{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.35213896912888837, "eval_steps": 160, "global_step": 2240, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00015720489693253945, "grad_norm": 1.3751904964447021, "learning_rate": 0.0, "loss": 3.5741, "step": 1 }, { "epoch": 0.00015720489693253945, "eval_loss": 3.4173049926757812, "eval_runtime": 2315.7248, "eval_samples_per_second": 3.998, "eval_steps_per_second": 1.999, "step": 1 }, { "epoch": 0.0003144097938650789, "grad_norm": 1.231239676475525, "learning_rate": 5e-06, "loss": 3.3021, "step": 2 }, { "epoch": 0.00047161469079761836, "grad_norm": 1.3657807111740112, "learning_rate": 1e-05, "loss": 3.6333, "step": 3 }, { "epoch": 0.0006288195877301578, "grad_norm": 1.3117496967315674, "learning_rate": 1.5e-05, "loss": 3.3731, "step": 4 }, { "epoch": 0.0007860244846626972, "grad_norm": 1.4118576049804688, "learning_rate": 2e-05, "loss": 3.612, "step": 5 }, { "epoch": 0.0009432293815952367, "grad_norm": 1.3155895471572876, "learning_rate": 2.5e-05, "loss": 3.3296, "step": 6 }, { "epoch": 0.001100434278527776, "grad_norm": 1.2847192287445068, "learning_rate": 3e-05, "loss": 3.2168, "step": 7 }, { "epoch": 0.0012576391754603156, "grad_norm": 1.1421078443527222, "learning_rate": 3.5e-05, "loss": 3.085, "step": 8 }, { "epoch": 0.0014148440723928551, "grad_norm": 0.9923035502433777, "learning_rate": 4e-05, "loss": 3.0472, "step": 9 }, { "epoch": 0.0015720489693253944, "grad_norm": 0.795043408870697, "learning_rate": 4.5e-05, "loss": 2.6666, "step": 10 }, { "epoch": 0.001729253866257934, "grad_norm": 0.5987974405288696, "learning_rate": 5e-05, "loss": 2.473, "step": 11 }, { "epoch": 0.0018864587631904734, "grad_norm": 0.4488905668258667, "learning_rate": 4.9999999236547564e-05, "loss": 2.3731, "step": 12 }, { "epoch": 0.002043663660123013, "grad_norm": 0.3517301380634308, "learning_rate": 4.999999694619029e-05, "loss": 2.2158, "step": 13 }, { "epoch": 0.002200868557055552, "grad_norm": 0.3045121431350708, "learning_rate": 4.999999312892831e-05, "loss": 2.3351, "step": 14 }, { "epoch": 0.002358073453988092, "grad_norm": 0.24488244950771332, "learning_rate": 4.9999987784761884e-05, "loss": 2.2693, "step": 15 }, { "epoch": 0.0025152783509206312, "grad_norm": 0.22892728447914124, "learning_rate": 4.999998091369132e-05, "loss": 2.1006, "step": 16 }, { "epoch": 0.0026724832478531705, "grad_norm": 0.23219206929206848, "learning_rate": 4.999997251571704e-05, "loss": 2.215, "step": 17 }, { "epoch": 0.0028296881447857102, "grad_norm": 0.24427154660224915, "learning_rate": 4.999996259083956e-05, "loss": 2.1708, "step": 18 }, { "epoch": 0.0029868930417182495, "grad_norm": 0.2640205919742584, "learning_rate": 4.999995113905947e-05, "loss": 2.1709, "step": 19 }, { "epoch": 0.003144097938650789, "grad_norm": 0.26644033193588257, "learning_rate": 4.999993816037749e-05, "loss": 2.1733, "step": 20 }, { "epoch": 0.0033013028355833285, "grad_norm": 0.2621535062789917, "learning_rate": 4.9999923654794414e-05, "loss": 2.0059, "step": 21 }, { "epoch": 0.003458507732515868, "grad_norm": 0.2586187422275543, "learning_rate": 4.999990762231111e-05, "loss": 2.0336, "step": 22 }, { "epoch": 0.003615712629448407, "grad_norm": 0.26732271909713745, "learning_rate": 4.9999890062928566e-05, "loss": 2.0566, "step": 23 }, { "epoch": 0.003772917526380947, "grad_norm": 0.2357867807149887, "learning_rate": 4.999987097664787e-05, "loss": 1.9529, "step": 24 }, { "epoch": 0.003930122423313486, "grad_norm": 0.2297009825706482, "learning_rate": 4.999985036347016e-05, "loss": 2.0369, "step": 25 }, { "epoch": 0.004087327320246026, "grad_norm": 0.20529747009277344, "learning_rate": 4.9999828223396705e-05, "loss": 1.9781, "step": 26 }, { "epoch": 0.004244532217178565, "grad_norm": 0.18342873454093933, "learning_rate": 4.999980455642887e-05, "loss": 1.9986, "step": 27 }, { "epoch": 0.004401737114111104, "grad_norm": 0.16487397253513336, "learning_rate": 4.999977936256809e-05, "loss": 1.9063, "step": 28 }, { "epoch": 0.004558942011043644, "grad_norm": 0.1762266606092453, "learning_rate": 4.99997526418159e-05, "loss": 1.9517, "step": 29 }, { "epoch": 0.004716146907976184, "grad_norm": 0.16371938586235046, "learning_rate": 4.999972439417394e-05, "loss": 1.7734, "step": 30 }, { "epoch": 0.004873351804908723, "grad_norm": 0.17309769988059998, "learning_rate": 4.999969461964392e-05, "loss": 1.8732, "step": 31 }, { "epoch": 0.0050305567018412625, "grad_norm": 0.15772338211536407, "learning_rate": 4.9999663318227683e-05, "loss": 1.7537, "step": 32 }, { "epoch": 0.005187761598773802, "grad_norm": 0.17521986365318298, "learning_rate": 4.9999630489927126e-05, "loss": 2.0077, "step": 33 }, { "epoch": 0.005344966495706341, "grad_norm": 0.15462292730808258, "learning_rate": 4.999959613474425e-05, "loss": 1.8576, "step": 34 }, { "epoch": 0.005502171392638881, "grad_norm": 0.15280336141586304, "learning_rate": 4.999956025268117e-05, "loss": 1.862, "step": 35 }, { "epoch": 0.0056593762895714205, "grad_norm": 0.14518432319164276, "learning_rate": 4.999952284374006e-05, "loss": 1.8893, "step": 36 }, { "epoch": 0.005816581186503959, "grad_norm": 0.16087624430656433, "learning_rate": 4.999948390792321e-05, "loss": 1.8658, "step": 37 }, { "epoch": 0.005973786083436499, "grad_norm": 0.17504698038101196, "learning_rate": 4.999944344523301e-05, "loss": 1.7647, "step": 38 }, { "epoch": 0.006130990980369039, "grad_norm": 0.17786233127117157, "learning_rate": 4.999940145567191e-05, "loss": 1.8133, "step": 39 }, { "epoch": 0.006288195877301578, "grad_norm": 0.1628972887992859, "learning_rate": 4.999935793924249e-05, "loss": 1.7731, "step": 40 }, { "epoch": 0.006445400774234117, "grad_norm": 0.13461466133594513, "learning_rate": 4.9999312895947406e-05, "loss": 1.7558, "step": 41 }, { "epoch": 0.006602605671166657, "grad_norm": 0.12960125505924225, "learning_rate": 4.99992663257894e-05, "loss": 1.7639, "step": 42 }, { "epoch": 0.006759810568099196, "grad_norm": 0.10991287231445312, "learning_rate": 4.9999218228771324e-05, "loss": 1.7538, "step": 43 }, { "epoch": 0.006917015465031736, "grad_norm": 0.11583230644464493, "learning_rate": 4.999916860489612e-05, "loss": 1.715, "step": 44 }, { "epoch": 0.007074220361964275, "grad_norm": 0.10344280302524567, "learning_rate": 4.999911745416681e-05, "loss": 1.6907, "step": 45 }, { "epoch": 0.007231425258896814, "grad_norm": 0.10546118766069412, "learning_rate": 4.999906477658651e-05, "loss": 1.7294, "step": 46 }, { "epoch": 0.007388630155829354, "grad_norm": 0.11775675415992737, "learning_rate": 4.9999010572158465e-05, "loss": 1.7146, "step": 47 }, { "epoch": 0.007545835052761894, "grad_norm": 0.11109112203121185, "learning_rate": 4.999895484088596e-05, "loss": 1.6939, "step": 48 }, { "epoch": 0.007703039949694433, "grad_norm": 0.1116517186164856, "learning_rate": 4.999889758277242e-05, "loss": 1.7271, "step": 49 }, { "epoch": 0.007860244846626972, "grad_norm": 0.11245547980070114, "learning_rate": 4.999883879782132e-05, "loss": 1.7333, "step": 50 }, { "epoch": 0.008017449743559512, "grad_norm": 0.1150551363825798, "learning_rate": 4.999877848603626e-05, "loss": 1.7036, "step": 51 }, { "epoch": 0.008174654640492052, "grad_norm": 0.10856381803750992, "learning_rate": 4.999871664742093e-05, "loss": 1.7493, "step": 52 }, { "epoch": 0.008331859537424591, "grad_norm": 0.10760089010000229, "learning_rate": 4.9998653281979095e-05, "loss": 1.6292, "step": 53 }, { "epoch": 0.00848906443435713, "grad_norm": 0.0932115837931633, "learning_rate": 4.9998588389714634e-05, "loss": 1.6608, "step": 54 }, { "epoch": 0.00864626933128967, "grad_norm": 0.09837482124567032, "learning_rate": 4.9998521970631504e-05, "loss": 1.7834, "step": 55 }, { "epoch": 0.008803474228222209, "grad_norm": 0.08872833847999573, "learning_rate": 4.9998454024733775e-05, "loss": 1.6484, "step": 56 }, { "epoch": 0.008960679125154749, "grad_norm": 0.08829163759946823, "learning_rate": 4.9998384552025577e-05, "loss": 1.5913, "step": 57 }, { "epoch": 0.009117884022087288, "grad_norm": 0.09087682515382767, "learning_rate": 4.999831355251117e-05, "loss": 1.6809, "step": 58 }, { "epoch": 0.009275088919019828, "grad_norm": 0.08675853163003922, "learning_rate": 4.9998241026194884e-05, "loss": 1.6519, "step": 59 }, { "epoch": 0.009432293815952368, "grad_norm": 0.08463481813669205, "learning_rate": 4.999816697308114e-05, "loss": 1.6234, "step": 60 }, { "epoch": 0.009589498712884906, "grad_norm": 0.08403950184583664, "learning_rate": 4.999809139317448e-05, "loss": 1.6533, "step": 61 }, { "epoch": 0.009746703609817445, "grad_norm": 0.08155622333288193, "learning_rate": 4.99980142864795e-05, "loss": 1.6726, "step": 62 }, { "epoch": 0.009903908506749985, "grad_norm": 0.08056480437517166, "learning_rate": 4.999793565300093e-05, "loss": 1.5881, "step": 63 }, { "epoch": 0.010061113403682525, "grad_norm": 0.07879023998975754, "learning_rate": 4.999785549274355e-05, "loss": 1.5568, "step": 64 }, { "epoch": 0.010218318300615065, "grad_norm": 0.07828455418348312, "learning_rate": 4.9997773805712265e-05, "loss": 1.6464, "step": 65 }, { "epoch": 0.010375523197547604, "grad_norm": 0.08054805546998978, "learning_rate": 4.9997690591912075e-05, "loss": 1.6213, "step": 66 }, { "epoch": 0.010532728094480142, "grad_norm": 0.07610727101564407, "learning_rate": 4.999760585134805e-05, "loss": 1.5729, "step": 67 }, { "epoch": 0.010689932991412682, "grad_norm": 0.07693428546190262, "learning_rate": 4.999751958402537e-05, "loss": 1.5444, "step": 68 }, { "epoch": 0.010847137888345222, "grad_norm": 0.0810319185256958, "learning_rate": 4.99974317899493e-05, "loss": 1.7045, "step": 69 }, { "epoch": 0.011004342785277762, "grad_norm": 0.07729896157979965, "learning_rate": 4.9997342469125205e-05, "loss": 1.6268, "step": 70 }, { "epoch": 0.011161547682210301, "grad_norm": 0.07730107754468918, "learning_rate": 4.999725162155855e-05, "loss": 1.658, "step": 71 }, { "epoch": 0.011318752579142841, "grad_norm": 0.08072328567504883, "learning_rate": 4.9997159247254864e-05, "loss": 1.5045, "step": 72 }, { "epoch": 0.011475957476075379, "grad_norm": 0.08120577782392502, "learning_rate": 4.9997065346219805e-05, "loss": 1.568, "step": 73 }, { "epoch": 0.011633162373007919, "grad_norm": 0.08131498098373413, "learning_rate": 4.99969699184591e-05, "loss": 1.6035, "step": 74 }, { "epoch": 0.011790367269940458, "grad_norm": 0.08395873010158539, "learning_rate": 4.9996872963978584e-05, "loss": 1.5844, "step": 75 }, { "epoch": 0.011947572166872998, "grad_norm": 0.08502068370580673, "learning_rate": 4.999677448278417e-05, "loss": 1.6661, "step": 76 }, { "epoch": 0.012104777063805538, "grad_norm": 0.08467952907085419, "learning_rate": 4.999667447488188e-05, "loss": 1.5537, "step": 77 }, { "epoch": 0.012261981960738078, "grad_norm": 0.19682182371616364, "learning_rate": 4.999657294027782e-05, "loss": 1.5051, "step": 78 }, { "epoch": 0.012419186857670617, "grad_norm": 0.08586428314447403, "learning_rate": 4.999646987897818e-05, "loss": 1.565, "step": 79 }, { "epoch": 0.012576391754603155, "grad_norm": 0.08156823366880417, "learning_rate": 4.999636529098928e-05, "loss": 1.6627, "step": 80 }, { "epoch": 0.012733596651535695, "grad_norm": 0.08715341240167618, "learning_rate": 4.9996259176317486e-05, "loss": 1.5862, "step": 81 }, { "epoch": 0.012890801548468235, "grad_norm": 0.09664586186408997, "learning_rate": 4.999615153496928e-05, "loss": 1.5741, "step": 82 }, { "epoch": 0.013048006445400774, "grad_norm": 0.08438891172409058, "learning_rate": 4.999604236695125e-05, "loss": 1.5933, "step": 83 }, { "epoch": 0.013205211342333314, "grad_norm": 0.08333732932806015, "learning_rate": 4.999593167227006e-05, "loss": 1.5904, "step": 84 }, { "epoch": 0.013362416239265854, "grad_norm": 0.07945791631937027, "learning_rate": 4.9995819450932455e-05, "loss": 1.5763, "step": 85 }, { "epoch": 0.013519621136198392, "grad_norm": 0.07682961225509644, "learning_rate": 4.9995705702945304e-05, "loss": 1.5197, "step": 86 }, { "epoch": 0.013676826033130932, "grad_norm": 0.07547677308320999, "learning_rate": 4.999559042831555e-05, "loss": 1.6825, "step": 87 }, { "epoch": 0.013834030930063471, "grad_norm": 0.07293456047773361, "learning_rate": 4.999547362705025e-05, "loss": 1.5466, "step": 88 }, { "epoch": 0.013991235826996011, "grad_norm": 0.07730914652347565, "learning_rate": 4.999535529915651e-05, "loss": 1.5775, "step": 89 }, { "epoch": 0.01414844072392855, "grad_norm": 0.07689664512872696, "learning_rate": 4.9995235444641565e-05, "loss": 1.5881, "step": 90 }, { "epoch": 0.01430564562086109, "grad_norm": 0.07754997909069061, "learning_rate": 4.999511406351275e-05, "loss": 1.5037, "step": 91 }, { "epoch": 0.014462850517793628, "grad_norm": 0.07229866087436676, "learning_rate": 4.999499115577746e-05, "loss": 1.5077, "step": 92 }, { "epoch": 0.014620055414726168, "grad_norm": 0.07491567730903625, "learning_rate": 4.9994866721443215e-05, "loss": 1.5461, "step": 93 }, { "epoch": 0.014777260311658708, "grad_norm": 0.07258685678243637, "learning_rate": 4.9994740760517605e-05, "loss": 1.5516, "step": 94 }, { "epoch": 0.014934465208591248, "grad_norm": 0.07643327116966248, "learning_rate": 4.9994613273008334e-05, "loss": 1.6223, "step": 95 }, { "epoch": 0.015091670105523787, "grad_norm": 0.0740588903427124, "learning_rate": 4.999448425892318e-05, "loss": 1.5322, "step": 96 }, { "epoch": 0.015248875002456327, "grad_norm": 0.44172239303588867, "learning_rate": 4.999435371827003e-05, "loss": 1.5498, "step": 97 }, { "epoch": 0.015406079899388867, "grad_norm": 0.0756363570690155, "learning_rate": 4.999422165105684e-05, "loss": 1.559, "step": 98 }, { "epoch": 0.015563284796321405, "grad_norm": 0.07251248508691788, "learning_rate": 4.99940880572917e-05, "loss": 1.5903, "step": 99 }, { "epoch": 0.015720489693253945, "grad_norm": 0.06931837648153305, "learning_rate": 4.999395293698275e-05, "loss": 1.4849, "step": 100 }, { "epoch": 0.015877694590186484, "grad_norm": 0.07403590530157089, "learning_rate": 4.9993816290138254e-05, "loss": 1.5191, "step": 101 }, { "epoch": 0.016034899487119024, "grad_norm": 0.07027724385261536, "learning_rate": 4.999367811676655e-05, "loss": 1.5655, "step": 102 }, { "epoch": 0.016192104384051564, "grad_norm": 0.07320379465818405, "learning_rate": 4.9993538416876093e-05, "loss": 1.4869, "step": 103 }, { "epoch": 0.016349309280984103, "grad_norm": 0.0726180374622345, "learning_rate": 4.9993397190475396e-05, "loss": 1.4629, "step": 104 }, { "epoch": 0.016506514177916643, "grad_norm": 0.07542011886835098, "learning_rate": 4.999325443757309e-05, "loss": 1.5976, "step": 105 }, { "epoch": 0.016663719074849183, "grad_norm": 0.07440067082643509, "learning_rate": 4.9993110158177895e-05, "loss": 1.5469, "step": 106 }, { "epoch": 0.016820923971781723, "grad_norm": 0.07547372579574585, "learning_rate": 4.999296435229863e-05, "loss": 1.5328, "step": 107 }, { "epoch": 0.01697812886871426, "grad_norm": 0.07532137632369995, "learning_rate": 4.999281701994419e-05, "loss": 1.6742, "step": 108 }, { "epoch": 0.0171353337656468, "grad_norm": 0.07249438762664795, "learning_rate": 4.999266816112358e-05, "loss": 1.4799, "step": 109 }, { "epoch": 0.01729253866257934, "grad_norm": 0.07399806380271912, "learning_rate": 4.999251777584589e-05, "loss": 1.5438, "step": 110 }, { "epoch": 0.017449743559511878, "grad_norm": 0.08135057240724564, "learning_rate": 4.99923658641203e-05, "loss": 1.5608, "step": 111 }, { "epoch": 0.017606948456444418, "grad_norm": 0.07508935779333115, "learning_rate": 4.99922124259561e-05, "loss": 1.5894, "step": 112 }, { "epoch": 0.017764153353376957, "grad_norm": 0.07432372123003006, "learning_rate": 4.999205746136265e-05, "loss": 1.4818, "step": 113 }, { "epoch": 0.017921358250309497, "grad_norm": 0.07694194465875626, "learning_rate": 4.999190097034942e-05, "loss": 1.5629, "step": 114 }, { "epoch": 0.018078563147242037, "grad_norm": 0.07384433597326279, "learning_rate": 4.999174295292597e-05, "loss": 1.4829, "step": 115 }, { "epoch": 0.018235768044174577, "grad_norm": 0.07152919471263885, "learning_rate": 4.999158340910195e-05, "loss": 1.4748, "step": 116 }, { "epoch": 0.018392972941107116, "grad_norm": 0.07719701528549194, "learning_rate": 4.999142233888709e-05, "loss": 1.5524, "step": 117 }, { "epoch": 0.018550177838039656, "grad_norm": 0.07540587335824966, "learning_rate": 4.999125974229125e-05, "loss": 1.4661, "step": 118 }, { "epoch": 0.018707382734972196, "grad_norm": 0.0787581130862236, "learning_rate": 4.9991095619324344e-05, "loss": 1.6455, "step": 119 }, { "epoch": 0.018864587631904736, "grad_norm": 0.07454577833414078, "learning_rate": 4.999092996999641e-05, "loss": 1.5083, "step": 120 }, { "epoch": 0.019021792528837272, "grad_norm": 0.0751076266169548, "learning_rate": 4.9990762794317545e-05, "loss": 1.4874, "step": 121 }, { "epoch": 0.01917899742576981, "grad_norm": 0.07733119279146194, "learning_rate": 4.999059409229798e-05, "loss": 1.6308, "step": 122 }, { "epoch": 0.01933620232270235, "grad_norm": 0.07897089421749115, "learning_rate": 4.999042386394802e-05, "loss": 1.5906, "step": 123 }, { "epoch": 0.01949340721963489, "grad_norm": 0.07758141309022903, "learning_rate": 4.999025210927804e-05, "loss": 1.5604, "step": 124 }, { "epoch": 0.01965061211656743, "grad_norm": 0.07845707982778549, "learning_rate": 4.9990078828298544e-05, "loss": 1.5901, "step": 125 }, { "epoch": 0.01980781701349997, "grad_norm": 0.0772818773984909, "learning_rate": 4.998990402102012e-05, "loss": 1.4516, "step": 126 }, { "epoch": 0.01996502191043251, "grad_norm": 0.07795504480600357, "learning_rate": 4.998972768745344e-05, "loss": 1.4642, "step": 127 }, { "epoch": 0.02012222680736505, "grad_norm": 0.0784008800983429, "learning_rate": 4.998954982760926e-05, "loss": 1.5936, "step": 128 }, { "epoch": 0.02027943170429759, "grad_norm": 0.07791212201118469, "learning_rate": 4.9989370441498465e-05, "loss": 1.4705, "step": 129 }, { "epoch": 0.02043663660123013, "grad_norm": 0.07785367220640182, "learning_rate": 4.9989189529132004e-05, "loss": 1.5085, "step": 130 }, { "epoch": 0.02059384149816267, "grad_norm": 0.07916689664125443, "learning_rate": 4.9989007090520925e-05, "loss": 1.5365, "step": 131 }, { "epoch": 0.02075104639509521, "grad_norm": 0.0775083601474762, "learning_rate": 4.9988823125676367e-05, "loss": 1.5286, "step": 132 }, { "epoch": 0.020908251292027745, "grad_norm": 0.08110442757606506, "learning_rate": 4.998863763460956e-05, "loss": 1.5779, "step": 133 }, { "epoch": 0.021065456188960285, "grad_norm": 0.0814640000462532, "learning_rate": 4.998845061733185e-05, "loss": 1.4778, "step": 134 }, { "epoch": 0.021222661085892824, "grad_norm": 0.08069492131471634, "learning_rate": 4.998826207385465e-05, "loss": 1.5317, "step": 135 }, { "epoch": 0.021379865982825364, "grad_norm": 0.07377774268388748, "learning_rate": 4.998807200418948e-05, "loss": 1.5258, "step": 136 }, { "epoch": 0.021537070879757904, "grad_norm": 0.0787922590970993, "learning_rate": 4.9987880408347945e-05, "loss": 1.5185, "step": 137 }, { "epoch": 0.021694275776690444, "grad_norm": 0.07662995159626007, "learning_rate": 4.9987687286341745e-05, "loss": 1.4637, "step": 138 }, { "epoch": 0.021851480673622983, "grad_norm": 0.08528955280780792, "learning_rate": 4.9987492638182676e-05, "loss": 1.4776, "step": 139 }, { "epoch": 0.022008685570555523, "grad_norm": 0.08089053630828857, "learning_rate": 4.9987296463882626e-05, "loss": 1.5885, "step": 140 }, { "epoch": 0.022165890467488063, "grad_norm": 0.08029694855213165, "learning_rate": 4.998709876345358e-05, "loss": 1.4557, "step": 141 }, { "epoch": 0.022323095364420602, "grad_norm": 0.07918502390384674, "learning_rate": 4.9986899536907614e-05, "loss": 1.4285, "step": 142 }, { "epoch": 0.022480300261353142, "grad_norm": 0.0813126415014267, "learning_rate": 4.998669878425689e-05, "loss": 1.5958, "step": 143 }, { "epoch": 0.022637505158285682, "grad_norm": 0.07935188710689545, "learning_rate": 4.998649650551368e-05, "loss": 1.5249, "step": 144 }, { "epoch": 0.02279471005521822, "grad_norm": 0.08163304626941681, "learning_rate": 4.9986292700690324e-05, "loss": 1.483, "step": 145 }, { "epoch": 0.022951914952150758, "grad_norm": 0.08277447521686554, "learning_rate": 4.998608736979928e-05, "loss": 1.6212, "step": 146 }, { "epoch": 0.023109119849083298, "grad_norm": 0.08285827934741974, "learning_rate": 4.9985880512853076e-05, "loss": 1.4495, "step": 147 }, { "epoch": 0.023266324746015837, "grad_norm": 0.082750603556633, "learning_rate": 4.998567212986437e-05, "loss": 1.4335, "step": 148 }, { "epoch": 0.023423529642948377, "grad_norm": 0.07986058294773102, "learning_rate": 4.998546222084587e-05, "loss": 1.4704, "step": 149 }, { "epoch": 0.023580734539880917, "grad_norm": 0.08105576783418655, "learning_rate": 4.9985250785810396e-05, "loss": 1.5183, "step": 150 }, { "epoch": 0.023737939436813457, "grad_norm": 0.08202917128801346, "learning_rate": 4.9985037824770866e-05, "loss": 1.5423, "step": 151 }, { "epoch": 0.023895144333745996, "grad_norm": 0.08937894552946091, "learning_rate": 4.998482333774029e-05, "loss": 1.5731, "step": 152 }, { "epoch": 0.024052349230678536, "grad_norm": 0.08333728462457657, "learning_rate": 4.9984607324731766e-05, "loss": 1.5133, "step": 153 }, { "epoch": 0.024209554127611076, "grad_norm": 0.08529175072908401, "learning_rate": 4.998438978575849e-05, "loss": 1.516, "step": 154 }, { "epoch": 0.024366759024543615, "grad_norm": 0.08508963882923126, "learning_rate": 4.998417072083374e-05, "loss": 1.5646, "step": 155 }, { "epoch": 0.024523963921476155, "grad_norm": 0.08971578627824783, "learning_rate": 4.99839501299709e-05, "loss": 1.4714, "step": 156 }, { "epoch": 0.024681168818408695, "grad_norm": 0.08380109816789627, "learning_rate": 4.998372801318345e-05, "loss": 1.4476, "step": 157 }, { "epoch": 0.024838373715341235, "grad_norm": 0.08533143252134323, "learning_rate": 4.9983504370484945e-05, "loss": 1.4866, "step": 158 }, { "epoch": 0.02499557861227377, "grad_norm": 0.08318709582090378, "learning_rate": 4.998327920188905e-05, "loss": 1.5274, "step": 159 }, { "epoch": 0.02515278350920631, "grad_norm": 0.08486370742321014, "learning_rate": 4.9983052507409525e-05, "loss": 1.4713, "step": 160 }, { "epoch": 0.02515278350920631, "eval_loss": 1.5136528015136719, "eval_runtime": 2318.8971, "eval_samples_per_second": 3.992, "eval_steps_per_second": 1.996, "step": 160 }, { "epoch": 0.02530998840613885, "grad_norm": 0.08242359757423401, "learning_rate": 4.9982824287060195e-05, "loss": 1.5069, "step": 161 }, { "epoch": 0.02546719330307139, "grad_norm": 0.08547423779964447, "learning_rate": 4.9982594540855014e-05, "loss": 1.4973, "step": 162 }, { "epoch": 0.02562439820000393, "grad_norm": 0.08345580101013184, "learning_rate": 4.9982363268808016e-05, "loss": 1.5078, "step": 163 }, { "epoch": 0.02578160309693647, "grad_norm": 0.0830339640378952, "learning_rate": 4.9982130470933316e-05, "loss": 1.4098, "step": 164 }, { "epoch": 0.02593880799386901, "grad_norm": 0.08568515628576279, "learning_rate": 4.998189614724514e-05, "loss": 1.4628, "step": 165 }, { "epoch": 0.02609601289080155, "grad_norm": 0.08261829614639282, "learning_rate": 4.998166029775779e-05, "loss": 1.4492, "step": 166 }, { "epoch": 0.02625321778773409, "grad_norm": 0.08944887667894363, "learning_rate": 4.998142292248569e-05, "loss": 1.5633, "step": 167 }, { "epoch": 0.02641042268466663, "grad_norm": 0.08632911741733551, "learning_rate": 4.998118402144332e-05, "loss": 1.5106, "step": 168 }, { "epoch": 0.026567627581599168, "grad_norm": 0.08733859658241272, "learning_rate": 4.998094359464528e-05, "loss": 1.5607, "step": 169 }, { "epoch": 0.026724832478531708, "grad_norm": 0.08667927235364914, "learning_rate": 4.9980701642106245e-05, "loss": 1.4544, "step": 170 }, { "epoch": 0.026882037375464244, "grad_norm": 0.08655022084712982, "learning_rate": 4.9980458163841006e-05, "loss": 1.5264, "step": 171 }, { "epoch": 0.027039242272396784, "grad_norm": 0.08899988234043121, "learning_rate": 4.9980213159864426e-05, "loss": 1.4778, "step": 172 }, { "epoch": 0.027196447169329323, "grad_norm": 0.09411856532096863, "learning_rate": 4.997996663019147e-05, "loss": 1.5269, "step": 173 }, { "epoch": 0.027353652066261863, "grad_norm": 0.087191641330719, "learning_rate": 4.997971857483719e-05, "loss": 1.5166, "step": 174 }, { "epoch": 0.027510856963194403, "grad_norm": 0.08959636092185974, "learning_rate": 4.997946899381675e-05, "loss": 1.5503, "step": 175 }, { "epoch": 0.027668061860126943, "grad_norm": 0.0951187014579773, "learning_rate": 4.997921788714537e-05, "loss": 1.4879, "step": 176 }, { "epoch": 0.027825266757059482, "grad_norm": 0.09324768930673599, "learning_rate": 4.997896525483841e-05, "loss": 1.5714, "step": 177 }, { "epoch": 0.027982471653992022, "grad_norm": 0.08633986115455627, "learning_rate": 4.997871109691129e-05, "loss": 1.4198, "step": 178 }, { "epoch": 0.028139676550924562, "grad_norm": 0.08947525173425674, "learning_rate": 4.9978455413379535e-05, "loss": 1.4702, "step": 179 }, { "epoch": 0.0282968814478571, "grad_norm": 0.09275490790605545, "learning_rate": 4.9978198204258766e-05, "loss": 1.5252, "step": 180 }, { "epoch": 0.02845408634478964, "grad_norm": 0.08761609345674515, "learning_rate": 4.9977939469564676e-05, "loss": 1.505, "step": 181 }, { "epoch": 0.02861129124172218, "grad_norm": 0.08683087676763535, "learning_rate": 4.997767920931308e-05, "loss": 1.5059, "step": 182 }, { "epoch": 0.02876849613865472, "grad_norm": 0.08931361883878708, "learning_rate": 4.997741742351988e-05, "loss": 1.5003, "step": 183 }, { "epoch": 0.028925701035587257, "grad_norm": 0.08820109069347382, "learning_rate": 4.997715411220105e-05, "loss": 1.5132, "step": 184 }, { "epoch": 0.029082905932519797, "grad_norm": 0.09284964948892593, "learning_rate": 4.997688927537268e-05, "loss": 1.4561, "step": 185 }, { "epoch": 0.029240110829452336, "grad_norm": 0.09472864121198654, "learning_rate": 4.997662291305094e-05, "loss": 1.4729, "step": 186 }, { "epoch": 0.029397315726384876, "grad_norm": 0.08725330233573914, "learning_rate": 4.997635502525211e-05, "loss": 1.3994, "step": 187 }, { "epoch": 0.029554520623317416, "grad_norm": 0.09085626900196075, "learning_rate": 4.9976085611992536e-05, "loss": 1.4695, "step": 188 }, { "epoch": 0.029711725520249956, "grad_norm": 0.09322400391101837, "learning_rate": 4.9975814673288684e-05, "loss": 1.4753, "step": 189 }, { "epoch": 0.029868930417182495, "grad_norm": 0.08927160501480103, "learning_rate": 4.99755422091571e-05, "loss": 1.4465, "step": 190 }, { "epoch": 0.030026135314115035, "grad_norm": 0.09317070990800858, "learning_rate": 4.997526821961442e-05, "loss": 1.5124, "step": 191 }, { "epoch": 0.030183340211047575, "grad_norm": 0.08911167085170746, "learning_rate": 4.9974992704677385e-05, "loss": 1.4515, "step": 192 }, { "epoch": 0.030340545107980114, "grad_norm": 0.09432853013277054, "learning_rate": 4.997471566436282e-05, "loss": 1.4623, "step": 193 }, { "epoch": 0.030497750004912654, "grad_norm": 0.09417332708835602, "learning_rate": 4.997443709868764e-05, "loss": 1.5103, "step": 194 }, { "epoch": 0.030654954901845194, "grad_norm": 0.09564542025327682, "learning_rate": 4.997415700766887e-05, "loss": 1.4929, "step": 195 }, { "epoch": 0.030812159798777734, "grad_norm": 0.09101004898548126, "learning_rate": 4.997387539132361e-05, "loss": 1.4225, "step": 196 }, { "epoch": 0.03096936469571027, "grad_norm": 0.09196274727582932, "learning_rate": 4.997359224966906e-05, "loss": 1.4701, "step": 197 }, { "epoch": 0.03112656959264281, "grad_norm": 0.09573279321193695, "learning_rate": 4.997330758272251e-05, "loss": 1.4425, "step": 198 }, { "epoch": 0.03128377448957535, "grad_norm": 0.09180758893489838, "learning_rate": 4.9973021390501354e-05, "loss": 1.4426, "step": 199 }, { "epoch": 0.03144097938650789, "grad_norm": 0.09583238512277603, "learning_rate": 4.997273367302306e-05, "loss": 1.5158, "step": 200 }, { "epoch": 0.03159818428344043, "grad_norm": 0.09394747018814087, "learning_rate": 4.997244443030521e-05, "loss": 1.4306, "step": 201 }, { "epoch": 0.03175538918037297, "grad_norm": 0.09470199793577194, "learning_rate": 4.9972153662365474e-05, "loss": 1.5286, "step": 202 }, { "epoch": 0.031912594077305505, "grad_norm": 0.09274959564208984, "learning_rate": 4.997186136922161e-05, "loss": 1.4803, "step": 203 }, { "epoch": 0.03206979897423805, "grad_norm": 0.09344369918107986, "learning_rate": 4.997156755089145e-05, "loss": 1.5449, "step": 204 }, { "epoch": 0.032227003871170584, "grad_norm": 0.09794919937849045, "learning_rate": 4.997127220739296e-05, "loss": 1.4383, "step": 205 }, { "epoch": 0.03238420876810313, "grad_norm": 0.09698093682527542, "learning_rate": 4.997097533874418e-05, "loss": 1.4462, "step": 206 }, { "epoch": 0.032541413665035664, "grad_norm": 0.09690559655427933, "learning_rate": 4.997067694496323e-05, "loss": 1.4735, "step": 207 }, { "epoch": 0.03269861856196821, "grad_norm": 0.09657544642686844, "learning_rate": 4.9970377026068336e-05, "loss": 1.5672, "step": 208 }, { "epoch": 0.03285582345890074, "grad_norm": 0.09483659267425537, "learning_rate": 4.9970075582077825e-05, "loss": 1.4931, "step": 209 }, { "epoch": 0.033013028355833286, "grad_norm": 0.09744243323802948, "learning_rate": 4.9969772613010104e-05, "loss": 1.4638, "step": 210 }, { "epoch": 0.03317023325276582, "grad_norm": 0.09521006047725677, "learning_rate": 4.9969468118883665e-05, "loss": 1.4127, "step": 211 }, { "epoch": 0.033327438149698366, "grad_norm": 0.09646004438400269, "learning_rate": 4.996916209971713e-05, "loss": 1.5139, "step": 212 }, { "epoch": 0.0334846430466309, "grad_norm": 0.09292810410261154, "learning_rate": 4.996885455552916e-05, "loss": 1.4399, "step": 213 }, { "epoch": 0.033641847943563445, "grad_norm": 0.09986516088247299, "learning_rate": 4.996854548633857e-05, "loss": 1.4637, "step": 214 }, { "epoch": 0.03379905284049598, "grad_norm": 0.09723702073097229, "learning_rate": 4.996823489216421e-05, "loss": 1.5673, "step": 215 }, { "epoch": 0.03395625773742852, "grad_norm": 0.09608977288007736, "learning_rate": 4.996792277302507e-05, "loss": 1.4428, "step": 216 }, { "epoch": 0.03411346263436106, "grad_norm": 0.09329380095005035, "learning_rate": 4.99676091289402e-05, "loss": 1.3892, "step": 217 }, { "epoch": 0.0342706675312936, "grad_norm": 0.0959913358092308, "learning_rate": 4.996729395992875e-05, "loss": 1.5219, "step": 218 }, { "epoch": 0.03442787242822614, "grad_norm": 0.09832671284675598, "learning_rate": 4.996697726600999e-05, "loss": 1.5259, "step": 219 }, { "epoch": 0.03458507732515868, "grad_norm": 0.10061636567115784, "learning_rate": 4.996665904720325e-05, "loss": 1.5216, "step": 220 }, { "epoch": 0.03474228222209122, "grad_norm": 0.09742400050163269, "learning_rate": 4.9966339303527965e-05, "loss": 1.3819, "step": 221 }, { "epoch": 0.034899487119023756, "grad_norm": 0.09629969298839569, "learning_rate": 4.996601803500367e-05, "loss": 1.5341, "step": 222 }, { "epoch": 0.0350566920159563, "grad_norm": 0.09776200354099274, "learning_rate": 4.996569524164998e-05, "loss": 1.5054, "step": 223 }, { "epoch": 0.035213896912888835, "grad_norm": 0.1008530780673027, "learning_rate": 4.996537092348661e-05, "loss": 1.5333, "step": 224 }, { "epoch": 0.03537110180982138, "grad_norm": 0.09749735891819, "learning_rate": 4.996504508053338e-05, "loss": 1.3899, "step": 225 }, { "epoch": 0.035528306706753915, "grad_norm": 0.10522401332855225, "learning_rate": 4.9964717712810175e-05, "loss": 1.5413, "step": 226 }, { "epoch": 0.03568551160368646, "grad_norm": 0.09566272795200348, "learning_rate": 4.9964388820336996e-05, "loss": 1.435, "step": 227 }, { "epoch": 0.035842716500618994, "grad_norm": 0.10133984684944153, "learning_rate": 4.996405840313393e-05, "loss": 1.445, "step": 228 }, { "epoch": 0.03599992139755153, "grad_norm": 0.09702739119529724, "learning_rate": 4.996372646122116e-05, "loss": 1.4287, "step": 229 }, { "epoch": 0.036157126294484074, "grad_norm": 0.1012992411851883, "learning_rate": 4.996339299461896e-05, "loss": 1.382, "step": 230 }, { "epoch": 0.03631433119141661, "grad_norm": 0.09877166152000427, "learning_rate": 4.99630580033477e-05, "loss": 1.5729, "step": 231 }, { "epoch": 0.03647153608834915, "grad_norm": 0.1033129170536995, "learning_rate": 4.996272148742783e-05, "loss": 1.4754, "step": 232 }, { "epoch": 0.03662874098528169, "grad_norm": 0.09901215881109238, "learning_rate": 4.9962383446879914e-05, "loss": 1.5153, "step": 233 }, { "epoch": 0.03678594588221423, "grad_norm": 0.10241983830928802, "learning_rate": 4.996204388172458e-05, "loss": 1.5131, "step": 234 }, { "epoch": 0.03694315077914677, "grad_norm": 0.09574593603610992, "learning_rate": 4.9961702791982594e-05, "loss": 1.5285, "step": 235 }, { "epoch": 0.03710035567607931, "grad_norm": 0.10309838503599167, "learning_rate": 4.996136017767477e-05, "loss": 1.5751, "step": 236 }, { "epoch": 0.03725756057301185, "grad_norm": 0.09928470849990845, "learning_rate": 4.996101603882204e-05, "loss": 1.5108, "step": 237 }, { "epoch": 0.03741476546994439, "grad_norm": 0.10514767467975616, "learning_rate": 4.996067037544542e-05, "loss": 1.4206, "step": 238 }, { "epoch": 0.03757197036687693, "grad_norm": 0.10411518812179565, "learning_rate": 4.996032318756601e-05, "loss": 1.5628, "step": 239 }, { "epoch": 0.03772917526380947, "grad_norm": 0.0989808738231659, "learning_rate": 4.9959974475205045e-05, "loss": 1.4444, "step": 240 }, { "epoch": 0.03788638016074201, "grad_norm": 0.10069911926984787, "learning_rate": 4.9959624238383804e-05, "loss": 1.4805, "step": 241 }, { "epoch": 0.038043585057674544, "grad_norm": 0.10637518763542175, "learning_rate": 4.995927247712367e-05, "loss": 1.5289, "step": 242 }, { "epoch": 0.03820078995460709, "grad_norm": 0.10085684061050415, "learning_rate": 4.995891919144614e-05, "loss": 1.5288, "step": 243 }, { "epoch": 0.03835799485153962, "grad_norm": 0.09989017248153687, "learning_rate": 4.995856438137279e-05, "loss": 1.5444, "step": 244 }, { "epoch": 0.038515199748472166, "grad_norm": 0.10382463037967682, "learning_rate": 4.9958208046925294e-05, "loss": 1.4621, "step": 245 }, { "epoch": 0.0386724046454047, "grad_norm": 0.10208063572645187, "learning_rate": 4.99578501881254e-05, "loss": 1.5003, "step": 246 }, { "epoch": 0.038829609542337246, "grad_norm": 0.1028011366724968, "learning_rate": 4.9957490804994977e-05, "loss": 1.516, "step": 247 }, { "epoch": 0.03898681443926978, "grad_norm": 0.10475701838731766, "learning_rate": 4.995712989755598e-05, "loss": 1.5333, "step": 248 }, { "epoch": 0.039144019336202325, "grad_norm": 0.1038154736161232, "learning_rate": 4.995676746583044e-05, "loss": 1.4779, "step": 249 }, { "epoch": 0.03930122423313486, "grad_norm": 0.10413440316915512, "learning_rate": 4.99564035098405e-05, "loss": 1.5241, "step": 250 }, { "epoch": 0.039458429130067404, "grad_norm": 0.09869382530450821, "learning_rate": 4.995603802960838e-05, "loss": 1.442, "step": 251 }, { "epoch": 0.03961563402699994, "grad_norm": 0.10138234496116638, "learning_rate": 4.995567102515641e-05, "loss": 1.5393, "step": 252 }, { "epoch": 0.039772838923932484, "grad_norm": 0.10225867480039597, "learning_rate": 4.995530249650701e-05, "loss": 1.4516, "step": 253 }, { "epoch": 0.03993004382086502, "grad_norm": 0.09942895174026489, "learning_rate": 4.995493244368268e-05, "loss": 1.4543, "step": 254 }, { "epoch": 0.040087248717797556, "grad_norm": 0.11218860745429993, "learning_rate": 4.995456086670602e-05, "loss": 1.4985, "step": 255 }, { "epoch": 0.0402444536147301, "grad_norm": 0.10839337855577469, "learning_rate": 4.9954187765599736e-05, "loss": 1.4805, "step": 256 }, { "epoch": 0.040401658511662636, "grad_norm": 0.10317599028348923, "learning_rate": 4.9953813140386595e-05, "loss": 1.4412, "step": 257 }, { "epoch": 0.04055886340859518, "grad_norm": 0.10285656154155731, "learning_rate": 4.99534369910895e-05, "loss": 1.476, "step": 258 }, { "epoch": 0.040716068305527715, "grad_norm": 0.10330680012702942, "learning_rate": 4.995305931773141e-05, "loss": 1.5157, "step": 259 }, { "epoch": 0.04087327320246026, "grad_norm": 0.1086694598197937, "learning_rate": 4.99526801203354e-05, "loss": 1.4999, "step": 260 }, { "epoch": 0.041030478099392795, "grad_norm": 0.10800144821405411, "learning_rate": 4.995229939892464e-05, "loss": 1.4764, "step": 261 }, { "epoch": 0.04118768299632534, "grad_norm": 0.10645303875207901, "learning_rate": 4.9951917153522355e-05, "loss": 1.4404, "step": 262 }, { "epoch": 0.041344887893257874, "grad_norm": 0.10440964996814728, "learning_rate": 4.9951533384151906e-05, "loss": 1.3678, "step": 263 }, { "epoch": 0.04150209279019042, "grad_norm": 0.10993078351020813, "learning_rate": 4.995114809083673e-05, "loss": 1.5064, "step": 264 }, { "epoch": 0.041659297687122954, "grad_norm": 0.10710245370864868, "learning_rate": 4.9950761273600366e-05, "loss": 1.4134, "step": 265 }, { "epoch": 0.04181650258405549, "grad_norm": 0.11030582338571548, "learning_rate": 4.995037293246644e-05, "loss": 1.5299, "step": 266 }, { "epoch": 0.04197370748098803, "grad_norm": 0.1058267131447792, "learning_rate": 4.994998306745866e-05, "loss": 1.3654, "step": 267 }, { "epoch": 0.04213091237792057, "grad_norm": 0.10541702806949615, "learning_rate": 4.994959167860084e-05, "loss": 1.4297, "step": 268 }, { "epoch": 0.04228811727485311, "grad_norm": 0.11085420846939087, "learning_rate": 4.994919876591689e-05, "loss": 1.4876, "step": 269 }, { "epoch": 0.04244532217178565, "grad_norm": 0.11054470390081406, "learning_rate": 4.994880432943081e-05, "loss": 1.574, "step": 270 }, { "epoch": 0.04260252706871819, "grad_norm": 0.11234510689973831, "learning_rate": 4.994840836916668e-05, "loss": 1.5079, "step": 271 }, { "epoch": 0.04275973196565073, "grad_norm": 0.11040106415748596, "learning_rate": 4.994801088514869e-05, "loss": 1.5091, "step": 272 }, { "epoch": 0.04291693686258327, "grad_norm": 0.10639887303113937, "learning_rate": 4.994761187740111e-05, "loss": 1.4495, "step": 273 }, { "epoch": 0.04307414175951581, "grad_norm": 0.11268071085214615, "learning_rate": 4.994721134594833e-05, "loss": 1.5057, "step": 274 }, { "epoch": 0.04323134665644835, "grad_norm": 0.10079260170459747, "learning_rate": 4.994680929081479e-05, "loss": 1.4145, "step": 275 }, { "epoch": 0.04338855155338089, "grad_norm": 0.11474710702896118, "learning_rate": 4.994640571202506e-05, "loss": 1.5061, "step": 276 }, { "epoch": 0.04354575645031343, "grad_norm": 0.10946876555681229, "learning_rate": 4.994600060960377e-05, "loss": 1.5306, "step": 277 }, { "epoch": 0.04370296134724597, "grad_norm": 0.11192137002944946, "learning_rate": 4.994559398357569e-05, "loss": 1.5347, "step": 278 }, { "epoch": 0.0438601662441785, "grad_norm": 0.10744784027338028, "learning_rate": 4.994518583396564e-05, "loss": 1.4686, "step": 279 }, { "epoch": 0.044017371141111046, "grad_norm": 0.11113352328538895, "learning_rate": 4.9944776160798544e-05, "loss": 1.4101, "step": 280 }, { "epoch": 0.04417457603804358, "grad_norm": 0.11456230282783508, "learning_rate": 4.994436496409943e-05, "loss": 1.4036, "step": 281 }, { "epoch": 0.044331780934976125, "grad_norm": 0.11608672887086868, "learning_rate": 4.994395224389342e-05, "loss": 1.4949, "step": 282 }, { "epoch": 0.04448898583190866, "grad_norm": 0.1232326403260231, "learning_rate": 4.9943538000205705e-05, "loss": 1.5501, "step": 283 }, { "epoch": 0.044646190728841205, "grad_norm": 0.11791515350341797, "learning_rate": 4.994312223306159e-05, "loss": 1.4542, "step": 284 }, { "epoch": 0.04480339562577374, "grad_norm": 0.11657550930976868, "learning_rate": 4.9942704942486476e-05, "loss": 1.4724, "step": 285 }, { "epoch": 0.044960600522706284, "grad_norm": 0.11560262739658356, "learning_rate": 4.994228612850584e-05, "loss": 1.4036, "step": 286 }, { "epoch": 0.04511780541963882, "grad_norm": 0.10999175906181335, "learning_rate": 4.994186579114527e-05, "loss": 1.4489, "step": 287 }, { "epoch": 0.045275010316571364, "grad_norm": 0.11586826294660568, "learning_rate": 4.9941443930430436e-05, "loss": 1.5486, "step": 288 }, { "epoch": 0.0454322152135039, "grad_norm": 0.11349951475858688, "learning_rate": 4.994102054638711e-05, "loss": 1.5698, "step": 289 }, { "epoch": 0.04558942011043644, "grad_norm": 0.11978698521852493, "learning_rate": 4.9940595639041134e-05, "loss": 1.3933, "step": 290 }, { "epoch": 0.04574662500736898, "grad_norm": 0.11438622325658798, "learning_rate": 4.994016920841846e-05, "loss": 1.5005, "step": 291 }, { "epoch": 0.045903829904301516, "grad_norm": 0.11395915597677231, "learning_rate": 4.9939741254545155e-05, "loss": 1.4521, "step": 292 }, { "epoch": 0.04606103480123406, "grad_norm": 0.11659599095582962, "learning_rate": 4.993931177744734e-05, "loss": 1.5166, "step": 293 }, { "epoch": 0.046218239698166595, "grad_norm": 0.11053171753883362, "learning_rate": 4.9938880777151254e-05, "loss": 1.4459, "step": 294 }, { "epoch": 0.04637544459509914, "grad_norm": 0.11428084224462509, "learning_rate": 4.993844825368321e-05, "loss": 1.4448, "step": 295 }, { "epoch": 0.046532649492031675, "grad_norm": 0.10734150558710098, "learning_rate": 4.993801420706964e-05, "loss": 1.3388, "step": 296 }, { "epoch": 0.04668985438896422, "grad_norm": 0.11137369275093079, "learning_rate": 4.993757863733703e-05, "loss": 1.4155, "step": 297 }, { "epoch": 0.046847059285896754, "grad_norm": 0.1221408098936081, "learning_rate": 4.993714154451202e-05, "loss": 1.4884, "step": 298 }, { "epoch": 0.0470042641828293, "grad_norm": 0.11707969009876251, "learning_rate": 4.993670292862127e-05, "loss": 1.4605, "step": 299 }, { "epoch": 0.047161469079761834, "grad_norm": 0.11751751601696014, "learning_rate": 4.993626278969158e-05, "loss": 1.5538, "step": 300 }, { "epoch": 0.04731867397669438, "grad_norm": 0.11617731302976608, "learning_rate": 4.993582112774984e-05, "loss": 1.438, "step": 301 }, { "epoch": 0.04747587887362691, "grad_norm": 0.15164637565612793, "learning_rate": 4.993537794282302e-05, "loss": 1.4607, "step": 302 }, { "epoch": 0.047633083770559456, "grad_norm": 0.12434446811676025, "learning_rate": 4.9934933234938193e-05, "loss": 1.4167, "step": 303 }, { "epoch": 0.04779028866749199, "grad_norm": 0.12518739700317383, "learning_rate": 4.993448700412251e-05, "loss": 1.4003, "step": 304 }, { "epoch": 0.04794749356442453, "grad_norm": 0.11146944761276245, "learning_rate": 4.993403925040323e-05, "loss": 1.3913, "step": 305 }, { "epoch": 0.04810469846135707, "grad_norm": 0.11682326346635818, "learning_rate": 4.993358997380771e-05, "loss": 1.3415, "step": 306 }, { "epoch": 0.04826190335828961, "grad_norm": 0.1197504773736, "learning_rate": 4.993313917436336e-05, "loss": 1.515, "step": 307 }, { "epoch": 0.04841910825522215, "grad_norm": 0.14647473394870758, "learning_rate": 4.993268685209775e-05, "loss": 1.4529, "step": 308 }, { "epoch": 0.04857631315215469, "grad_norm": 0.12431525439023972, "learning_rate": 4.9932233007038484e-05, "loss": 1.5426, "step": 309 }, { "epoch": 0.04873351804908723, "grad_norm": 0.11715538799762726, "learning_rate": 4.9931777639213284e-05, "loss": 1.4615, "step": 310 }, { "epoch": 0.04889072294601977, "grad_norm": 0.12391876429319382, "learning_rate": 4.993132074864997e-05, "loss": 1.4138, "step": 311 }, { "epoch": 0.04904792784295231, "grad_norm": 0.11894181370735168, "learning_rate": 4.9930862335376444e-05, "loss": 1.4383, "step": 312 }, { "epoch": 0.049205132739884846, "grad_norm": 0.1225295439362526, "learning_rate": 4.9930402399420695e-05, "loss": 1.3847, "step": 313 }, { "epoch": 0.04936233763681739, "grad_norm": 0.11435995995998383, "learning_rate": 4.9929940940810825e-05, "loss": 1.4254, "step": 314 }, { "epoch": 0.049519542533749926, "grad_norm": 0.11988761276006699, "learning_rate": 4.9929477959575024e-05, "loss": 1.4787, "step": 315 }, { "epoch": 0.04967674743068247, "grad_norm": 0.11983373016119003, "learning_rate": 4.992901345574155e-05, "loss": 1.4341, "step": 316 }, { "epoch": 0.049833952327615005, "grad_norm": 0.13395054638385773, "learning_rate": 4.992854742933878e-05, "loss": 1.4315, "step": 317 }, { "epoch": 0.04999115722454754, "grad_norm": 0.12578143179416656, "learning_rate": 4.9928079880395186e-05, "loss": 1.4143, "step": 318 }, { "epoch": 0.050148362121480085, "grad_norm": 0.1401878446340561, "learning_rate": 4.992761080893932e-05, "loss": 1.4665, "step": 319 }, { "epoch": 0.05030556701841262, "grad_norm": 0.13048145174980164, "learning_rate": 4.9927140214999826e-05, "loss": 1.4266, "step": 320 }, { "epoch": 0.05030556701841262, "eval_loss": 1.450086236000061, "eval_runtime": 2316.1877, "eval_samples_per_second": 3.997, "eval_steps_per_second": 1.999, "step": 320 }, { "epoch": 0.050462771915345164, "grad_norm": 0.13121232390403748, "learning_rate": 4.992666809860545e-05, "loss": 1.4946, "step": 321 }, { "epoch": 0.0506199768122777, "grad_norm": 0.13547195494174957, "learning_rate": 4.9926194459785015e-05, "loss": 1.5532, "step": 322 }, { "epoch": 0.050777181709210244, "grad_norm": 0.11797169595956802, "learning_rate": 4.992571929856747e-05, "loss": 1.4118, "step": 323 }, { "epoch": 0.05093438660614278, "grad_norm": 0.12734922766685486, "learning_rate": 4.992524261498183e-05, "loss": 1.4427, "step": 324 }, { "epoch": 0.05109159150307532, "grad_norm": 0.12444902211427689, "learning_rate": 4.99247644090572e-05, "loss": 1.4369, "step": 325 }, { "epoch": 0.05124879640000786, "grad_norm": 0.12244518846273422, "learning_rate": 4.99242846808228e-05, "loss": 1.4587, "step": 326 }, { "epoch": 0.0514060012969404, "grad_norm": 0.12424397468566895, "learning_rate": 4.9923803430307916e-05, "loss": 1.3949, "step": 327 }, { "epoch": 0.05156320619387294, "grad_norm": 0.1352718621492386, "learning_rate": 4.9923320657541944e-05, "loss": 1.504, "step": 328 }, { "epoch": 0.05172041109080548, "grad_norm": 0.12855666875839233, "learning_rate": 4.992283636255438e-05, "loss": 1.4271, "step": 329 }, { "epoch": 0.05187761598773802, "grad_norm": 0.129829540848732, "learning_rate": 4.99223505453748e-05, "loss": 1.455, "step": 330 }, { "epoch": 0.052034820884670555, "grad_norm": 0.12780050933361053, "learning_rate": 4.992186320603286e-05, "loss": 1.4045, "step": 331 }, { "epoch": 0.0521920257816031, "grad_norm": 0.13515712320804596, "learning_rate": 4.992137434455834e-05, "loss": 1.4335, "step": 332 }, { "epoch": 0.052349230678535634, "grad_norm": 0.15026766061782837, "learning_rate": 4.99208839609811e-05, "loss": 1.5386, "step": 333 }, { "epoch": 0.05250643557546818, "grad_norm": 0.13422101736068726, "learning_rate": 4.992039205533108e-05, "loss": 1.454, "step": 334 }, { "epoch": 0.05266364047240071, "grad_norm": 0.13735777139663696, "learning_rate": 4.991989862763833e-05, "loss": 1.4415, "step": 335 }, { "epoch": 0.05282084536933326, "grad_norm": 0.12985137104988098, "learning_rate": 4.9919403677932994e-05, "loss": 1.385, "step": 336 }, { "epoch": 0.05297805026626579, "grad_norm": 0.1301167607307434, "learning_rate": 4.9918907206245285e-05, "loss": 1.4364, "step": 337 }, { "epoch": 0.053135255163198336, "grad_norm": 0.1407599002122879, "learning_rate": 4.991840921260553e-05, "loss": 1.4454, "step": 338 }, { "epoch": 0.05329246006013087, "grad_norm": 0.12763133645057678, "learning_rate": 4.9917909697044164e-05, "loss": 1.4008, "step": 339 }, { "epoch": 0.053449664957063416, "grad_norm": 0.1443052589893341, "learning_rate": 4.991740865959167e-05, "loss": 1.5184, "step": 340 }, { "epoch": 0.05360686985399595, "grad_norm": 0.13496418297290802, "learning_rate": 4.991690610027866e-05, "loss": 1.3888, "step": 341 }, { "epoch": 0.05376407475092849, "grad_norm": 0.12681293487548828, "learning_rate": 4.991640201913583e-05, "loss": 1.42, "step": 342 }, { "epoch": 0.05392127964786103, "grad_norm": 0.13178062438964844, "learning_rate": 4.9915896416193965e-05, "loss": 1.4178, "step": 343 }, { "epoch": 0.05407848454479357, "grad_norm": 0.14452503621578217, "learning_rate": 4.991538929148394e-05, "loss": 1.4248, "step": 344 }, { "epoch": 0.05423568944172611, "grad_norm": 0.1352955400943756, "learning_rate": 4.991488064503674e-05, "loss": 1.4304, "step": 345 }, { "epoch": 0.05439289433865865, "grad_norm": 0.14846469461917877, "learning_rate": 4.991437047688343e-05, "loss": 1.4784, "step": 346 }, { "epoch": 0.05455009923559119, "grad_norm": 0.12475849688053131, "learning_rate": 4.9913858787055156e-05, "loss": 1.4131, "step": 347 }, { "epoch": 0.054707304132523726, "grad_norm": 0.13835409283638, "learning_rate": 4.991334557558318e-05, "loss": 1.4913, "step": 348 }, { "epoch": 0.05486450902945627, "grad_norm": 0.13921529054641724, "learning_rate": 4.991283084249885e-05, "loss": 1.3713, "step": 349 }, { "epoch": 0.055021713926388806, "grad_norm": 0.13188250362873077, "learning_rate": 4.9912314587833586e-05, "loss": 1.3608, "step": 350 }, { "epoch": 0.05517891882332135, "grad_norm": 0.12457428872585297, "learning_rate": 4.991179681161895e-05, "loss": 1.4427, "step": 351 }, { "epoch": 0.055336123720253885, "grad_norm": 0.12452542781829834, "learning_rate": 4.9911277513886535e-05, "loss": 1.4179, "step": 352 }, { "epoch": 0.05549332861718643, "grad_norm": 0.14799195528030396, "learning_rate": 4.9910756694668074e-05, "loss": 1.4532, "step": 353 }, { "epoch": 0.055650533514118965, "grad_norm": 0.13485541939735413, "learning_rate": 4.991023435399538e-05, "loss": 1.4114, "step": 354 }, { "epoch": 0.0558077384110515, "grad_norm": 0.1422443389892578, "learning_rate": 4.990971049190034e-05, "loss": 1.377, "step": 355 }, { "epoch": 0.055964943307984044, "grad_norm": 0.12994804978370667, "learning_rate": 4.990918510841496e-05, "loss": 1.4474, "step": 356 }, { "epoch": 0.05612214820491658, "grad_norm": 0.1429785192012787, "learning_rate": 4.990865820357133e-05, "loss": 1.4435, "step": 357 }, { "epoch": 0.056279353101849124, "grad_norm": 0.12979790568351746, "learning_rate": 4.9908129777401625e-05, "loss": 1.4039, "step": 358 }, { "epoch": 0.05643655799878166, "grad_norm": 0.1332644671201706, "learning_rate": 4.990759982993812e-05, "loss": 1.4377, "step": 359 }, { "epoch": 0.0565937628957142, "grad_norm": 0.13796579837799072, "learning_rate": 4.99070683612132e-05, "loss": 1.3951, "step": 360 }, { "epoch": 0.05675096779264674, "grad_norm": 0.14315246045589447, "learning_rate": 4.9906535371259294e-05, "loss": 1.4042, "step": 361 }, { "epoch": 0.05690817268957928, "grad_norm": 0.1463768631219864, "learning_rate": 4.9906000860108974e-05, "loss": 1.461, "step": 362 }, { "epoch": 0.05706537758651182, "grad_norm": 0.14041170477867126, "learning_rate": 4.9905464827794884e-05, "loss": 1.4147, "step": 363 }, { "epoch": 0.05722258248344436, "grad_norm": 0.19242697954177856, "learning_rate": 4.990492727434976e-05, "loss": 1.3435, "step": 364 }, { "epoch": 0.0573797873803769, "grad_norm": 0.1556611955165863, "learning_rate": 4.990438819980644e-05, "loss": 1.4075, "step": 365 }, { "epoch": 0.05753699227730944, "grad_norm": 0.13157570362091064, "learning_rate": 4.990384760419784e-05, "loss": 1.3334, "step": 366 }, { "epoch": 0.05769419717424198, "grad_norm": 0.17953743040561676, "learning_rate": 4.990330548755698e-05, "loss": 1.4609, "step": 367 }, { "epoch": 0.057851402071174514, "grad_norm": 0.14179491996765137, "learning_rate": 4.990276184991697e-05, "loss": 1.4344, "step": 368 }, { "epoch": 0.05800860696810706, "grad_norm": 0.16522593796253204, "learning_rate": 4.9902216691311024e-05, "loss": 1.3794, "step": 369 }, { "epoch": 0.05816581186503959, "grad_norm": 0.12736016511917114, "learning_rate": 4.9901670011772425e-05, "loss": 1.4167, "step": 370 }, { "epoch": 0.058323016761972137, "grad_norm": 0.15869787335395813, "learning_rate": 4.990112181133456e-05, "loss": 1.4293, "step": 371 }, { "epoch": 0.05848022165890467, "grad_norm": 0.14410504698753357, "learning_rate": 4.990057209003093e-05, "loss": 1.4357, "step": 372 }, { "epoch": 0.058637426555837216, "grad_norm": 0.1567080020904541, "learning_rate": 4.9900020847895086e-05, "loss": 1.4146, "step": 373 }, { "epoch": 0.05879463145276975, "grad_norm": 0.1430107057094574, "learning_rate": 4.989946808496071e-05, "loss": 1.3415, "step": 374 }, { "epoch": 0.058951836349702295, "grad_norm": 0.146332785487175, "learning_rate": 4.989891380126156e-05, "loss": 1.496, "step": 375 }, { "epoch": 0.05910904124663483, "grad_norm": 0.13674487173557281, "learning_rate": 4.989835799683149e-05, "loss": 1.3611, "step": 376 }, { "epoch": 0.059266246143567375, "grad_norm": 0.1321984827518463, "learning_rate": 4.989780067170444e-05, "loss": 1.4695, "step": 377 }, { "epoch": 0.05942345104049991, "grad_norm": 0.1535942554473877, "learning_rate": 4.9897241825914464e-05, "loss": 1.3564, "step": 378 }, { "epoch": 0.059580655937432454, "grad_norm": 0.1538037806749344, "learning_rate": 4.989668145949568e-05, "loss": 1.3502, "step": 379 }, { "epoch": 0.05973786083436499, "grad_norm": 0.15744829177856445, "learning_rate": 4.989611957248232e-05, "loss": 1.4318, "step": 380 }, { "epoch": 0.05989506573129753, "grad_norm": 0.17178332805633545, "learning_rate": 4.98955561649087e-05, "loss": 1.4643, "step": 381 }, { "epoch": 0.06005227062823007, "grad_norm": 0.15913072228431702, "learning_rate": 4.989499123680923e-05, "loss": 1.487, "step": 382 }, { "epoch": 0.060209475525162606, "grad_norm": 0.15134060382843018, "learning_rate": 4.9894424788218415e-05, "loss": 1.4705, "step": 383 }, { "epoch": 0.06036668042209515, "grad_norm": 0.13704389333724976, "learning_rate": 4.989385681917085e-05, "loss": 1.4756, "step": 384 }, { "epoch": 0.060523885319027686, "grad_norm": 0.14025503396987915, "learning_rate": 4.989328732970122e-05, "loss": 1.443, "step": 385 }, { "epoch": 0.06068109021596023, "grad_norm": 0.1822325438261032, "learning_rate": 4.9892716319844325e-05, "loss": 1.3996, "step": 386 }, { "epoch": 0.060838295112892765, "grad_norm": 0.15639656782150269, "learning_rate": 4.989214378963502e-05, "loss": 1.3656, "step": 387 }, { "epoch": 0.06099550000982531, "grad_norm": 0.15097728371620178, "learning_rate": 4.989156973910828e-05, "loss": 1.4055, "step": 388 }, { "epoch": 0.061152704906757845, "grad_norm": 0.18977142870426178, "learning_rate": 4.989099416829917e-05, "loss": 1.4472, "step": 389 }, { "epoch": 0.06130990980369039, "grad_norm": 0.1596304178237915, "learning_rate": 4.989041707724284e-05, "loss": 1.4373, "step": 390 }, { "epoch": 0.061467114700622924, "grad_norm": 0.171820729970932, "learning_rate": 4.988983846597454e-05, "loss": 1.468, "step": 391 }, { "epoch": 0.06162431959755547, "grad_norm": 0.14266176521778107, "learning_rate": 4.98892583345296e-05, "loss": 1.4037, "step": 392 }, { "epoch": 0.061781524494488003, "grad_norm": 0.13375528156757355, "learning_rate": 4.988867668294346e-05, "loss": 1.437, "step": 393 }, { "epoch": 0.06193872939142054, "grad_norm": 0.13332228362560272, "learning_rate": 4.988809351125165e-05, "loss": 1.3892, "step": 394 }, { "epoch": 0.06209593428835308, "grad_norm": 0.17180980741977692, "learning_rate": 4.988750881948977e-05, "loss": 1.3494, "step": 395 }, { "epoch": 0.06225313918528562, "grad_norm": 0.1419111043214798, "learning_rate": 4.988692260769355e-05, "loss": 1.3748, "step": 396 }, { "epoch": 0.06241034408221816, "grad_norm": 0.17256620526313782, "learning_rate": 4.9886334875898776e-05, "loss": 1.3549, "step": 397 }, { "epoch": 0.0625675489791507, "grad_norm": 0.2243422418832779, "learning_rate": 4.988574562414137e-05, "loss": 1.4465, "step": 398 }, { "epoch": 0.06272475387608324, "grad_norm": 0.15700723230838776, "learning_rate": 4.9885154852457294e-05, "loss": 1.4477, "step": 399 }, { "epoch": 0.06288195877301578, "grad_norm": 0.14497259259223938, "learning_rate": 4.988456256088264e-05, "loss": 1.3861, "step": 400 }, { "epoch": 0.06303916366994831, "grad_norm": 0.14747034013271332, "learning_rate": 4.988396874945359e-05, "loss": 1.4206, "step": 401 }, { "epoch": 0.06319636856688086, "grad_norm": 0.17671054601669312, "learning_rate": 4.98833734182064e-05, "loss": 1.2475, "step": 402 }, { "epoch": 0.0633535734638134, "grad_norm": 0.16974316537380219, "learning_rate": 4.9882776567177446e-05, "loss": 1.4955, "step": 403 }, { "epoch": 0.06351077836074594, "grad_norm": 0.15419775247573853, "learning_rate": 4.988217819640317e-05, "loss": 1.4209, "step": 404 }, { "epoch": 0.06366798325767847, "grad_norm": 0.13987664878368378, "learning_rate": 4.988157830592012e-05, "loss": 1.456, "step": 405 }, { "epoch": 0.06382518815461101, "grad_norm": 0.24560455977916718, "learning_rate": 4.988097689576493e-05, "loss": 1.3567, "step": 406 }, { "epoch": 0.06398239305154356, "grad_norm": 0.13870076835155487, "learning_rate": 4.9880373965974334e-05, "loss": 1.3752, "step": 407 }, { "epoch": 0.0641395979484761, "grad_norm": 0.16167718172073364, "learning_rate": 4.987976951658517e-05, "loss": 1.4766, "step": 408 }, { "epoch": 0.06429680284540863, "grad_norm": 0.1700398474931717, "learning_rate": 4.9879163547634346e-05, "loss": 1.427, "step": 409 }, { "epoch": 0.06445400774234117, "grad_norm": 0.15502458810806274, "learning_rate": 4.987855605915887e-05, "loss": 1.3965, "step": 410 }, { "epoch": 0.06461121263927372, "grad_norm": 0.14834032952785492, "learning_rate": 4.987794705119584e-05, "loss": 1.4399, "step": 411 }, { "epoch": 0.06476841753620625, "grad_norm": 0.22443649172782898, "learning_rate": 4.987733652378246e-05, "loss": 1.3736, "step": 412 }, { "epoch": 0.06492562243313879, "grad_norm": 0.14396560192108154, "learning_rate": 4.9876724476956015e-05, "loss": 1.4648, "step": 413 }, { "epoch": 0.06508282733007133, "grad_norm": 0.15352006256580353, "learning_rate": 4.987611091075389e-05, "loss": 1.4988, "step": 414 }, { "epoch": 0.06524003222700388, "grad_norm": 0.13210074603557587, "learning_rate": 4.987549582521356e-05, "loss": 1.3705, "step": 415 }, { "epoch": 0.06539723712393641, "grad_norm": 0.16056782007217407, "learning_rate": 4.98748792203726e-05, "loss": 1.3388, "step": 416 }, { "epoch": 0.06555444202086895, "grad_norm": 0.18992343544960022, "learning_rate": 4.9874261096268647e-05, "loss": 1.3842, "step": 417 }, { "epoch": 0.06571164691780149, "grad_norm": 0.1789916455745697, "learning_rate": 4.9873641452939466e-05, "loss": 1.3622, "step": 418 }, { "epoch": 0.06586885181473402, "grad_norm": 0.21043789386749268, "learning_rate": 4.9873020290422915e-05, "loss": 1.3477, "step": 419 }, { "epoch": 0.06602605671166657, "grad_norm": 0.15355254709720612, "learning_rate": 4.987239760875691e-05, "loss": 1.3643, "step": 420 }, { "epoch": 0.06618326160859911, "grad_norm": 0.1433190107345581, "learning_rate": 4.9871773407979496e-05, "loss": 1.3753, "step": 421 }, { "epoch": 0.06634046650553165, "grad_norm": 0.17479249835014343, "learning_rate": 4.987114768812879e-05, "loss": 1.3809, "step": 422 }, { "epoch": 0.06649767140246418, "grad_norm": 0.186944842338562, "learning_rate": 4.987052044924302e-05, "loss": 1.3616, "step": 423 }, { "epoch": 0.06665487629939673, "grad_norm": 0.15202952921390533, "learning_rate": 4.986989169136048e-05, "loss": 1.4479, "step": 424 }, { "epoch": 0.06681208119632927, "grad_norm": 0.16295532882213593, "learning_rate": 4.9869261414519575e-05, "loss": 1.3713, "step": 425 }, { "epoch": 0.0669692860932618, "grad_norm": 0.19577625393867493, "learning_rate": 4.986862961875881e-05, "loss": 1.4199, "step": 426 }, { "epoch": 0.06712649099019434, "grad_norm": 0.22768542170524597, "learning_rate": 4.986799630411677e-05, "loss": 1.3529, "step": 427 }, { "epoch": 0.06728369588712689, "grad_norm": 0.25184011459350586, "learning_rate": 4.986736147063212e-05, "loss": 1.3944, "step": 428 }, { "epoch": 0.06744090078405943, "grad_norm": 0.15565118193626404, "learning_rate": 4.986672511834366e-05, "loss": 1.4505, "step": 429 }, { "epoch": 0.06759810568099196, "grad_norm": 0.16559922695159912, "learning_rate": 4.986608724729024e-05, "loss": 1.3742, "step": 430 }, { "epoch": 0.0677553105779245, "grad_norm": 0.14826242625713348, "learning_rate": 4.986544785751081e-05, "loss": 1.4008, "step": 431 }, { "epoch": 0.06791251547485704, "grad_norm": 0.16543184220790863, "learning_rate": 4.986480694904444e-05, "loss": 1.3433, "step": 432 }, { "epoch": 0.06806972037178959, "grad_norm": 0.15332931280136108, "learning_rate": 4.986416452193027e-05, "loss": 1.4459, "step": 433 }, { "epoch": 0.06822692526872212, "grad_norm": 0.18880733847618103, "learning_rate": 4.986352057620752e-05, "loss": 1.3902, "step": 434 }, { "epoch": 0.06838413016565466, "grad_norm": 0.1513829231262207, "learning_rate": 4.986287511191554e-05, "loss": 1.3485, "step": 435 }, { "epoch": 0.0685413350625872, "grad_norm": 0.15241704881191254, "learning_rate": 4.9862228129093745e-05, "loss": 1.3051, "step": 436 }, { "epoch": 0.06869853995951974, "grad_norm": 0.1956702321767807, "learning_rate": 4.986157962778165e-05, "loss": 1.4647, "step": 437 }, { "epoch": 0.06885574485645228, "grad_norm": 0.2027936428785324, "learning_rate": 4.9860929608018866e-05, "loss": 1.3602, "step": 438 }, { "epoch": 0.06901294975338482, "grad_norm": 0.1623186320066452, "learning_rate": 4.986027806984509e-05, "loss": 1.4154, "step": 439 }, { "epoch": 0.06917015465031735, "grad_norm": 0.16111283004283905, "learning_rate": 4.985962501330011e-05, "loss": 1.4311, "step": 440 }, { "epoch": 0.0693273595472499, "grad_norm": 0.16754299402236938, "learning_rate": 4.985897043842382e-05, "loss": 1.349, "step": 441 }, { "epoch": 0.06948456444418244, "grad_norm": 0.1766330897808075, "learning_rate": 4.985831434525621e-05, "loss": 1.3714, "step": 442 }, { "epoch": 0.06964176934111498, "grad_norm": 0.1742810308933258, "learning_rate": 4.985765673383733e-05, "loss": 1.4161, "step": 443 }, { "epoch": 0.06979897423804751, "grad_norm": 0.17025281488895416, "learning_rate": 4.985699760420736e-05, "loss": 1.3925, "step": 444 }, { "epoch": 0.06995617913498005, "grad_norm": 0.19201375544071198, "learning_rate": 4.985633695640655e-05, "loss": 1.4158, "step": 445 }, { "epoch": 0.0701133840319126, "grad_norm": 0.1636267751455307, "learning_rate": 4.985567479047524e-05, "loss": 1.4071, "step": 446 }, { "epoch": 0.07027058892884513, "grad_norm": 0.19676333665847778, "learning_rate": 4.9855011106453894e-05, "loss": 1.3449, "step": 447 }, { "epoch": 0.07042779382577767, "grad_norm": 0.17712907493114471, "learning_rate": 4.985434590438303e-05, "loss": 1.3421, "step": 448 }, { "epoch": 0.07058499872271021, "grad_norm": 0.18515101075172424, "learning_rate": 4.985367918430329e-05, "loss": 1.4051, "step": 449 }, { "epoch": 0.07074220361964276, "grad_norm": 0.17168915271759033, "learning_rate": 4.985301094625538e-05, "loss": 1.3093, "step": 450 }, { "epoch": 0.0708994085165753, "grad_norm": 0.1891397386789322, "learning_rate": 4.9852341190280127e-05, "loss": 1.3075, "step": 451 }, { "epoch": 0.07105661341350783, "grad_norm": 0.17731457948684692, "learning_rate": 4.985166991641843e-05, "loss": 1.3986, "step": 452 }, { "epoch": 0.07121381831044037, "grad_norm": 0.18817296624183655, "learning_rate": 4.985099712471129e-05, "loss": 1.3531, "step": 453 }, { "epoch": 0.07137102320737292, "grad_norm": 0.1782791018486023, "learning_rate": 4.9850322815199795e-05, "loss": 1.4064, "step": 454 }, { "epoch": 0.07152822810430545, "grad_norm": 0.18053874373435974, "learning_rate": 4.984964698792514e-05, "loss": 1.4607, "step": 455 }, { "epoch": 0.07168543300123799, "grad_norm": 0.286338746547699, "learning_rate": 4.984896964292858e-05, "loss": 1.3036, "step": 456 }, { "epoch": 0.07184263789817052, "grad_norm": 0.2560707926750183, "learning_rate": 4.98482907802515e-05, "loss": 1.3428, "step": 457 }, { "epoch": 0.07199984279510306, "grad_norm": 0.19296897947788239, "learning_rate": 4.984761039993537e-05, "loss": 1.3502, "step": 458 }, { "epoch": 0.07215704769203561, "grad_norm": 0.19685949385166168, "learning_rate": 4.9846928502021725e-05, "loss": 1.4015, "step": 459 }, { "epoch": 0.07231425258896815, "grad_norm": 0.1548481583595276, "learning_rate": 4.984624508655223e-05, "loss": 1.3698, "step": 460 }, { "epoch": 0.07247145748590068, "grad_norm": 0.16076034307479858, "learning_rate": 4.984556015356862e-05, "loss": 1.3627, "step": 461 }, { "epoch": 0.07262866238283322, "grad_norm": 0.18571603298187256, "learning_rate": 4.9844873703112726e-05, "loss": 1.3506, "step": 462 }, { "epoch": 0.07278586727976577, "grad_norm": 0.1540035605430603, "learning_rate": 4.984418573522648e-05, "loss": 1.4483, "step": 463 }, { "epoch": 0.0729430721766983, "grad_norm": 0.1730145364999771, "learning_rate": 4.984349624995188e-05, "loss": 1.3678, "step": 464 }, { "epoch": 0.07310027707363084, "grad_norm": 0.26254212856292725, "learning_rate": 4.984280524733107e-05, "loss": 1.401, "step": 465 }, { "epoch": 0.07325748197056338, "grad_norm": 0.2079063057899475, "learning_rate": 4.984211272740623e-05, "loss": 1.3655, "step": 466 }, { "epoch": 0.07341468686749593, "grad_norm": 0.21711499989032745, "learning_rate": 4.9841418690219653e-05, "loss": 1.4011, "step": 467 }, { "epoch": 0.07357189176442847, "grad_norm": 0.18226252496242523, "learning_rate": 4.984072313581375e-05, "loss": 1.4213, "step": 468 }, { "epoch": 0.073729096661361, "grad_norm": 0.1463780552148819, "learning_rate": 4.9840026064230984e-05, "loss": 1.4519, "step": 469 }, { "epoch": 0.07388630155829354, "grad_norm": 0.18232892453670502, "learning_rate": 4.983932747551394e-05, "loss": 1.3657, "step": 470 }, { "epoch": 0.07404350645522607, "grad_norm": 0.19644559919834137, "learning_rate": 4.9838627369705285e-05, "loss": 1.3988, "step": 471 }, { "epoch": 0.07420071135215862, "grad_norm": 0.16292576491832733, "learning_rate": 4.983792574684776e-05, "loss": 1.4369, "step": 472 }, { "epoch": 0.07435791624909116, "grad_norm": 0.2244543433189392, "learning_rate": 4.983722260698425e-05, "loss": 1.4269, "step": 473 }, { "epoch": 0.0745151211460237, "grad_norm": 0.2582489848136902, "learning_rate": 4.9836517950157666e-05, "loss": 1.3986, "step": 474 }, { "epoch": 0.07467232604295623, "grad_norm": 0.15564194321632385, "learning_rate": 4.983581177641108e-05, "loss": 1.3871, "step": 475 }, { "epoch": 0.07482953093988878, "grad_norm": 0.2301008552312851, "learning_rate": 4.9835104085787596e-05, "loss": 1.3572, "step": 476 }, { "epoch": 0.07498673583682132, "grad_norm": 0.21603424847126007, "learning_rate": 4.9834394878330444e-05, "loss": 1.3803, "step": 477 }, { "epoch": 0.07514394073375386, "grad_norm": 0.16744717955589294, "learning_rate": 4.9833684154082937e-05, "loss": 1.4233, "step": 478 }, { "epoch": 0.07530114563068639, "grad_norm": 0.23016415536403656, "learning_rate": 4.98329719130885e-05, "loss": 1.3962, "step": 479 }, { "epoch": 0.07545835052761894, "grad_norm": 0.19687114655971527, "learning_rate": 4.983225815539061e-05, "loss": 1.3667, "step": 480 }, { "epoch": 0.07545835052761894, "eval_loss": 1.3748993873596191, "eval_runtime": 2315.5952, "eval_samples_per_second": 3.998, "eval_steps_per_second": 1.999, "step": 480 }, { "epoch": 0.07561555542455148, "grad_norm": 0.1833205670118332, "learning_rate": 4.9831542881032884e-05, "loss": 1.4365, "step": 481 }, { "epoch": 0.07577276032148401, "grad_norm": 0.17124423384666443, "learning_rate": 4.983082609005899e-05, "loss": 1.3641, "step": 482 }, { "epoch": 0.07592996521841655, "grad_norm": 0.17352670431137085, "learning_rate": 4.9830107782512715e-05, "loss": 1.3415, "step": 483 }, { "epoch": 0.07608717011534909, "grad_norm": 0.20768220722675323, "learning_rate": 4.982938795843793e-05, "loss": 1.3261, "step": 484 }, { "epoch": 0.07624437501228164, "grad_norm": 0.21459853649139404, "learning_rate": 4.982866661787859e-05, "loss": 1.4185, "step": 485 }, { "epoch": 0.07640157990921417, "grad_norm": 0.26912233233451843, "learning_rate": 4.982794376087877e-05, "loss": 1.3941, "step": 486 }, { "epoch": 0.07655878480614671, "grad_norm": 0.28497114777565, "learning_rate": 4.982721938748261e-05, "loss": 1.3201, "step": 487 }, { "epoch": 0.07671598970307925, "grad_norm": 0.15378472208976746, "learning_rate": 4.982649349773435e-05, "loss": 1.3615, "step": 488 }, { "epoch": 0.0768731946000118, "grad_norm": 0.16169893741607666, "learning_rate": 4.982576609167831e-05, "loss": 1.3342, "step": 489 }, { "epoch": 0.07703039949694433, "grad_norm": 0.24693650007247925, "learning_rate": 4.982503716935896e-05, "loss": 1.3788, "step": 490 }, { "epoch": 0.07718760439387687, "grad_norm": 0.1769181787967682, "learning_rate": 4.982430673082077e-05, "loss": 1.3664, "step": 491 }, { "epoch": 0.0773448092908094, "grad_norm": 0.26325106620788574, "learning_rate": 4.982357477610839e-05, "loss": 1.3173, "step": 492 }, { "epoch": 0.07750201418774195, "grad_norm": 0.2063319832086563, "learning_rate": 4.9822841305266506e-05, "loss": 1.4125, "step": 493 }, { "epoch": 0.07765921908467449, "grad_norm": 0.29141879081726074, "learning_rate": 4.982210631833992e-05, "loss": 1.3596, "step": 494 }, { "epoch": 0.07781642398160703, "grad_norm": 0.18967591226100922, "learning_rate": 4.982136981537352e-05, "loss": 1.4128, "step": 495 }, { "epoch": 0.07797362887853956, "grad_norm": 0.2291795313358307, "learning_rate": 4.9820631796412287e-05, "loss": 1.3772, "step": 496 }, { "epoch": 0.0781308337754721, "grad_norm": 0.200834721326828, "learning_rate": 4.98198922615013e-05, "loss": 1.369, "step": 497 }, { "epoch": 0.07828803867240465, "grad_norm": 0.22960609197616577, "learning_rate": 4.9819151210685736e-05, "loss": 1.3979, "step": 498 }, { "epoch": 0.07844524356933719, "grad_norm": 0.17247427999973297, "learning_rate": 4.981840864401084e-05, "loss": 1.3927, "step": 499 }, { "epoch": 0.07860244846626972, "grad_norm": 0.2623608112335205, "learning_rate": 4.981766456152198e-05, "loss": 1.3919, "step": 500 }, { "epoch": 0.07875965336320226, "grad_norm": 0.19911788403987885, "learning_rate": 4.981691896326459e-05, "loss": 1.3925, "step": 501 }, { "epoch": 0.07891685826013481, "grad_norm": 0.24869734048843384, "learning_rate": 4.9816171849284205e-05, "loss": 1.3562, "step": 502 }, { "epoch": 0.07907406315706735, "grad_norm": 0.31372350454330444, "learning_rate": 4.981542321962647e-05, "loss": 1.3211, "step": 503 }, { "epoch": 0.07923126805399988, "grad_norm": 0.21760910749435425, "learning_rate": 4.981467307433709e-05, "loss": 1.3042, "step": 504 }, { "epoch": 0.07938847295093242, "grad_norm": 0.2469843477010727, "learning_rate": 4.9813921413461906e-05, "loss": 1.2831, "step": 505 }, { "epoch": 0.07954567784786497, "grad_norm": 0.24319148063659668, "learning_rate": 4.981316823704681e-05, "loss": 1.2703, "step": 506 }, { "epoch": 0.0797028827447975, "grad_norm": 0.19718031585216522, "learning_rate": 4.98124135451378e-05, "loss": 1.3258, "step": 507 }, { "epoch": 0.07986008764173004, "grad_norm": 0.17459236085414886, "learning_rate": 4.981165733778098e-05, "loss": 1.4248, "step": 508 }, { "epoch": 0.08001729253866258, "grad_norm": 0.17684616148471832, "learning_rate": 4.981089961502253e-05, "loss": 1.3939, "step": 509 }, { "epoch": 0.08017449743559511, "grad_norm": 0.17499729990959167, "learning_rate": 4.981014037690874e-05, "loss": 1.4156, "step": 510 }, { "epoch": 0.08033170233252766, "grad_norm": 0.1901170015335083, "learning_rate": 4.9809379623485964e-05, "loss": 1.4209, "step": 511 }, { "epoch": 0.0804889072294602, "grad_norm": 0.18230682611465454, "learning_rate": 4.980861735480067e-05, "loss": 1.4607, "step": 512 }, { "epoch": 0.08064611212639274, "grad_norm": 0.22843636572360992, "learning_rate": 4.9807853570899427e-05, "loss": 1.3671, "step": 513 }, { "epoch": 0.08080331702332527, "grad_norm": 0.2288489192724228, "learning_rate": 4.980708827182887e-05, "loss": 1.3657, "step": 514 }, { "epoch": 0.08096052192025782, "grad_norm": 0.19647593796253204, "learning_rate": 4.980632145763575e-05, "loss": 1.4079, "step": 515 }, { "epoch": 0.08111772681719036, "grad_norm": 0.20980435609817505, "learning_rate": 4.98055531283669e-05, "loss": 1.3746, "step": 516 }, { "epoch": 0.0812749317141229, "grad_norm": 0.19381123781204224, "learning_rate": 4.980478328406923e-05, "loss": 1.3986, "step": 517 }, { "epoch": 0.08143213661105543, "grad_norm": 0.2224361151456833, "learning_rate": 4.980401192478979e-05, "loss": 1.3082, "step": 518 }, { "epoch": 0.08158934150798797, "grad_norm": 0.20567384362220764, "learning_rate": 4.9803239050575664e-05, "loss": 1.4417, "step": 519 }, { "epoch": 0.08174654640492052, "grad_norm": 0.22890503704547882, "learning_rate": 4.9802464661474074e-05, "loss": 1.3034, "step": 520 }, { "epoch": 0.08190375130185305, "grad_norm": 0.23220910131931305, "learning_rate": 4.9801688757532304e-05, "loss": 1.3705, "step": 521 }, { "epoch": 0.08206095619878559, "grad_norm": 0.29084959626197815, "learning_rate": 4.980091133879775e-05, "loss": 1.3246, "step": 522 }, { "epoch": 0.08221816109571813, "grad_norm": 0.15776456892490387, "learning_rate": 4.9800132405317895e-05, "loss": 1.4311, "step": 523 }, { "epoch": 0.08237536599265068, "grad_norm": 0.2636071443557739, "learning_rate": 4.9799351957140314e-05, "loss": 1.3265, "step": 524 }, { "epoch": 0.08253257088958321, "grad_norm": 0.20042134821414948, "learning_rate": 4.979856999431266e-05, "loss": 1.3257, "step": 525 }, { "epoch": 0.08268977578651575, "grad_norm": 0.24039289355278015, "learning_rate": 4.9797786516882714e-05, "loss": 1.3999, "step": 526 }, { "epoch": 0.08284698068344828, "grad_norm": 0.16932524740695953, "learning_rate": 4.9797001524898315e-05, "loss": 1.4113, "step": 527 }, { "epoch": 0.08300418558038083, "grad_norm": 0.2101370096206665, "learning_rate": 4.97962150184074e-05, "loss": 1.3973, "step": 528 }, { "epoch": 0.08316139047731337, "grad_norm": 0.20983585715293884, "learning_rate": 4.979542699745803e-05, "loss": 1.3255, "step": 529 }, { "epoch": 0.08331859537424591, "grad_norm": 0.20477800071239471, "learning_rate": 4.97946374620983e-05, "loss": 1.4349, "step": 530 }, { "epoch": 0.08347580027117844, "grad_norm": 0.22637289762496948, "learning_rate": 4.979384641237647e-05, "loss": 1.3263, "step": 531 }, { "epoch": 0.08363300516811098, "grad_norm": 0.20332221686840057, "learning_rate": 4.9793053848340835e-05, "loss": 1.3411, "step": 532 }, { "epoch": 0.08379021006504353, "grad_norm": 0.22744616866111755, "learning_rate": 4.979225977003979e-05, "loss": 1.4042, "step": 533 }, { "epoch": 0.08394741496197607, "grad_norm": 0.20091576874256134, "learning_rate": 4.979146417752185e-05, "loss": 1.3218, "step": 534 }, { "epoch": 0.0841046198589086, "grad_norm": 0.2225920408964157, "learning_rate": 4.9790667070835604e-05, "loss": 1.4223, "step": 535 }, { "epoch": 0.08426182475584114, "grad_norm": 0.20447570085525513, "learning_rate": 4.9789868450029745e-05, "loss": 1.3884, "step": 536 }, { "epoch": 0.08441902965277369, "grad_norm": 0.22765719890594482, "learning_rate": 4.9789068315153035e-05, "loss": 1.3575, "step": 537 }, { "epoch": 0.08457623454970623, "grad_norm": 0.18886259198188782, "learning_rate": 4.9788266666254343e-05, "loss": 1.2737, "step": 538 }, { "epoch": 0.08473343944663876, "grad_norm": 0.26551586389541626, "learning_rate": 4.978746350338264e-05, "loss": 1.3867, "step": 539 }, { "epoch": 0.0848906443435713, "grad_norm": 0.29268744587898254, "learning_rate": 4.9786658826586975e-05, "loss": 1.4266, "step": 540 }, { "epoch": 0.08504784924050385, "grad_norm": 0.2537211775779724, "learning_rate": 4.97858526359165e-05, "loss": 1.3402, "step": 541 }, { "epoch": 0.08520505413743638, "grad_norm": 0.20287925004959106, "learning_rate": 4.978504493142045e-05, "loss": 1.3148, "step": 542 }, { "epoch": 0.08536225903436892, "grad_norm": 0.18584851920604706, "learning_rate": 4.978423571314814e-05, "loss": 1.3293, "step": 543 }, { "epoch": 0.08551946393130146, "grad_norm": 0.1944153755903244, "learning_rate": 4.978342498114903e-05, "loss": 1.4084, "step": 544 }, { "epoch": 0.08567666882823399, "grad_norm": 0.18139739334583282, "learning_rate": 4.978261273547261e-05, "loss": 1.2734, "step": 545 }, { "epoch": 0.08583387372516654, "grad_norm": 0.20824116468429565, "learning_rate": 4.97817989761685e-05, "loss": 1.3346, "step": 546 }, { "epoch": 0.08599107862209908, "grad_norm": 0.16180047392845154, "learning_rate": 4.978098370328639e-05, "loss": 1.4547, "step": 547 }, { "epoch": 0.08614828351903162, "grad_norm": 0.17156392335891724, "learning_rate": 4.978016691687609e-05, "loss": 1.366, "step": 548 }, { "epoch": 0.08630548841596415, "grad_norm": 0.17913401126861572, "learning_rate": 4.977934861698746e-05, "loss": 1.2771, "step": 549 }, { "epoch": 0.0864626933128967, "grad_norm": 0.17393502593040466, "learning_rate": 4.977852880367051e-05, "loss": 1.3061, "step": 550 }, { "epoch": 0.08661989820982924, "grad_norm": 0.21741637587547302, "learning_rate": 4.97777074769753e-05, "loss": 1.3232, "step": 551 }, { "epoch": 0.08677710310676177, "grad_norm": 0.26123344898223877, "learning_rate": 4.977688463695198e-05, "loss": 1.2678, "step": 552 }, { "epoch": 0.08693430800369431, "grad_norm": 0.2508600354194641, "learning_rate": 4.9776060283650826e-05, "loss": 1.4543, "step": 553 }, { "epoch": 0.08709151290062686, "grad_norm": 0.18527132272720337, "learning_rate": 4.977523441712217e-05, "loss": 1.3359, "step": 554 }, { "epoch": 0.0872487177975594, "grad_norm": 0.24495406448841095, "learning_rate": 4.977440703741646e-05, "loss": 1.2892, "step": 555 }, { "epoch": 0.08740592269449193, "grad_norm": 0.22759339213371277, "learning_rate": 4.9773578144584235e-05, "loss": 1.2212, "step": 556 }, { "epoch": 0.08756312759142447, "grad_norm": 0.1627693474292755, "learning_rate": 4.977274773867611e-05, "loss": 1.3461, "step": 557 }, { "epoch": 0.087720332488357, "grad_norm": 0.2068985551595688, "learning_rate": 4.9771915819742804e-05, "loss": 1.3348, "step": 558 }, { "epoch": 0.08787753738528956, "grad_norm": 0.19731195271015167, "learning_rate": 4.9771082387835135e-05, "loss": 1.3727, "step": 559 }, { "epoch": 0.08803474228222209, "grad_norm": 0.26571184396743774, "learning_rate": 4.977024744300399e-05, "loss": 1.3911, "step": 560 }, { "epoch": 0.08819194717915463, "grad_norm": 0.23141519725322723, "learning_rate": 4.976941098530039e-05, "loss": 1.3978, "step": 561 }, { "epoch": 0.08834915207608716, "grad_norm": 0.2507224380970001, "learning_rate": 4.97685730147754e-05, "loss": 1.3017, "step": 562 }, { "epoch": 0.08850635697301971, "grad_norm": 0.2453109323978424, "learning_rate": 4.976773353148022e-05, "loss": 1.2977, "step": 563 }, { "epoch": 0.08866356186995225, "grad_norm": 0.2600953280925751, "learning_rate": 4.9766892535466105e-05, "loss": 1.4015, "step": 564 }, { "epoch": 0.08882076676688479, "grad_norm": 0.19863371551036835, "learning_rate": 4.9766050026784416e-05, "loss": 1.3593, "step": 565 }, { "epoch": 0.08897797166381732, "grad_norm": 0.2115338146686554, "learning_rate": 4.976520600548663e-05, "loss": 1.2928, "step": 566 }, { "epoch": 0.08913517656074987, "grad_norm": 0.18994684517383575, "learning_rate": 4.976436047162429e-05, "loss": 1.3506, "step": 567 }, { "epoch": 0.08929238145768241, "grad_norm": 0.22891771793365479, "learning_rate": 4.976351342524903e-05, "loss": 1.4449, "step": 568 }, { "epoch": 0.08944958635461495, "grad_norm": 0.19313135743141174, "learning_rate": 4.976266486641259e-05, "loss": 1.2916, "step": 569 }, { "epoch": 0.08960679125154748, "grad_norm": 0.17697346210479736, "learning_rate": 4.976181479516679e-05, "loss": 1.3696, "step": 570 }, { "epoch": 0.08976399614848002, "grad_norm": 0.22902925312519073, "learning_rate": 4.976096321156356e-05, "loss": 1.3688, "step": 571 }, { "epoch": 0.08992120104541257, "grad_norm": 0.25305554270744324, "learning_rate": 4.97601101156549e-05, "loss": 1.3057, "step": 572 }, { "epoch": 0.0900784059423451, "grad_norm": 0.23255370557308197, "learning_rate": 4.975925550749293e-05, "loss": 1.3571, "step": 573 }, { "epoch": 0.09023561083927764, "grad_norm": 0.25259101390838623, "learning_rate": 4.9758399387129834e-05, "loss": 1.3152, "step": 574 }, { "epoch": 0.09039281573621018, "grad_norm": 0.26062390208244324, "learning_rate": 4.97575417546179e-05, "loss": 1.3042, "step": 575 }, { "epoch": 0.09055002063314273, "grad_norm": 0.16536732017993927, "learning_rate": 4.9756682610009515e-05, "loss": 1.2797, "step": 576 }, { "epoch": 0.09070722553007526, "grad_norm": 0.19088499248027802, "learning_rate": 4.9755821953357144e-05, "loss": 1.3774, "step": 577 }, { "epoch": 0.0908644304270078, "grad_norm": 0.2181147336959839, "learning_rate": 4.975495978471336e-05, "loss": 1.3364, "step": 578 }, { "epoch": 0.09102163532394034, "grad_norm": 0.18012750148773193, "learning_rate": 4.975409610413082e-05, "loss": 1.3852, "step": 579 }, { "epoch": 0.09117884022087289, "grad_norm": 0.18108834326267242, "learning_rate": 4.975323091166227e-05, "loss": 1.3214, "step": 580 }, { "epoch": 0.09133604511780542, "grad_norm": 0.25102898478507996, "learning_rate": 4.975236420736056e-05, "loss": 1.3199, "step": 581 }, { "epoch": 0.09149325001473796, "grad_norm": 0.20121383666992188, "learning_rate": 4.9751495991278626e-05, "loss": 1.3328, "step": 582 }, { "epoch": 0.0916504549116705, "grad_norm": 0.24183815717697144, "learning_rate": 4.975062626346948e-05, "loss": 1.3881, "step": 583 }, { "epoch": 0.09180765980860303, "grad_norm": 0.23274902999401093, "learning_rate": 4.974975502398626e-05, "loss": 1.3674, "step": 584 }, { "epoch": 0.09196486470553558, "grad_norm": 0.224375382065773, "learning_rate": 4.9748882272882165e-05, "loss": 1.362, "step": 585 }, { "epoch": 0.09212206960246812, "grad_norm": 0.2743482291698456, "learning_rate": 4.97480080102105e-05, "loss": 1.3028, "step": 586 }, { "epoch": 0.09227927449940065, "grad_norm": 0.30631452798843384, "learning_rate": 4.974713223602467e-05, "loss": 1.3541, "step": 587 }, { "epoch": 0.09243647939633319, "grad_norm": 0.1999395489692688, "learning_rate": 4.9746254950378166e-05, "loss": 1.3515, "step": 588 }, { "epoch": 0.09259368429326574, "grad_norm": 0.3005799353122711, "learning_rate": 4.974537615332455e-05, "loss": 1.3872, "step": 589 }, { "epoch": 0.09275088919019828, "grad_norm": 0.21795117855072021, "learning_rate": 4.9744495844917524e-05, "loss": 1.2804, "step": 590 }, { "epoch": 0.09290809408713081, "grad_norm": 0.2832283675670624, "learning_rate": 4.9743614025210825e-05, "loss": 1.3209, "step": 591 }, { "epoch": 0.09306529898406335, "grad_norm": 0.21391350030899048, "learning_rate": 4.9742730694258334e-05, "loss": 1.3041, "step": 592 }, { "epoch": 0.0932225038809959, "grad_norm": 0.21651242673397064, "learning_rate": 4.974184585211399e-05, "loss": 1.2529, "step": 593 }, { "epoch": 0.09337970877792844, "grad_norm": 0.22796374559402466, "learning_rate": 4.974095949883183e-05, "loss": 1.3999, "step": 594 }, { "epoch": 0.09353691367486097, "grad_norm": 0.21013247966766357, "learning_rate": 4.9740071634466e-05, "loss": 1.3626, "step": 595 }, { "epoch": 0.09369411857179351, "grad_norm": 0.31589969992637634, "learning_rate": 4.973918225907073e-05, "loss": 1.4096, "step": 596 }, { "epoch": 0.09385132346872604, "grad_norm": 0.2923184931278229, "learning_rate": 4.973829137270033e-05, "loss": 1.2116, "step": 597 }, { "epoch": 0.0940085283656586, "grad_norm": 0.2147187739610672, "learning_rate": 4.9737398975409224e-05, "loss": 1.3909, "step": 598 }, { "epoch": 0.09416573326259113, "grad_norm": 0.20287127792835236, "learning_rate": 4.9736505067251896e-05, "loss": 1.3621, "step": 599 }, { "epoch": 0.09432293815952367, "grad_norm": 0.24703876674175262, "learning_rate": 4.9735609648282965e-05, "loss": 1.3525, "step": 600 }, { "epoch": 0.0944801430564562, "grad_norm": 0.25060412287712097, "learning_rate": 4.97347127185571e-05, "loss": 1.36, "step": 601 }, { "epoch": 0.09463734795338875, "grad_norm": 0.214557945728302, "learning_rate": 4.9733814278129096e-05, "loss": 1.4372, "step": 602 }, { "epoch": 0.09479455285032129, "grad_norm": 0.1984785795211792, "learning_rate": 4.9732914327053825e-05, "loss": 1.3191, "step": 603 }, { "epoch": 0.09495175774725383, "grad_norm": 0.2099440097808838, "learning_rate": 4.9732012865386244e-05, "loss": 1.313, "step": 604 }, { "epoch": 0.09510896264418636, "grad_norm": 0.20393683016300201, "learning_rate": 4.9731109893181423e-05, "loss": 1.3465, "step": 605 }, { "epoch": 0.09526616754111891, "grad_norm": 0.25346165895462036, "learning_rate": 4.97302054104945e-05, "loss": 1.3379, "step": 606 }, { "epoch": 0.09542337243805145, "grad_norm": 0.21876423060894012, "learning_rate": 4.9729299417380725e-05, "loss": 1.2746, "step": 607 }, { "epoch": 0.09558057733498398, "grad_norm": 0.21032990515232086, "learning_rate": 4.9728391913895436e-05, "loss": 1.3215, "step": 608 }, { "epoch": 0.09573778223191652, "grad_norm": 0.2550762891769409, "learning_rate": 4.9727482900094044e-05, "loss": 1.3239, "step": 609 }, { "epoch": 0.09589498712884906, "grad_norm": 0.31706327199935913, "learning_rate": 4.972657237603208e-05, "loss": 1.3467, "step": 610 }, { "epoch": 0.09605219202578161, "grad_norm": 0.17176879942417145, "learning_rate": 4.972566034176516e-05, "loss": 1.3815, "step": 611 }, { "epoch": 0.09620939692271414, "grad_norm": 0.22620820999145508, "learning_rate": 4.972474679734898e-05, "loss": 1.2593, "step": 612 }, { "epoch": 0.09636660181964668, "grad_norm": 0.18735802173614502, "learning_rate": 4.9723831742839334e-05, "loss": 1.424, "step": 613 }, { "epoch": 0.09652380671657922, "grad_norm": 0.2582910656929016, "learning_rate": 4.972291517829211e-05, "loss": 1.2741, "step": 614 }, { "epoch": 0.09668101161351177, "grad_norm": 0.19907522201538086, "learning_rate": 4.97219971037633e-05, "loss": 1.2045, "step": 615 }, { "epoch": 0.0968382165104443, "grad_norm": 0.20451949536800385, "learning_rate": 4.972107751930896e-05, "loss": 1.3026, "step": 616 }, { "epoch": 0.09699542140737684, "grad_norm": 0.29682090878486633, "learning_rate": 4.972015642498527e-05, "loss": 1.3789, "step": 617 }, { "epoch": 0.09715262630430938, "grad_norm": 0.27210530638694763, "learning_rate": 4.9719233820848476e-05, "loss": 1.3968, "step": 618 }, { "epoch": 0.09730983120124193, "grad_norm": 0.24241842329502106, "learning_rate": 4.971830970695493e-05, "loss": 1.2763, "step": 619 }, { "epoch": 0.09746703609817446, "grad_norm": 0.2535828649997711, "learning_rate": 4.9717384083361075e-05, "loss": 1.3463, "step": 620 }, { "epoch": 0.097624240995107, "grad_norm": 0.22121217846870422, "learning_rate": 4.971645695012344e-05, "loss": 1.3384, "step": 621 }, { "epoch": 0.09778144589203953, "grad_norm": 0.28840744495391846, "learning_rate": 4.971552830729866e-05, "loss": 1.2418, "step": 622 }, { "epoch": 0.09793865078897207, "grad_norm": 0.1682664453983307, "learning_rate": 4.971459815494345e-05, "loss": 1.3658, "step": 623 }, { "epoch": 0.09809585568590462, "grad_norm": 0.24955761432647705, "learning_rate": 4.971366649311461e-05, "loss": 1.2372, "step": 624 }, { "epoch": 0.09825306058283716, "grad_norm": 0.2756117582321167, "learning_rate": 4.971273332186906e-05, "loss": 1.3212, "step": 625 }, { "epoch": 0.09841026547976969, "grad_norm": 0.2370867133140564, "learning_rate": 4.971179864126377e-05, "loss": 1.2879, "step": 626 }, { "epoch": 0.09856747037670223, "grad_norm": 0.20566895604133606, "learning_rate": 4.9710862451355846e-05, "loss": 1.4243, "step": 627 }, { "epoch": 0.09872467527363478, "grad_norm": 0.1923399120569229, "learning_rate": 4.970992475220246e-05, "loss": 1.2639, "step": 628 }, { "epoch": 0.09888188017056732, "grad_norm": 0.17972147464752197, "learning_rate": 4.9708985543860896e-05, "loss": 1.3366, "step": 629 }, { "epoch": 0.09903908506749985, "grad_norm": 0.1936875432729721, "learning_rate": 4.97080448263885e-05, "loss": 1.3496, "step": 630 }, { "epoch": 0.09919628996443239, "grad_norm": 0.24409984052181244, "learning_rate": 4.9707102599842735e-05, "loss": 1.3268, "step": 631 }, { "epoch": 0.09935349486136494, "grad_norm": 0.21084928512573242, "learning_rate": 4.970615886428115e-05, "loss": 1.3421, "step": 632 }, { "epoch": 0.09951069975829747, "grad_norm": 0.21201804280281067, "learning_rate": 4.970521361976138e-05, "loss": 1.3189, "step": 633 }, { "epoch": 0.09966790465523001, "grad_norm": 0.2698107063770294, "learning_rate": 4.9704266866341156e-05, "loss": 1.2193, "step": 634 }, { "epoch": 0.09982510955216255, "grad_norm": 0.27072674036026, "learning_rate": 4.970331860407831e-05, "loss": 1.2694, "step": 635 }, { "epoch": 0.09998231444909508, "grad_norm": 0.26514896750450134, "learning_rate": 4.9702368833030754e-05, "loss": 1.2175, "step": 636 }, { "epoch": 0.10013951934602763, "grad_norm": 0.21645940840244293, "learning_rate": 4.970141755325649e-05, "loss": 1.3099, "step": 637 }, { "epoch": 0.10029672424296017, "grad_norm": 0.27035385370254517, "learning_rate": 4.970046476481363e-05, "loss": 1.2723, "step": 638 }, { "epoch": 0.1004539291398927, "grad_norm": 0.20999298989772797, "learning_rate": 4.969951046776036e-05, "loss": 1.369, "step": 639 }, { "epoch": 0.10061113403682524, "grad_norm": 0.18554192781448364, "learning_rate": 4.969855466215497e-05, "loss": 1.3483, "step": 640 }, { "epoch": 0.10061113403682524, "eval_loss": 1.314468502998352, "eval_runtime": 2275.7115, "eval_samples_per_second": 4.068, "eval_steps_per_second": 2.034, "step": 640 }, { "epoch": 0.10076833893375779, "grad_norm": 0.19117292761802673, "learning_rate": 4.969759734805582e-05, "loss": 1.3538, "step": 641 }, { "epoch": 0.10092554383069033, "grad_norm": 0.21971918642520905, "learning_rate": 4.969663852552141e-05, "loss": 1.2827, "step": 642 }, { "epoch": 0.10108274872762286, "grad_norm": 0.2663845121860504, "learning_rate": 4.969567819461027e-05, "loss": 1.3332, "step": 643 }, { "epoch": 0.1012399536245554, "grad_norm": 0.23752686381340027, "learning_rate": 4.9694716355381076e-05, "loss": 1.2675, "step": 644 }, { "epoch": 0.10139715852148795, "grad_norm": 0.1558876782655716, "learning_rate": 4.9693753007892565e-05, "loss": 1.3356, "step": 645 }, { "epoch": 0.10155436341842049, "grad_norm": 0.2064114212989807, "learning_rate": 4.969278815220356e-05, "loss": 1.3261, "step": 646 }, { "epoch": 0.10171156831535302, "grad_norm": 0.2371819168329239, "learning_rate": 4.969182178837302e-05, "loss": 1.2706, "step": 647 }, { "epoch": 0.10186877321228556, "grad_norm": 0.22757107019424438, "learning_rate": 4.969085391645994e-05, "loss": 1.4035, "step": 648 }, { "epoch": 0.1020259781092181, "grad_norm": 0.16831007599830627, "learning_rate": 4.968988453652345e-05, "loss": 1.3006, "step": 649 }, { "epoch": 0.10218318300615065, "grad_norm": 0.1719575822353363, "learning_rate": 4.968891364862275e-05, "loss": 1.2439, "step": 650 }, { "epoch": 0.10234038790308318, "grad_norm": 0.27235090732574463, "learning_rate": 4.9687941252817144e-05, "loss": 1.3065, "step": 651 }, { "epoch": 0.10249759280001572, "grad_norm": 0.25622984766960144, "learning_rate": 4.968696734916601e-05, "loss": 1.2908, "step": 652 }, { "epoch": 0.10265479769694826, "grad_norm": 0.22526390850543976, "learning_rate": 4.968599193772885e-05, "loss": 1.3081, "step": 653 }, { "epoch": 0.1028120025938808, "grad_norm": 0.2552133798599243, "learning_rate": 4.968501501856522e-05, "loss": 1.3292, "step": 654 }, { "epoch": 0.10296920749081334, "grad_norm": 0.26533272862434387, "learning_rate": 4.96840365917348e-05, "loss": 1.3571, "step": 655 }, { "epoch": 0.10312641238774588, "grad_norm": 0.29065170884132385, "learning_rate": 4.968305665729732e-05, "loss": 1.2799, "step": 656 }, { "epoch": 0.10328361728467841, "grad_norm": 0.27552661299705505, "learning_rate": 4.968207521531267e-05, "loss": 1.2262, "step": 657 }, { "epoch": 0.10344082218161096, "grad_norm": 1.929308533668518, "learning_rate": 4.9681092265840775e-05, "loss": 1.2027, "step": 658 }, { "epoch": 0.1035980270785435, "grad_norm": 0.2610799968242645, "learning_rate": 4.968010780894167e-05, "loss": 1.3527, "step": 659 }, { "epoch": 0.10375523197547604, "grad_norm": 0.28388604521751404, "learning_rate": 4.967912184467547e-05, "loss": 1.2989, "step": 660 }, { "epoch": 0.10391243687240857, "grad_norm": 0.21056891977787018, "learning_rate": 4.9678134373102415e-05, "loss": 1.2748, "step": 661 }, { "epoch": 0.10406964176934111, "grad_norm": 0.268331378698349, "learning_rate": 4.967714539428281e-05, "loss": 1.3712, "step": 662 }, { "epoch": 0.10422684666627366, "grad_norm": 0.28430554270744324, "learning_rate": 4.967615490827705e-05, "loss": 1.3641, "step": 663 }, { "epoch": 0.1043840515632062, "grad_norm": 0.254165917634964, "learning_rate": 4.9675162915145636e-05, "loss": 1.3042, "step": 664 }, { "epoch": 0.10454125646013873, "grad_norm": 0.19123367965221405, "learning_rate": 4.967416941494914e-05, "loss": 1.3613, "step": 665 }, { "epoch": 0.10469846135707127, "grad_norm": 0.20710323750972748, "learning_rate": 4.967317440774828e-05, "loss": 1.2815, "step": 666 }, { "epoch": 0.10485566625400382, "grad_norm": 0.2143716812133789, "learning_rate": 4.967217789360379e-05, "loss": 1.3136, "step": 667 }, { "epoch": 0.10501287115093635, "grad_norm": 0.2556392550468445, "learning_rate": 4.967117987257654e-05, "loss": 1.384, "step": 668 }, { "epoch": 0.10517007604786889, "grad_norm": 0.28254854679107666, "learning_rate": 4.9670180344727505e-05, "loss": 1.3218, "step": 669 }, { "epoch": 0.10532728094480143, "grad_norm": 0.24643027782440186, "learning_rate": 4.9669179310117706e-05, "loss": 1.278, "step": 670 }, { "epoch": 0.10548448584173396, "grad_norm": 0.34323665499687195, "learning_rate": 4.9668176768808304e-05, "loss": 1.2511, "step": 671 }, { "epoch": 0.10564169073866651, "grad_norm": 0.2499508410692215, "learning_rate": 4.966717272086052e-05, "loss": 1.338, "step": 672 }, { "epoch": 0.10579889563559905, "grad_norm": 0.2145325094461441, "learning_rate": 4.966616716633567e-05, "loss": 1.3304, "step": 673 }, { "epoch": 0.10595610053253159, "grad_norm": 0.19230923056602478, "learning_rate": 4.9665160105295185e-05, "loss": 1.3535, "step": 674 }, { "epoch": 0.10611330542946412, "grad_norm": 0.20243465900421143, "learning_rate": 4.966415153780056e-05, "loss": 1.3118, "step": 675 }, { "epoch": 0.10627051032639667, "grad_norm": 0.24927914142608643, "learning_rate": 4.966314146391341e-05, "loss": 1.3136, "step": 676 }, { "epoch": 0.10642771522332921, "grad_norm": 0.21791934967041016, "learning_rate": 4.9662129883695406e-05, "loss": 1.3314, "step": 677 }, { "epoch": 0.10658492012026174, "grad_norm": 0.24318841099739075, "learning_rate": 4.966111679720835e-05, "loss": 1.3929, "step": 678 }, { "epoch": 0.10674212501719428, "grad_norm": 0.2829376757144928, "learning_rate": 4.966010220451411e-05, "loss": 1.3232, "step": 679 }, { "epoch": 0.10689932991412683, "grad_norm": 0.2353716641664505, "learning_rate": 4.965908610567465e-05, "loss": 1.2851, "step": 680 }, { "epoch": 0.10705653481105937, "grad_norm": 0.2615984380245209, "learning_rate": 4.965806850075203e-05, "loss": 1.2552, "step": 681 }, { "epoch": 0.1072137397079919, "grad_norm": 0.23773109912872314, "learning_rate": 4.965704938980841e-05, "loss": 1.2961, "step": 682 }, { "epoch": 0.10737094460492444, "grad_norm": 0.2622957229614258, "learning_rate": 4.9656028772906014e-05, "loss": 1.3073, "step": 683 }, { "epoch": 0.10752814950185698, "grad_norm": 0.24974018335342407, "learning_rate": 4.965500665010721e-05, "loss": 1.2774, "step": 684 }, { "epoch": 0.10768535439878953, "grad_norm": 0.17124338448047638, "learning_rate": 4.9653983021474395e-05, "loss": 1.4159, "step": 685 }, { "epoch": 0.10784255929572206, "grad_norm": 0.16673363745212555, "learning_rate": 4.96529578870701e-05, "loss": 1.3748, "step": 686 }, { "epoch": 0.1079997641926546, "grad_norm": 0.25368422269821167, "learning_rate": 4.965193124695693e-05, "loss": 1.3958, "step": 687 }, { "epoch": 0.10815696908958713, "grad_norm": 0.22910015285015106, "learning_rate": 4.96509031011976e-05, "loss": 1.3259, "step": 688 }, { "epoch": 0.10831417398651969, "grad_norm": 0.277851939201355, "learning_rate": 4.96498734498549e-05, "loss": 1.3254, "step": 689 }, { "epoch": 0.10847137888345222, "grad_norm": 0.32443082332611084, "learning_rate": 4.964884229299172e-05, "loss": 1.3007, "step": 690 }, { "epoch": 0.10862858378038476, "grad_norm": 0.20710885524749756, "learning_rate": 4.964780963067102e-05, "loss": 1.3297, "step": 691 }, { "epoch": 0.1087857886773173, "grad_norm": 0.25522252917289734, "learning_rate": 4.96467754629559e-05, "loss": 1.2487, "step": 692 }, { "epoch": 0.10894299357424984, "grad_norm": 0.3286147713661194, "learning_rate": 4.9645739789909504e-05, "loss": 1.2255, "step": 693 }, { "epoch": 0.10910019847118238, "grad_norm": 0.3795601725578308, "learning_rate": 4.964470261159509e-05, "loss": 1.2725, "step": 694 }, { "epoch": 0.10925740336811492, "grad_norm": 0.3112131655216217, "learning_rate": 4.964366392807602e-05, "loss": 1.252, "step": 695 }, { "epoch": 0.10941460826504745, "grad_norm": 0.2891729176044464, "learning_rate": 4.964262373941571e-05, "loss": 1.3377, "step": 696 }, { "epoch": 0.10957181316197999, "grad_norm": 0.26973745226860046, "learning_rate": 4.96415820456777e-05, "loss": 1.3186, "step": 697 }, { "epoch": 0.10972901805891254, "grad_norm": 0.2832094430923462, "learning_rate": 4.964053884692562e-05, "loss": 1.3248, "step": 698 }, { "epoch": 0.10988622295584508, "grad_norm": 0.2840999960899353, "learning_rate": 4.963949414322318e-05, "loss": 1.2677, "step": 699 }, { "epoch": 0.11004342785277761, "grad_norm": 0.2891542911529541, "learning_rate": 4.963844793463418e-05, "loss": 1.3274, "step": 700 }, { "epoch": 0.11020063274971015, "grad_norm": 0.23569005727767944, "learning_rate": 4.963740022122252e-05, "loss": 1.2259, "step": 701 }, { "epoch": 0.1103578376466427, "grad_norm": 0.2174285501241684, "learning_rate": 4.963635100305221e-05, "loss": 1.2785, "step": 702 }, { "epoch": 0.11051504254357523, "grad_norm": 0.2753438651561737, "learning_rate": 4.96353002801873e-05, "loss": 1.3263, "step": 703 }, { "epoch": 0.11067224744050777, "grad_norm": 0.21094419062137604, "learning_rate": 4.963424805269198e-05, "loss": 1.2439, "step": 704 }, { "epoch": 0.1108294523374403, "grad_norm": 0.20501388609409332, "learning_rate": 4.963319432063052e-05, "loss": 1.3091, "step": 705 }, { "epoch": 0.11098665723437286, "grad_norm": 0.2041424810886383, "learning_rate": 4.963213908406728e-05, "loss": 1.2951, "step": 706 }, { "epoch": 0.1111438621313054, "grad_norm": 0.24955442547798157, "learning_rate": 4.963108234306669e-05, "loss": 1.2208, "step": 707 }, { "epoch": 0.11130106702823793, "grad_norm": 0.39431118965148926, "learning_rate": 4.9630024097693314e-05, "loss": 1.306, "step": 708 }, { "epoch": 0.11145827192517047, "grad_norm": 0.24803434312343597, "learning_rate": 4.962896434801178e-05, "loss": 1.2951, "step": 709 }, { "epoch": 0.111615476822103, "grad_norm": 0.2736116349697113, "learning_rate": 4.962790309408681e-05, "loss": 1.3245, "step": 710 }, { "epoch": 0.11177268171903555, "grad_norm": 0.24502034485340118, "learning_rate": 4.9626840335983215e-05, "loss": 1.2961, "step": 711 }, { "epoch": 0.11192988661596809, "grad_norm": 0.24158692359924316, "learning_rate": 4.962577607376592e-05, "loss": 1.2387, "step": 712 }, { "epoch": 0.11208709151290062, "grad_norm": 0.24977251887321472, "learning_rate": 4.962471030749991e-05, "loss": 1.2976, "step": 713 }, { "epoch": 0.11224429640983316, "grad_norm": 0.15401019155979156, "learning_rate": 4.962364303725029e-05, "loss": 1.2684, "step": 714 }, { "epoch": 0.11240150130676571, "grad_norm": 0.2611544132232666, "learning_rate": 4.962257426308224e-05, "loss": 1.2928, "step": 715 }, { "epoch": 0.11255870620369825, "grad_norm": 0.434600830078125, "learning_rate": 4.962150398506103e-05, "loss": 1.3657, "step": 716 }, { "epoch": 0.11271591110063078, "grad_norm": 0.2896519601345062, "learning_rate": 4.9620432203252045e-05, "loss": 1.3055, "step": 717 }, { "epoch": 0.11287311599756332, "grad_norm": 0.1891547590494156, "learning_rate": 4.961935891772073e-05, "loss": 1.3355, "step": 718 }, { "epoch": 0.11303032089449587, "grad_norm": 0.2223133146762848, "learning_rate": 4.9618284128532644e-05, "loss": 1.2939, "step": 719 }, { "epoch": 0.1131875257914284, "grad_norm": 0.27313077449798584, "learning_rate": 4.961720783575343e-05, "loss": 1.2596, "step": 720 }, { "epoch": 0.11334473068836094, "grad_norm": 0.24807053804397583, "learning_rate": 4.961613003944883e-05, "loss": 1.2851, "step": 721 }, { "epoch": 0.11350193558529348, "grad_norm": 0.2343195378780365, "learning_rate": 4.9615050739684656e-05, "loss": 1.2899, "step": 722 }, { "epoch": 0.11365914048222601, "grad_norm": 0.229730024933815, "learning_rate": 4.961396993652684e-05, "loss": 1.3118, "step": 723 }, { "epoch": 0.11381634537915856, "grad_norm": 0.2397170215845108, "learning_rate": 4.9612887630041394e-05, "loss": 1.2148, "step": 724 }, { "epoch": 0.1139735502760911, "grad_norm": 0.2167958915233612, "learning_rate": 4.9611803820294414e-05, "loss": 1.2597, "step": 725 }, { "epoch": 0.11413075517302364, "grad_norm": 0.21318721771240234, "learning_rate": 4.961071850735209e-05, "loss": 1.3949, "step": 726 }, { "epoch": 0.11428796006995617, "grad_norm": 0.21988382935523987, "learning_rate": 4.960963169128073e-05, "loss": 1.3196, "step": 727 }, { "epoch": 0.11444516496688872, "grad_norm": 0.17555692791938782, "learning_rate": 4.96085433721467e-05, "loss": 1.3661, "step": 728 }, { "epoch": 0.11460236986382126, "grad_norm": 0.3545222282409668, "learning_rate": 4.960745355001647e-05, "loss": 1.2659, "step": 729 }, { "epoch": 0.1147595747607538, "grad_norm": 0.3196569085121155, "learning_rate": 4.960636222495659e-05, "loss": 1.2893, "step": 730 }, { "epoch": 0.11491677965768633, "grad_norm": 0.2241334766149521, "learning_rate": 4.960526939703374e-05, "loss": 1.2155, "step": 731 }, { "epoch": 0.11507398455461888, "grad_norm": 0.26543980836868286, "learning_rate": 4.960417506631465e-05, "loss": 1.3615, "step": 732 }, { "epoch": 0.11523118945155142, "grad_norm": 0.21146585047245026, "learning_rate": 4.960307923286616e-05, "loss": 1.3516, "step": 733 }, { "epoch": 0.11538839434848396, "grad_norm": 0.18095079064369202, "learning_rate": 4.960198189675519e-05, "loss": 1.3581, "step": 734 }, { "epoch": 0.11554559924541649, "grad_norm": 0.26687100529670715, "learning_rate": 4.9600883058048775e-05, "loss": 1.1971, "step": 735 }, { "epoch": 0.11570280414234903, "grad_norm": 0.2271047830581665, "learning_rate": 4.959978271681402e-05, "loss": 1.1867, "step": 736 }, { "epoch": 0.11586000903928158, "grad_norm": 0.2102867215871811, "learning_rate": 4.959868087311814e-05, "loss": 1.2749, "step": 737 }, { "epoch": 0.11601721393621411, "grad_norm": 0.2752761244773865, "learning_rate": 4.9597577527028424e-05, "loss": 1.1753, "step": 738 }, { "epoch": 0.11617441883314665, "grad_norm": 0.22385725378990173, "learning_rate": 4.959647267861226e-05, "loss": 1.343, "step": 739 }, { "epoch": 0.11633162373007919, "grad_norm": 0.2597412168979645, "learning_rate": 4.959536632793712e-05, "loss": 1.2539, "step": 740 }, { "epoch": 0.11648882862701174, "grad_norm": 0.27975237369537354, "learning_rate": 4.959425847507059e-05, "loss": 1.2883, "step": 741 }, { "epoch": 0.11664603352394427, "grad_norm": 0.29127049446105957, "learning_rate": 4.959314912008033e-05, "loss": 1.3139, "step": 742 }, { "epoch": 0.11680323842087681, "grad_norm": 0.19929318130016327, "learning_rate": 4.9592038263034094e-05, "loss": 1.271, "step": 743 }, { "epoch": 0.11696044331780935, "grad_norm": 0.23164550960063934, "learning_rate": 4.9590925903999716e-05, "loss": 1.3359, "step": 744 }, { "epoch": 0.1171176482147419, "grad_norm": 0.27876612544059753, "learning_rate": 4.958981204304516e-05, "loss": 1.2568, "step": 745 }, { "epoch": 0.11727485311167443, "grad_norm": 0.2459796965122223, "learning_rate": 4.9588696680238435e-05, "loss": 1.2426, "step": 746 }, { "epoch": 0.11743205800860697, "grad_norm": 0.2039456069469452, "learning_rate": 4.958757981564767e-05, "loss": 1.2681, "step": 747 }, { "epoch": 0.1175892629055395, "grad_norm": 0.24796408414840698, "learning_rate": 4.958646144934108e-05, "loss": 1.257, "step": 748 }, { "epoch": 0.11774646780247204, "grad_norm": 0.2779620289802551, "learning_rate": 4.958534158138697e-05, "loss": 1.2933, "step": 749 }, { "epoch": 0.11790367269940459, "grad_norm": 0.20878851413726807, "learning_rate": 4.9584220211853735e-05, "loss": 1.2902, "step": 750 }, { "epoch": 0.11806087759633713, "grad_norm": 0.24720412492752075, "learning_rate": 4.958309734080987e-05, "loss": 1.203, "step": 751 }, { "epoch": 0.11821808249326966, "grad_norm": 0.287654846906662, "learning_rate": 4.9581972968323956e-05, "loss": 1.3141, "step": 752 }, { "epoch": 0.1183752873902022, "grad_norm": 0.23071719706058502, "learning_rate": 4.958084709446466e-05, "loss": 1.3145, "step": 753 }, { "epoch": 0.11853249228713475, "grad_norm": 0.21027110517024994, "learning_rate": 4.9579719719300746e-05, "loss": 1.2893, "step": 754 }, { "epoch": 0.11868969718406729, "grad_norm": 0.17173202335834503, "learning_rate": 4.9578590842901066e-05, "loss": 1.2618, "step": 755 }, { "epoch": 0.11884690208099982, "grad_norm": 0.24606984853744507, "learning_rate": 4.957746046533457e-05, "loss": 1.1904, "step": 756 }, { "epoch": 0.11900410697793236, "grad_norm": 0.248653382062912, "learning_rate": 4.957632858667031e-05, "loss": 1.331, "step": 757 }, { "epoch": 0.11916131187486491, "grad_norm": 0.1904144436120987, "learning_rate": 4.9575195206977406e-05, "loss": 1.3303, "step": 758 }, { "epoch": 0.11931851677179744, "grad_norm": 0.39540621638298035, "learning_rate": 4.9574060326325075e-05, "loss": 1.3455, "step": 759 }, { "epoch": 0.11947572166872998, "grad_norm": 0.20992301404476166, "learning_rate": 4.957292394478265e-05, "loss": 1.2911, "step": 760 }, { "epoch": 0.11963292656566252, "grad_norm": 0.23418502509593964, "learning_rate": 4.957178606241951e-05, "loss": 1.35, "step": 761 }, { "epoch": 0.11979013146259505, "grad_norm": 0.24480225145816803, "learning_rate": 4.957064667930517e-05, "loss": 1.2138, "step": 762 }, { "epoch": 0.1199473363595276, "grad_norm": 0.22909322381019592, "learning_rate": 4.956950579550922e-05, "loss": 1.1915, "step": 763 }, { "epoch": 0.12010454125646014, "grad_norm": 0.16839763522148132, "learning_rate": 4.956836341110134e-05, "loss": 1.234, "step": 764 }, { "epoch": 0.12026174615339268, "grad_norm": 0.2291131466627121, "learning_rate": 4.956721952615129e-05, "loss": 1.2964, "step": 765 }, { "epoch": 0.12041895105032521, "grad_norm": 0.2606765329837799, "learning_rate": 4.956607414072895e-05, "loss": 1.2785, "step": 766 }, { "epoch": 0.12057615594725776, "grad_norm": 0.24100011587142944, "learning_rate": 4.956492725490426e-05, "loss": 1.2389, "step": 767 }, { "epoch": 0.1207333608441903, "grad_norm": 0.2868693172931671, "learning_rate": 4.956377886874729e-05, "loss": 1.3852, "step": 768 }, { "epoch": 0.12089056574112284, "grad_norm": 0.29049259424209595, "learning_rate": 4.956262898232816e-05, "loss": 1.1511, "step": 769 }, { "epoch": 0.12104777063805537, "grad_norm": 0.31396448612213135, "learning_rate": 4.9561477595717106e-05, "loss": 1.2687, "step": 770 }, { "epoch": 0.12120497553498792, "grad_norm": 0.3348733186721802, "learning_rate": 4.956032470898445e-05, "loss": 1.1933, "step": 771 }, { "epoch": 0.12136218043192046, "grad_norm": 0.2009342461824417, "learning_rate": 4.955917032220061e-05, "loss": 1.3299, "step": 772 }, { "epoch": 0.121519385328853, "grad_norm": 0.2037377655506134, "learning_rate": 4.9558014435436084e-05, "loss": 1.3208, "step": 773 }, { "epoch": 0.12167659022578553, "grad_norm": 0.3118877410888672, "learning_rate": 4.955685704876147e-05, "loss": 1.1927, "step": 774 }, { "epoch": 0.12183379512271807, "grad_norm": 0.21884632110595703, "learning_rate": 4.955569816224747e-05, "loss": 1.2661, "step": 775 }, { "epoch": 0.12199100001965062, "grad_norm": 0.25817862153053284, "learning_rate": 4.9554537775964846e-05, "loss": 1.3077, "step": 776 }, { "epoch": 0.12214820491658315, "grad_norm": 0.27827751636505127, "learning_rate": 4.955337588998449e-05, "loss": 1.2709, "step": 777 }, { "epoch": 0.12230540981351569, "grad_norm": 0.30520737171173096, "learning_rate": 4.955221250437735e-05, "loss": 1.2407, "step": 778 }, { "epoch": 0.12246261471044823, "grad_norm": 0.21729423105716705, "learning_rate": 4.9551047619214473e-05, "loss": 1.3392, "step": 779 }, { "epoch": 0.12261981960738078, "grad_norm": 0.2408866286277771, "learning_rate": 4.954988123456703e-05, "loss": 1.215, "step": 780 }, { "epoch": 0.12277702450431331, "grad_norm": 0.23833869397640228, "learning_rate": 4.954871335050625e-05, "loss": 1.3607, "step": 781 }, { "epoch": 0.12293422940124585, "grad_norm": 0.27017349004745483, "learning_rate": 4.954754396710345e-05, "loss": 1.2662, "step": 782 }, { "epoch": 0.12309143429817838, "grad_norm": 0.21869684755802155, "learning_rate": 4.954637308443007e-05, "loss": 1.2384, "step": 783 }, { "epoch": 0.12324863919511093, "grad_norm": 0.18912911415100098, "learning_rate": 4.9545200702557615e-05, "loss": 1.2958, "step": 784 }, { "epoch": 0.12340584409204347, "grad_norm": 0.27320876717567444, "learning_rate": 4.954402682155768e-05, "loss": 1.2546, "step": 785 }, { "epoch": 0.12356304898897601, "grad_norm": 0.2938046455383301, "learning_rate": 4.954285144150198e-05, "loss": 1.3451, "step": 786 }, { "epoch": 0.12372025388590854, "grad_norm": 0.18271508812904358, "learning_rate": 4.954167456246229e-05, "loss": 1.2239, "step": 787 }, { "epoch": 0.12387745878284108, "grad_norm": 0.21799346804618835, "learning_rate": 4.9540496184510495e-05, "loss": 1.2471, "step": 788 }, { "epoch": 0.12403466367977363, "grad_norm": 0.21574997901916504, "learning_rate": 4.9539316307718564e-05, "loss": 1.3137, "step": 789 }, { "epoch": 0.12419186857670617, "grad_norm": 0.21586358547210693, "learning_rate": 4.953813493215855e-05, "loss": 1.2763, "step": 790 }, { "epoch": 0.1243490734736387, "grad_norm": 0.2723408043384552, "learning_rate": 4.953695205790262e-05, "loss": 1.4148, "step": 791 }, { "epoch": 0.12450627837057124, "grad_norm": 0.29501527547836304, "learning_rate": 4.9535767685023026e-05, "loss": 1.3093, "step": 792 }, { "epoch": 0.12466348326750379, "grad_norm": 0.2884112000465393, "learning_rate": 4.9534581813592086e-05, "loss": 1.3276, "step": 793 }, { "epoch": 0.12482068816443632, "grad_norm": 0.24246759712696075, "learning_rate": 4.9533394443682234e-05, "loss": 1.3203, "step": 794 }, { "epoch": 0.12497789306136886, "grad_norm": 0.23493270576000214, "learning_rate": 4.9532205575365995e-05, "loss": 1.2567, "step": 795 }, { "epoch": 0.1251350979583014, "grad_norm": 0.26456305384635925, "learning_rate": 4.953101520871598e-05, "loss": 1.3194, "step": 796 }, { "epoch": 0.12529230285523393, "grad_norm": 0.18891221284866333, "learning_rate": 4.952982334380489e-05, "loss": 1.3041, "step": 797 }, { "epoch": 0.12544950775216648, "grad_norm": 0.21460258960723877, "learning_rate": 4.952862998070552e-05, "loss": 1.2274, "step": 798 }, { "epoch": 0.125606712649099, "grad_norm": 0.2832646667957306, "learning_rate": 4.9527435119490753e-05, "loss": 1.2009, "step": 799 }, { "epoch": 0.12576391754603156, "grad_norm": 0.22183702886104584, "learning_rate": 4.9526238760233576e-05, "loss": 1.31, "step": 800 }, { "epoch": 0.12576391754603156, "eval_loss": 1.2745521068572998, "eval_runtime": 2292.1003, "eval_samples_per_second": 4.039, "eval_steps_per_second": 2.02, "step": 800 }, { "epoch": 0.1259211224429641, "grad_norm": 0.21697020530700684, "learning_rate": 4.9525040903007046e-05, "loss": 1.3197, "step": 801 }, { "epoch": 0.12607832733989663, "grad_norm": 0.36354196071624756, "learning_rate": 4.952384154788433e-05, "loss": 1.1926, "step": 802 }, { "epoch": 0.12623553223682918, "grad_norm": 0.27054232358932495, "learning_rate": 4.952264069493868e-05, "loss": 1.3199, "step": 803 }, { "epoch": 0.12639273713376173, "grad_norm": 0.2425469011068344, "learning_rate": 4.952143834424344e-05, "loss": 1.2906, "step": 804 }, { "epoch": 0.12654994203069425, "grad_norm": 0.1988941729068756, "learning_rate": 4.952023449587205e-05, "loss": 1.3183, "step": 805 }, { "epoch": 0.1267071469276268, "grad_norm": 0.2429157942533493, "learning_rate": 4.951902914989802e-05, "loss": 1.2497, "step": 806 }, { "epoch": 0.12686435182455932, "grad_norm": 0.2704293727874756, "learning_rate": 4.951782230639499e-05, "loss": 1.3113, "step": 807 }, { "epoch": 0.12702155672149187, "grad_norm": 0.31801360845565796, "learning_rate": 4.951661396543664e-05, "loss": 1.2354, "step": 808 }, { "epoch": 0.12717876161842442, "grad_norm": 0.21358463168144226, "learning_rate": 4.951540412709681e-05, "loss": 1.3512, "step": 809 }, { "epoch": 0.12733596651535695, "grad_norm": 0.24300484359264374, "learning_rate": 4.951419279144936e-05, "loss": 1.213, "step": 810 }, { "epoch": 0.1274931714122895, "grad_norm": 0.39550015330314636, "learning_rate": 4.951297995856828e-05, "loss": 1.1872, "step": 811 }, { "epoch": 0.12765037630922202, "grad_norm": 0.20150414109230042, "learning_rate": 4.951176562852765e-05, "loss": 1.3469, "step": 812 }, { "epoch": 0.12780758120615457, "grad_norm": 0.2050725519657135, "learning_rate": 4.951054980140164e-05, "loss": 1.259, "step": 813 }, { "epoch": 0.12796478610308712, "grad_norm": 0.23815183341503143, "learning_rate": 4.950933247726451e-05, "loss": 1.2961, "step": 814 }, { "epoch": 0.12812199100001964, "grad_norm": 0.28224676847457886, "learning_rate": 4.95081136561906e-05, "loss": 1.2631, "step": 815 }, { "epoch": 0.1282791958969522, "grad_norm": 0.294791042804718, "learning_rate": 4.9506893338254353e-05, "loss": 1.1834, "step": 816 }, { "epoch": 0.12843640079388474, "grad_norm": 0.29148972034454346, "learning_rate": 4.9505671523530306e-05, "loss": 1.2573, "step": 817 }, { "epoch": 0.12859360569081726, "grad_norm": 0.29371243715286255, "learning_rate": 4.950444821209308e-05, "loss": 1.4532, "step": 818 }, { "epoch": 0.12875081058774981, "grad_norm": 0.2303713709115982, "learning_rate": 4.9503223404017396e-05, "loss": 1.2828, "step": 819 }, { "epoch": 0.12890801548468234, "grad_norm": 0.24906295537948608, "learning_rate": 4.9501997099378046e-05, "loss": 1.2759, "step": 820 }, { "epoch": 0.1290652203816149, "grad_norm": 0.1983998864889145, "learning_rate": 4.950076929824994e-05, "loss": 1.3111, "step": 821 }, { "epoch": 0.12922242527854744, "grad_norm": 0.2079075276851654, "learning_rate": 4.9499540000708064e-05, "loss": 1.3416, "step": 822 }, { "epoch": 0.12937963017547996, "grad_norm": 0.22548237442970276, "learning_rate": 4.94983092068275e-05, "loss": 1.3879, "step": 823 }, { "epoch": 0.1295368350724125, "grad_norm": 0.2052278220653534, "learning_rate": 4.949707691668343e-05, "loss": 1.3347, "step": 824 }, { "epoch": 0.12969403996934503, "grad_norm": 0.21978795528411865, "learning_rate": 4.949584313035109e-05, "loss": 1.1345, "step": 825 }, { "epoch": 0.12985124486627758, "grad_norm": 0.18930193781852722, "learning_rate": 4.9494607847905863e-05, "loss": 1.319, "step": 826 }, { "epoch": 0.13000844976321013, "grad_norm": 0.24538543820381165, "learning_rate": 4.9493371069423176e-05, "loss": 1.3103, "step": 827 }, { "epoch": 0.13016565466014265, "grad_norm": 0.2874930799007416, "learning_rate": 4.9492132794978586e-05, "loss": 1.3388, "step": 828 }, { "epoch": 0.1303228595570752, "grad_norm": 0.23338377475738525, "learning_rate": 4.949089302464771e-05, "loss": 1.2793, "step": 829 }, { "epoch": 0.13048006445400775, "grad_norm": 0.23670902848243713, "learning_rate": 4.948965175850626e-05, "loss": 1.2808, "step": 830 }, { "epoch": 0.13063726935094028, "grad_norm": 0.2617732584476471, "learning_rate": 4.9488408996630066e-05, "loss": 1.2641, "step": 831 }, { "epoch": 0.13079447424787283, "grad_norm": 0.24584044516086578, "learning_rate": 4.948716473909502e-05, "loss": 1.2462, "step": 832 }, { "epoch": 0.13095167914480535, "grad_norm": 0.2507297098636627, "learning_rate": 4.948591898597712e-05, "loss": 1.2211, "step": 833 }, { "epoch": 0.1311088840417379, "grad_norm": 0.25439611077308655, "learning_rate": 4.948467173735245e-05, "loss": 1.2762, "step": 834 }, { "epoch": 0.13126608893867045, "grad_norm": 0.19934779405593872, "learning_rate": 4.948342299329719e-05, "loss": 1.1798, "step": 835 }, { "epoch": 0.13142329383560297, "grad_norm": 0.24154123663902283, "learning_rate": 4.948217275388761e-05, "loss": 1.2608, "step": 836 }, { "epoch": 0.13158049873253552, "grad_norm": 0.2484877109527588, "learning_rate": 4.948092101920006e-05, "loss": 1.2466, "step": 837 }, { "epoch": 0.13173770362946804, "grad_norm": 0.28683343529701233, "learning_rate": 4.9479667789311e-05, "loss": 1.1915, "step": 838 }, { "epoch": 0.1318949085264006, "grad_norm": 0.21289369463920593, "learning_rate": 4.9478413064296976e-05, "loss": 1.2642, "step": 839 }, { "epoch": 0.13205211342333315, "grad_norm": 0.22933778166770935, "learning_rate": 4.947715684423461e-05, "loss": 1.2182, "step": 840 }, { "epoch": 0.13220931832026567, "grad_norm": 0.2507724463939667, "learning_rate": 4.9475899129200635e-05, "loss": 1.3089, "step": 841 }, { "epoch": 0.13236652321719822, "grad_norm": 0.251770943403244, "learning_rate": 4.947463991927187e-05, "loss": 1.3194, "step": 842 }, { "epoch": 0.13252372811413077, "grad_norm": 0.2533280849456787, "learning_rate": 4.947337921452521e-05, "loss": 1.2141, "step": 843 }, { "epoch": 0.1326809330110633, "grad_norm": 0.26309993863105774, "learning_rate": 4.9472117015037664e-05, "loss": 1.2265, "step": 844 }, { "epoch": 0.13283813790799584, "grad_norm": 0.29711806774139404, "learning_rate": 4.9470853320886335e-05, "loss": 1.2538, "step": 845 }, { "epoch": 0.13299534280492836, "grad_norm": 0.24551883339881897, "learning_rate": 4.9469588132148373e-05, "loss": 1.2927, "step": 846 }, { "epoch": 0.1331525477018609, "grad_norm": 0.28027257323265076, "learning_rate": 4.946832144890108e-05, "loss": 1.2712, "step": 847 }, { "epoch": 0.13330975259879346, "grad_norm": 0.22099149227142334, "learning_rate": 4.9467053271221804e-05, "loss": 1.2095, "step": 848 }, { "epoch": 0.13346695749572599, "grad_norm": 0.19661381840705872, "learning_rate": 4.946578359918801e-05, "loss": 1.2855, "step": 849 }, { "epoch": 0.13362416239265854, "grad_norm": 0.22767631709575653, "learning_rate": 4.946451243287723e-05, "loss": 1.2932, "step": 850 }, { "epoch": 0.13378136728959106, "grad_norm": 0.28008589148521423, "learning_rate": 4.946323977236712e-05, "loss": 1.2335, "step": 851 }, { "epoch": 0.1339385721865236, "grad_norm": 0.2091825157403946, "learning_rate": 4.94619656177354e-05, "loss": 1.3151, "step": 852 }, { "epoch": 0.13409577708345616, "grad_norm": 0.1978277713060379, "learning_rate": 4.946068996905989e-05, "loss": 1.3359, "step": 853 }, { "epoch": 0.13425298198038868, "grad_norm": 0.21397674083709717, "learning_rate": 4.9459412826418505e-05, "loss": 1.2998, "step": 854 }, { "epoch": 0.13441018687732123, "grad_norm": 0.30490776896476746, "learning_rate": 4.945813418988925e-05, "loss": 1.2607, "step": 855 }, { "epoch": 0.13456739177425378, "grad_norm": 0.2896914780139923, "learning_rate": 4.945685405955021e-05, "loss": 1.2329, "step": 856 }, { "epoch": 0.1347245966711863, "grad_norm": 0.1988048106431961, "learning_rate": 4.945557243547958e-05, "loss": 1.2877, "step": 857 }, { "epoch": 0.13488180156811885, "grad_norm": 0.17888212203979492, "learning_rate": 4.945428931775563e-05, "loss": 1.2543, "step": 858 }, { "epoch": 0.13503900646505138, "grad_norm": 0.2748056650161743, "learning_rate": 4.945300470645673e-05, "loss": 1.3461, "step": 859 }, { "epoch": 0.13519621136198393, "grad_norm": 0.23218591511249542, "learning_rate": 4.945171860166135e-05, "loss": 1.2878, "step": 860 }, { "epoch": 0.13535341625891648, "grad_norm": 0.33142325282096863, "learning_rate": 4.9450431003448015e-05, "loss": 1.294, "step": 861 }, { "epoch": 0.135510621155849, "grad_norm": 0.2330816686153412, "learning_rate": 4.944914191189539e-05, "loss": 1.3593, "step": 862 }, { "epoch": 0.13566782605278155, "grad_norm": 0.23989921808242798, "learning_rate": 4.9447851327082204e-05, "loss": 1.2879, "step": 863 }, { "epoch": 0.13582503094971407, "grad_norm": 0.21358944475650787, "learning_rate": 4.944655924908727e-05, "loss": 1.222, "step": 864 }, { "epoch": 0.13598223584664662, "grad_norm": 0.30434924364089966, "learning_rate": 4.9445265677989515e-05, "loss": 1.273, "step": 865 }, { "epoch": 0.13613944074357917, "grad_norm": 0.22028383612632751, "learning_rate": 4.944397061386794e-05, "loss": 1.2494, "step": 866 }, { "epoch": 0.1362966456405117, "grad_norm": 0.2354927659034729, "learning_rate": 4.944267405680164e-05, "loss": 1.1469, "step": 867 }, { "epoch": 0.13645385053744424, "grad_norm": 0.28941988945007324, "learning_rate": 4.944137600686981e-05, "loss": 1.1678, "step": 868 }, { "epoch": 0.1366110554343768, "grad_norm": 0.2538214325904846, "learning_rate": 4.944007646415172e-05, "loss": 1.2636, "step": 869 }, { "epoch": 0.13676826033130932, "grad_norm": 0.3719157576560974, "learning_rate": 4.943877542872676e-05, "loss": 1.2901, "step": 870 }, { "epoch": 0.13692546522824187, "grad_norm": 0.2994091212749481, "learning_rate": 4.943747290067438e-05, "loss": 1.2209, "step": 871 }, { "epoch": 0.1370826701251744, "grad_norm": 0.23586580157279968, "learning_rate": 4.9436168880074115e-05, "loss": 1.2989, "step": 872 }, { "epoch": 0.13723987502210694, "grad_norm": 0.193126380443573, "learning_rate": 4.943486336700564e-05, "loss": 1.204, "step": 873 }, { "epoch": 0.1373970799190395, "grad_norm": 0.18505080044269562, "learning_rate": 4.943355636154868e-05, "loss": 1.3247, "step": 874 }, { "epoch": 0.137554284815972, "grad_norm": 0.2586881220340729, "learning_rate": 4.9432247863783064e-05, "loss": 1.3315, "step": 875 }, { "epoch": 0.13771148971290456, "grad_norm": 0.2904506027698517, "learning_rate": 4.943093787378871e-05, "loss": 1.2593, "step": 876 }, { "epoch": 0.13786869460983708, "grad_norm": 0.2971174120903015, "learning_rate": 4.9429626391645615e-05, "loss": 1.2241, "step": 877 }, { "epoch": 0.13802589950676963, "grad_norm": 0.42521703243255615, "learning_rate": 4.9428313417433894e-05, "loss": 1.2638, "step": 878 }, { "epoch": 0.13818310440370218, "grad_norm": 0.2515777349472046, "learning_rate": 4.9426998951233735e-05, "loss": 1.3111, "step": 879 }, { "epoch": 0.1383403093006347, "grad_norm": 0.25959545373916626, "learning_rate": 4.942568299312541e-05, "loss": 1.2505, "step": 880 }, { "epoch": 0.13849751419756726, "grad_norm": 0.28090932965278625, "learning_rate": 4.942436554318931e-05, "loss": 1.1604, "step": 881 }, { "epoch": 0.1386547190944998, "grad_norm": 0.21833541989326477, "learning_rate": 4.942304660150588e-05, "loss": 1.2246, "step": 882 }, { "epoch": 0.13881192399143233, "grad_norm": 0.26167765259742737, "learning_rate": 4.9421726168155704e-05, "loss": 1.2399, "step": 883 }, { "epoch": 0.13896912888836488, "grad_norm": 0.23778817057609558, "learning_rate": 4.9420404243219395e-05, "loss": 1.2692, "step": 884 }, { "epoch": 0.1391263337852974, "grad_norm": 0.43253734707832336, "learning_rate": 4.941908082677773e-05, "loss": 1.2302, "step": 885 }, { "epoch": 0.13928353868222995, "grad_norm": 0.2448786050081253, "learning_rate": 4.94177559189115e-05, "loss": 1.3163, "step": 886 }, { "epoch": 0.1394407435791625, "grad_norm": 0.24711786210536957, "learning_rate": 4.941642951970165e-05, "loss": 1.2756, "step": 887 }, { "epoch": 0.13959794847609502, "grad_norm": 0.22932004928588867, "learning_rate": 4.941510162922917e-05, "loss": 1.3087, "step": 888 }, { "epoch": 0.13975515337302757, "grad_norm": 0.24999158084392548, "learning_rate": 4.941377224757518e-05, "loss": 1.3328, "step": 889 }, { "epoch": 0.1399123582699601, "grad_norm": 0.21222981810569763, "learning_rate": 4.941244137482088e-05, "loss": 1.3177, "step": 890 }, { "epoch": 0.14006956316689265, "grad_norm": 0.22691656649112701, "learning_rate": 4.941110901104754e-05, "loss": 1.2937, "step": 891 }, { "epoch": 0.1402267680638252, "grad_norm": 0.3120933771133423, "learning_rate": 4.940977515633653e-05, "loss": 1.1604, "step": 892 }, { "epoch": 0.14038397296075772, "grad_norm": 0.24279998242855072, "learning_rate": 4.940843981076934e-05, "loss": 1.3234, "step": 893 }, { "epoch": 0.14054117785769027, "grad_norm": 0.25406959652900696, "learning_rate": 4.940710297442751e-05, "loss": 1.3216, "step": 894 }, { "epoch": 0.14069838275462282, "grad_norm": 0.29678472876548767, "learning_rate": 4.940576464739269e-05, "loss": 1.2706, "step": 895 }, { "epoch": 0.14085558765155534, "grad_norm": 0.25185081362724304, "learning_rate": 4.9404424829746634e-05, "loss": 1.2456, "step": 896 }, { "epoch": 0.1410127925484879, "grad_norm": 0.2171952873468399, "learning_rate": 4.940308352157115e-05, "loss": 1.2943, "step": 897 }, { "epoch": 0.14116999744542041, "grad_norm": 0.21498677134513855, "learning_rate": 4.940174072294818e-05, "loss": 1.3466, "step": 898 }, { "epoch": 0.14132720234235296, "grad_norm": 0.2881999611854553, "learning_rate": 4.940039643395972e-05, "loss": 1.2322, "step": 899 }, { "epoch": 0.14148440723928551, "grad_norm": 0.2709384858608246, "learning_rate": 4.939905065468789e-05, "loss": 1.2228, "step": 900 }, { "epoch": 0.14164161213621804, "grad_norm": 0.2723088562488556, "learning_rate": 4.9397703385214875e-05, "loss": 1.1937, "step": 901 }, { "epoch": 0.1417988170331506, "grad_norm": 0.4296363294124603, "learning_rate": 4.939635462562297e-05, "loss": 1.2043, "step": 902 }, { "epoch": 0.1419560219300831, "grad_norm": 0.3255182206630707, "learning_rate": 4.939500437599454e-05, "loss": 1.1563, "step": 903 }, { "epoch": 0.14211322682701566, "grad_norm": 0.33772897720336914, "learning_rate": 4.939365263641206e-05, "loss": 1.3019, "step": 904 }, { "epoch": 0.1422704317239482, "grad_norm": 0.18991219997406006, "learning_rate": 4.93922994069581e-05, "loss": 1.3575, "step": 905 }, { "epoch": 0.14242763662088073, "grad_norm": 0.23950403928756714, "learning_rate": 4.939094468771529e-05, "loss": 1.2512, "step": 906 }, { "epoch": 0.14258484151781328, "grad_norm": 0.29783302545547485, "learning_rate": 4.938958847876637e-05, "loss": 1.3033, "step": 907 }, { "epoch": 0.14274204641474583, "grad_norm": 0.3168744742870331, "learning_rate": 4.93882307801942e-05, "loss": 1.1934, "step": 908 }, { "epoch": 0.14289925131167835, "grad_norm": 0.22578391432762146, "learning_rate": 4.9386871592081675e-05, "loss": 1.3307, "step": 909 }, { "epoch": 0.1430564562086109, "grad_norm": 0.32671108841896057, "learning_rate": 4.9385510914511824e-05, "loss": 1.2436, "step": 910 }, { "epoch": 0.14321366110554343, "grad_norm": 0.2524665296077728, "learning_rate": 4.938414874756774e-05, "loss": 1.2611, "step": 911 }, { "epoch": 0.14337086600247598, "grad_norm": 0.3576960563659668, "learning_rate": 4.9382785091332625e-05, "loss": 1.3721, "step": 912 }, { "epoch": 0.14352807089940853, "grad_norm": 0.2915900945663452, "learning_rate": 4.9381419945889776e-05, "loss": 1.3539, "step": 913 }, { "epoch": 0.14368527579634105, "grad_norm": 0.3168608844280243, "learning_rate": 4.938005331132256e-05, "loss": 1.224, "step": 914 }, { "epoch": 0.1438424806932736, "grad_norm": 0.24886426329612732, "learning_rate": 4.937868518771445e-05, "loss": 1.2299, "step": 915 }, { "epoch": 0.14399968559020612, "grad_norm": 0.26588642597198486, "learning_rate": 4.9377315575149e-05, "loss": 1.1947, "step": 916 }, { "epoch": 0.14415689048713867, "grad_norm": 0.28032201528549194, "learning_rate": 4.937594447370986e-05, "loss": 1.3756, "step": 917 }, { "epoch": 0.14431409538407122, "grad_norm": 0.3017072081565857, "learning_rate": 4.937457188348078e-05, "loss": 1.2723, "step": 918 }, { "epoch": 0.14447130028100374, "grad_norm": 0.2926197648048401, "learning_rate": 4.937319780454559e-05, "loss": 1.2716, "step": 919 }, { "epoch": 0.1446285051779363, "grad_norm": 0.24066713452339172, "learning_rate": 4.937182223698821e-05, "loss": 1.2828, "step": 920 }, { "epoch": 0.14478571007486885, "grad_norm": 0.30001577734947205, "learning_rate": 4.937044518089266e-05, "loss": 1.2407, "step": 921 }, { "epoch": 0.14494291497180137, "grad_norm": 0.25927406549453735, "learning_rate": 4.9369066636343044e-05, "loss": 1.3004, "step": 922 }, { "epoch": 0.14510011986873392, "grad_norm": 0.2542930543422699, "learning_rate": 4.936768660342355e-05, "loss": 1.3312, "step": 923 }, { "epoch": 0.14525732476566644, "grad_norm": 0.25233832001686096, "learning_rate": 4.936630508221847e-05, "loss": 1.1879, "step": 924 }, { "epoch": 0.145414529662599, "grad_norm": 0.22136953473091125, "learning_rate": 4.9364922072812185e-05, "loss": 1.2649, "step": 925 }, { "epoch": 0.14557173455953154, "grad_norm": 0.21759863197803497, "learning_rate": 4.936353757528916e-05, "loss": 1.2467, "step": 926 }, { "epoch": 0.14572893945646406, "grad_norm": 0.27614825963974, "learning_rate": 4.936215158973396e-05, "loss": 1.1901, "step": 927 }, { "epoch": 0.1458861443533966, "grad_norm": 0.2502923309803009, "learning_rate": 4.936076411623124e-05, "loss": 1.3358, "step": 928 }, { "epoch": 0.14604334925032914, "grad_norm": 0.2419285923242569, "learning_rate": 4.935937515486573e-05, "loss": 1.24, "step": 929 }, { "epoch": 0.14620055414726169, "grad_norm": 0.35315272212028503, "learning_rate": 4.935798470572226e-05, "loss": 1.2452, "step": 930 }, { "epoch": 0.14635775904419424, "grad_norm": 0.28915464878082275, "learning_rate": 4.935659276888577e-05, "loss": 1.3369, "step": 931 }, { "epoch": 0.14651496394112676, "grad_norm": 0.23898139595985413, "learning_rate": 4.9355199344441254e-05, "loss": 1.2328, "step": 932 }, { "epoch": 0.1466721688380593, "grad_norm": 0.25197896361351013, "learning_rate": 4.935380443247384e-05, "loss": 1.2826, "step": 933 }, { "epoch": 0.14682937373499186, "grad_norm": 0.26547369360923767, "learning_rate": 4.9352408033068695e-05, "loss": 1.2284, "step": 934 }, { "epoch": 0.14698657863192438, "grad_norm": 0.22031289339065552, "learning_rate": 4.935101014631114e-05, "loss": 1.2918, "step": 935 }, { "epoch": 0.14714378352885693, "grad_norm": 0.2603214979171753, "learning_rate": 4.9349610772286525e-05, "loss": 1.1767, "step": 936 }, { "epoch": 0.14730098842578945, "grad_norm": 0.29469192028045654, "learning_rate": 4.934820991108032e-05, "loss": 1.2845, "step": 937 }, { "epoch": 0.147458193322722, "grad_norm": 0.30825692415237427, "learning_rate": 4.934680756277811e-05, "loss": 1.1999, "step": 938 }, { "epoch": 0.14761539821965455, "grad_norm": 0.25342094898223877, "learning_rate": 4.934540372746552e-05, "loss": 1.2285, "step": 939 }, { "epoch": 0.14777260311658708, "grad_norm": 0.26036733388900757, "learning_rate": 4.9343998405228295e-05, "loss": 1.2367, "step": 940 }, { "epoch": 0.14792980801351963, "grad_norm": 0.27401411533355713, "learning_rate": 4.934259159615228e-05, "loss": 1.1985, "step": 941 }, { "epoch": 0.14808701291045215, "grad_norm": 0.23039095103740692, "learning_rate": 4.934118330032338e-05, "loss": 1.2649, "step": 942 }, { "epoch": 0.1482442178073847, "grad_norm": 0.29547953605651855, "learning_rate": 4.933977351782761e-05, "loss": 1.1345, "step": 943 }, { "epoch": 0.14840142270431725, "grad_norm": 0.22598884999752045, "learning_rate": 4.933836224875109e-05, "loss": 1.2965, "step": 944 }, { "epoch": 0.14855862760124977, "grad_norm": 0.31008240580558777, "learning_rate": 4.9336949493180006e-05, "loss": 1.1144, "step": 945 }, { "epoch": 0.14871583249818232, "grad_norm": 0.28397658467292786, "learning_rate": 4.9335535251200636e-05, "loss": 1.266, "step": 946 }, { "epoch": 0.14887303739511487, "grad_norm": 0.2284776121377945, "learning_rate": 4.933411952289937e-05, "loss": 1.2164, "step": 947 }, { "epoch": 0.1490302422920474, "grad_norm": 0.2107551246881485, "learning_rate": 4.9332702308362665e-05, "loss": 1.2719, "step": 948 }, { "epoch": 0.14918744718897994, "grad_norm": 0.26652616262435913, "learning_rate": 4.933128360767709e-05, "loss": 1.2304, "step": 949 }, { "epoch": 0.14934465208591247, "grad_norm": 0.22624680399894714, "learning_rate": 4.932986342092928e-05, "loss": 1.2999, "step": 950 }, { "epoch": 0.14950185698284502, "grad_norm": 0.20410288870334625, "learning_rate": 4.932844174820598e-05, "loss": 1.2269, "step": 951 }, { "epoch": 0.14965906187977757, "grad_norm": 0.24987919628620148, "learning_rate": 4.932701858959403e-05, "loss": 1.3042, "step": 952 }, { "epoch": 0.1498162667767101, "grad_norm": 0.191947340965271, "learning_rate": 4.932559394518033e-05, "loss": 1.2803, "step": 953 }, { "epoch": 0.14997347167364264, "grad_norm": 0.3396085798740387, "learning_rate": 4.932416781505191e-05, "loss": 1.2014, "step": 954 }, { "epoch": 0.15013067657057516, "grad_norm": 0.22375932335853577, "learning_rate": 4.932274019929587e-05, "loss": 1.242, "step": 955 }, { "epoch": 0.1502878814675077, "grad_norm": 0.281097412109375, "learning_rate": 4.93213110979994e-05, "loss": 1.2742, "step": 956 }, { "epoch": 0.15044508636444026, "grad_norm": 0.24049919843673706, "learning_rate": 4.931988051124979e-05, "loss": 1.3166, "step": 957 }, { "epoch": 0.15060229126137278, "grad_norm": 0.24433936178684235, "learning_rate": 4.93184484391344e-05, "loss": 1.2933, "step": 958 }, { "epoch": 0.15075949615830533, "grad_norm": 0.3671477138996124, "learning_rate": 4.9317014881740706e-05, "loss": 1.1731, "step": 959 }, { "epoch": 0.15091670105523788, "grad_norm": 0.22575189173221588, "learning_rate": 4.931557983915627e-05, "loss": 1.2509, "step": 960 }, { "epoch": 0.15091670105523788, "eval_loss": 1.2455451488494873, "eval_runtime": 2308.563, "eval_samples_per_second": 4.01, "eval_steps_per_second": 2.005, "step": 960 }, { "epoch": 0.1510739059521704, "grad_norm": 0.25838157534599304, "learning_rate": 4.931414331146873e-05, "loss": 1.3554, "step": 961 }, { "epoch": 0.15123111084910296, "grad_norm": 0.3163435459136963, "learning_rate": 4.931270529876583e-05, "loss": 1.3133, "step": 962 }, { "epoch": 0.15138831574603548, "grad_norm": 0.30024880170822144, "learning_rate": 4.9311265801135384e-05, "loss": 1.2303, "step": 963 }, { "epoch": 0.15154552064296803, "grad_norm": 0.3424816429615021, "learning_rate": 4.9309824818665325e-05, "loss": 1.1929, "step": 964 }, { "epoch": 0.15170272553990058, "grad_norm": 0.27401861548423767, "learning_rate": 4.930838235144366e-05, "loss": 1.2198, "step": 965 }, { "epoch": 0.1518599304368331, "grad_norm": 0.24110247194766998, "learning_rate": 4.930693839955848e-05, "loss": 1.2381, "step": 966 }, { "epoch": 0.15201713533376565, "grad_norm": 0.159100741147995, "learning_rate": 4.9305492963098e-05, "loss": 1.2666, "step": 967 }, { "epoch": 0.15217434023069817, "grad_norm": 0.23810729384422302, "learning_rate": 4.9304046042150474e-05, "loss": 1.2592, "step": 968 }, { "epoch": 0.15233154512763072, "grad_norm": 0.19887159764766693, "learning_rate": 4.930259763680429e-05, "loss": 1.3179, "step": 969 }, { "epoch": 0.15248875002456327, "grad_norm": 0.22060149908065796, "learning_rate": 4.930114774714791e-05, "loss": 1.2712, "step": 970 }, { "epoch": 0.1526459549214958, "grad_norm": 0.2443406730890274, "learning_rate": 4.929969637326989e-05, "loss": 1.2021, "step": 971 }, { "epoch": 0.15280315981842835, "grad_norm": 0.2185499370098114, "learning_rate": 4.9298243515258855e-05, "loss": 1.3017, "step": 972 }, { "epoch": 0.1529603647153609, "grad_norm": 0.23159849643707275, "learning_rate": 4.929678917320357e-05, "loss": 1.2122, "step": 973 }, { "epoch": 0.15311756961229342, "grad_norm": 0.3281627595424652, "learning_rate": 4.929533334719284e-05, "loss": 1.1151, "step": 974 }, { "epoch": 0.15327477450922597, "grad_norm": 0.24456332623958588, "learning_rate": 4.929387603731558e-05, "loss": 1.2107, "step": 975 }, { "epoch": 0.1534319794061585, "grad_norm": 0.3623602092266083, "learning_rate": 4.9292417243660814e-05, "loss": 1.3041, "step": 976 }, { "epoch": 0.15358918430309104, "grad_norm": 0.2319325953722, "learning_rate": 4.929095696631763e-05, "loss": 1.2331, "step": 977 }, { "epoch": 0.1537463892000236, "grad_norm": 0.324660986661911, "learning_rate": 4.92894952053752e-05, "loss": 1.2511, "step": 978 }, { "epoch": 0.15390359409695611, "grad_norm": 0.23866824805736542, "learning_rate": 4.9288031960922834e-05, "loss": 1.2709, "step": 979 }, { "epoch": 0.15406079899388866, "grad_norm": 0.21922123432159424, "learning_rate": 4.928656723304989e-05, "loss": 1.3108, "step": 980 }, { "epoch": 0.1542180038908212, "grad_norm": 0.2919687032699585, "learning_rate": 4.92851010218458e-05, "loss": 1.0465, "step": 981 }, { "epoch": 0.15437520878775374, "grad_norm": 0.26380711793899536, "learning_rate": 4.9283633327400156e-05, "loss": 1.2066, "step": 982 }, { "epoch": 0.1545324136846863, "grad_norm": 0.2607104182243347, "learning_rate": 4.9282164149802576e-05, "loss": 1.1227, "step": 983 }, { "epoch": 0.1546896185816188, "grad_norm": 0.32302606105804443, "learning_rate": 4.92806934891428e-05, "loss": 1.2066, "step": 984 }, { "epoch": 0.15484682347855136, "grad_norm": 0.28476232290267944, "learning_rate": 4.927922134551065e-05, "loss": 1.1447, "step": 985 }, { "epoch": 0.1550040283754839, "grad_norm": 0.253738671541214, "learning_rate": 4.9277747718996036e-05, "loss": 1.2438, "step": 986 }, { "epoch": 0.15516123327241643, "grad_norm": 0.29659610986709595, "learning_rate": 4.927627260968896e-05, "loss": 1.1946, "step": 987 }, { "epoch": 0.15531843816934898, "grad_norm": 0.27436473965644836, "learning_rate": 4.927479601767952e-05, "loss": 1.2783, "step": 988 }, { "epoch": 0.1554756430662815, "grad_norm": 0.21257497370243073, "learning_rate": 4.9273317943057896e-05, "loss": 1.204, "step": 989 }, { "epoch": 0.15563284796321405, "grad_norm": 0.2546120584011078, "learning_rate": 4.927183838591437e-05, "loss": 1.2224, "step": 990 }, { "epoch": 0.1557900528601466, "grad_norm": 0.2336018979549408, "learning_rate": 4.92703573463393e-05, "loss": 1.2903, "step": 991 }, { "epoch": 0.15594725775707913, "grad_norm": 0.2048567533493042, "learning_rate": 4.926887482442315e-05, "loss": 1.2529, "step": 992 }, { "epoch": 0.15610446265401168, "grad_norm": 0.20950450003147125, "learning_rate": 4.926739082025646e-05, "loss": 1.4247, "step": 993 }, { "epoch": 0.1562616675509442, "grad_norm": 0.22965438663959503, "learning_rate": 4.926590533392987e-05, "loss": 1.2573, "step": 994 }, { "epoch": 0.15641887244787675, "grad_norm": 0.25654879212379456, "learning_rate": 4.9264418365534105e-05, "loss": 1.2115, "step": 995 }, { "epoch": 0.1565760773448093, "grad_norm": 0.26419493556022644, "learning_rate": 4.9262929915159995e-05, "loss": 1.2994, "step": 996 }, { "epoch": 0.15673328224174182, "grad_norm": 0.3215327560901642, "learning_rate": 4.926143998289843e-05, "loss": 1.2268, "step": 997 }, { "epoch": 0.15689048713867437, "grad_norm": 0.3190024495124817, "learning_rate": 4.925994856884042e-05, "loss": 1.2747, "step": 998 }, { "epoch": 0.15704769203560692, "grad_norm": 0.26742681860923767, "learning_rate": 4.9258455673077065e-05, "loss": 1.2791, "step": 999 }, { "epoch": 0.15720489693253945, "grad_norm": 0.21026930212974548, "learning_rate": 4.925696129569953e-05, "loss": 1.3694, "step": 1000 }, { "epoch": 0.157362101829472, "grad_norm": 0.2838928699493408, "learning_rate": 4.925546543679909e-05, "loss": 1.2589, "step": 1001 }, { "epoch": 0.15751930672640452, "grad_norm": 0.3007844090461731, "learning_rate": 4.9253968096467104e-05, "loss": 1.2488, "step": 1002 }, { "epoch": 0.15767651162333707, "grad_norm": 0.2473154515028, "learning_rate": 4.925246927479503e-05, "loss": 1.2288, "step": 1003 }, { "epoch": 0.15783371652026962, "grad_norm": 0.2360457479953766, "learning_rate": 4.925096897187441e-05, "loss": 1.3148, "step": 1004 }, { "epoch": 0.15799092141720214, "grad_norm": 0.3671962320804596, "learning_rate": 4.924946718779687e-05, "loss": 1.1999, "step": 1005 }, { "epoch": 0.1581481263141347, "grad_norm": 0.28391456604003906, "learning_rate": 4.924796392265414e-05, "loss": 1.2296, "step": 1006 }, { "epoch": 0.1583053312110672, "grad_norm": 0.2550790011882782, "learning_rate": 4.924645917653802e-05, "loss": 1.1923, "step": 1007 }, { "epoch": 0.15846253610799976, "grad_norm": 0.301740825176239, "learning_rate": 4.924495294954044e-05, "loss": 1.2363, "step": 1008 }, { "epoch": 0.1586197410049323, "grad_norm": 0.2608097195625305, "learning_rate": 4.9243445241753374e-05, "loss": 1.298, "step": 1009 }, { "epoch": 0.15877694590186484, "grad_norm": 0.31052571535110474, "learning_rate": 4.924193605326891e-05, "loss": 1.3037, "step": 1010 }, { "epoch": 0.15893415079879739, "grad_norm": 0.2420274317264557, "learning_rate": 4.924042538417923e-05, "loss": 1.2541, "step": 1011 }, { "epoch": 0.15909135569572994, "grad_norm": 0.21056459844112396, "learning_rate": 4.92389132345766e-05, "loss": 1.3229, "step": 1012 }, { "epoch": 0.15924856059266246, "grad_norm": 0.3283076286315918, "learning_rate": 4.923739960455337e-05, "loss": 1.1036, "step": 1013 }, { "epoch": 0.159405765489595, "grad_norm": 0.2461448460817337, "learning_rate": 4.9235884494201987e-05, "loss": 1.2503, "step": 1014 }, { "epoch": 0.15956297038652753, "grad_norm": 0.2919829189777374, "learning_rate": 4.923436790361499e-05, "loss": 1.2108, "step": 1015 }, { "epoch": 0.15972017528346008, "grad_norm": 0.22029368579387665, "learning_rate": 4.923284983288501e-05, "loss": 1.2369, "step": 1016 }, { "epoch": 0.15987738018039263, "grad_norm": 0.27863630652427673, "learning_rate": 4.9231330282104756e-05, "loss": 1.2555, "step": 1017 }, { "epoch": 0.16003458507732515, "grad_norm": 0.28123825788497925, "learning_rate": 4.9229809251367055e-05, "loss": 1.1519, "step": 1018 }, { "epoch": 0.1601917899742577, "grad_norm": 0.231231227517128, "learning_rate": 4.922828674076478e-05, "loss": 1.3488, "step": 1019 }, { "epoch": 0.16034899487119023, "grad_norm": 0.2704828977584839, "learning_rate": 4.9226762750390944e-05, "loss": 1.1102, "step": 1020 }, { "epoch": 0.16050619976812278, "grad_norm": 0.17970627546310425, "learning_rate": 4.922523728033861e-05, "loss": 1.2237, "step": 1021 }, { "epoch": 0.16066340466505533, "grad_norm": 0.22386445105075836, "learning_rate": 4.9223710330700956e-05, "loss": 1.2564, "step": 1022 }, { "epoch": 0.16082060956198785, "grad_norm": 0.21347114443778992, "learning_rate": 4.922218190157124e-05, "loss": 1.2433, "step": 1023 }, { "epoch": 0.1609778144589204, "grad_norm": 0.23873676359653473, "learning_rate": 4.9220651993042813e-05, "loss": 1.2018, "step": 1024 }, { "epoch": 0.16113501935585295, "grad_norm": 0.26954975724220276, "learning_rate": 4.921912060520912e-05, "loss": 1.2118, "step": 1025 }, { "epoch": 0.16129222425278547, "grad_norm": 0.3023718595504761, "learning_rate": 4.9217587738163686e-05, "loss": 1.2717, "step": 1026 }, { "epoch": 0.16144942914971802, "grad_norm": 0.31107062101364136, "learning_rate": 4.921605339200013e-05, "loss": 1.2017, "step": 1027 }, { "epoch": 0.16160663404665054, "grad_norm": 0.2795855402946472, "learning_rate": 4.921451756681217e-05, "loss": 1.3492, "step": 1028 }, { "epoch": 0.1617638389435831, "grad_norm": 0.24515774846076965, "learning_rate": 4.921298026269361e-05, "loss": 1.2763, "step": 1029 }, { "epoch": 0.16192104384051564, "grad_norm": 0.2603877782821655, "learning_rate": 4.921144147973834e-05, "loss": 1.3196, "step": 1030 }, { "epoch": 0.16207824873744817, "grad_norm": 0.265697181224823, "learning_rate": 4.9209901218040335e-05, "loss": 1.3067, "step": 1031 }, { "epoch": 0.16223545363438072, "grad_norm": 0.2554314136505127, "learning_rate": 4.9208359477693686e-05, "loss": 1.2347, "step": 1032 }, { "epoch": 0.16239265853131324, "grad_norm": 0.3272973597049713, "learning_rate": 4.920681625879254e-05, "loss": 1.2104, "step": 1033 }, { "epoch": 0.1625498634282458, "grad_norm": 0.21259547770023346, "learning_rate": 4.9205271561431166e-05, "loss": 1.2857, "step": 1034 }, { "epoch": 0.16270706832517834, "grad_norm": 0.2505529820919037, "learning_rate": 4.92037253857039e-05, "loss": 1.1988, "step": 1035 }, { "epoch": 0.16286427322211086, "grad_norm": 0.2369750738143921, "learning_rate": 4.920217773170517e-05, "loss": 1.2384, "step": 1036 }, { "epoch": 0.1630214781190434, "grad_norm": 0.26577889919281006, "learning_rate": 4.920062859952951e-05, "loss": 1.1521, "step": 1037 }, { "epoch": 0.16317868301597593, "grad_norm": 0.2224215418100357, "learning_rate": 4.919907798927153e-05, "loss": 1.3065, "step": 1038 }, { "epoch": 0.16333588791290848, "grad_norm": 0.1882622092962265, "learning_rate": 4.9197525901025944e-05, "loss": 1.2472, "step": 1039 }, { "epoch": 0.16349309280984103, "grad_norm": 0.25392916798591614, "learning_rate": 4.919597233488754e-05, "loss": 1.3387, "step": 1040 }, { "epoch": 0.16365029770677356, "grad_norm": 0.3185995817184448, "learning_rate": 4.91944172909512e-05, "loss": 1.161, "step": 1041 }, { "epoch": 0.1638075026037061, "grad_norm": 0.26924118399620056, "learning_rate": 4.919286076931191e-05, "loss": 1.0659, "step": 1042 }, { "epoch": 0.16396470750063866, "grad_norm": 0.2224770486354828, "learning_rate": 4.919130277006473e-05, "loss": 1.2303, "step": 1043 }, { "epoch": 0.16412191239757118, "grad_norm": 0.24008037149906158, "learning_rate": 4.918974329330482e-05, "loss": 1.2762, "step": 1044 }, { "epoch": 0.16427911729450373, "grad_norm": 0.2728358209133148, "learning_rate": 4.918818233912742e-05, "loss": 1.1582, "step": 1045 }, { "epoch": 0.16443632219143625, "grad_norm": 0.24911800026893616, "learning_rate": 4.918661990762788e-05, "loss": 1.2155, "step": 1046 }, { "epoch": 0.1645935270883688, "grad_norm": 0.2444472759962082, "learning_rate": 4.918505599890162e-05, "loss": 1.2838, "step": 1047 }, { "epoch": 0.16475073198530135, "grad_norm": 0.2379113882780075, "learning_rate": 4.918349061304416e-05, "loss": 1.3043, "step": 1048 }, { "epoch": 0.16490793688223387, "grad_norm": 0.3085183799266815, "learning_rate": 4.9181923750151095e-05, "loss": 1.2568, "step": 1049 }, { "epoch": 0.16506514177916642, "grad_norm": 0.2629674971103668, "learning_rate": 4.918035541031814e-05, "loss": 1.2171, "step": 1050 }, { "epoch": 0.16522234667609895, "grad_norm": 0.2707282602787018, "learning_rate": 4.917878559364107e-05, "loss": 1.1597, "step": 1051 }, { "epoch": 0.1653795515730315, "grad_norm": 0.27305370569229126, "learning_rate": 4.9177214300215784e-05, "loss": 1.36, "step": 1052 }, { "epoch": 0.16553675646996405, "grad_norm": 0.20558474957942963, "learning_rate": 4.9175641530138226e-05, "loss": 1.2225, "step": 1053 }, { "epoch": 0.16569396136689657, "grad_norm": 0.23680521547794342, "learning_rate": 4.917406728350448e-05, "loss": 1.2149, "step": 1054 }, { "epoch": 0.16585116626382912, "grad_norm": 0.2101297229528427, "learning_rate": 4.917249156041066e-05, "loss": 1.2313, "step": 1055 }, { "epoch": 0.16600837116076167, "grad_norm": 0.2601447105407715, "learning_rate": 4.917091436095304e-05, "loss": 1.1907, "step": 1056 }, { "epoch": 0.1661655760576942, "grad_norm": 0.2113189995288849, "learning_rate": 4.916933568522793e-05, "loss": 1.2852, "step": 1057 }, { "epoch": 0.16632278095462674, "grad_norm": 0.21135227382183075, "learning_rate": 4.916775553333176e-05, "loss": 1.2852, "step": 1058 }, { "epoch": 0.16647998585155926, "grad_norm": 0.2743116617202759, "learning_rate": 4.916617390536102e-05, "loss": 1.2032, "step": 1059 }, { "epoch": 0.16663719074849181, "grad_norm": 0.2520056664943695, "learning_rate": 4.916459080141234e-05, "loss": 1.3038, "step": 1060 }, { "epoch": 0.16679439564542436, "grad_norm": 0.21614307165145874, "learning_rate": 4.916300622158239e-05, "loss": 1.2216, "step": 1061 }, { "epoch": 0.1669516005423569, "grad_norm": 0.28615444898605347, "learning_rate": 4.9161420165967956e-05, "loss": 1.2162, "step": 1062 }, { "epoch": 0.16710880543928944, "grad_norm": 0.33522137999534607, "learning_rate": 4.91598326346659e-05, "loss": 1.1458, "step": 1063 }, { "epoch": 0.16726601033622196, "grad_norm": 0.30597957968711853, "learning_rate": 4.9158243627773194e-05, "loss": 1.2623, "step": 1064 }, { "epoch": 0.1674232152331545, "grad_norm": 0.2643260061740875, "learning_rate": 4.915665314538688e-05, "loss": 1.2092, "step": 1065 }, { "epoch": 0.16758042013008706, "grad_norm": 0.3190208673477173, "learning_rate": 4.91550611876041e-05, "loss": 1.0543, "step": 1066 }, { "epoch": 0.16773762502701958, "grad_norm": 0.3194860816001892, "learning_rate": 4.9153467754522095e-05, "loss": 1.1393, "step": 1067 }, { "epoch": 0.16789482992395213, "grad_norm": 0.2031661570072174, "learning_rate": 4.915187284623817e-05, "loss": 1.2136, "step": 1068 }, { "epoch": 0.16805203482088468, "grad_norm": 0.2047189325094223, "learning_rate": 4.915027646284974e-05, "loss": 1.2962, "step": 1069 }, { "epoch": 0.1682092397178172, "grad_norm": 0.31565096974372864, "learning_rate": 4.9148678604454325e-05, "loss": 1.1979, "step": 1070 }, { "epoch": 0.16836644461474976, "grad_norm": 0.23875312507152557, "learning_rate": 4.914707927114949e-05, "loss": 1.3002, "step": 1071 }, { "epoch": 0.16852364951168228, "grad_norm": 0.24424909055233002, "learning_rate": 4.9145478463032924e-05, "loss": 1.1491, "step": 1072 }, { "epoch": 0.16868085440861483, "grad_norm": 0.28339752554893494, "learning_rate": 4.91438761802024e-05, "loss": 1.2398, "step": 1073 }, { "epoch": 0.16883805930554738, "grad_norm": 0.2435888648033142, "learning_rate": 4.9142272422755786e-05, "loss": 1.3292, "step": 1074 }, { "epoch": 0.1689952642024799, "grad_norm": 0.21540984511375427, "learning_rate": 4.9140667190791026e-05, "loss": 1.3665, "step": 1075 }, { "epoch": 0.16915246909941245, "grad_norm": 0.2556820809841156, "learning_rate": 4.913906048440617e-05, "loss": 1.2557, "step": 1076 }, { "epoch": 0.16930967399634497, "grad_norm": 0.23769475519657135, "learning_rate": 4.913745230369934e-05, "loss": 1.2163, "step": 1077 }, { "epoch": 0.16946687889327752, "grad_norm": 0.31578120589256287, "learning_rate": 4.913584264876875e-05, "loss": 1.3176, "step": 1078 }, { "epoch": 0.16962408379021007, "grad_norm": 0.22278232872486115, "learning_rate": 4.913423151971273e-05, "loss": 1.2206, "step": 1079 }, { "epoch": 0.1697812886871426, "grad_norm": 0.31810736656188965, "learning_rate": 4.913261891662967e-05, "loss": 1.2254, "step": 1080 }, { "epoch": 0.16993849358407515, "grad_norm": 0.22623823583126068, "learning_rate": 4.913100483961807e-05, "loss": 1.208, "step": 1081 }, { "epoch": 0.1700956984810077, "grad_norm": 0.27108776569366455, "learning_rate": 4.9129389288776504e-05, "loss": 1.2989, "step": 1082 }, { "epoch": 0.17025290337794022, "grad_norm": 0.194550558924675, "learning_rate": 4.912777226420365e-05, "loss": 1.3849, "step": 1083 }, { "epoch": 0.17041010827487277, "grad_norm": 0.20856213569641113, "learning_rate": 4.912615376599826e-05, "loss": 1.2736, "step": 1084 }, { "epoch": 0.1705673131718053, "grad_norm": 0.19355203211307526, "learning_rate": 4.91245337942592e-05, "loss": 1.256, "step": 1085 }, { "epoch": 0.17072451806873784, "grad_norm": 0.21832303702831268, "learning_rate": 4.9122912349085395e-05, "loss": 1.1987, "step": 1086 }, { "epoch": 0.1708817229656704, "grad_norm": 0.22642913460731506, "learning_rate": 4.912128943057589e-05, "loss": 1.3043, "step": 1087 }, { "epoch": 0.1710389278626029, "grad_norm": 0.22713351249694824, "learning_rate": 4.911966503882981e-05, "loss": 1.1951, "step": 1088 }, { "epoch": 0.17119613275953546, "grad_norm": 0.29707837104797363, "learning_rate": 4.911803917394634e-05, "loss": 1.2674, "step": 1089 }, { "epoch": 0.17135333765646799, "grad_norm": 0.27017104625701904, "learning_rate": 4.911641183602481e-05, "loss": 1.1727, "step": 1090 }, { "epoch": 0.17151054255340054, "grad_norm": 0.23867738246917725, "learning_rate": 4.911478302516461e-05, "loss": 1.2061, "step": 1091 }, { "epoch": 0.17166774745033309, "grad_norm": 0.26270055770874023, "learning_rate": 4.911315274146521e-05, "loss": 1.2735, "step": 1092 }, { "epoch": 0.1718249523472656, "grad_norm": 0.25167661905288696, "learning_rate": 4.911152098502617e-05, "loss": 1.2643, "step": 1093 }, { "epoch": 0.17198215724419816, "grad_norm": 0.46500882506370544, "learning_rate": 4.9109887755947185e-05, "loss": 1.1743, "step": 1094 }, { "epoch": 0.1721393621411307, "grad_norm": 0.2189512848854065, "learning_rate": 4.910825305432798e-05, "loss": 1.1232, "step": 1095 }, { "epoch": 0.17229656703806323, "grad_norm": 0.21188926696777344, "learning_rate": 4.9106616880268405e-05, "loss": 1.2031, "step": 1096 }, { "epoch": 0.17245377193499578, "grad_norm": 0.2135314792394638, "learning_rate": 4.910497923386839e-05, "loss": 1.2547, "step": 1097 }, { "epoch": 0.1726109768319283, "grad_norm": 0.27527931332588196, "learning_rate": 4.910334011522796e-05, "loss": 1.119, "step": 1098 }, { "epoch": 0.17276818172886085, "grad_norm": 0.23516559600830078, "learning_rate": 4.910169952444722e-05, "loss": 1.3006, "step": 1099 }, { "epoch": 0.1729253866257934, "grad_norm": 0.23166057467460632, "learning_rate": 4.910005746162637e-05, "loss": 1.281, "step": 1100 }, { "epoch": 0.17308259152272593, "grad_norm": 0.22281822562217712, "learning_rate": 4.9098413926865714e-05, "loss": 1.1526, "step": 1101 }, { "epoch": 0.17323979641965848, "grad_norm": 0.32062655687332153, "learning_rate": 4.909676892026563e-05, "loss": 1.2388, "step": 1102 }, { "epoch": 0.173397001316591, "grad_norm": 0.29868969321250916, "learning_rate": 4.909512244192657e-05, "loss": 1.2303, "step": 1103 }, { "epoch": 0.17355420621352355, "grad_norm": 0.23143270611763, "learning_rate": 4.90934744919491e-05, "loss": 1.2137, "step": 1104 }, { "epoch": 0.1737114111104561, "grad_norm": 0.2830474376678467, "learning_rate": 4.909182507043389e-05, "loss": 1.2178, "step": 1105 }, { "epoch": 0.17386861600738862, "grad_norm": 0.22427986562252045, "learning_rate": 4.909017417748166e-05, "loss": 1.3153, "step": 1106 }, { "epoch": 0.17402582090432117, "grad_norm": 0.2587423622608185, "learning_rate": 4.908852181319326e-05, "loss": 1.2669, "step": 1107 }, { "epoch": 0.17418302580125372, "grad_norm": 0.24905993044376373, "learning_rate": 4.9086867977669594e-05, "loss": 1.2549, "step": 1108 }, { "epoch": 0.17434023069818624, "grad_norm": 0.26877379417419434, "learning_rate": 4.908521267101167e-05, "loss": 1.2694, "step": 1109 }, { "epoch": 0.1744974355951188, "grad_norm": 0.2501152753829956, "learning_rate": 4.9083555893320596e-05, "loss": 1.2241, "step": 1110 }, { "epoch": 0.17465464049205132, "grad_norm": 0.27815014123916626, "learning_rate": 4.908189764469757e-05, "loss": 1.2152, "step": 1111 }, { "epoch": 0.17481184538898387, "grad_norm": 0.32891881465911865, "learning_rate": 4.9080237925243856e-05, "loss": 1.2638, "step": 1112 }, { "epoch": 0.17496905028591642, "grad_norm": 0.2137015461921692, "learning_rate": 4.9078576735060825e-05, "loss": 1.2041, "step": 1113 }, { "epoch": 0.17512625518284894, "grad_norm": 0.17862486839294434, "learning_rate": 4.907691407424995e-05, "loss": 1.3349, "step": 1114 }, { "epoch": 0.1752834600797815, "grad_norm": 0.25791284441947937, "learning_rate": 4.907524994291276e-05, "loss": 1.2337, "step": 1115 }, { "epoch": 0.175440664976714, "grad_norm": 0.24266491830348969, "learning_rate": 4.90735843411509e-05, "loss": 1.0939, "step": 1116 }, { "epoch": 0.17559786987364656, "grad_norm": 0.2618250250816345, "learning_rate": 4.9071917269066114e-05, "loss": 1.2855, "step": 1117 }, { "epoch": 0.1757550747705791, "grad_norm": 0.2477390021085739, "learning_rate": 4.9070248726760206e-05, "loss": 1.1675, "step": 1118 }, { "epoch": 0.17591227966751163, "grad_norm": 0.29105502367019653, "learning_rate": 4.906857871433508e-05, "loss": 1.183, "step": 1119 }, { "epoch": 0.17606948456444418, "grad_norm": 0.2923283874988556, "learning_rate": 4.906690723189275e-05, "loss": 1.1386, "step": 1120 }, { "epoch": 0.17606948456444418, "eval_loss": 1.219694972038269, "eval_runtime": 2300.2931, "eval_samples_per_second": 4.025, "eval_steps_per_second": 2.012, "step": 1120 }, { "epoch": 0.17622668946137673, "grad_norm": 0.3278633952140808, "learning_rate": 4.906523427953529e-05, "loss": 1.1738, "step": 1121 }, { "epoch": 0.17638389435830926, "grad_norm": 0.31546783447265625, "learning_rate": 4.906355985736488e-05, "loss": 1.0894, "step": 1122 }, { "epoch": 0.1765410992552418, "grad_norm": 0.28350481390953064, "learning_rate": 4.906188396548379e-05, "loss": 1.2774, "step": 1123 }, { "epoch": 0.17669830415217433, "grad_norm": 0.21374982595443726, "learning_rate": 4.9060206603994385e-05, "loss": 1.37, "step": 1124 }, { "epoch": 0.17685550904910688, "grad_norm": 0.2343566119670868, "learning_rate": 4.9058527772999095e-05, "loss": 1.2065, "step": 1125 }, { "epoch": 0.17701271394603943, "grad_norm": 0.29571887850761414, "learning_rate": 4.905684747260047e-05, "loss": 1.1967, "step": 1126 }, { "epoch": 0.17716991884297195, "grad_norm": 0.2689303457736969, "learning_rate": 4.905516570290113e-05, "loss": 1.2337, "step": 1127 }, { "epoch": 0.1773271237399045, "grad_norm": 0.22743673622608185, "learning_rate": 4.90534824640038e-05, "loss": 1.1673, "step": 1128 }, { "epoch": 0.17748432863683702, "grad_norm": 0.36731019616127014, "learning_rate": 4.905179775601126e-05, "loss": 1.1397, "step": 1129 }, { "epoch": 0.17764153353376957, "grad_norm": 0.2571149468421936, "learning_rate": 4.905011157902645e-05, "loss": 1.1166, "step": 1130 }, { "epoch": 0.17779873843070212, "grad_norm": 0.2615256905555725, "learning_rate": 4.904842393315231e-05, "loss": 1.2095, "step": 1131 }, { "epoch": 0.17795594332763465, "grad_norm": 0.28919360041618347, "learning_rate": 4.904673481849194e-05, "loss": 1.0976, "step": 1132 }, { "epoch": 0.1781131482245672, "grad_norm": 0.3858489990234375, "learning_rate": 4.90450442351485e-05, "loss": 1.1934, "step": 1133 }, { "epoch": 0.17827035312149975, "grad_norm": 0.2448245733976364, "learning_rate": 4.904335218322524e-05, "loss": 1.1604, "step": 1134 }, { "epoch": 0.17842755801843227, "grad_norm": 0.2626294195652008, "learning_rate": 4.9041658662825514e-05, "loss": 1.1301, "step": 1135 }, { "epoch": 0.17858476291536482, "grad_norm": 0.3016091287136078, "learning_rate": 4.903996367405275e-05, "loss": 1.2579, "step": 1136 }, { "epoch": 0.17874196781229734, "grad_norm": 0.28168612718582153, "learning_rate": 4.9038267217010455e-05, "loss": 1.1471, "step": 1137 }, { "epoch": 0.1788991727092299, "grad_norm": 0.29256439208984375, "learning_rate": 4.903656929180228e-05, "loss": 1.1598, "step": 1138 }, { "epoch": 0.17905637760616244, "grad_norm": 0.19786624610424042, "learning_rate": 4.9034869898531895e-05, "loss": 1.2115, "step": 1139 }, { "epoch": 0.17921358250309496, "grad_norm": 0.17216260731220245, "learning_rate": 4.9033169037303106e-05, "loss": 1.2471, "step": 1140 }, { "epoch": 0.17937078740002751, "grad_norm": 0.22571730613708496, "learning_rate": 4.9031466708219785e-05, "loss": 1.2226, "step": 1141 }, { "epoch": 0.17952799229696004, "grad_norm": 0.25510528683662415, "learning_rate": 4.9029762911385915e-05, "loss": 1.1428, "step": 1142 }, { "epoch": 0.1796851971938926, "grad_norm": 0.19014020264148712, "learning_rate": 4.902805764690556e-05, "loss": 1.2268, "step": 1143 }, { "epoch": 0.17984240209082514, "grad_norm": 0.25155729055404663, "learning_rate": 4.902635091488286e-05, "loss": 1.1943, "step": 1144 }, { "epoch": 0.17999960698775766, "grad_norm": 0.3109387159347534, "learning_rate": 4.902464271542206e-05, "loss": 1.176, "step": 1145 }, { "epoch": 0.1801568118846902, "grad_norm": 0.2269504815340042, "learning_rate": 4.9022933048627496e-05, "loss": 1.2166, "step": 1146 }, { "epoch": 0.18031401678162276, "grad_norm": 0.20270425081253052, "learning_rate": 4.902122191460358e-05, "loss": 1.235, "step": 1147 }, { "epoch": 0.18047122167855528, "grad_norm": 0.2519841194152832, "learning_rate": 4.901950931345481e-05, "loss": 1.2418, "step": 1148 }, { "epoch": 0.18062842657548783, "grad_norm": 0.1967516988515854, "learning_rate": 4.901779524528582e-05, "loss": 1.2979, "step": 1149 }, { "epoch": 0.18078563147242036, "grad_norm": 0.21120384335517883, "learning_rate": 4.901607971020127e-05, "loss": 1.1557, "step": 1150 }, { "epoch": 0.1809428363693529, "grad_norm": 0.31649792194366455, "learning_rate": 4.9014362708305944e-05, "loss": 1.3237, "step": 1151 }, { "epoch": 0.18110004126628546, "grad_norm": 0.24945318698883057, "learning_rate": 4.901264423970471e-05, "loss": 1.2099, "step": 1152 }, { "epoch": 0.18125724616321798, "grad_norm": 0.30652904510498047, "learning_rate": 4.901092430450254e-05, "loss": 1.1918, "step": 1153 }, { "epoch": 0.18141445106015053, "grad_norm": 0.2480253279209137, "learning_rate": 4.900920290280446e-05, "loss": 1.2675, "step": 1154 }, { "epoch": 0.18157165595708305, "grad_norm": 0.3034304976463318, "learning_rate": 4.900748003471561e-05, "loss": 1.2012, "step": 1155 }, { "epoch": 0.1817288608540156, "grad_norm": 0.2113679200410843, "learning_rate": 4.900575570034124e-05, "loss": 1.2824, "step": 1156 }, { "epoch": 0.18188606575094815, "grad_norm": 0.34726831316947937, "learning_rate": 4.9004029899786627e-05, "loss": 1.1426, "step": 1157 }, { "epoch": 0.18204327064788067, "grad_norm": 0.20344194769859314, "learning_rate": 4.900230263315722e-05, "loss": 1.2096, "step": 1158 }, { "epoch": 0.18220047554481322, "grad_norm": 0.28635072708129883, "learning_rate": 4.900057390055847e-05, "loss": 1.166, "step": 1159 }, { "epoch": 0.18235768044174577, "grad_norm": 0.21670344471931458, "learning_rate": 4.8998843702095995e-05, "loss": 1.2103, "step": 1160 }, { "epoch": 0.1825148853386783, "grad_norm": 0.31661516427993774, "learning_rate": 4.899711203787545e-05, "loss": 1.2345, "step": 1161 }, { "epoch": 0.18267209023561085, "grad_norm": 0.30255556106567383, "learning_rate": 4.899537890800261e-05, "loss": 1.2342, "step": 1162 }, { "epoch": 0.18282929513254337, "grad_norm": 0.23636944591999054, "learning_rate": 4.899364431258332e-05, "loss": 1.1685, "step": 1163 }, { "epoch": 0.18298650002947592, "grad_norm": 0.27452319860458374, "learning_rate": 4.8991908251723524e-05, "loss": 1.1263, "step": 1164 }, { "epoch": 0.18314370492640847, "grad_norm": 0.28636041283607483, "learning_rate": 4.899017072552926e-05, "loss": 1.1961, "step": 1165 }, { "epoch": 0.183300909823341, "grad_norm": 0.29220953583717346, "learning_rate": 4.8988431734106635e-05, "loss": 1.2414, "step": 1166 }, { "epoch": 0.18345811472027354, "grad_norm": 0.20738068222999573, "learning_rate": 4.898669127756188e-05, "loss": 1.1499, "step": 1167 }, { "epoch": 0.18361531961720606, "grad_norm": 0.19913551211357117, "learning_rate": 4.898494935600127e-05, "loss": 1.3538, "step": 1168 }, { "epoch": 0.1837725245141386, "grad_norm": 0.256979763507843, "learning_rate": 4.8983205969531234e-05, "loss": 1.1979, "step": 1169 }, { "epoch": 0.18392972941107116, "grad_norm": 0.26307129859924316, "learning_rate": 4.898146111825821e-05, "loss": 1.2054, "step": 1170 }, { "epoch": 0.18408693430800369, "grad_norm": 0.2451772540807724, "learning_rate": 4.897971480228879e-05, "loss": 1.1901, "step": 1171 }, { "epoch": 0.18424413920493624, "grad_norm": 0.3223975896835327, "learning_rate": 4.897796702172962e-05, "loss": 1.1825, "step": 1172 }, { "epoch": 0.18440134410186879, "grad_norm": 0.34991317987442017, "learning_rate": 4.897621777668746e-05, "loss": 1.1371, "step": 1173 }, { "epoch": 0.1845585489988013, "grad_norm": 0.2680002748966217, "learning_rate": 4.897446706726915e-05, "loss": 1.2179, "step": 1174 }, { "epoch": 0.18471575389573386, "grad_norm": 0.21509090065956116, "learning_rate": 4.897271489358159e-05, "loss": 1.1284, "step": 1175 }, { "epoch": 0.18487295879266638, "grad_norm": 0.20545831322669983, "learning_rate": 4.8970961255731826e-05, "loss": 1.2188, "step": 1176 }, { "epoch": 0.18503016368959893, "grad_norm": 0.23479585349559784, "learning_rate": 4.896920615382695e-05, "loss": 1.2947, "step": 1177 }, { "epoch": 0.18518736858653148, "grad_norm": 0.2880757749080658, "learning_rate": 4.896744958797417e-05, "loss": 1.1443, "step": 1178 }, { "epoch": 0.185344573483464, "grad_norm": 0.2431318610906601, "learning_rate": 4.8965691558280744e-05, "loss": 1.1123, "step": 1179 }, { "epoch": 0.18550177838039655, "grad_norm": 0.21252453327178955, "learning_rate": 4.896393206485407e-05, "loss": 1.326, "step": 1180 }, { "epoch": 0.18565898327732908, "grad_norm": 0.28821709752082825, "learning_rate": 4.8962171107801596e-05, "loss": 1.1508, "step": 1181 }, { "epoch": 0.18581618817426163, "grad_norm": 0.2636358141899109, "learning_rate": 4.8960408687230886e-05, "loss": 1.1061, "step": 1182 }, { "epoch": 0.18597339307119418, "grad_norm": 0.23121225833892822, "learning_rate": 4.895864480324957e-05, "loss": 1.2486, "step": 1183 }, { "epoch": 0.1861305979681267, "grad_norm": 0.29034245014190674, "learning_rate": 4.895687945596539e-05, "loss": 1.186, "step": 1184 }, { "epoch": 0.18628780286505925, "grad_norm": 0.3220363259315491, "learning_rate": 4.895511264548617e-05, "loss": 1.1727, "step": 1185 }, { "epoch": 0.1864450077619918, "grad_norm": 0.2863159477710724, "learning_rate": 4.89533443719198e-05, "loss": 1.1946, "step": 1186 }, { "epoch": 0.18660221265892432, "grad_norm": 0.27671483159065247, "learning_rate": 4.89515746353743e-05, "loss": 1.2271, "step": 1187 }, { "epoch": 0.18675941755585687, "grad_norm": 0.2535041570663452, "learning_rate": 4.894980343595775e-05, "loss": 1.2437, "step": 1188 }, { "epoch": 0.1869166224527894, "grad_norm": 0.34405645728111267, "learning_rate": 4.894803077377833e-05, "loss": 1.1397, "step": 1189 }, { "epoch": 0.18707382734972194, "grad_norm": 0.28299692273139954, "learning_rate": 4.8946256648944307e-05, "loss": 1.1215, "step": 1190 }, { "epoch": 0.1872310322466545, "grad_norm": 0.1962118297815323, "learning_rate": 4.8944481061564035e-05, "loss": 1.1908, "step": 1191 }, { "epoch": 0.18738823714358702, "grad_norm": 0.24563154578208923, "learning_rate": 4.894270401174597e-05, "loss": 1.2265, "step": 1192 }, { "epoch": 0.18754544204051957, "grad_norm": 0.22452424466609955, "learning_rate": 4.894092549959862e-05, "loss": 1.1673, "step": 1193 }, { "epoch": 0.1877026469374521, "grad_norm": 0.1847248673439026, "learning_rate": 4.8939145525230646e-05, "loss": 1.2706, "step": 1194 }, { "epoch": 0.18785985183438464, "grad_norm": 0.2578265964984894, "learning_rate": 4.893736408875075e-05, "loss": 1.2011, "step": 1195 }, { "epoch": 0.1880170567313172, "grad_norm": 0.2686786353588104, "learning_rate": 4.893558119026772e-05, "loss": 1.3191, "step": 1196 }, { "epoch": 0.1881742616282497, "grad_norm": 0.27492383122444153, "learning_rate": 4.893379682989047e-05, "loss": 1.1755, "step": 1197 }, { "epoch": 0.18833146652518226, "grad_norm": 0.2544412612915039, "learning_rate": 4.8932011007727965e-05, "loss": 1.1842, "step": 1198 }, { "epoch": 0.1884886714221148, "grad_norm": 0.24790935218334198, "learning_rate": 4.893022372388928e-05, "loss": 1.2408, "step": 1199 }, { "epoch": 0.18864587631904733, "grad_norm": 0.2788006067276001, "learning_rate": 4.892843497848358e-05, "loss": 1.2671, "step": 1200 }, { "epoch": 0.18880308121597988, "grad_norm": 0.2571476101875305, "learning_rate": 4.892664477162012e-05, "loss": 1.1894, "step": 1201 }, { "epoch": 0.1889602861129124, "grad_norm": 0.22788426280021667, "learning_rate": 4.892485310340822e-05, "loss": 1.2261, "step": 1202 }, { "epoch": 0.18911749100984496, "grad_norm": 0.2010507732629776, "learning_rate": 4.892305997395733e-05, "loss": 1.2399, "step": 1203 }, { "epoch": 0.1892746959067775, "grad_norm": 0.23946425318717957, "learning_rate": 4.892126538337696e-05, "loss": 1.2727, "step": 1204 }, { "epoch": 0.18943190080371003, "grad_norm": 0.2885929346084595, "learning_rate": 4.8919469331776714e-05, "loss": 1.2376, "step": 1205 }, { "epoch": 0.18958910570064258, "grad_norm": 0.31879860162734985, "learning_rate": 4.891767181926629e-05, "loss": 1.22, "step": 1206 }, { "epoch": 0.1897463105975751, "grad_norm": 0.2895459532737732, "learning_rate": 4.891587284595546e-05, "loss": 1.2387, "step": 1207 }, { "epoch": 0.18990351549450765, "grad_norm": 0.27507272362709045, "learning_rate": 4.891407241195412e-05, "loss": 1.1723, "step": 1208 }, { "epoch": 0.1900607203914402, "grad_norm": 0.26780039072036743, "learning_rate": 4.8912270517372224e-05, "loss": 1.1549, "step": 1209 }, { "epoch": 0.19021792528837272, "grad_norm": 0.1915176510810852, "learning_rate": 4.8910467162319826e-05, "loss": 1.109, "step": 1210 }, { "epoch": 0.19037513018530527, "grad_norm": 0.25054261088371277, "learning_rate": 4.8908662346907064e-05, "loss": 1.1197, "step": 1211 }, { "epoch": 0.19053233508223782, "grad_norm": 0.24239963293075562, "learning_rate": 4.8906856071244176e-05, "loss": 1.2614, "step": 1212 }, { "epoch": 0.19068953997917035, "grad_norm": 0.21543578803539276, "learning_rate": 4.890504833544147e-05, "loss": 1.3804, "step": 1213 }, { "epoch": 0.1908467448761029, "grad_norm": 0.2045610100030899, "learning_rate": 4.8903239139609376e-05, "loss": 1.2108, "step": 1214 }, { "epoch": 0.19100394977303542, "grad_norm": 0.2209930568933487, "learning_rate": 4.890142848385838e-05, "loss": 1.2329, "step": 1215 }, { "epoch": 0.19116115466996797, "grad_norm": 0.24921675026416779, "learning_rate": 4.889961636829906e-05, "loss": 1.2009, "step": 1216 }, { "epoch": 0.19131835956690052, "grad_norm": 0.2356979250907898, "learning_rate": 4.8897802793042115e-05, "loss": 1.211, "step": 1217 }, { "epoch": 0.19147556446383304, "grad_norm": 0.20199252665042877, "learning_rate": 4.88959877581983e-05, "loss": 1.18, "step": 1218 }, { "epoch": 0.1916327693607656, "grad_norm": 0.24907195568084717, "learning_rate": 4.889417126387846e-05, "loss": 1.2438, "step": 1219 }, { "epoch": 0.19178997425769811, "grad_norm": 0.2976427674293518, "learning_rate": 4.889235331019356e-05, "loss": 1.1526, "step": 1220 }, { "epoch": 0.19194717915463066, "grad_norm": 0.25074872374534607, "learning_rate": 4.889053389725463e-05, "loss": 1.1805, "step": 1221 }, { "epoch": 0.19210438405156322, "grad_norm": 0.2157672792673111, "learning_rate": 4.8888713025172776e-05, "loss": 1.2103, "step": 1222 }, { "epoch": 0.19226158894849574, "grad_norm": 0.24573171138763428, "learning_rate": 4.888689069405923e-05, "loss": 1.1981, "step": 1223 }, { "epoch": 0.1924187938454283, "grad_norm": 0.294160932302475, "learning_rate": 4.888506690402528e-05, "loss": 1.2667, "step": 1224 }, { "epoch": 0.19257599874236084, "grad_norm": 0.8444136381149292, "learning_rate": 4.8883241655182314e-05, "loss": 1.1977, "step": 1225 }, { "epoch": 0.19273320363929336, "grad_norm": 0.4191160798072815, "learning_rate": 4.888141494764182e-05, "loss": 1.1981, "step": 1226 }, { "epoch": 0.1928904085362259, "grad_norm": 0.31621554493904114, "learning_rate": 4.8879586781515376e-05, "loss": 1.2224, "step": 1227 }, { "epoch": 0.19304761343315843, "grad_norm": 0.2715776860713959, "learning_rate": 4.887775715691462e-05, "loss": 1.1029, "step": 1228 }, { "epoch": 0.19320481833009098, "grad_norm": 0.2641848623752594, "learning_rate": 4.88759260739513e-05, "loss": 1.1738, "step": 1229 }, { "epoch": 0.19336202322702353, "grad_norm": 0.2537270188331604, "learning_rate": 4.887409353273727e-05, "loss": 1.2847, "step": 1230 }, { "epoch": 0.19351922812395606, "grad_norm": 0.2998782694339752, "learning_rate": 4.8872259533384423e-05, "loss": 1.1814, "step": 1231 }, { "epoch": 0.1936764330208886, "grad_norm": 0.2254815697669983, "learning_rate": 4.8870424076004806e-05, "loss": 1.2004, "step": 1232 }, { "epoch": 0.19383363791782113, "grad_norm": 0.3711993396282196, "learning_rate": 4.88685871607105e-05, "loss": 1.1502, "step": 1233 }, { "epoch": 0.19399084281475368, "grad_norm": 0.24783778190612793, "learning_rate": 4.886674878761371e-05, "loss": 1.1185, "step": 1234 }, { "epoch": 0.19414804771168623, "grad_norm": 0.1896362453699112, "learning_rate": 4.88649089568267e-05, "loss": 1.1856, "step": 1235 }, { "epoch": 0.19430525260861875, "grad_norm": 0.28106558322906494, "learning_rate": 4.886306766846187e-05, "loss": 1.2196, "step": 1236 }, { "epoch": 0.1944624575055513, "grad_norm": 0.3023208975791931, "learning_rate": 4.8861224922631645e-05, "loss": 1.1836, "step": 1237 }, { "epoch": 0.19461966240248385, "grad_norm": 0.36752450466156006, "learning_rate": 4.8859380719448596e-05, "loss": 1.1831, "step": 1238 }, { "epoch": 0.19477686729941637, "grad_norm": 0.2593975365161896, "learning_rate": 4.885753505902535e-05, "loss": 1.1955, "step": 1239 }, { "epoch": 0.19493407219634892, "grad_norm": 0.2952882647514343, "learning_rate": 4.885568794147463e-05, "loss": 1.108, "step": 1240 }, { "epoch": 0.19509127709328145, "grad_norm": 0.2335767149925232, "learning_rate": 4.885383936690926e-05, "loss": 1.2389, "step": 1241 }, { "epoch": 0.195248481990214, "grad_norm": 0.3618619441986084, "learning_rate": 4.885198933544214e-05, "loss": 1.0247, "step": 1242 }, { "epoch": 0.19540568688714655, "grad_norm": 0.26691627502441406, "learning_rate": 4.885013784718626e-05, "loss": 1.1516, "step": 1243 }, { "epoch": 0.19556289178407907, "grad_norm": 0.2977723777294159, "learning_rate": 4.8848284902254705e-05, "loss": 1.1617, "step": 1244 }, { "epoch": 0.19572009668101162, "grad_norm": 0.33515632152557373, "learning_rate": 4.884643050076064e-05, "loss": 1.1789, "step": 1245 }, { "epoch": 0.19587730157794414, "grad_norm": 0.275840163230896, "learning_rate": 4.8844574642817334e-05, "loss": 1.1103, "step": 1246 }, { "epoch": 0.1960345064748767, "grad_norm": 0.26756566762924194, "learning_rate": 4.884271732853813e-05, "loss": 1.2101, "step": 1247 }, { "epoch": 0.19619171137180924, "grad_norm": 0.20770548284053802, "learning_rate": 4.884085855803647e-05, "loss": 1.2506, "step": 1248 }, { "epoch": 0.19634891626874176, "grad_norm": 0.2700664699077606, "learning_rate": 4.883899833142588e-05, "loss": 1.2034, "step": 1249 }, { "epoch": 0.1965061211656743, "grad_norm": 0.2403496950864792, "learning_rate": 4.883713664881997e-05, "loss": 1.1622, "step": 1250 }, { "epoch": 0.19666332606260686, "grad_norm": 0.2710270881652832, "learning_rate": 4.883527351033245e-05, "loss": 1.0679, "step": 1251 }, { "epoch": 0.19682053095953939, "grad_norm": 0.2600773870944977, "learning_rate": 4.8833408916077104e-05, "loss": 1.3343, "step": 1252 }, { "epoch": 0.19697773585647194, "grad_norm": 0.25740665197372437, "learning_rate": 4.883154286616783e-05, "loss": 1.2206, "step": 1253 }, { "epoch": 0.19713494075340446, "grad_norm": 0.3393601179122925, "learning_rate": 4.8829675360718585e-05, "loss": 1.1518, "step": 1254 }, { "epoch": 0.197292145650337, "grad_norm": 0.2968616783618927, "learning_rate": 4.8827806399843444e-05, "loss": 1.2547, "step": 1255 }, { "epoch": 0.19744935054726956, "grad_norm": 0.24990178644657135, "learning_rate": 4.8825935983656535e-05, "loss": 1.2733, "step": 1256 }, { "epoch": 0.19760655544420208, "grad_norm": 0.31955957412719727, "learning_rate": 4.882406411227212e-05, "loss": 1.2138, "step": 1257 }, { "epoch": 0.19776376034113463, "grad_norm": 0.22445374727249146, "learning_rate": 4.88221907858045e-05, "loss": 1.1845, "step": 1258 }, { "epoch": 0.19792096523806715, "grad_norm": 0.32888510823249817, "learning_rate": 4.8820316004368116e-05, "loss": 1.2339, "step": 1259 }, { "epoch": 0.1980781701349997, "grad_norm": 0.29760921001434326, "learning_rate": 4.8818439768077456e-05, "loss": 1.2216, "step": 1260 }, { "epoch": 0.19823537503193225, "grad_norm": 0.19965974986553192, "learning_rate": 4.881656207704712e-05, "loss": 1.2608, "step": 1261 }, { "epoch": 0.19839257992886478, "grad_norm": 0.2538587749004364, "learning_rate": 4.881468293139179e-05, "loss": 1.1989, "step": 1262 }, { "epoch": 0.19854978482579733, "grad_norm": 0.35299167037010193, "learning_rate": 4.8812802331226224e-05, "loss": 1.1426, "step": 1263 }, { "epoch": 0.19870698972272988, "grad_norm": 0.3230816423892975, "learning_rate": 4.8810920276665306e-05, "loss": 1.2546, "step": 1264 }, { "epoch": 0.1988641946196624, "grad_norm": 0.3077559769153595, "learning_rate": 4.880903676782397e-05, "loss": 1.1661, "step": 1265 }, { "epoch": 0.19902139951659495, "grad_norm": 0.32157936692237854, "learning_rate": 4.8807151804817254e-05, "loss": 1.2141, "step": 1266 }, { "epoch": 0.19917860441352747, "grad_norm": 0.32653504610061646, "learning_rate": 4.880526538776029e-05, "loss": 1.0623, "step": 1267 }, { "epoch": 0.19933580931046002, "grad_norm": 0.2675210237503052, "learning_rate": 4.880337751676828e-05, "loss": 1.1408, "step": 1268 }, { "epoch": 0.19949301420739257, "grad_norm": 0.28380653262138367, "learning_rate": 4.880148819195654e-05, "loss": 1.223, "step": 1269 }, { "epoch": 0.1996502191043251, "grad_norm": 0.2532847821712494, "learning_rate": 4.8799597413440466e-05, "loss": 1.2133, "step": 1270 }, { "epoch": 0.19980742400125764, "grad_norm": 0.2972438633441925, "learning_rate": 4.8797705181335526e-05, "loss": 1.2806, "step": 1271 }, { "epoch": 0.19996462889819017, "grad_norm": 0.2725450098514557, "learning_rate": 4.8795811495757306e-05, "loss": 1.1627, "step": 1272 }, { "epoch": 0.20012183379512272, "grad_norm": 0.2451506108045578, "learning_rate": 4.879391635682145e-05, "loss": 1.3242, "step": 1273 }, { "epoch": 0.20027903869205527, "grad_norm": 0.22880415618419647, "learning_rate": 4.8792019764643714e-05, "loss": 1.1535, "step": 1274 }, { "epoch": 0.2004362435889878, "grad_norm": 0.22470681369304657, "learning_rate": 4.8790121719339935e-05, "loss": 1.268, "step": 1275 }, { "epoch": 0.20059344848592034, "grad_norm": 0.2413133829832077, "learning_rate": 4.878822222102604e-05, "loss": 1.2291, "step": 1276 }, { "epoch": 0.2007506533828529, "grad_norm": 0.23373375833034515, "learning_rate": 4.878632126981804e-05, "loss": 1.1007, "step": 1277 }, { "epoch": 0.2009078582797854, "grad_norm": 0.3018023371696472, "learning_rate": 4.878441886583203e-05, "loss": 1.2393, "step": 1278 }, { "epoch": 0.20106506317671796, "grad_norm": 0.2107972353696823, "learning_rate": 4.878251500918421e-05, "loss": 1.3164, "step": 1279 }, { "epoch": 0.20122226807365048, "grad_norm": 0.24787524342536926, "learning_rate": 4.878060969999087e-05, "loss": 1.217, "step": 1280 }, { "epoch": 0.20122226807365048, "eval_loss": 1.2021143436431885, "eval_runtime": 2276.1827, "eval_samples_per_second": 4.067, "eval_steps_per_second": 2.034, "step": 1280 }, { "epoch": 0.20137947297058303, "grad_norm": 0.22325897216796875, "learning_rate": 4.877870293836837e-05, "loss": 1.2739, "step": 1281 }, { "epoch": 0.20153667786751558, "grad_norm": 0.20739248394966125, "learning_rate": 4.877679472443315e-05, "loss": 1.2458, "step": 1282 }, { "epoch": 0.2016938827644481, "grad_norm": 0.38787081837654114, "learning_rate": 4.877488505830179e-05, "loss": 1.2039, "step": 1283 }, { "epoch": 0.20185108766138066, "grad_norm": 0.2838675379753113, "learning_rate": 4.8772973940090895e-05, "loss": 1.1647, "step": 1284 }, { "epoch": 0.20200829255831318, "grad_norm": 0.2516341805458069, "learning_rate": 4.877106136991721e-05, "loss": 1.1952, "step": 1285 }, { "epoch": 0.20216549745524573, "grad_norm": 0.24181115627288818, "learning_rate": 4.8769147347897535e-05, "loss": 1.1822, "step": 1286 }, { "epoch": 0.20232270235217828, "grad_norm": 0.3560396730899811, "learning_rate": 4.876723187414878e-05, "loss": 1.1863, "step": 1287 }, { "epoch": 0.2024799072491108, "grad_norm": 0.41868817806243896, "learning_rate": 4.8765314948787934e-05, "loss": 1.1446, "step": 1288 }, { "epoch": 0.20263711214604335, "grad_norm": 0.21799515187740326, "learning_rate": 4.8763396571932066e-05, "loss": 1.155, "step": 1289 }, { "epoch": 0.2027943170429759, "grad_norm": 0.24254827201366425, "learning_rate": 4.876147674369834e-05, "loss": 1.1363, "step": 1290 }, { "epoch": 0.20295152193990842, "grad_norm": 0.24996933341026306, "learning_rate": 4.875955546420404e-05, "loss": 1.1773, "step": 1291 }, { "epoch": 0.20310872683684097, "grad_norm": 0.33059096336364746, "learning_rate": 4.8757632733566484e-05, "loss": 1.2217, "step": 1292 }, { "epoch": 0.2032659317337735, "grad_norm": 0.3126905858516693, "learning_rate": 4.875570855190311e-05, "loss": 1.2031, "step": 1293 }, { "epoch": 0.20342313663070605, "grad_norm": 0.26368340849876404, "learning_rate": 4.8753782919331436e-05, "loss": 1.2348, "step": 1294 }, { "epoch": 0.2035803415276386, "grad_norm": 0.2591840922832489, "learning_rate": 4.875185583596909e-05, "loss": 1.2303, "step": 1295 }, { "epoch": 0.20373754642457112, "grad_norm": 0.3512473702430725, "learning_rate": 4.874992730193375e-05, "loss": 1.149, "step": 1296 }, { "epoch": 0.20389475132150367, "grad_norm": 0.28538331389427185, "learning_rate": 4.874799731734322e-05, "loss": 1.2177, "step": 1297 }, { "epoch": 0.2040519562184362, "grad_norm": 0.27414706349372864, "learning_rate": 4.8746065882315375e-05, "loss": 1.1767, "step": 1298 }, { "epoch": 0.20420916111536874, "grad_norm": 0.2481449991464615, "learning_rate": 4.874413299696816e-05, "loss": 1.1928, "step": 1299 }, { "epoch": 0.2043663660123013, "grad_norm": 0.26623374223709106, "learning_rate": 4.8742198661419646e-05, "loss": 1.2455, "step": 1300 }, { "epoch": 0.20452357090923381, "grad_norm": 0.22189855575561523, "learning_rate": 4.874026287578798e-05, "loss": 1.1934, "step": 1301 }, { "epoch": 0.20468077580616637, "grad_norm": 0.2549070119857788, "learning_rate": 4.873832564019137e-05, "loss": 1.0798, "step": 1302 }, { "epoch": 0.20483798070309892, "grad_norm": 0.2875777781009674, "learning_rate": 4.873638695474816e-05, "loss": 1.2025, "step": 1303 }, { "epoch": 0.20499518560003144, "grad_norm": 0.26281726360321045, "learning_rate": 4.873444681957674e-05, "loss": 1.2533, "step": 1304 }, { "epoch": 0.205152390496964, "grad_norm": 0.294112890958786, "learning_rate": 4.873250523479561e-05, "loss": 1.177, "step": 1305 }, { "epoch": 0.2053095953938965, "grad_norm": 0.24901710450649261, "learning_rate": 4.873056220052336e-05, "loss": 1.2624, "step": 1306 }, { "epoch": 0.20546680029082906, "grad_norm": 0.2856816351413727, "learning_rate": 4.8728617716878664e-05, "loss": 1.0575, "step": 1307 }, { "epoch": 0.2056240051877616, "grad_norm": 0.34814971685409546, "learning_rate": 4.872667178398027e-05, "loss": 1.1829, "step": 1308 }, { "epoch": 0.20578121008469413, "grad_norm": 0.3625463545322418, "learning_rate": 4.872472440194704e-05, "loss": 1.1988, "step": 1309 }, { "epoch": 0.20593841498162668, "grad_norm": 0.2561318576335907, "learning_rate": 4.8722775570897915e-05, "loss": 1.185, "step": 1310 }, { "epoch": 0.2060956198785592, "grad_norm": 0.25931569933891296, "learning_rate": 4.872082529095191e-05, "loss": 1.15, "step": 1311 }, { "epoch": 0.20625282477549176, "grad_norm": 0.21291913092136383, "learning_rate": 4.871887356222815e-05, "loss": 1.2019, "step": 1312 }, { "epoch": 0.2064100296724243, "grad_norm": 0.21931886672973633, "learning_rate": 4.8716920384845844e-05, "loss": 1.3054, "step": 1313 }, { "epoch": 0.20656723456935683, "grad_norm": 0.22694003582000732, "learning_rate": 4.8714965758924276e-05, "loss": 1.1884, "step": 1314 }, { "epoch": 0.20672443946628938, "grad_norm": 0.25568726658821106, "learning_rate": 4.871300968458282e-05, "loss": 1.2516, "step": 1315 }, { "epoch": 0.20688164436322193, "grad_norm": 0.24557702243328094, "learning_rate": 4.871105216194096e-05, "loss": 1.3418, "step": 1316 }, { "epoch": 0.20703884926015445, "grad_norm": 0.2368367463350296, "learning_rate": 4.870909319111825e-05, "loss": 1.2088, "step": 1317 }, { "epoch": 0.207196054157087, "grad_norm": 0.3659820556640625, "learning_rate": 4.870713277223434e-05, "loss": 1.0446, "step": 1318 }, { "epoch": 0.20735325905401952, "grad_norm": 0.2582785189151764, "learning_rate": 4.870517090540896e-05, "loss": 1.3072, "step": 1319 }, { "epoch": 0.20751046395095207, "grad_norm": 0.27062729001045227, "learning_rate": 4.870320759076192e-05, "loss": 1.2977, "step": 1320 }, { "epoch": 0.20766766884788462, "grad_norm": 0.2483299970626831, "learning_rate": 4.870124282841316e-05, "loss": 1.2001, "step": 1321 }, { "epoch": 0.20782487374481715, "grad_norm": 0.17987532913684845, "learning_rate": 4.869927661848266e-05, "loss": 1.253, "step": 1322 }, { "epoch": 0.2079820786417497, "grad_norm": 0.24004274606704712, "learning_rate": 4.869730896109051e-05, "loss": 1.2242, "step": 1323 }, { "epoch": 0.20813928353868222, "grad_norm": 0.259755402803421, "learning_rate": 4.869533985635689e-05, "loss": 1.2338, "step": 1324 }, { "epoch": 0.20829648843561477, "grad_norm": 0.2481742948293686, "learning_rate": 4.869336930440207e-05, "loss": 1.0983, "step": 1325 }, { "epoch": 0.20845369333254732, "grad_norm": 0.2746635675430298, "learning_rate": 4.8691397305346404e-05, "loss": 1.2491, "step": 1326 }, { "epoch": 0.20861089822947984, "grad_norm": 0.3291498124599457, "learning_rate": 4.868942385931032e-05, "loss": 1.2045, "step": 1327 }, { "epoch": 0.2087681031264124, "grad_norm": 0.2693649232387543, "learning_rate": 4.8687448966414376e-05, "loss": 1.2367, "step": 1328 }, { "epoch": 0.2089253080233449, "grad_norm": 0.3101809620857239, "learning_rate": 4.868547262677916e-05, "loss": 1.1759, "step": 1329 }, { "epoch": 0.20908251292027746, "grad_norm": 0.22869786620140076, "learning_rate": 4.86834948405254e-05, "loss": 1.2219, "step": 1330 }, { "epoch": 0.20923971781721, "grad_norm": 0.32914999127388, "learning_rate": 4.868151560777388e-05, "loss": 1.1465, "step": 1331 }, { "epoch": 0.20939692271414254, "grad_norm": 0.2585611343383789, "learning_rate": 4.867953492864549e-05, "loss": 1.202, "step": 1332 }, { "epoch": 0.2095541276110751, "grad_norm": 0.22913898527622223, "learning_rate": 4.8677552803261203e-05, "loss": 1.1182, "step": 1333 }, { "epoch": 0.20971133250800764, "grad_norm": 0.19015748798847198, "learning_rate": 4.867556923174208e-05, "loss": 1.2997, "step": 1334 }, { "epoch": 0.20986853740494016, "grad_norm": 0.28327012062072754, "learning_rate": 4.867358421420927e-05, "loss": 1.2135, "step": 1335 }, { "epoch": 0.2100257423018727, "grad_norm": 0.20216205716133118, "learning_rate": 4.8671597750784006e-05, "loss": 1.0898, "step": 1336 }, { "epoch": 0.21018294719880523, "grad_norm": 0.23876157402992249, "learning_rate": 4.8669609841587607e-05, "loss": 1.1954, "step": 1337 }, { "epoch": 0.21034015209573778, "grad_norm": 0.22727316617965698, "learning_rate": 4.86676204867415e-05, "loss": 1.1578, "step": 1338 }, { "epoch": 0.21049735699267033, "grad_norm": 0.23092162609100342, "learning_rate": 4.8665629686367185e-05, "loss": 1.2883, "step": 1339 }, { "epoch": 0.21065456188960285, "grad_norm": 0.5870768427848816, "learning_rate": 4.8663637440586255e-05, "loss": 1.1947, "step": 1340 }, { "epoch": 0.2108117667865354, "grad_norm": 0.20369422435760498, "learning_rate": 4.866164374952038e-05, "loss": 1.1551, "step": 1341 }, { "epoch": 0.21096897168346793, "grad_norm": 0.21872729063034058, "learning_rate": 4.865964861329133e-05, "loss": 1.195, "step": 1342 }, { "epoch": 0.21112617658040048, "grad_norm": 0.23386871814727783, "learning_rate": 4.8657652032020965e-05, "loss": 1.1417, "step": 1343 }, { "epoch": 0.21128338147733303, "grad_norm": 0.25174540281295776, "learning_rate": 4.865565400583123e-05, "loss": 1.2826, "step": 1344 }, { "epoch": 0.21144058637426555, "grad_norm": 0.2607194781303406, "learning_rate": 4.865365453484415e-05, "loss": 1.1529, "step": 1345 }, { "epoch": 0.2115977912711981, "grad_norm": 0.28137052059173584, "learning_rate": 4.8651653619181835e-05, "loss": 1.2041, "step": 1346 }, { "epoch": 0.21175499616813065, "grad_norm": 0.3423405587673187, "learning_rate": 4.864965125896652e-05, "loss": 1.217, "step": 1347 }, { "epoch": 0.21191220106506317, "grad_norm": 0.35149091482162476, "learning_rate": 4.864764745432048e-05, "loss": 1.234, "step": 1348 }, { "epoch": 0.21206940596199572, "grad_norm": 0.19579124450683594, "learning_rate": 4.864564220536611e-05, "loss": 1.1335, "step": 1349 }, { "epoch": 0.21222661085892824, "grad_norm": 0.2690495550632477, "learning_rate": 4.8643635512225874e-05, "loss": 1.2253, "step": 1350 }, { "epoch": 0.2123838157558608, "grad_norm": 0.2389325350522995, "learning_rate": 4.8641627375022346e-05, "loss": 1.2362, "step": 1351 }, { "epoch": 0.21254102065279334, "grad_norm": 0.2795645296573639, "learning_rate": 4.863961779387817e-05, "loss": 1.0509, "step": 1352 }, { "epoch": 0.21269822554972587, "grad_norm": 0.1987501084804535, "learning_rate": 4.863760676891608e-05, "loss": 1.0825, "step": 1353 }, { "epoch": 0.21285543044665842, "grad_norm": 0.2520771324634552, "learning_rate": 4.8635594300258905e-05, "loss": 1.2879, "step": 1354 }, { "epoch": 0.21301263534359094, "grad_norm": 0.3053792119026184, "learning_rate": 4.863358038802955e-05, "loss": 1.1737, "step": 1355 }, { "epoch": 0.2131698402405235, "grad_norm": 0.24323543906211853, "learning_rate": 4.863156503235102e-05, "loss": 1.3105, "step": 1356 }, { "epoch": 0.21332704513745604, "grad_norm": 0.2622387111186981, "learning_rate": 4.862954823334643e-05, "loss": 1.1817, "step": 1357 }, { "epoch": 0.21348425003438856, "grad_norm": 0.28524765372276306, "learning_rate": 4.862752999113893e-05, "loss": 1.1191, "step": 1358 }, { "epoch": 0.2136414549313211, "grad_norm": 0.2917231619358063, "learning_rate": 4.8625510305851784e-05, "loss": 1.1717, "step": 1359 }, { "epoch": 0.21379865982825366, "grad_norm": 0.23248536884784698, "learning_rate": 4.862348917760837e-05, "loss": 1.1472, "step": 1360 }, { "epoch": 0.21395586472518618, "grad_norm": 0.30648741126060486, "learning_rate": 4.862146660653212e-05, "loss": 1.1325, "step": 1361 }, { "epoch": 0.21411306962211873, "grad_norm": 0.26287996768951416, "learning_rate": 4.8619442592746554e-05, "loss": 1.1891, "step": 1362 }, { "epoch": 0.21427027451905126, "grad_norm": 0.2846413254737854, "learning_rate": 4.861741713637531e-05, "loss": 1.2429, "step": 1363 }, { "epoch": 0.2144274794159838, "grad_norm": 0.26600465178489685, "learning_rate": 4.861539023754208e-05, "loss": 1.2825, "step": 1364 }, { "epoch": 0.21458468431291636, "grad_norm": 0.23721352219581604, "learning_rate": 4.861336189637066e-05, "loss": 1.16, "step": 1365 }, { "epoch": 0.21474188920984888, "grad_norm": 0.21250367164611816, "learning_rate": 4.8611332112984946e-05, "loss": 1.1917, "step": 1366 }, { "epoch": 0.21489909410678143, "grad_norm": 0.2471015751361847, "learning_rate": 4.86093008875089e-05, "loss": 1.2403, "step": 1367 }, { "epoch": 0.21505629900371395, "grad_norm": 0.2969186007976532, "learning_rate": 4.860726822006659e-05, "loss": 1.2443, "step": 1368 }, { "epoch": 0.2152135039006465, "grad_norm": 0.24633657932281494, "learning_rate": 4.860523411078215e-05, "loss": 1.2468, "step": 1369 }, { "epoch": 0.21537070879757905, "grad_norm": 0.24867349863052368, "learning_rate": 4.860319855977982e-05, "loss": 1.194, "step": 1370 }, { "epoch": 0.21552791369451157, "grad_norm": 0.27883896231651306, "learning_rate": 4.8601161567183925e-05, "loss": 1.2181, "step": 1371 }, { "epoch": 0.21568511859144412, "grad_norm": 0.20830753445625305, "learning_rate": 4.859912313311888e-05, "loss": 1.1579, "step": 1372 }, { "epoch": 0.21584232348837668, "grad_norm": 0.25265562534332275, "learning_rate": 4.8597083257709194e-05, "loss": 1.1455, "step": 1373 }, { "epoch": 0.2159995283853092, "grad_norm": 0.23922070860862732, "learning_rate": 4.859504194107943e-05, "loss": 1.1434, "step": 1374 }, { "epoch": 0.21615673328224175, "grad_norm": 0.1661684215068817, "learning_rate": 4.859299918335428e-05, "loss": 1.2426, "step": 1375 }, { "epoch": 0.21631393817917427, "grad_norm": 0.20215146243572235, "learning_rate": 4.859095498465851e-05, "loss": 1.2485, "step": 1376 }, { "epoch": 0.21647114307610682, "grad_norm": 0.2150086909532547, "learning_rate": 4.858890934511697e-05, "loss": 1.2034, "step": 1377 }, { "epoch": 0.21662834797303937, "grad_norm": 0.24220801889896393, "learning_rate": 4.8586862264854595e-05, "loss": 1.2063, "step": 1378 }, { "epoch": 0.2167855528699719, "grad_norm": 0.3015029728412628, "learning_rate": 4.85848137439964e-05, "loss": 1.2371, "step": 1379 }, { "epoch": 0.21694275776690444, "grad_norm": 0.32718077301979065, "learning_rate": 4.8582763782667534e-05, "loss": 1.1915, "step": 1380 }, { "epoch": 0.21709996266383697, "grad_norm": 0.2721043527126312, "learning_rate": 4.858071238099318e-05, "loss": 1.1692, "step": 1381 }, { "epoch": 0.21725716756076952, "grad_norm": 0.21820539236068726, "learning_rate": 4.857865953909862e-05, "loss": 1.2529, "step": 1382 }, { "epoch": 0.21741437245770207, "grad_norm": 0.24213218688964844, "learning_rate": 4.857660525710927e-05, "loss": 1.2273, "step": 1383 }, { "epoch": 0.2175715773546346, "grad_norm": 0.25169044733047485, "learning_rate": 4.857454953515055e-05, "loss": 1.1917, "step": 1384 }, { "epoch": 0.21772878225156714, "grad_norm": 0.1900857836008072, "learning_rate": 4.8572492373348055e-05, "loss": 1.1924, "step": 1385 }, { "epoch": 0.2178859871484997, "grad_norm": 0.27372950315475464, "learning_rate": 4.857043377182741e-05, "loss": 1.2062, "step": 1386 }, { "epoch": 0.2180431920454322, "grad_norm": 0.2589268386363983, "learning_rate": 4.8568373730714344e-05, "loss": 1.1294, "step": 1387 }, { "epoch": 0.21820039694236476, "grad_norm": 0.32234373688697815, "learning_rate": 4.856631225013468e-05, "loss": 1.0772, "step": 1388 }, { "epoch": 0.21835760183929728, "grad_norm": 0.34723344445228577, "learning_rate": 4.8564249330214337e-05, "loss": 1.1989, "step": 1389 }, { "epoch": 0.21851480673622983, "grad_norm": 0.23347902297973633, "learning_rate": 4.85621849710793e-05, "loss": 1.3154, "step": 1390 }, { "epoch": 0.21867201163316238, "grad_norm": 0.19097594916820526, "learning_rate": 4.856011917285565e-05, "loss": 1.241, "step": 1391 }, { "epoch": 0.2188292165300949, "grad_norm": 0.17387251555919647, "learning_rate": 4.855805193566956e-05, "loss": 1.2396, "step": 1392 }, { "epoch": 0.21898642142702746, "grad_norm": 0.25310957431793213, "learning_rate": 4.85559832596473e-05, "loss": 1.1169, "step": 1393 }, { "epoch": 0.21914362632395998, "grad_norm": 0.20254233479499817, "learning_rate": 4.85539131449152e-05, "loss": 1.2233, "step": 1394 }, { "epoch": 0.21930083122089253, "grad_norm": 0.22399556636810303, "learning_rate": 4.8551841591599696e-05, "loss": 1.1257, "step": 1395 }, { "epoch": 0.21945803611782508, "grad_norm": 0.24220795929431915, "learning_rate": 4.854976859982732e-05, "loss": 1.1364, "step": 1396 }, { "epoch": 0.2196152410147576, "grad_norm": 0.2996869683265686, "learning_rate": 4.854769416972468e-05, "loss": 1.277, "step": 1397 }, { "epoch": 0.21977244591169015, "grad_norm": 0.22607775032520294, "learning_rate": 4.854561830141848e-05, "loss": 1.1538, "step": 1398 }, { "epoch": 0.2199296508086227, "grad_norm": 0.3184898793697357, "learning_rate": 4.854354099503549e-05, "loss": 1.2933, "step": 1399 }, { "epoch": 0.22008685570555522, "grad_norm": 0.26133978366851807, "learning_rate": 4.8541462250702595e-05, "loss": 1.1539, "step": 1400 }, { "epoch": 0.22024406060248777, "grad_norm": 0.2325352430343628, "learning_rate": 4.853938206854676e-05, "loss": 1.1242, "step": 1401 }, { "epoch": 0.2204012654994203, "grad_norm": 0.20906803011894226, "learning_rate": 4.853730044869503e-05, "loss": 1.1166, "step": 1402 }, { "epoch": 0.22055847039635285, "grad_norm": 0.27185148000717163, "learning_rate": 4.853521739127453e-05, "loss": 1.1067, "step": 1403 }, { "epoch": 0.2207156752932854, "grad_norm": 0.2759348452091217, "learning_rate": 4.8533132896412514e-05, "loss": 1.1237, "step": 1404 }, { "epoch": 0.22087288019021792, "grad_norm": 0.21420961618423462, "learning_rate": 4.853104696423627e-05, "loss": 1.1362, "step": 1405 }, { "epoch": 0.22103008508715047, "grad_norm": 0.22797361016273499, "learning_rate": 4.852895959487321e-05, "loss": 1.1114, "step": 1406 }, { "epoch": 0.221187289984083, "grad_norm": 0.303501695394516, "learning_rate": 4.8526870788450816e-05, "loss": 1.1836, "step": 1407 }, { "epoch": 0.22134449488101554, "grad_norm": 0.22205421328544617, "learning_rate": 4.852478054509667e-05, "loss": 1.2281, "step": 1408 }, { "epoch": 0.2215016997779481, "grad_norm": 0.31351590156555176, "learning_rate": 4.852268886493844e-05, "loss": 1.161, "step": 1409 }, { "epoch": 0.2216589046748806, "grad_norm": 0.2639506757259369, "learning_rate": 4.852059574810386e-05, "loss": 1.1397, "step": 1410 }, { "epoch": 0.22181610957181316, "grad_norm": 0.2192612588405609, "learning_rate": 4.851850119472079e-05, "loss": 1.1476, "step": 1411 }, { "epoch": 0.2219733144687457, "grad_norm": 0.27032819390296936, "learning_rate": 4.851640520491715e-05, "loss": 1.2416, "step": 1412 }, { "epoch": 0.22213051936567824, "grad_norm": 0.3159463405609131, "learning_rate": 4.851430777882095e-05, "loss": 1.1131, "step": 1413 }, { "epoch": 0.2222877242626108, "grad_norm": 0.20352302491664886, "learning_rate": 4.85122089165603e-05, "loss": 1.2406, "step": 1414 }, { "epoch": 0.2224449291595433, "grad_norm": 0.2544812858104706, "learning_rate": 4.8510108618263385e-05, "loss": 1.2213, "step": 1415 }, { "epoch": 0.22260213405647586, "grad_norm": 0.38868314027786255, "learning_rate": 4.8508006884058485e-05, "loss": 1.1821, "step": 1416 }, { "epoch": 0.2227593389534084, "grad_norm": 0.293169766664505, "learning_rate": 4.850590371407397e-05, "loss": 1.2964, "step": 1417 }, { "epoch": 0.22291654385034093, "grad_norm": 0.282044917345047, "learning_rate": 4.850379910843829e-05, "loss": 1.2502, "step": 1418 }, { "epoch": 0.22307374874727348, "grad_norm": 0.2392999529838562, "learning_rate": 4.850169306727999e-05, "loss": 1.1388, "step": 1419 }, { "epoch": 0.223230953644206, "grad_norm": 0.18563714623451233, "learning_rate": 4.849958559072768e-05, "loss": 1.2458, "step": 1420 }, { "epoch": 0.22338815854113855, "grad_norm": 0.21055997908115387, "learning_rate": 4.84974766789101e-05, "loss": 1.1713, "step": 1421 }, { "epoch": 0.2235453634380711, "grad_norm": 0.21380756795406342, "learning_rate": 4.849536633195606e-05, "loss": 1.2113, "step": 1422 }, { "epoch": 0.22370256833500363, "grad_norm": 0.2441507875919342, "learning_rate": 4.849325454999443e-05, "loss": 1.275, "step": 1423 }, { "epoch": 0.22385977323193618, "grad_norm": 0.2958972454071045, "learning_rate": 4.849114133315419e-05, "loss": 1.1454, "step": 1424 }, { "epoch": 0.22401697812886873, "grad_norm": 0.30271342396736145, "learning_rate": 4.848902668156442e-05, "loss": 1.1151, "step": 1425 }, { "epoch": 0.22417418302580125, "grad_norm": 0.2217937856912613, "learning_rate": 4.848691059535427e-05, "loss": 1.2406, "step": 1426 }, { "epoch": 0.2243313879227338, "grad_norm": 0.22862625122070312, "learning_rate": 4.848479307465299e-05, "loss": 1.1971, "step": 1427 }, { "epoch": 0.22448859281966632, "grad_norm": 0.2594766616821289, "learning_rate": 4.8482674119589896e-05, "loss": 1.1845, "step": 1428 }, { "epoch": 0.22464579771659887, "grad_norm": 0.21012337505817413, "learning_rate": 4.848055373029441e-05, "loss": 1.1644, "step": 1429 }, { "epoch": 0.22480300261353142, "grad_norm": 0.3061239421367645, "learning_rate": 4.847843190689605e-05, "loss": 1.1177, "step": 1430 }, { "epoch": 0.22496020751046394, "grad_norm": 0.27125445008277893, "learning_rate": 4.847630864952439e-05, "loss": 1.2566, "step": 1431 }, { "epoch": 0.2251174124073965, "grad_norm": 0.16465838253498077, "learning_rate": 4.847418395830911e-05, "loss": 1.2477, "step": 1432 }, { "epoch": 0.22527461730432902, "grad_norm": 0.2064242959022522, "learning_rate": 4.8472057833380005e-05, "loss": 1.2777, "step": 1433 }, { "epoch": 0.22543182220126157, "grad_norm": 0.1920788437128067, "learning_rate": 4.846993027486691e-05, "loss": 1.0682, "step": 1434 }, { "epoch": 0.22558902709819412, "grad_norm": 0.2445499151945114, "learning_rate": 4.8467801282899775e-05, "loss": 1.1468, "step": 1435 }, { "epoch": 0.22574623199512664, "grad_norm": 0.2358809858560562, "learning_rate": 4.846567085760861e-05, "loss": 1.2001, "step": 1436 }, { "epoch": 0.2259034368920592, "grad_norm": 0.2340790331363678, "learning_rate": 4.846353899912356e-05, "loss": 1.1401, "step": 1437 }, { "epoch": 0.22606064178899174, "grad_norm": 0.23034712672233582, "learning_rate": 4.8461405707574824e-05, "loss": 1.1617, "step": 1438 }, { "epoch": 0.22621784668592426, "grad_norm": 0.2453261762857437, "learning_rate": 4.8459270983092686e-05, "loss": 1.2889, "step": 1439 }, { "epoch": 0.2263750515828568, "grad_norm": 0.23590409755706787, "learning_rate": 4.8457134825807535e-05, "loss": 1.1807, "step": 1440 }, { "epoch": 0.2263750515828568, "eval_loss": 1.1835801601409912, "eval_runtime": 2320.0135, "eval_samples_per_second": 3.99, "eval_steps_per_second": 1.995, "step": 1440 }, { "epoch": 0.22653225647978933, "grad_norm": 0.2932775318622589, "learning_rate": 4.845499723584984e-05, "loss": 1.1899, "step": 1441 }, { "epoch": 0.22668946137672188, "grad_norm": 0.23739026486873627, "learning_rate": 4.845285821335015e-05, "loss": 1.2037, "step": 1442 }, { "epoch": 0.22684666627365443, "grad_norm": 0.1993498057126999, "learning_rate": 4.8450717758439115e-05, "loss": 1.1681, "step": 1443 }, { "epoch": 0.22700387117058696, "grad_norm": 0.23352783918380737, "learning_rate": 4.8448575871247465e-05, "loss": 1.2268, "step": 1444 }, { "epoch": 0.2271610760675195, "grad_norm": 0.22201289236545563, "learning_rate": 4.844643255190602e-05, "loss": 1.2883, "step": 1445 }, { "epoch": 0.22731828096445203, "grad_norm": 0.2467186152935028, "learning_rate": 4.8444287800545676e-05, "loss": 1.1602, "step": 1446 }, { "epoch": 0.22747548586138458, "grad_norm": 0.20474953949451447, "learning_rate": 4.844214161729743e-05, "loss": 1.3125, "step": 1447 }, { "epoch": 0.22763269075831713, "grad_norm": 0.1738600730895996, "learning_rate": 4.843999400229238e-05, "loss": 1.275, "step": 1448 }, { "epoch": 0.22778989565524965, "grad_norm": 0.23264817893505096, "learning_rate": 4.843784495566166e-05, "loss": 1.2151, "step": 1449 }, { "epoch": 0.2279471005521822, "grad_norm": 0.2865363657474518, "learning_rate": 4.843569447753656e-05, "loss": 1.2071, "step": 1450 }, { "epoch": 0.22810430544911475, "grad_norm": 0.2803146243095398, "learning_rate": 4.84335425680484e-05, "loss": 1.207, "step": 1451 }, { "epoch": 0.22826151034604727, "grad_norm": 0.22393058240413666, "learning_rate": 4.843138922732863e-05, "loss": 1.1584, "step": 1452 }, { "epoch": 0.22841871524297983, "grad_norm": 0.22326746582984924, "learning_rate": 4.8429234455508746e-05, "loss": 1.2231, "step": 1453 }, { "epoch": 0.22857592013991235, "grad_norm": 0.21326550841331482, "learning_rate": 4.8427078252720366e-05, "loss": 1.138, "step": 1454 }, { "epoch": 0.2287331250368449, "grad_norm": 0.19931691884994507, "learning_rate": 4.842492061909518e-05, "loss": 1.2628, "step": 1455 }, { "epoch": 0.22889032993377745, "grad_norm": 0.4217057228088379, "learning_rate": 4.8422761554764974e-05, "loss": 1.0622, "step": 1456 }, { "epoch": 0.22904753483070997, "grad_norm": 0.2697439193725586, "learning_rate": 4.8420601059861605e-05, "loss": 1.1374, "step": 1457 }, { "epoch": 0.22920473972764252, "grad_norm": 0.25051379203796387, "learning_rate": 4.841843913451703e-05, "loss": 1.1349, "step": 1458 }, { "epoch": 0.22936194462457504, "grad_norm": 0.30390244722366333, "learning_rate": 4.841627577886329e-05, "loss": 1.1929, "step": 1459 }, { "epoch": 0.2295191495215076, "grad_norm": 0.2657618820667267, "learning_rate": 4.8414110993032535e-05, "loss": 1.1913, "step": 1460 }, { "epoch": 0.22967635441844014, "grad_norm": 0.27086779475212097, "learning_rate": 4.841194477715696e-05, "loss": 1.2208, "step": 1461 }, { "epoch": 0.22983355931537267, "grad_norm": 0.2512415647506714, "learning_rate": 4.840977713136887e-05, "loss": 1.1451, "step": 1462 }, { "epoch": 0.22999076421230522, "grad_norm": 0.2690669298171997, "learning_rate": 4.8407608055800656e-05, "loss": 1.2148, "step": 1463 }, { "epoch": 0.23014796910923777, "grad_norm": 0.2143964320421219, "learning_rate": 4.8405437550584816e-05, "loss": 1.2793, "step": 1464 }, { "epoch": 0.2303051740061703, "grad_norm": 0.21806931495666504, "learning_rate": 4.8403265615853894e-05, "loss": 1.1906, "step": 1465 }, { "epoch": 0.23046237890310284, "grad_norm": 0.23820914328098297, "learning_rate": 4.8401092251740555e-05, "loss": 1.3061, "step": 1466 }, { "epoch": 0.23061958380003536, "grad_norm": 0.2485547661781311, "learning_rate": 4.839891745837753e-05, "loss": 1.2976, "step": 1467 }, { "epoch": 0.2307767886969679, "grad_norm": 0.2677574157714844, "learning_rate": 4.8396741235897655e-05, "loss": 1.0837, "step": 1468 }, { "epoch": 0.23093399359390046, "grad_norm": 0.21473652124404907, "learning_rate": 4.839456358443385e-05, "loss": 1.1982, "step": 1469 }, { "epoch": 0.23109119849083298, "grad_norm": 0.27341189980506897, "learning_rate": 4.8392384504119116e-05, "loss": 1.1508, "step": 1470 }, { "epoch": 0.23124840338776553, "grad_norm": 0.20952042937278748, "learning_rate": 4.8390203995086525e-05, "loss": 1.2806, "step": 1471 }, { "epoch": 0.23140560828469806, "grad_norm": 0.19671006500720978, "learning_rate": 4.838802205746927e-05, "loss": 1.1397, "step": 1472 }, { "epoch": 0.2315628131816306, "grad_norm": 0.2122461348772049, "learning_rate": 4.838583869140063e-05, "loss": 1.2206, "step": 1473 }, { "epoch": 0.23172001807856316, "grad_norm": 0.2837271988391876, "learning_rate": 4.838365389701392e-05, "loss": 1.1932, "step": 1474 }, { "epoch": 0.23187722297549568, "grad_norm": 0.20707464218139648, "learning_rate": 4.838146767444261e-05, "loss": 1.1619, "step": 1475 }, { "epoch": 0.23203442787242823, "grad_norm": 0.2600030303001404, "learning_rate": 4.837928002382021e-05, "loss": 1.1239, "step": 1476 }, { "epoch": 0.23219163276936078, "grad_norm": 0.1838095337152481, "learning_rate": 4.837709094528035e-05, "loss": 1.2329, "step": 1477 }, { "epoch": 0.2323488376662933, "grad_norm": 0.21165348589420319, "learning_rate": 4.83749004389567e-05, "loss": 1.2103, "step": 1478 }, { "epoch": 0.23250604256322585, "grad_norm": 0.21954618394374847, "learning_rate": 4.837270850498308e-05, "loss": 1.1957, "step": 1479 }, { "epoch": 0.23266324746015837, "grad_norm": 0.1750314086675644, "learning_rate": 4.8370515143493346e-05, "loss": 1.2116, "step": 1480 }, { "epoch": 0.23282045235709092, "grad_norm": 0.24269573390483856, "learning_rate": 4.8368320354621474e-05, "loss": 1.1123, "step": 1481 }, { "epoch": 0.23297765725402347, "grad_norm": 0.2614949345588684, "learning_rate": 4.83661241385015e-05, "loss": 1.2208, "step": 1482 }, { "epoch": 0.233134862150956, "grad_norm": 0.252847284078598, "learning_rate": 4.836392649526756e-05, "loss": 1.1886, "step": 1483 }, { "epoch": 0.23329206704788855, "grad_norm": 0.26073166728019714, "learning_rate": 4.8361727425053895e-05, "loss": 1.1645, "step": 1484 }, { "epoch": 0.23344927194482107, "grad_norm": 0.33702126145362854, "learning_rate": 4.83595269279948e-05, "loss": 1.278, "step": 1485 }, { "epoch": 0.23360647684175362, "grad_norm": 0.23756952583789825, "learning_rate": 4.8357325004224675e-05, "loss": 1.2537, "step": 1486 }, { "epoch": 0.23376368173868617, "grad_norm": 0.2630425989627838, "learning_rate": 4.835512165387801e-05, "loss": 1.1478, "step": 1487 }, { "epoch": 0.2339208866356187, "grad_norm": 0.2218979150056839, "learning_rate": 4.835291687708937e-05, "loss": 1.1887, "step": 1488 }, { "epoch": 0.23407809153255124, "grad_norm": 0.23264755308628082, "learning_rate": 4.8350710673993425e-05, "loss": 1.1516, "step": 1489 }, { "epoch": 0.2342352964294838, "grad_norm": 0.2717369794845581, "learning_rate": 4.834850304472491e-05, "loss": 1.1529, "step": 1490 }, { "epoch": 0.2343925013264163, "grad_norm": 0.19794031977653503, "learning_rate": 4.8346293989418666e-05, "loss": 1.2004, "step": 1491 }, { "epoch": 0.23454970622334886, "grad_norm": 0.21382425725460052, "learning_rate": 4.8344083508209614e-05, "loss": 1.1859, "step": 1492 }, { "epoch": 0.2347069111202814, "grad_norm": 0.3373514413833618, "learning_rate": 4.834187160123276e-05, "loss": 1.1786, "step": 1493 }, { "epoch": 0.23486411601721394, "grad_norm": 0.2665463089942932, "learning_rate": 4.83396582686232e-05, "loss": 1.2605, "step": 1494 }, { "epoch": 0.2350213209141465, "grad_norm": 0.2265530377626419, "learning_rate": 4.833744351051611e-05, "loss": 1.2043, "step": 1495 }, { "epoch": 0.235178525811079, "grad_norm": 0.23344869911670685, "learning_rate": 4.833522732704677e-05, "loss": 1.2518, "step": 1496 }, { "epoch": 0.23533573070801156, "grad_norm": 0.2448417842388153, "learning_rate": 4.833300971835053e-05, "loss": 1.2219, "step": 1497 }, { "epoch": 0.23549293560494408, "grad_norm": 0.21578192710876465, "learning_rate": 4.8330790684562827e-05, "loss": 1.1285, "step": 1498 }, { "epoch": 0.23565014050187663, "grad_norm": 0.25885578989982605, "learning_rate": 4.8328570225819195e-05, "loss": 1.1466, "step": 1499 }, { "epoch": 0.23580734539880918, "grad_norm": 0.22060716152191162, "learning_rate": 4.832634834225526e-05, "loss": 1.126, "step": 1500 }, { "epoch": 0.2359645502957417, "grad_norm": 0.23449023067951202, "learning_rate": 4.8324125034006715e-05, "loss": 1.1628, "step": 1501 }, { "epoch": 0.23612175519267425, "grad_norm": 0.2863908112049103, "learning_rate": 4.832190030120936e-05, "loss": 1.1403, "step": 1502 }, { "epoch": 0.2362789600896068, "grad_norm": 0.25917142629623413, "learning_rate": 4.8319674143999063e-05, "loss": 1.1563, "step": 1503 }, { "epoch": 0.23643616498653933, "grad_norm": 0.2577166259288788, "learning_rate": 4.83174465625118e-05, "loss": 1.2036, "step": 1504 }, { "epoch": 0.23659336988347188, "grad_norm": 0.26499852538108826, "learning_rate": 4.831521755688361e-05, "loss": 1.2324, "step": 1505 }, { "epoch": 0.2367505747804044, "grad_norm": 0.32207560539245605, "learning_rate": 4.831298712725065e-05, "loss": 1.109, "step": 1506 }, { "epoch": 0.23690777967733695, "grad_norm": 0.2242748886346817, "learning_rate": 4.831075527374913e-05, "loss": 1.0943, "step": 1507 }, { "epoch": 0.2370649845742695, "grad_norm": 0.23681138455867767, "learning_rate": 4.830852199651537e-05, "loss": 1.2074, "step": 1508 }, { "epoch": 0.23722218947120202, "grad_norm": 0.23508530855178833, "learning_rate": 4.830628729568577e-05, "loss": 1.1592, "step": 1509 }, { "epoch": 0.23737939436813457, "grad_norm": 0.24309656023979187, "learning_rate": 4.8304051171396815e-05, "loss": 1.1721, "step": 1510 }, { "epoch": 0.2375365992650671, "grad_norm": 0.22660185396671295, "learning_rate": 4.830181362378509e-05, "loss": 1.2004, "step": 1511 }, { "epoch": 0.23769380416199964, "grad_norm": 0.19725121557712555, "learning_rate": 4.8299574652987236e-05, "loss": 1.142, "step": 1512 }, { "epoch": 0.2378510090589322, "grad_norm": 0.251643568277359, "learning_rate": 4.8297334259140015e-05, "loss": 1.24, "step": 1513 }, { "epoch": 0.23800821395586472, "grad_norm": 0.21497240662574768, "learning_rate": 4.829509244238026e-05, "loss": 1.0919, "step": 1514 }, { "epoch": 0.23816541885279727, "grad_norm": 0.20723997056484222, "learning_rate": 4.829284920284488e-05, "loss": 1.2857, "step": 1515 }, { "epoch": 0.23832262374972982, "grad_norm": 0.1896023154258728, "learning_rate": 4.82906045406709e-05, "loss": 1.1827, "step": 1516 }, { "epoch": 0.23847982864666234, "grad_norm": 0.2979254126548767, "learning_rate": 4.828835845599542e-05, "loss": 1.1791, "step": 1517 }, { "epoch": 0.2386370335435949, "grad_norm": 0.22216519713401794, "learning_rate": 4.82861109489556e-05, "loss": 1.1883, "step": 1518 }, { "epoch": 0.2387942384405274, "grad_norm": 0.21972821652889252, "learning_rate": 4.828386201968873e-05, "loss": 1.1213, "step": 1519 }, { "epoch": 0.23895144333745996, "grad_norm": 0.22813616693019867, "learning_rate": 4.828161166833215e-05, "loss": 1.1776, "step": 1520 }, { "epoch": 0.2391086482343925, "grad_norm": 0.2273959070444107, "learning_rate": 4.827935989502331e-05, "loss": 1.0856, "step": 1521 }, { "epoch": 0.23926585313132503, "grad_norm": 0.219630166888237, "learning_rate": 4.827710669989974e-05, "loss": 1.1141, "step": 1522 }, { "epoch": 0.23942305802825758, "grad_norm": 0.278255432844162, "learning_rate": 4.8274852083099065e-05, "loss": 1.1454, "step": 1523 }, { "epoch": 0.2395802629251901, "grad_norm": 0.27673086524009705, "learning_rate": 4.8272596044758974e-05, "loss": 1.1366, "step": 1524 }, { "epoch": 0.23973746782212266, "grad_norm": 0.32165762782096863, "learning_rate": 4.827033858501726e-05, "loss": 1.2367, "step": 1525 }, { "epoch": 0.2398946727190552, "grad_norm": 0.20993465185165405, "learning_rate": 4.82680797040118e-05, "loss": 1.2368, "step": 1526 }, { "epoch": 0.24005187761598773, "grad_norm": 0.23632824420928955, "learning_rate": 4.8265819401880575e-05, "loss": 1.2395, "step": 1527 }, { "epoch": 0.24020908251292028, "grad_norm": 0.2364589422941208, "learning_rate": 4.826355767876161e-05, "loss": 1.1977, "step": 1528 }, { "epoch": 0.24036628740985283, "grad_norm": 0.2299450784921646, "learning_rate": 4.826129453479306e-05, "loss": 1.2261, "step": 1529 }, { "epoch": 0.24052349230678535, "grad_norm": 0.2673666477203369, "learning_rate": 4.825902997011314e-05, "loss": 1.2938, "step": 1530 }, { "epoch": 0.2406806972037179, "grad_norm": 0.19846846163272858, "learning_rate": 4.8256763984860164e-05, "loss": 1.1954, "step": 1531 }, { "epoch": 0.24083790210065043, "grad_norm": 0.23329487442970276, "learning_rate": 4.825449657917253e-05, "loss": 1.2483, "step": 1532 }, { "epoch": 0.24099510699758298, "grad_norm": 0.22006340324878693, "learning_rate": 4.825222775318872e-05, "loss": 1.14, "step": 1533 }, { "epoch": 0.24115231189451553, "grad_norm": 0.2518123984336853, "learning_rate": 4.8249957507047315e-05, "loss": 1.041, "step": 1534 }, { "epoch": 0.24130951679144805, "grad_norm": 0.26999431848526, "learning_rate": 4.824768584088696e-05, "loss": 1.1871, "step": 1535 }, { "epoch": 0.2414667216883806, "grad_norm": 0.26676446199417114, "learning_rate": 4.824541275484641e-05, "loss": 1.1527, "step": 1536 }, { "epoch": 0.24162392658531312, "grad_norm": 0.30466800928115845, "learning_rate": 4.824313824906449e-05, "loss": 1.0792, "step": 1537 }, { "epoch": 0.24178113148224567, "grad_norm": 0.23601631820201874, "learning_rate": 4.824086232368011e-05, "loss": 1.1099, "step": 1538 }, { "epoch": 0.24193833637917822, "grad_norm": 0.21866478025913239, "learning_rate": 4.82385849788323e-05, "loss": 1.2394, "step": 1539 }, { "epoch": 0.24209554127611074, "grad_norm": 0.2272699475288391, "learning_rate": 4.823630621466013e-05, "loss": 1.2503, "step": 1540 }, { "epoch": 0.2422527461730433, "grad_norm": 0.2118367850780487, "learning_rate": 4.823402603130279e-05, "loss": 1.2348, "step": 1541 }, { "epoch": 0.24240995106997584, "grad_norm": 0.2032533437013626, "learning_rate": 4.823174442889953e-05, "loss": 1.205, "step": 1542 }, { "epoch": 0.24256715596690837, "grad_norm": 0.22420114278793335, "learning_rate": 4.822946140758972e-05, "loss": 1.1653, "step": 1543 }, { "epoch": 0.24272436086384092, "grad_norm": 0.22077587246894836, "learning_rate": 4.8227176967512785e-05, "loss": 1.0768, "step": 1544 }, { "epoch": 0.24288156576077344, "grad_norm": 0.25682517886161804, "learning_rate": 4.8224891108808255e-05, "loss": 1.1728, "step": 1545 }, { "epoch": 0.243038770657706, "grad_norm": 0.28335657715797424, "learning_rate": 4.8222603831615744e-05, "loss": 1.1615, "step": 1546 }, { "epoch": 0.24319597555463854, "grad_norm": 0.2960416376590729, "learning_rate": 4.8220315136074946e-05, "loss": 1.1786, "step": 1547 }, { "epoch": 0.24335318045157106, "grad_norm": 0.24929380416870117, "learning_rate": 4.821802502232565e-05, "loss": 1.1626, "step": 1548 }, { "epoch": 0.2435103853485036, "grad_norm": 0.2701495587825775, "learning_rate": 4.821573349050772e-05, "loss": 1.1638, "step": 1549 }, { "epoch": 0.24366759024543613, "grad_norm": 0.21584048867225647, "learning_rate": 4.821344054076111e-05, "loss": 1.083, "step": 1550 }, { "epoch": 0.24382479514236868, "grad_norm": 0.20484817028045654, "learning_rate": 4.8211146173225884e-05, "loss": 1.1552, "step": 1551 }, { "epoch": 0.24398200003930123, "grad_norm": 0.2720612585544586, "learning_rate": 4.8208850388042166e-05, "loss": 1.2093, "step": 1552 }, { "epoch": 0.24413920493623376, "grad_norm": 0.2705625295639038, "learning_rate": 4.820655318535017e-05, "loss": 1.2524, "step": 1553 }, { "epoch": 0.2442964098331663, "grad_norm": 0.2290465533733368, "learning_rate": 4.820425456529019e-05, "loss": 1.2138, "step": 1554 }, { "epoch": 0.24445361473009886, "grad_norm": 0.20924489200115204, "learning_rate": 4.8201954528002634e-05, "loss": 1.215, "step": 1555 }, { "epoch": 0.24461081962703138, "grad_norm": 0.4101350009441376, "learning_rate": 4.819965307362797e-05, "loss": 1.1646, "step": 1556 }, { "epoch": 0.24476802452396393, "grad_norm": 0.3153195381164551, "learning_rate": 4.819735020230677e-05, "loss": 1.0581, "step": 1557 }, { "epoch": 0.24492522942089645, "grad_norm": 0.2894124686717987, "learning_rate": 4.819504591417967e-05, "loss": 1.1619, "step": 1558 }, { "epoch": 0.245082434317829, "grad_norm": 0.2255009263753891, "learning_rate": 4.8192740209387425e-05, "loss": 1.2306, "step": 1559 }, { "epoch": 0.24523963921476155, "grad_norm": 0.23586878180503845, "learning_rate": 4.819043308807085e-05, "loss": 1.0897, "step": 1560 }, { "epoch": 0.24539684411169407, "grad_norm": 0.19656193256378174, "learning_rate": 4.818812455037086e-05, "loss": 1.2547, "step": 1561 }, { "epoch": 0.24555404900862662, "grad_norm": 0.19186031818389893, "learning_rate": 4.818581459642844e-05, "loss": 1.2, "step": 1562 }, { "epoch": 0.24571125390555915, "grad_norm": 0.30889350175857544, "learning_rate": 4.8183503226384685e-05, "loss": 1.0726, "step": 1563 }, { "epoch": 0.2458684588024917, "grad_norm": 0.28955504298210144, "learning_rate": 4.8181190440380755e-05, "loss": 1.0853, "step": 1564 }, { "epoch": 0.24602566369942425, "grad_norm": 0.20687651634216309, "learning_rate": 4.817887623855792e-05, "loss": 1.1026, "step": 1565 }, { "epoch": 0.24618286859635677, "grad_norm": 0.20754511654376984, "learning_rate": 4.817656062105751e-05, "loss": 1.1793, "step": 1566 }, { "epoch": 0.24634007349328932, "grad_norm": 0.20881116390228271, "learning_rate": 4.817424358802096e-05, "loss": 1.1812, "step": 1567 }, { "epoch": 0.24649727839022187, "grad_norm": 0.22653476893901825, "learning_rate": 4.8171925139589777e-05, "loss": 1.1733, "step": 1568 }, { "epoch": 0.2466544832871544, "grad_norm": 0.2595761716365814, "learning_rate": 4.8169605275905574e-05, "loss": 1.1817, "step": 1569 }, { "epoch": 0.24681168818408694, "grad_norm": 0.265379935503006, "learning_rate": 4.8167283997110044e-05, "loss": 1.0631, "step": 1570 }, { "epoch": 0.24696889308101946, "grad_norm": 0.28662997484207153, "learning_rate": 4.816496130334494e-05, "loss": 1.171, "step": 1571 }, { "epoch": 0.24712609797795201, "grad_norm": 0.28659316897392273, "learning_rate": 4.8162637194752146e-05, "loss": 1.158, "step": 1572 }, { "epoch": 0.24728330287488456, "grad_norm": 0.25767526030540466, "learning_rate": 4.8160311671473596e-05, "loss": 1.1179, "step": 1573 }, { "epoch": 0.2474405077718171, "grad_norm": 0.2345789521932602, "learning_rate": 4.815798473365133e-05, "loss": 1.0722, "step": 1574 }, { "epoch": 0.24759771266874964, "grad_norm": 0.20178478956222534, "learning_rate": 4.8155656381427464e-05, "loss": 1.1837, "step": 1575 }, { "epoch": 0.24775491756568216, "grad_norm": 0.2254524976015091, "learning_rate": 4.815332661494421e-05, "loss": 1.0697, "step": 1576 }, { "epoch": 0.2479121224626147, "grad_norm": 0.24192413687705994, "learning_rate": 4.815099543434386e-05, "loss": 1.1321, "step": 1577 }, { "epoch": 0.24806932735954726, "grad_norm": 0.24133244156837463, "learning_rate": 4.814866283976879e-05, "loss": 0.9985, "step": 1578 }, { "epoch": 0.24822653225647978, "grad_norm": 0.25949928164482117, "learning_rate": 4.814632883136146e-05, "loss": 1.1197, "step": 1579 }, { "epoch": 0.24838373715341233, "grad_norm": 0.2583736479282379, "learning_rate": 4.8143993409264446e-05, "loss": 1.1219, "step": 1580 }, { "epoch": 0.24854094205034488, "grad_norm": 0.27031365036964417, "learning_rate": 4.814165657362037e-05, "loss": 1.1165, "step": 1581 }, { "epoch": 0.2486981469472774, "grad_norm": 0.21057547628879547, "learning_rate": 4.813931832457195e-05, "loss": 1.2199, "step": 1582 }, { "epoch": 0.24885535184420995, "grad_norm": 0.16742344200611115, "learning_rate": 4.813697866226201e-05, "loss": 1.1562, "step": 1583 }, { "epoch": 0.24901255674114248, "grad_norm": 0.17304782569408417, "learning_rate": 4.813463758683345e-05, "loss": 1.28, "step": 1584 }, { "epoch": 0.24916976163807503, "grad_norm": 0.22176629304885864, "learning_rate": 4.813229509842924e-05, "loss": 1.261, "step": 1585 }, { "epoch": 0.24932696653500758, "grad_norm": 0.2836274802684784, "learning_rate": 4.812995119719246e-05, "loss": 1.0815, "step": 1586 }, { "epoch": 0.2494841714319401, "grad_norm": 0.28422456979751587, "learning_rate": 4.812760588326627e-05, "loss": 1.2035, "step": 1587 }, { "epoch": 0.24964137632887265, "grad_norm": 0.21732592582702637, "learning_rate": 4.81252591567939e-05, "loss": 1.2436, "step": 1588 }, { "epoch": 0.24979858122580517, "grad_norm": 0.24617740511894226, "learning_rate": 4.8122911017918694e-05, "loss": 1.1604, "step": 1589 }, { "epoch": 0.24995578612273772, "grad_norm": 0.22099459171295166, "learning_rate": 4.8120561466784056e-05, "loss": 1.2597, "step": 1590 }, { "epoch": 0.25011299101967027, "grad_norm": 0.21290400624275208, "learning_rate": 4.811821050353349e-05, "loss": 1.2431, "step": 1591 }, { "epoch": 0.2502701959166028, "grad_norm": 0.32815465331077576, "learning_rate": 4.811585812831059e-05, "loss": 1.202, "step": 1592 }, { "epoch": 0.2504274008135353, "grad_norm": 0.19528530538082123, "learning_rate": 4.811350434125902e-05, "loss": 1.2669, "step": 1593 }, { "epoch": 0.25058460571046787, "grad_norm": 0.2987164855003357, "learning_rate": 4.8111149142522545e-05, "loss": 1.1263, "step": 1594 }, { "epoch": 0.2507418106074004, "grad_norm": 0.22502346336841583, "learning_rate": 4.810879253224502e-05, "loss": 1.0881, "step": 1595 }, { "epoch": 0.25089901550433297, "grad_norm": 0.18701261281967163, "learning_rate": 4.810643451057036e-05, "loss": 1.1612, "step": 1596 }, { "epoch": 0.2510562204012655, "grad_norm": 0.2719084322452545, "learning_rate": 4.81040750776426e-05, "loss": 1.0618, "step": 1597 }, { "epoch": 0.251213425298198, "grad_norm": 0.2659015655517578, "learning_rate": 4.8101714233605845e-05, "loss": 1.2364, "step": 1598 }, { "epoch": 0.25137063019513056, "grad_norm": 0.23821905255317688, "learning_rate": 4.809935197860427e-05, "loss": 1.2978, "step": 1599 }, { "epoch": 0.2515278350920631, "grad_norm": 0.2146797925233841, "learning_rate": 4.8096988312782174e-05, "loss": 1.2891, "step": 1600 }, { "epoch": 0.2515278350920631, "eval_loss": 1.1705546379089355, "eval_runtime": 2320.6543, "eval_samples_per_second": 3.989, "eval_steps_per_second": 1.995, "step": 1600 }, { "epoch": 0.25168503998899566, "grad_norm": 0.2162550389766693, "learning_rate": 4.80946232362839e-05, "loss": 1.1259, "step": 1601 }, { "epoch": 0.2518422448859282, "grad_norm": 0.24746862053871155, "learning_rate": 4.809225674925392e-05, "loss": 1.1532, "step": 1602 }, { "epoch": 0.25199944978286076, "grad_norm": 0.21859407424926758, "learning_rate": 4.808988885183675e-05, "loss": 1.2823, "step": 1603 }, { "epoch": 0.25215665467979326, "grad_norm": 0.24052634835243225, "learning_rate": 4.808751954417702e-05, "loss": 1.2075, "step": 1604 }, { "epoch": 0.2523138595767258, "grad_norm": 0.3079143464565277, "learning_rate": 4.808514882641944e-05, "loss": 1.2258, "step": 1605 }, { "epoch": 0.25247106447365836, "grad_norm": 0.2353007048368454, "learning_rate": 4.8082776698708805e-05, "loss": 1.0725, "step": 1606 }, { "epoch": 0.2526282693705909, "grad_norm": 0.24463996291160583, "learning_rate": 4.808040316118999e-05, "loss": 1.1763, "step": 1607 }, { "epoch": 0.25278547426752346, "grad_norm": 0.24785065650939941, "learning_rate": 4.807802821400796e-05, "loss": 1.146, "step": 1608 }, { "epoch": 0.25294267916445595, "grad_norm": 0.3086298704147339, "learning_rate": 4.8075651857307786e-05, "loss": 1.1113, "step": 1609 }, { "epoch": 0.2530998840613885, "grad_norm": 0.2168322056531906, "learning_rate": 4.807327409123459e-05, "loss": 1.2544, "step": 1610 }, { "epoch": 0.25325708895832105, "grad_norm": 0.23688393831253052, "learning_rate": 4.807089491593359e-05, "loss": 1.2082, "step": 1611 }, { "epoch": 0.2534142938552536, "grad_norm": 0.184744194149971, "learning_rate": 4.8068514331550116e-05, "loss": 1.2256, "step": 1612 }, { "epoch": 0.25357149875218615, "grad_norm": 0.26341819763183594, "learning_rate": 4.8066132338229564e-05, "loss": 1.0727, "step": 1613 }, { "epoch": 0.25372870364911865, "grad_norm": 0.24953097105026245, "learning_rate": 4.80637489361174e-05, "loss": 1.197, "step": 1614 }, { "epoch": 0.2538859085460512, "grad_norm": 0.26930662989616394, "learning_rate": 4.8061364125359204e-05, "loss": 1.2333, "step": 1615 }, { "epoch": 0.25404311344298375, "grad_norm": 0.22073645889759064, "learning_rate": 4.805897790610063e-05, "loss": 1.1749, "step": 1616 }, { "epoch": 0.2542003183399163, "grad_norm": 0.23651045560836792, "learning_rate": 4.805659027848742e-05, "loss": 1.2632, "step": 1617 }, { "epoch": 0.25435752323684885, "grad_norm": 0.2211553007364273, "learning_rate": 4.80542012426654e-05, "loss": 1.2297, "step": 1618 }, { "epoch": 0.25451472813378134, "grad_norm": 0.21516153216362, "learning_rate": 4.805181079878048e-05, "loss": 1.2396, "step": 1619 }, { "epoch": 0.2546719330307139, "grad_norm": 0.2111159861087799, "learning_rate": 4.804941894697867e-05, "loss": 1.0961, "step": 1620 }, { "epoch": 0.25482913792764644, "grad_norm": 0.2587776184082031, "learning_rate": 4.804702568740604e-05, "loss": 1.1243, "step": 1621 }, { "epoch": 0.254986342824579, "grad_norm": 0.21024088561534882, "learning_rate": 4.804463102020878e-05, "loss": 1.1276, "step": 1622 }, { "epoch": 0.25514354772151154, "grad_norm": 0.2880707383155823, "learning_rate": 4.8042234945533127e-05, "loss": 1.0841, "step": 1623 }, { "epoch": 0.25530075261844404, "grad_norm": 0.21504652500152588, "learning_rate": 4.803983746352544e-05, "loss": 1.0336, "step": 1624 }, { "epoch": 0.2554579575153766, "grad_norm": 0.2103573977947235, "learning_rate": 4.803743857433214e-05, "loss": 1.2497, "step": 1625 }, { "epoch": 0.25561516241230914, "grad_norm": 0.24323992431163788, "learning_rate": 4.803503827809974e-05, "loss": 1.1702, "step": 1626 }, { "epoch": 0.2557723673092417, "grad_norm": 0.1768578141927719, "learning_rate": 4.8032636574974845e-05, "loss": 1.2472, "step": 1627 }, { "epoch": 0.25592957220617424, "grad_norm": 0.20027288794517517, "learning_rate": 4.803023346510415e-05, "loss": 1.1757, "step": 1628 }, { "epoch": 0.2560867771031068, "grad_norm": 0.2526282072067261, "learning_rate": 4.8027828948634405e-05, "loss": 1.1338, "step": 1629 }, { "epoch": 0.2562439820000393, "grad_norm": 0.2610374689102173, "learning_rate": 4.802542302571249e-05, "loss": 1.0715, "step": 1630 }, { "epoch": 0.25640118689697183, "grad_norm": 0.2409505993127823, "learning_rate": 4.802301569648534e-05, "loss": 1.1915, "step": 1631 }, { "epoch": 0.2565583917939044, "grad_norm": 0.3539654314517975, "learning_rate": 4.8020606961099996e-05, "loss": 1.1752, "step": 1632 }, { "epoch": 0.25671559669083693, "grad_norm": 0.23121047019958496, "learning_rate": 4.801819681970357e-05, "loss": 1.1966, "step": 1633 }, { "epoch": 0.2568728015877695, "grad_norm": 0.2272186279296875, "learning_rate": 4.801578527244325e-05, "loss": 1.1131, "step": 1634 }, { "epoch": 0.257030006484702, "grad_norm": 0.183711439371109, "learning_rate": 4.801337231946633e-05, "loss": 1.1569, "step": 1635 }, { "epoch": 0.25718721138163453, "grad_norm": 0.22497425973415375, "learning_rate": 4.80109579609202e-05, "loss": 1.1491, "step": 1636 }, { "epoch": 0.2573444162785671, "grad_norm": 0.22534243762493134, "learning_rate": 4.80085421969523e-05, "loss": 1.1827, "step": 1637 }, { "epoch": 0.25750162117549963, "grad_norm": 0.14900386333465576, "learning_rate": 4.800612502771019e-05, "loss": 1.1905, "step": 1638 }, { "epoch": 0.2576588260724322, "grad_norm": 0.26234182715415955, "learning_rate": 4.80037064533415e-05, "loss": 1.1197, "step": 1639 }, { "epoch": 0.2578160309693647, "grad_norm": 0.23205281794071198, "learning_rate": 4.800128647399393e-05, "loss": 1.1325, "step": 1640 }, { "epoch": 0.2579732358662972, "grad_norm": 0.22982637584209442, "learning_rate": 4.799886508981531e-05, "loss": 1.1097, "step": 1641 }, { "epoch": 0.2581304407632298, "grad_norm": 0.2268412709236145, "learning_rate": 4.799644230095351e-05, "loss": 1.2452, "step": 1642 }, { "epoch": 0.2582876456601623, "grad_norm": 0.2071431428194046, "learning_rate": 4.799401810755651e-05, "loss": 1.2338, "step": 1643 }, { "epoch": 0.2584448505570949, "grad_norm": 0.21589568257331848, "learning_rate": 4.799159250977237e-05, "loss": 1.1177, "step": 1644 }, { "epoch": 0.25860205545402737, "grad_norm": 0.19412028789520264, "learning_rate": 4.798916550774924e-05, "loss": 1.113, "step": 1645 }, { "epoch": 0.2587592603509599, "grad_norm": 0.2528839707374573, "learning_rate": 4.798673710163535e-05, "loss": 1.2024, "step": 1646 }, { "epoch": 0.25891646524789247, "grad_norm": 0.21848973631858826, "learning_rate": 4.798430729157901e-05, "loss": 0.9633, "step": 1647 }, { "epoch": 0.259073670144825, "grad_norm": 0.2397562563419342, "learning_rate": 4.7981876077728625e-05, "loss": 1.14, "step": 1648 }, { "epoch": 0.25923087504175757, "grad_norm": 0.19531749188899994, "learning_rate": 4.7979443460232703e-05, "loss": 1.1785, "step": 1649 }, { "epoch": 0.25938807993869006, "grad_norm": 0.20693683624267578, "learning_rate": 4.79770094392398e-05, "loss": 1.2601, "step": 1650 }, { "epoch": 0.2595452848356226, "grad_norm": 0.26081717014312744, "learning_rate": 4.797457401489858e-05, "loss": 1.1489, "step": 1651 }, { "epoch": 0.25970248973255516, "grad_norm": 0.23810921609401703, "learning_rate": 4.7972137187357795e-05, "loss": 1.1896, "step": 1652 }, { "epoch": 0.2598596946294877, "grad_norm": 0.22295863926410675, "learning_rate": 4.796969895676627e-05, "loss": 1.1954, "step": 1653 }, { "epoch": 0.26001689952642026, "grad_norm": 0.26917287707328796, "learning_rate": 4.7967259323272935e-05, "loss": 1.2985, "step": 1654 }, { "epoch": 0.2601741044233528, "grad_norm": 0.20920400321483612, "learning_rate": 4.796481828702678e-05, "loss": 1.2217, "step": 1655 }, { "epoch": 0.2603313093202853, "grad_norm": 0.24075470864772797, "learning_rate": 4.79623758481769e-05, "loss": 1.2349, "step": 1656 }, { "epoch": 0.26048851421721786, "grad_norm": 0.2986784279346466, "learning_rate": 4.795993200687247e-05, "loss": 0.9704, "step": 1657 }, { "epoch": 0.2606457191141504, "grad_norm": 0.21063056588172913, "learning_rate": 4.795748676326275e-05, "loss": 1.1764, "step": 1658 }, { "epoch": 0.26080292401108296, "grad_norm": 0.17640748620033264, "learning_rate": 4.7955040117497084e-05, "loss": 1.1998, "step": 1659 }, { "epoch": 0.2609601289080155, "grad_norm": 0.3088374137878418, "learning_rate": 4.79525920697249e-05, "loss": 1.108, "step": 1660 }, { "epoch": 0.261117333804948, "grad_norm": 0.21907536685466766, "learning_rate": 4.795014262009573e-05, "loss": 1.2134, "step": 1661 }, { "epoch": 0.26127453870188055, "grad_norm": 0.25505852699279785, "learning_rate": 4.794769176875917e-05, "loss": 1.2556, "step": 1662 }, { "epoch": 0.2614317435988131, "grad_norm": 0.2881571650505066, "learning_rate": 4.79452395158649e-05, "loss": 1.1194, "step": 1663 }, { "epoch": 0.26158894849574565, "grad_norm": 0.24344374239444733, "learning_rate": 4.794278586156271e-05, "loss": 1.114, "step": 1664 }, { "epoch": 0.2617461533926782, "grad_norm": 0.2396748960018158, "learning_rate": 4.794033080600244e-05, "loss": 1.0962, "step": 1665 }, { "epoch": 0.2619033582896107, "grad_norm": 0.2769325375556946, "learning_rate": 4.7937874349334056e-05, "loss": 1.0822, "step": 1666 }, { "epoch": 0.26206056318654325, "grad_norm": 0.20145046710968018, "learning_rate": 4.793541649170757e-05, "loss": 1.1856, "step": 1667 }, { "epoch": 0.2622177680834758, "grad_norm": 0.2373715192079544, "learning_rate": 4.7932957233273123e-05, "loss": 1.195, "step": 1668 }, { "epoch": 0.26237497298040835, "grad_norm": 0.26741209626197815, "learning_rate": 4.7930496574180894e-05, "loss": 1.1812, "step": 1669 }, { "epoch": 0.2625321778773409, "grad_norm": 0.18911203742027283, "learning_rate": 4.7928034514581174e-05, "loss": 1.1561, "step": 1670 }, { "epoch": 0.2626893827742734, "grad_norm": 0.23078587651252747, "learning_rate": 4.792557105462434e-05, "loss": 1.1746, "step": 1671 }, { "epoch": 0.26284658767120594, "grad_norm": 0.25644704699516296, "learning_rate": 4.792310619446087e-05, "loss": 1.1917, "step": 1672 }, { "epoch": 0.2630037925681385, "grad_norm": 0.2350175380706787, "learning_rate": 4.7920639934241274e-05, "loss": 1.2823, "step": 1673 }, { "epoch": 0.26316099746507104, "grad_norm": 0.28728723526000977, "learning_rate": 4.79181722741162e-05, "loss": 1.2291, "step": 1674 }, { "epoch": 0.2633182023620036, "grad_norm": 0.28967636823654175, "learning_rate": 4.791570321423637e-05, "loss": 1.0443, "step": 1675 }, { "epoch": 0.2634754072589361, "grad_norm": 0.20903019607067108, "learning_rate": 4.791323275475257e-05, "loss": 1.2378, "step": 1676 }, { "epoch": 0.26363261215586864, "grad_norm": 0.25918152928352356, "learning_rate": 4.791076089581569e-05, "loss": 1.0944, "step": 1677 }, { "epoch": 0.2637898170528012, "grad_norm": 0.2330826371908188, "learning_rate": 4.790828763757671e-05, "loss": 1.2158, "step": 1678 }, { "epoch": 0.26394702194973374, "grad_norm": 0.22032979130744934, "learning_rate": 4.790581298018667e-05, "loss": 1.0987, "step": 1679 }, { "epoch": 0.2641042268466663, "grad_norm": 0.25354599952697754, "learning_rate": 4.7903336923796736e-05, "loss": 1.2191, "step": 1680 }, { "epoch": 0.26426143174359884, "grad_norm": 0.29382967948913574, "learning_rate": 4.7900859468558123e-05, "loss": 1.1906, "step": 1681 }, { "epoch": 0.26441863664053133, "grad_norm": 0.26418671011924744, "learning_rate": 4.7898380614622144e-05, "loss": 1.1357, "step": 1682 }, { "epoch": 0.2645758415374639, "grad_norm": 0.4071028232574463, "learning_rate": 4.78959003621402e-05, "loss": 1.1547, "step": 1683 }, { "epoch": 0.26473304643439644, "grad_norm": 0.20212817192077637, "learning_rate": 4.789341871126378e-05, "loss": 1.2021, "step": 1684 }, { "epoch": 0.264890251331329, "grad_norm": 0.22908912599086761, "learning_rate": 4.789093566214444e-05, "loss": 1.3014, "step": 1685 }, { "epoch": 0.26504745622826154, "grad_norm": 0.2181553691625595, "learning_rate": 4.788845121493385e-05, "loss": 1.2074, "step": 1686 }, { "epoch": 0.26520466112519403, "grad_norm": 0.2574876546859741, "learning_rate": 4.788596536978374e-05, "loss": 1.1311, "step": 1687 }, { "epoch": 0.2653618660221266, "grad_norm": 0.20918451249599457, "learning_rate": 4.7883478126845945e-05, "loss": 1.1652, "step": 1688 }, { "epoch": 0.26551907091905913, "grad_norm": 0.1706637293100357, "learning_rate": 4.7880989486272366e-05, "loss": 1.1742, "step": 1689 }, { "epoch": 0.2656762758159917, "grad_norm": 0.22445620596408844, "learning_rate": 4.787849944821501e-05, "loss": 1.1579, "step": 1690 }, { "epoch": 0.26583348071292423, "grad_norm": 0.25707024335861206, "learning_rate": 4.787600801282596e-05, "loss": 1.0905, "step": 1691 }, { "epoch": 0.2659906856098567, "grad_norm": 0.22234748303890228, "learning_rate": 4.787351518025737e-05, "loss": 1.1657, "step": 1692 }, { "epoch": 0.2661478905067893, "grad_norm": 0.26487070322036743, "learning_rate": 4.78710209506615e-05, "loss": 1.2234, "step": 1693 }, { "epoch": 0.2663050954037218, "grad_norm": 0.26550817489624023, "learning_rate": 4.786852532419069e-05, "loss": 1.1627, "step": 1694 }, { "epoch": 0.2664623003006544, "grad_norm": 0.24167504906654358, "learning_rate": 4.786602830099737e-05, "loss": 1.2216, "step": 1695 }, { "epoch": 0.2666195051975869, "grad_norm": 0.30660563707351685, "learning_rate": 4.786352988123403e-05, "loss": 1.1593, "step": 1696 }, { "epoch": 0.2667767100945194, "grad_norm": 0.222911536693573, "learning_rate": 4.786103006505328e-05, "loss": 1.2223, "step": 1697 }, { "epoch": 0.26693391499145197, "grad_norm": 0.19984647631645203, "learning_rate": 4.78585288526078e-05, "loss": 1.2453, "step": 1698 }, { "epoch": 0.2670911198883845, "grad_norm": 0.19021253287792206, "learning_rate": 4.785602624405034e-05, "loss": 1.2094, "step": 1699 }, { "epoch": 0.26724832478531707, "grad_norm": 0.24404233694076538, "learning_rate": 4.785352223953376e-05, "loss": 1.1247, "step": 1700 }, { "epoch": 0.2674055296822496, "grad_norm": 0.24491339921951294, "learning_rate": 4.7851016839210995e-05, "loss": 1.2667, "step": 1701 }, { "epoch": 0.2675627345791821, "grad_norm": 0.2175382375717163, "learning_rate": 4.7848510043235064e-05, "loss": 1.1664, "step": 1702 }, { "epoch": 0.26771993947611467, "grad_norm": 0.31710314750671387, "learning_rate": 4.784600185175907e-05, "loss": 1.1909, "step": 1703 }, { "epoch": 0.2678771443730472, "grad_norm": 0.21164441108703613, "learning_rate": 4.7843492264936214e-05, "loss": 1.1851, "step": 1704 }, { "epoch": 0.26803434926997977, "grad_norm": 0.18407614529132843, "learning_rate": 4.784098128291976e-05, "loss": 1.175, "step": 1705 }, { "epoch": 0.2681915541669123, "grad_norm": 0.2403915822505951, "learning_rate": 4.783846890586307e-05, "loss": 1.1775, "step": 1706 }, { "epoch": 0.26834875906384487, "grad_norm": 0.25552019476890564, "learning_rate": 4.78359551339196e-05, "loss": 1.1153, "step": 1707 }, { "epoch": 0.26850596396077736, "grad_norm": 0.24397552013397217, "learning_rate": 4.783343996724287e-05, "loss": 1.1795, "step": 1708 }, { "epoch": 0.2686631688577099, "grad_norm": 0.2242836058139801, "learning_rate": 4.78309234059865e-05, "loss": 1.1872, "step": 1709 }, { "epoch": 0.26882037375464246, "grad_norm": 0.2508406639099121, "learning_rate": 4.78284054503042e-05, "loss": 1.177, "step": 1710 }, { "epoch": 0.268977578651575, "grad_norm": 0.31523460149765015, "learning_rate": 4.7825886100349756e-05, "loss": 1.0777, "step": 1711 }, { "epoch": 0.26913478354850756, "grad_norm": 0.2677992582321167, "learning_rate": 4.782336535627703e-05, "loss": 1.135, "step": 1712 }, { "epoch": 0.26929198844544006, "grad_norm": 0.2804398536682129, "learning_rate": 4.782084321823998e-05, "loss": 1.2421, "step": 1713 }, { "epoch": 0.2694491933423726, "grad_norm": 0.2114623337984085, "learning_rate": 4.781831968639266e-05, "loss": 1.1308, "step": 1714 }, { "epoch": 0.26960639823930516, "grad_norm": 0.24072472751140594, "learning_rate": 4.7815794760889196e-05, "loss": 1.0372, "step": 1715 }, { "epoch": 0.2697636031362377, "grad_norm": 0.25040605664253235, "learning_rate": 4.7813268441883784e-05, "loss": 1.2141, "step": 1716 }, { "epoch": 0.26992080803317026, "grad_norm": 0.2768096625804901, "learning_rate": 4.781074072953074e-05, "loss": 1.1514, "step": 1717 }, { "epoch": 0.27007801293010275, "grad_norm": 0.21474453806877136, "learning_rate": 4.780821162398444e-05, "loss": 1.1375, "step": 1718 }, { "epoch": 0.2702352178270353, "grad_norm": 0.2671203911304474, "learning_rate": 4.780568112539936e-05, "loss": 1.1992, "step": 1719 }, { "epoch": 0.27039242272396785, "grad_norm": 0.19425953924655914, "learning_rate": 4.780314923393005e-05, "loss": 1.1796, "step": 1720 }, { "epoch": 0.2705496276209004, "grad_norm": 0.21683478355407715, "learning_rate": 4.780061594973114e-05, "loss": 1.102, "step": 1721 }, { "epoch": 0.27070683251783295, "grad_norm": 0.22656619548797607, "learning_rate": 4.779808127295735e-05, "loss": 1.2064, "step": 1722 }, { "epoch": 0.27086403741476545, "grad_norm": 0.27819374203681946, "learning_rate": 4.779554520376351e-05, "loss": 1.1881, "step": 1723 }, { "epoch": 0.271021242311698, "grad_norm": 0.2076413929462433, "learning_rate": 4.779300774230449e-05, "loss": 1.1586, "step": 1724 }, { "epoch": 0.27117844720863055, "grad_norm": 0.28731614351272583, "learning_rate": 4.779046888873529e-05, "loss": 0.9967, "step": 1725 }, { "epoch": 0.2713356521055631, "grad_norm": 0.23616041243076324, "learning_rate": 4.7787928643210955e-05, "loss": 1.2109, "step": 1726 }, { "epoch": 0.27149285700249565, "grad_norm": 0.3352977931499481, "learning_rate": 4.778538700588664e-05, "loss": 1.316, "step": 1727 }, { "epoch": 0.27165006189942814, "grad_norm": 0.275420218706131, "learning_rate": 4.778284397691758e-05, "loss": 1.2542, "step": 1728 }, { "epoch": 0.2718072667963607, "grad_norm": 0.20388716459274292, "learning_rate": 4.77802995564591e-05, "loss": 1.187, "step": 1729 }, { "epoch": 0.27196447169329324, "grad_norm": 0.24802348017692566, "learning_rate": 4.777775374466659e-05, "loss": 1.2225, "step": 1730 }, { "epoch": 0.2721216765902258, "grad_norm": 0.2412358671426773, "learning_rate": 4.777520654169554e-05, "loss": 1.1717, "step": 1731 }, { "epoch": 0.27227888148715834, "grad_norm": 0.24212747812271118, "learning_rate": 4.777265794770153e-05, "loss": 1.0965, "step": 1732 }, { "epoch": 0.2724360863840909, "grad_norm": 0.2116345465183258, "learning_rate": 4.7770107962840225e-05, "loss": 1.0881, "step": 1733 }, { "epoch": 0.2725932912810234, "grad_norm": 0.1802130490541458, "learning_rate": 4.7767556587267356e-05, "loss": 1.2671, "step": 1734 }, { "epoch": 0.27275049617795594, "grad_norm": 0.22544068098068237, "learning_rate": 4.776500382113875e-05, "loss": 1.1557, "step": 1735 }, { "epoch": 0.2729077010748885, "grad_norm": 0.22627638280391693, "learning_rate": 4.776244966461034e-05, "loss": 1.0589, "step": 1736 }, { "epoch": 0.27306490597182104, "grad_norm": 0.24201518297195435, "learning_rate": 4.77598941178381e-05, "loss": 1.217, "step": 1737 }, { "epoch": 0.2732221108687536, "grad_norm": 0.2708778381347656, "learning_rate": 4.775733718097812e-05, "loss": 1.1762, "step": 1738 }, { "epoch": 0.2733793157656861, "grad_norm": 0.22093307971954346, "learning_rate": 4.775477885418658e-05, "loss": 1.1599, "step": 1739 }, { "epoch": 0.27353652066261863, "grad_norm": 0.2544403672218323, "learning_rate": 4.775221913761971e-05, "loss": 1.1177, "step": 1740 }, { "epoch": 0.2736937255595512, "grad_norm": 0.2178761512041092, "learning_rate": 4.7749658031433873e-05, "loss": 1.1493, "step": 1741 }, { "epoch": 0.27385093045648373, "grad_norm": 0.2969399392604828, "learning_rate": 4.774709553578548e-05, "loss": 1.1048, "step": 1742 }, { "epoch": 0.2740081353534163, "grad_norm": 0.18794193863868713, "learning_rate": 4.7744531650831034e-05, "loss": 1.2954, "step": 1743 }, { "epoch": 0.2741653402503488, "grad_norm": 0.19405794143676758, "learning_rate": 4.774196637672714e-05, "loss": 1.1211, "step": 1744 }, { "epoch": 0.2743225451472813, "grad_norm": 0.24118031561374664, "learning_rate": 4.773939971363046e-05, "loss": 1.0933, "step": 1745 }, { "epoch": 0.2744797500442139, "grad_norm": 0.24610240757465363, "learning_rate": 4.7736831661697766e-05, "loss": 1.3337, "step": 1746 }, { "epoch": 0.2746369549411464, "grad_norm": 0.1838586926460266, "learning_rate": 4.77342622210859e-05, "loss": 1.1453, "step": 1747 }, { "epoch": 0.274794159838079, "grad_norm": 0.26292094588279724, "learning_rate": 4.77316913919518e-05, "loss": 1.1206, "step": 1748 }, { "epoch": 0.27495136473501147, "grad_norm": 0.15059764683246613, "learning_rate": 4.7729119174452475e-05, "loss": 1.2451, "step": 1749 }, { "epoch": 0.275108569631944, "grad_norm": 0.2880924940109253, "learning_rate": 4.772654556874503e-05, "loss": 1.1765, "step": 1750 }, { "epoch": 0.2752657745288766, "grad_norm": 0.20336076617240906, "learning_rate": 4.7723970574986656e-05, "loss": 1.1369, "step": 1751 }, { "epoch": 0.2754229794258091, "grad_norm": 0.24111486971378326, "learning_rate": 4.77213941933346e-05, "loss": 1.0741, "step": 1752 }, { "epoch": 0.2755801843227417, "grad_norm": 0.23586514592170715, "learning_rate": 4.7718816423946256e-05, "loss": 1.214, "step": 1753 }, { "epoch": 0.27573738921967417, "grad_norm": 0.2305273711681366, "learning_rate": 4.7716237266979036e-05, "loss": 1.1988, "step": 1754 }, { "epoch": 0.2758945941166067, "grad_norm": 0.18287290632724762, "learning_rate": 4.7713656722590475e-05, "loss": 1.1478, "step": 1755 }, { "epoch": 0.27605179901353927, "grad_norm": 0.28260156512260437, "learning_rate": 4.7711074790938184e-05, "loss": 1.0779, "step": 1756 }, { "epoch": 0.2762090039104718, "grad_norm": 0.2582015097141266, "learning_rate": 4.770849147217985e-05, "loss": 1.1002, "step": 1757 }, { "epoch": 0.27636620880740437, "grad_norm": 0.28041109442710876, "learning_rate": 4.770590676647326e-05, "loss": 1.0999, "step": 1758 }, { "epoch": 0.2765234137043369, "grad_norm": 0.23540420830249786, "learning_rate": 4.770332067397627e-05, "loss": 1.1478, "step": 1759 }, { "epoch": 0.2766806186012694, "grad_norm": 0.29831403493881226, "learning_rate": 4.770073319484684e-05, "loss": 1.0932, "step": 1760 }, { "epoch": 0.2766806186012694, "eval_loss": 1.1590473651885986, "eval_runtime": 2321.8933, "eval_samples_per_second": 3.987, "eval_steps_per_second": 1.994, "step": 1760 }, { "epoch": 0.27683782349820196, "grad_norm": 0.21759991347789764, "learning_rate": 4.769814432924299e-05, "loss": 1.166, "step": 1761 }, { "epoch": 0.2769950283951345, "grad_norm": 0.2622128129005432, "learning_rate": 4.7695554077322845e-05, "loss": 1.1155, "step": 1762 }, { "epoch": 0.27715223329206706, "grad_norm": 0.24021178483963013, "learning_rate": 4.769296243924462e-05, "loss": 1.1922, "step": 1763 }, { "epoch": 0.2773094381889996, "grad_norm": 0.23425696790218353, "learning_rate": 4.769036941516658e-05, "loss": 1.0903, "step": 1764 }, { "epoch": 0.2774666430859321, "grad_norm": 0.17257444560527802, "learning_rate": 4.768777500524711e-05, "loss": 1.1319, "step": 1765 }, { "epoch": 0.27762384798286466, "grad_norm": 0.2458474487066269, "learning_rate": 4.7685179209644664e-05, "loss": 1.2144, "step": 1766 }, { "epoch": 0.2777810528797972, "grad_norm": 0.18771199882030487, "learning_rate": 4.7682582028517784e-05, "loss": 1.1654, "step": 1767 }, { "epoch": 0.27793825777672976, "grad_norm": 0.2979085445404053, "learning_rate": 4.767998346202509e-05, "loss": 1.1537, "step": 1768 }, { "epoch": 0.2780954626736623, "grad_norm": 0.22919511795043945, "learning_rate": 4.767738351032531e-05, "loss": 1.15, "step": 1769 }, { "epoch": 0.2782526675705948, "grad_norm": 0.24717088043689728, "learning_rate": 4.7674782173577214e-05, "loss": 1.1205, "step": 1770 }, { "epoch": 0.27840987246752735, "grad_norm": 0.21218983829021454, "learning_rate": 4.7672179451939704e-05, "loss": 1.2007, "step": 1771 }, { "epoch": 0.2785670773644599, "grad_norm": 0.16957569122314453, "learning_rate": 4.766957534557173e-05, "loss": 1.183, "step": 1772 }, { "epoch": 0.27872428226139245, "grad_norm": 0.17404644191265106, "learning_rate": 4.766696985463235e-05, "loss": 1.0727, "step": 1773 }, { "epoch": 0.278881487158325, "grad_norm": 0.22840437293052673, "learning_rate": 4.766436297928068e-05, "loss": 1.0331, "step": 1774 }, { "epoch": 0.2790386920552575, "grad_norm": 0.18459245562553406, "learning_rate": 4.766175471967597e-05, "loss": 1.1837, "step": 1775 }, { "epoch": 0.27919589695219005, "grad_norm": 0.1901281177997589, "learning_rate": 4.7659145075977496e-05, "loss": 1.2815, "step": 1776 }, { "epoch": 0.2793531018491226, "grad_norm": 0.20891423523426056, "learning_rate": 4.7656534048344656e-05, "loss": 1.1554, "step": 1777 }, { "epoch": 0.27951030674605515, "grad_norm": 0.2291899174451828, "learning_rate": 4.765392163693691e-05, "loss": 1.137, "step": 1778 }, { "epoch": 0.2796675116429877, "grad_norm": 0.21621862053871155, "learning_rate": 4.765130784191384e-05, "loss": 1.1818, "step": 1779 }, { "epoch": 0.2798247165399202, "grad_norm": 0.24925227463245392, "learning_rate": 4.7648692663435054e-05, "loss": 1.1611, "step": 1780 }, { "epoch": 0.27998192143685274, "grad_norm": 0.26903223991394043, "learning_rate": 4.76460761016603e-05, "loss": 1.1493, "step": 1781 }, { "epoch": 0.2801391263337853, "grad_norm": 0.17821714282035828, "learning_rate": 4.764345815674937e-05, "loss": 1.1292, "step": 1782 }, { "epoch": 0.28029633123071784, "grad_norm": 0.19402022659778595, "learning_rate": 4.764083882886218e-05, "loss": 1.1462, "step": 1783 }, { "epoch": 0.2804535361276504, "grad_norm": 0.18578824400901794, "learning_rate": 4.7638218118158694e-05, "loss": 1.1745, "step": 1784 }, { "epoch": 0.2806107410245829, "grad_norm": 0.2654874920845032, "learning_rate": 4.763559602479898e-05, "loss": 1.1627, "step": 1785 }, { "epoch": 0.28076794592151544, "grad_norm": 0.3167024850845337, "learning_rate": 4.763297254894318e-05, "loss": 1.0536, "step": 1786 }, { "epoch": 0.280925150818448, "grad_norm": 0.21574537456035614, "learning_rate": 4.763034769075153e-05, "loss": 1.1318, "step": 1787 }, { "epoch": 0.28108235571538054, "grad_norm": 0.2255323827266693, "learning_rate": 4.7627721450384354e-05, "loss": 1.299, "step": 1788 }, { "epoch": 0.2812395606123131, "grad_norm": 0.23979492485523224, "learning_rate": 4.7625093828002035e-05, "loss": 1.2133, "step": 1789 }, { "epoch": 0.28139676550924564, "grad_norm": 0.26039859652519226, "learning_rate": 4.762246482376507e-05, "loss": 1.1472, "step": 1790 }, { "epoch": 0.28155397040617813, "grad_norm": 0.2618144452571869, "learning_rate": 4.761983443783403e-05, "loss": 1.1851, "step": 1791 }, { "epoch": 0.2817111753031107, "grad_norm": 0.2428680956363678, "learning_rate": 4.7617202670369556e-05, "loss": 1.1583, "step": 1792 }, { "epoch": 0.28186838020004323, "grad_norm": 0.280111700296402, "learning_rate": 4.76145695215324e-05, "loss": 1.2112, "step": 1793 }, { "epoch": 0.2820255850969758, "grad_norm": 0.24310675263404846, "learning_rate": 4.761193499148339e-05, "loss": 1.1552, "step": 1794 }, { "epoch": 0.28218278999390833, "grad_norm": 0.28014469146728516, "learning_rate": 4.7609299080383415e-05, "loss": 1.2571, "step": 1795 }, { "epoch": 0.28233999489084083, "grad_norm": 0.3029899001121521, "learning_rate": 4.760666178839347e-05, "loss": 1.1485, "step": 1796 }, { "epoch": 0.2824971997877734, "grad_norm": 0.34275612235069275, "learning_rate": 4.7604023115674644e-05, "loss": 1.101, "step": 1797 }, { "epoch": 0.28265440468470593, "grad_norm": 0.20968905091285706, "learning_rate": 4.760138306238809e-05, "loss": 1.0953, "step": 1798 }, { "epoch": 0.2828116095816385, "grad_norm": 0.19242046773433685, "learning_rate": 4.759874162869505e-05, "loss": 1.1754, "step": 1799 }, { "epoch": 0.28296881447857103, "grad_norm": 0.31839972734451294, "learning_rate": 4.759609881475685e-05, "loss": 1.0707, "step": 1800 }, { "epoch": 0.2831260193755035, "grad_norm": 0.3110107183456421, "learning_rate": 4.7593454620734914e-05, "loss": 1.1576, "step": 1801 }, { "epoch": 0.2832832242724361, "grad_norm": 0.2405199557542801, "learning_rate": 4.759080904679072e-05, "loss": 1.179, "step": 1802 }, { "epoch": 0.2834404291693686, "grad_norm": 0.21492940187454224, "learning_rate": 4.758816209308587e-05, "loss": 1.1806, "step": 1803 }, { "epoch": 0.2835976340663012, "grad_norm": 0.2658292055130005, "learning_rate": 4.758551375978202e-05, "loss": 1.0843, "step": 1804 }, { "epoch": 0.2837548389632337, "grad_norm": 0.2901574969291687, "learning_rate": 4.758286404704092e-05, "loss": 1.1257, "step": 1805 }, { "epoch": 0.2839120438601662, "grad_norm": 0.34066644310951233, "learning_rate": 4.758021295502441e-05, "loss": 1.1073, "step": 1806 }, { "epoch": 0.28406924875709877, "grad_norm": 0.2622152268886566, "learning_rate": 4.7577560483894406e-05, "loss": 1.1046, "step": 1807 }, { "epoch": 0.2842264536540313, "grad_norm": 0.1930551379919052, "learning_rate": 4.757490663381291e-05, "loss": 1.2328, "step": 1808 }, { "epoch": 0.28438365855096387, "grad_norm": 0.30099764466285706, "learning_rate": 4.7572251404942e-05, "loss": 1.1743, "step": 1809 }, { "epoch": 0.2845408634478964, "grad_norm": 0.24297019839286804, "learning_rate": 4.756959479744386e-05, "loss": 1.1596, "step": 1810 }, { "epoch": 0.2846980683448289, "grad_norm": 0.2792363464832306, "learning_rate": 4.7566936811480744e-05, "loss": 1.0061, "step": 1811 }, { "epoch": 0.28485527324176146, "grad_norm": 0.2141488492488861, "learning_rate": 4.756427744721499e-05, "loss": 1.1381, "step": 1812 }, { "epoch": 0.285012478138694, "grad_norm": 0.23667530715465546, "learning_rate": 4.756161670480902e-05, "loss": 1.1434, "step": 1813 }, { "epoch": 0.28516968303562656, "grad_norm": 0.21142233908176422, "learning_rate": 4.755895458442534e-05, "loss": 1.1959, "step": 1814 }, { "epoch": 0.2853268879325591, "grad_norm": 0.22952231764793396, "learning_rate": 4.755629108622655e-05, "loss": 1.132, "step": 1815 }, { "epoch": 0.28548409282949166, "grad_norm": 0.2787313759326935, "learning_rate": 4.7553626210375326e-05, "loss": 0.9319, "step": 1816 }, { "epoch": 0.28564129772642416, "grad_norm": 0.1895769089460373, "learning_rate": 4.755095995703441e-05, "loss": 1.2251, "step": 1817 }, { "epoch": 0.2857985026233567, "grad_norm": 0.2436160445213318, "learning_rate": 4.754829232636667e-05, "loss": 1.14, "step": 1818 }, { "epoch": 0.28595570752028926, "grad_norm": 0.3004104793071747, "learning_rate": 4.7545623318535024e-05, "loss": 1.1939, "step": 1819 }, { "epoch": 0.2861129124172218, "grad_norm": 0.30581870675086975, "learning_rate": 4.754295293370248e-05, "loss": 1.1904, "step": 1820 }, { "epoch": 0.28627011731415436, "grad_norm": 0.3518725335597992, "learning_rate": 4.754028117203215e-05, "loss": 1.1693, "step": 1821 }, { "epoch": 0.28642732221108685, "grad_norm": 0.2542705237865448, "learning_rate": 4.7537608033687204e-05, "loss": 1.0971, "step": 1822 }, { "epoch": 0.2865845271080194, "grad_norm": 0.19205254316329956, "learning_rate": 4.7534933518830904e-05, "loss": 1.0652, "step": 1823 }, { "epoch": 0.28674173200495195, "grad_norm": 0.2477174997329712, "learning_rate": 4.753225762762661e-05, "loss": 1.2265, "step": 1824 }, { "epoch": 0.2868989369018845, "grad_norm": 0.19431249797344208, "learning_rate": 4.7529580360237744e-05, "loss": 1.1722, "step": 1825 }, { "epoch": 0.28705614179881705, "grad_norm": 0.24031542241573334, "learning_rate": 4.7526901716827846e-05, "loss": 1.1593, "step": 1826 }, { "epoch": 0.28721334669574955, "grad_norm": 0.3656073808670044, "learning_rate": 4.752422169756048e-05, "loss": 1.1655, "step": 1827 }, { "epoch": 0.2873705515926821, "grad_norm": 0.24645781517028809, "learning_rate": 4.752154030259936e-05, "loss": 1.0771, "step": 1828 }, { "epoch": 0.28752775648961465, "grad_norm": 0.22396346926689148, "learning_rate": 4.7518857532108245e-05, "loss": 1.2857, "step": 1829 }, { "epoch": 0.2876849613865472, "grad_norm": 0.37394481897354126, "learning_rate": 4.751617338625099e-05, "loss": 1.1057, "step": 1830 }, { "epoch": 0.28784216628347975, "grad_norm": 0.260421484708786, "learning_rate": 4.751348786519154e-05, "loss": 1.1764, "step": 1831 }, { "epoch": 0.28799937118041224, "grad_norm": 0.23633582890033722, "learning_rate": 4.7510800969093904e-05, "loss": 1.2257, "step": 1832 }, { "epoch": 0.2881565760773448, "grad_norm": 0.2963007092475891, "learning_rate": 4.750811269812219e-05, "loss": 1.1226, "step": 1833 }, { "epoch": 0.28831378097427734, "grad_norm": 0.2502879798412323, "learning_rate": 4.75054230524406e-05, "loss": 1.1942, "step": 1834 }, { "epoch": 0.2884709858712099, "grad_norm": 0.20743811130523682, "learning_rate": 4.750273203221339e-05, "loss": 1.2321, "step": 1835 }, { "epoch": 0.28862819076814245, "grad_norm": 0.3560789227485657, "learning_rate": 4.750003963760493e-05, "loss": 1.1188, "step": 1836 }, { "epoch": 0.28878539566507494, "grad_norm": 0.3189667761325836, "learning_rate": 4.7497345868779644e-05, "loss": 1.3159, "step": 1837 }, { "epoch": 0.2889426005620075, "grad_norm": 0.22662386298179626, "learning_rate": 4.749465072590208e-05, "loss": 1.055, "step": 1838 }, { "epoch": 0.28909980545894004, "grad_norm": 0.18307331204414368, "learning_rate": 4.749195420913683e-05, "loss": 1.3076, "step": 1839 }, { "epoch": 0.2892570103558726, "grad_norm": 0.25430959463119507, "learning_rate": 4.74892563186486e-05, "loss": 1.2806, "step": 1840 }, { "epoch": 0.28941421525280514, "grad_norm": 0.21403716504573822, "learning_rate": 4.748655705460215e-05, "loss": 1.1572, "step": 1841 }, { "epoch": 0.2895714201497377, "grad_norm": 0.1849280297756195, "learning_rate": 4.7483856417162365e-05, "loss": 1.1613, "step": 1842 }, { "epoch": 0.2897286250466702, "grad_norm": 0.21544575691223145, "learning_rate": 4.7481154406494164e-05, "loss": 1.1786, "step": 1843 }, { "epoch": 0.28988582994360274, "grad_norm": 0.2631804943084717, "learning_rate": 4.7478451022762596e-05, "loss": 1.1764, "step": 1844 }, { "epoch": 0.2900430348405353, "grad_norm": 0.21197448670864105, "learning_rate": 4.747574626613276e-05, "loss": 1.1235, "step": 1845 }, { "epoch": 0.29020023973746784, "grad_norm": 0.26589202880859375, "learning_rate": 4.7473040136769855e-05, "loss": 1.2104, "step": 1846 }, { "epoch": 0.2903574446344004, "grad_norm": 0.3811849057674408, "learning_rate": 4.7470332634839165e-05, "loss": 1.1747, "step": 1847 }, { "epoch": 0.2905146495313329, "grad_norm": 0.23455043137073517, "learning_rate": 4.7467623760506054e-05, "loss": 1.1475, "step": 1848 }, { "epoch": 0.29067185442826543, "grad_norm": 0.2337242066860199, "learning_rate": 4.746491351393596e-05, "loss": 1.1085, "step": 1849 }, { "epoch": 0.290829059325198, "grad_norm": 0.30514079332351685, "learning_rate": 4.746220189529442e-05, "loss": 1.235, "step": 1850 }, { "epoch": 0.29098626422213053, "grad_norm": 0.20872575044631958, "learning_rate": 4.7459488904747064e-05, "loss": 1.1893, "step": 1851 }, { "epoch": 0.2911434691190631, "grad_norm": 0.23578737676143646, "learning_rate": 4.745677454245957e-05, "loss": 1.1986, "step": 1852 }, { "epoch": 0.2913006740159956, "grad_norm": 0.26326242089271545, "learning_rate": 4.745405880859773e-05, "loss": 1.213, "step": 1853 }, { "epoch": 0.2914578789129281, "grad_norm": 0.23654600977897644, "learning_rate": 4.7451341703327414e-05, "loss": 1.19, "step": 1854 }, { "epoch": 0.2916150838098607, "grad_norm": 0.2641269564628601, "learning_rate": 4.744862322681457e-05, "loss": 1.1428, "step": 1855 }, { "epoch": 0.2917722887067932, "grad_norm": 0.24482795596122742, "learning_rate": 4.744590337922522e-05, "loss": 1.1508, "step": 1856 }, { "epoch": 0.2919294936037258, "grad_norm": 0.28008776903152466, "learning_rate": 4.744318216072551e-05, "loss": 1.3053, "step": 1857 }, { "epoch": 0.29208669850065827, "grad_norm": 0.23641882836818695, "learning_rate": 4.744045957148161e-05, "loss": 1.0879, "step": 1858 }, { "epoch": 0.2922439033975908, "grad_norm": 0.25675082206726074, "learning_rate": 4.743773561165982e-05, "loss": 1.2489, "step": 1859 }, { "epoch": 0.29240110829452337, "grad_norm": 0.25265899300575256, "learning_rate": 4.743501028142652e-05, "loss": 1.1042, "step": 1860 }, { "epoch": 0.2925583131914559, "grad_norm": 0.2375422716140747, "learning_rate": 4.743228358094814e-05, "loss": 1.2003, "step": 1861 }, { "epoch": 0.29271551808838847, "grad_norm": 0.24216416478157043, "learning_rate": 4.742955551039123e-05, "loss": 1.1058, "step": 1862 }, { "epoch": 0.29287272298532097, "grad_norm": 0.3152044713497162, "learning_rate": 4.7426826069922416e-05, "loss": 1.2503, "step": 1863 }, { "epoch": 0.2930299278822535, "grad_norm": 0.19799089431762695, "learning_rate": 4.7424095259708384e-05, "loss": 1.0635, "step": 1864 }, { "epoch": 0.29318713277918607, "grad_norm": 0.24691304564476013, "learning_rate": 4.742136307991594e-05, "loss": 1.232, "step": 1865 }, { "epoch": 0.2933443376761186, "grad_norm": 0.24523115158081055, "learning_rate": 4.741862953071194e-05, "loss": 1.1678, "step": 1866 }, { "epoch": 0.29350154257305117, "grad_norm": 0.26968106627464294, "learning_rate": 4.7415894612263344e-05, "loss": 1.194, "step": 1867 }, { "epoch": 0.2936587474699837, "grad_norm": 0.23013943433761597, "learning_rate": 4.7413158324737206e-05, "loss": 1.1052, "step": 1868 }, { "epoch": 0.2938159523669162, "grad_norm": 0.21373358368873596, "learning_rate": 4.741042066830062e-05, "loss": 1.1555, "step": 1869 }, { "epoch": 0.29397315726384876, "grad_norm": 0.26143181324005127, "learning_rate": 4.740768164312081e-05, "loss": 1.0752, "step": 1870 }, { "epoch": 0.2941303621607813, "grad_norm": 0.216646209359169, "learning_rate": 4.7404941249365066e-05, "loss": 1.1678, "step": 1871 }, { "epoch": 0.29428756705771386, "grad_norm": 0.22760845720767975, "learning_rate": 4.740219948720075e-05, "loss": 1.0414, "step": 1872 }, { "epoch": 0.2944447719546464, "grad_norm": 0.2713901400566101, "learning_rate": 4.739945635679532e-05, "loss": 1.2089, "step": 1873 }, { "epoch": 0.2946019768515789, "grad_norm": 0.2115233987569809, "learning_rate": 4.739671185831633e-05, "loss": 1.2092, "step": 1874 }, { "epoch": 0.29475918174851146, "grad_norm": 0.24696362018585205, "learning_rate": 4.739396599193139e-05, "loss": 1.1438, "step": 1875 }, { "epoch": 0.294916386645444, "grad_norm": 0.29846012592315674, "learning_rate": 4.739121875780821e-05, "loss": 1.1604, "step": 1876 }, { "epoch": 0.29507359154237656, "grad_norm": 0.22811514139175415, "learning_rate": 4.7388470156114576e-05, "loss": 1.2149, "step": 1877 }, { "epoch": 0.2952307964393091, "grad_norm": 0.24468787014484406, "learning_rate": 4.738572018701838e-05, "loss": 1.0935, "step": 1878 }, { "epoch": 0.2953880013362416, "grad_norm": 0.28037217259407043, "learning_rate": 4.738296885068756e-05, "loss": 1.0999, "step": 1879 }, { "epoch": 0.29554520623317415, "grad_norm": 0.2207103669643402, "learning_rate": 4.738021614729016e-05, "loss": 1.242, "step": 1880 }, { "epoch": 0.2957024111301067, "grad_norm": 0.16394954919815063, "learning_rate": 4.7377462076994313e-05, "loss": 1.1138, "step": 1881 }, { "epoch": 0.29585961602703925, "grad_norm": 0.3569471538066864, "learning_rate": 4.7374706639968224e-05, "loss": 1.1034, "step": 1882 }, { "epoch": 0.2960168209239718, "grad_norm": 0.19428043067455292, "learning_rate": 4.737194983638018e-05, "loss": 1.2082, "step": 1883 }, { "epoch": 0.2961740258209043, "grad_norm": 0.23990128934383392, "learning_rate": 4.736919166639856e-05, "loss": 1.2445, "step": 1884 }, { "epoch": 0.29633123071783685, "grad_norm": 0.2282816767692566, "learning_rate": 4.736643213019183e-05, "loss": 1.0495, "step": 1885 }, { "epoch": 0.2964884356147694, "grad_norm": 0.279305100440979, "learning_rate": 4.736367122792852e-05, "loss": 1.0828, "step": 1886 }, { "epoch": 0.29664564051170195, "grad_norm": 0.18702171742916107, "learning_rate": 4.736090895977725e-05, "loss": 1.2215, "step": 1887 }, { "epoch": 0.2968028454086345, "grad_norm": 0.232543483376503, "learning_rate": 4.735814532590675e-05, "loss": 1.0841, "step": 1888 }, { "epoch": 0.296960050305567, "grad_norm": 0.1895131915807724, "learning_rate": 4.7355380326485796e-05, "loss": 1.169, "step": 1889 }, { "epoch": 0.29711725520249954, "grad_norm": 0.1745792180299759, "learning_rate": 4.735261396168327e-05, "loss": 1.2403, "step": 1890 }, { "epoch": 0.2972744600994321, "grad_norm": 0.25837990641593933, "learning_rate": 4.734984623166813e-05, "loss": 1.0719, "step": 1891 }, { "epoch": 0.29743166499636464, "grad_norm": 0.21535173058509827, "learning_rate": 4.7347077136609416e-05, "loss": 1.1058, "step": 1892 }, { "epoch": 0.2975888698932972, "grad_norm": 0.25150197744369507, "learning_rate": 4.7344306676676254e-05, "loss": 1.1828, "step": 1893 }, { "epoch": 0.29774607479022974, "grad_norm": 0.2469971477985382, "learning_rate": 4.734153485203786e-05, "loss": 1.0672, "step": 1894 }, { "epoch": 0.29790327968716224, "grad_norm": 0.2318231761455536, "learning_rate": 4.733876166286352e-05, "loss": 1.1708, "step": 1895 }, { "epoch": 0.2980604845840948, "grad_norm": 0.1978289932012558, "learning_rate": 4.733598710932261e-05, "loss": 1.2133, "step": 1896 }, { "epoch": 0.29821768948102734, "grad_norm": 0.22039441764354706, "learning_rate": 4.733321119158459e-05, "loss": 1.1628, "step": 1897 }, { "epoch": 0.2983748943779599, "grad_norm": 0.1828751415014267, "learning_rate": 4.7330433909819e-05, "loss": 1.0819, "step": 1898 }, { "epoch": 0.29853209927489244, "grad_norm": 0.3677425980567932, "learning_rate": 4.732765526419547e-05, "loss": 1.2305, "step": 1899 }, { "epoch": 0.29868930417182493, "grad_norm": 0.2010020911693573, "learning_rate": 4.732487525488371e-05, "loss": 1.1933, "step": 1900 }, { "epoch": 0.2988465090687575, "grad_norm": 0.2198139876127243, "learning_rate": 4.732209388205351e-05, "loss": 1.1228, "step": 1901 }, { "epoch": 0.29900371396569003, "grad_norm": 0.26586952805519104, "learning_rate": 4.731931114587474e-05, "loss": 1.1978, "step": 1902 }, { "epoch": 0.2991609188626226, "grad_norm": 0.18092826008796692, "learning_rate": 4.731652704651737e-05, "loss": 1.2788, "step": 1903 }, { "epoch": 0.29931812375955513, "grad_norm": 0.23159582912921906, "learning_rate": 4.731374158415144e-05, "loss": 1.0886, "step": 1904 }, { "epoch": 0.2994753286564876, "grad_norm": 0.24246764183044434, "learning_rate": 4.7310954758947066e-05, "loss": 1.1374, "step": 1905 }, { "epoch": 0.2996325335534202, "grad_norm": 0.25246042013168335, "learning_rate": 4.730816657107446e-05, "loss": 1.2619, "step": 1906 }, { "epoch": 0.2997897384503527, "grad_norm": 0.23286496102809906, "learning_rate": 4.730537702070393e-05, "loss": 1.2179, "step": 1907 }, { "epoch": 0.2999469433472853, "grad_norm": 0.23141705989837646, "learning_rate": 4.7302586108005834e-05, "loss": 1.219, "step": 1908 }, { "epoch": 0.3001041482442178, "grad_norm": 0.2506294846534729, "learning_rate": 4.7299793833150624e-05, "loss": 1.173, "step": 1909 }, { "epoch": 0.3002613531411503, "grad_norm": 0.23536083102226257, "learning_rate": 4.729700019630886e-05, "loss": 0.9852, "step": 1910 }, { "epoch": 0.3004185580380829, "grad_norm": 0.27178215980529785, "learning_rate": 4.729420519765115e-05, "loss": 1.2663, "step": 1911 }, { "epoch": 0.3005757629350154, "grad_norm": 0.2641531229019165, "learning_rate": 4.7291408837348224e-05, "loss": 1.2179, "step": 1912 }, { "epoch": 0.300732967831948, "grad_norm": 0.2041010558605194, "learning_rate": 4.728861111557085e-05, "loss": 1.1546, "step": 1913 }, { "epoch": 0.3008901727288805, "grad_norm": 0.24100835621356964, "learning_rate": 4.728581203248992e-05, "loss": 1.2314, "step": 1914 }, { "epoch": 0.301047377625813, "grad_norm": 0.2499173879623413, "learning_rate": 4.7283011588276374e-05, "loss": 1.1391, "step": 1915 }, { "epoch": 0.30120458252274557, "grad_norm": 0.17696775496006012, "learning_rate": 4.7280209783101265e-05, "loss": 1.1424, "step": 1916 }, { "epoch": 0.3013617874196781, "grad_norm": 0.2696003019809723, "learning_rate": 4.727740661713571e-05, "loss": 1.1785, "step": 1917 }, { "epoch": 0.30151899231661067, "grad_norm": 0.19227688014507294, "learning_rate": 4.727460209055092e-05, "loss": 1.1431, "step": 1918 }, { "epoch": 0.3016761972135432, "grad_norm": 0.2030278742313385, "learning_rate": 4.7271796203518184e-05, "loss": 1.1968, "step": 1919 }, { "epoch": 0.30183340211047577, "grad_norm": 0.2489650398492813, "learning_rate": 4.726898895620888e-05, "loss": 1.1133, "step": 1920 }, { "epoch": 0.30183340211047577, "eval_loss": 1.1504688262939453, "eval_runtime": 2346.6617, "eval_samples_per_second": 3.945, "eval_steps_per_second": 1.973, "step": 1920 }, { "epoch": 0.30199060700740826, "grad_norm": 0.20980967581272125, "learning_rate": 4.7266180348794456e-05, "loss": 1.1863, "step": 1921 }, { "epoch": 0.3021478119043408, "grad_norm": 0.25084492564201355, "learning_rate": 4.726337038144645e-05, "loss": 1.1693, "step": 1922 }, { "epoch": 0.30230501680127336, "grad_norm": 0.24765540659427643, "learning_rate": 4.726055905433649e-05, "loss": 1.1415, "step": 1923 }, { "epoch": 0.3024622216982059, "grad_norm": 0.21769382059574127, "learning_rate": 4.725774636763628e-05, "loss": 1.2113, "step": 1924 }, { "epoch": 0.30261942659513846, "grad_norm": 0.18449430167675018, "learning_rate": 4.725493232151761e-05, "loss": 1.1187, "step": 1925 }, { "epoch": 0.30277663149207096, "grad_norm": 0.2294544279575348, "learning_rate": 4.725211691615234e-05, "loss": 1.011, "step": 1926 }, { "epoch": 0.3029338363890035, "grad_norm": 0.235177144408226, "learning_rate": 4.724930015171244e-05, "loss": 1.1604, "step": 1927 }, { "epoch": 0.30309104128593606, "grad_norm": 0.32669633626937866, "learning_rate": 4.724648202836993e-05, "loss": 1.1437, "step": 1928 }, { "epoch": 0.3032482461828686, "grad_norm": 0.1598648726940155, "learning_rate": 4.7243662546296954e-05, "loss": 1.2719, "step": 1929 }, { "epoch": 0.30340545107980116, "grad_norm": 0.222189262509346, "learning_rate": 4.724084170566569e-05, "loss": 1.1576, "step": 1930 }, { "epoch": 0.30356265597673365, "grad_norm": 0.20110036432743073, "learning_rate": 4.723801950664844e-05, "loss": 1.1906, "step": 1931 }, { "epoch": 0.3037198608736662, "grad_norm": 0.25523391366004944, "learning_rate": 4.7235195949417564e-05, "loss": 1.1705, "step": 1932 }, { "epoch": 0.30387706577059875, "grad_norm": 0.20221413671970367, "learning_rate": 4.723237103414553e-05, "loss": 1.13, "step": 1933 }, { "epoch": 0.3040342706675313, "grad_norm": 0.2736821472644806, "learning_rate": 4.722954476100485e-05, "loss": 0.9517, "step": 1934 }, { "epoch": 0.30419147556446385, "grad_norm": 0.20103007555007935, "learning_rate": 4.722671713016816e-05, "loss": 1.2214, "step": 1935 }, { "epoch": 0.30434868046139635, "grad_norm": 0.23819975554943085, "learning_rate": 4.7223888141808156e-05, "loss": 1.1757, "step": 1936 }, { "epoch": 0.3045058853583289, "grad_norm": 0.22529488801956177, "learning_rate": 4.7221057796097614e-05, "loss": 1.2031, "step": 1937 }, { "epoch": 0.30466309025526145, "grad_norm": 0.2521119713783264, "learning_rate": 4.7218226093209416e-05, "loss": 1.1728, "step": 1938 }, { "epoch": 0.304820295152194, "grad_norm": 0.23056873679161072, "learning_rate": 4.72153930333165e-05, "loss": 1.2113, "step": 1939 }, { "epoch": 0.30497750004912655, "grad_norm": 0.29192107915878296, "learning_rate": 4.7212558616591895e-05, "loss": 1.0284, "step": 1940 }, { "epoch": 0.30513470494605904, "grad_norm": 0.1903986781835556, "learning_rate": 4.7209722843208725e-05, "loss": 1.2149, "step": 1941 }, { "epoch": 0.3052919098429916, "grad_norm": 0.18553946912288666, "learning_rate": 4.720688571334019e-05, "loss": 1.1815, "step": 1942 }, { "epoch": 0.30544911473992414, "grad_norm": 0.2598629593849182, "learning_rate": 4.720404722715957e-05, "loss": 1.0763, "step": 1943 }, { "epoch": 0.3056063196368567, "grad_norm": 0.3054077625274658, "learning_rate": 4.720120738484022e-05, "loss": 1.1202, "step": 1944 }, { "epoch": 0.30576352453378924, "grad_norm": 0.20831403136253357, "learning_rate": 4.71983661865556e-05, "loss": 1.1807, "step": 1945 }, { "epoch": 0.3059207294307218, "grad_norm": 0.18264667689800262, "learning_rate": 4.7195523632479226e-05, "loss": 1.172, "step": 1946 }, { "epoch": 0.3060779343276543, "grad_norm": 0.18661391735076904, "learning_rate": 4.719267972278472e-05, "loss": 1.2601, "step": 1947 }, { "epoch": 0.30623513922458684, "grad_norm": 0.19430875778198242, "learning_rate": 4.7189834457645775e-05, "loss": 1.1476, "step": 1948 }, { "epoch": 0.3063923441215194, "grad_norm": 0.23284098505973816, "learning_rate": 4.718698783723616e-05, "loss": 1.0859, "step": 1949 }, { "epoch": 0.30654954901845194, "grad_norm": 0.19055165350437164, "learning_rate": 4.7184139861729756e-05, "loss": 1.1245, "step": 1950 }, { "epoch": 0.3067067539153845, "grad_norm": 0.20607997477054596, "learning_rate": 4.7181290531300496e-05, "loss": 1.2386, "step": 1951 }, { "epoch": 0.306863958812317, "grad_norm": 0.16489629447460175, "learning_rate": 4.717843984612239e-05, "loss": 1.2782, "step": 1952 }, { "epoch": 0.30702116370924953, "grad_norm": 0.22474288940429688, "learning_rate": 4.717558780636957e-05, "loss": 1.219, "step": 1953 }, { "epoch": 0.3071783686061821, "grad_norm": 0.22763358056545258, "learning_rate": 4.7172734412216224e-05, "loss": 1.0742, "step": 1954 }, { "epoch": 0.30733557350311463, "grad_norm": 0.2578442692756653, "learning_rate": 4.7169879663836614e-05, "loss": 1.0377, "step": 1955 }, { "epoch": 0.3074927784000472, "grad_norm": 0.26955658197402954, "learning_rate": 4.716702356140511e-05, "loss": 1.0492, "step": 1956 }, { "epoch": 0.3076499832969797, "grad_norm": 0.2685282826423645, "learning_rate": 4.716416610509614e-05, "loss": 1.1809, "step": 1957 }, { "epoch": 0.30780718819391223, "grad_norm": 0.26675713062286377, "learning_rate": 4.716130729508424e-05, "loss": 1.2132, "step": 1958 }, { "epoch": 0.3079643930908448, "grad_norm": 0.22978322207927704, "learning_rate": 4.7158447131544e-05, "loss": 1.1327, "step": 1959 }, { "epoch": 0.30812159798777733, "grad_norm": 0.2514789402484894, "learning_rate": 4.7155585614650134e-05, "loss": 1.2075, "step": 1960 }, { "epoch": 0.3082788028847099, "grad_norm": 0.2616156339645386, "learning_rate": 4.715272274457738e-05, "loss": 1.0809, "step": 1961 }, { "epoch": 0.3084360077816424, "grad_norm": 0.3378554880619049, "learning_rate": 4.7149858521500614e-05, "loss": 1.1245, "step": 1962 }, { "epoch": 0.3085932126785749, "grad_norm": 0.20755833387374878, "learning_rate": 4.714699294559476e-05, "loss": 1.2121, "step": 1963 }, { "epoch": 0.3087504175755075, "grad_norm": 0.2765822410583496, "learning_rate": 4.714412601703484e-05, "loss": 1.0652, "step": 1964 }, { "epoch": 0.30890762247244, "grad_norm": 0.2736920416355133, "learning_rate": 4.714125773599596e-05, "loss": 1.1376, "step": 1965 }, { "epoch": 0.3090648273693726, "grad_norm": 0.15031395852565765, "learning_rate": 4.7138388102653295e-05, "loss": 1.1655, "step": 1966 }, { "epoch": 0.30922203226630507, "grad_norm": 0.30680006742477417, "learning_rate": 4.713551711718212e-05, "loss": 1.1064, "step": 1967 }, { "epoch": 0.3093792371632376, "grad_norm": 0.2674984335899353, "learning_rate": 4.713264477975777e-05, "loss": 1.2259, "step": 1968 }, { "epoch": 0.30953644206017017, "grad_norm": 0.2514277398586273, "learning_rate": 4.7129771090555694e-05, "loss": 1.1184, "step": 1969 }, { "epoch": 0.3096936469571027, "grad_norm": 0.25297811627388, "learning_rate": 4.71268960497514e-05, "loss": 1.0941, "step": 1970 }, { "epoch": 0.30985085185403527, "grad_norm": 0.24282206594944, "learning_rate": 4.712401965752048e-05, "loss": 1.0437, "step": 1971 }, { "epoch": 0.3100080567509678, "grad_norm": 0.17618770897388458, "learning_rate": 4.712114191403862e-05, "loss": 1.2034, "step": 1972 }, { "epoch": 0.3101652616479003, "grad_norm": 0.2321152687072754, "learning_rate": 4.7118262819481576e-05, "loss": 1.1255, "step": 1973 }, { "epoch": 0.31032246654483286, "grad_norm": 0.2128259241580963, "learning_rate": 4.711538237402518e-05, "loss": 1.2008, "step": 1974 }, { "epoch": 0.3104796714417654, "grad_norm": 0.20988841354846954, "learning_rate": 4.711250057784539e-05, "loss": 1.2586, "step": 1975 }, { "epoch": 0.31063687633869796, "grad_norm": 0.16251863539218903, "learning_rate": 4.7109617431118195e-05, "loss": 1.1829, "step": 1976 }, { "epoch": 0.3107940812356305, "grad_norm": 0.24834904074668884, "learning_rate": 4.710673293401968e-05, "loss": 1.1107, "step": 1977 }, { "epoch": 0.310951286132563, "grad_norm": 0.42274773120880127, "learning_rate": 4.7103847086726026e-05, "loss": 1.1066, "step": 1978 }, { "epoch": 0.31110849102949556, "grad_norm": 0.22393664717674255, "learning_rate": 4.7100959889413505e-05, "loss": 1.1203, "step": 1979 }, { "epoch": 0.3112656959264281, "grad_norm": 0.2756640315055847, "learning_rate": 4.709807134225843e-05, "loss": 1.0559, "step": 1980 }, { "epoch": 0.31142290082336066, "grad_norm": 0.3086540997028351, "learning_rate": 4.709518144543724e-05, "loss": 1.0609, "step": 1981 }, { "epoch": 0.3115801057202932, "grad_norm": 0.22327710688114166, "learning_rate": 4.7092290199126444e-05, "loss": 1.1378, "step": 1982 }, { "epoch": 0.3117373106172257, "grad_norm": 0.24802890419960022, "learning_rate": 4.708939760350261e-05, "loss": 1.3069, "step": 1983 }, { "epoch": 0.31189451551415825, "grad_norm": 0.24601417779922485, "learning_rate": 4.708650365874241e-05, "loss": 0.9913, "step": 1984 }, { "epoch": 0.3120517204110908, "grad_norm": 0.1627042442560196, "learning_rate": 4.70836083650226e-05, "loss": 1.1499, "step": 1985 }, { "epoch": 0.31220892530802336, "grad_norm": 0.28036314249038696, "learning_rate": 4.708071172252002e-05, "loss": 1.2129, "step": 1986 }, { "epoch": 0.3123661302049559, "grad_norm": 0.1716984510421753, "learning_rate": 4.707781373141158e-05, "loss": 1.0891, "step": 1987 }, { "epoch": 0.3125233351018884, "grad_norm": 0.23412878811359406, "learning_rate": 4.707491439187427e-05, "loss": 1.1228, "step": 1988 }, { "epoch": 0.31268053999882095, "grad_norm": 0.1973845213651657, "learning_rate": 4.707201370408518e-05, "loss": 1.0968, "step": 1989 }, { "epoch": 0.3128377448957535, "grad_norm": 0.22760836780071259, "learning_rate": 4.7069111668221476e-05, "loss": 1.0974, "step": 1990 }, { "epoch": 0.31299494979268605, "grad_norm": 0.24710045754909515, "learning_rate": 4.70662082844604e-05, "loss": 1.189, "step": 1991 }, { "epoch": 0.3131521546896186, "grad_norm": 0.1627720296382904, "learning_rate": 4.7063303552979275e-05, "loss": 1.2501, "step": 1992 }, { "epoch": 0.3133093595865511, "grad_norm": 0.2823135554790497, "learning_rate": 4.706039747395552e-05, "loss": 1.1686, "step": 1993 }, { "epoch": 0.31346656448348365, "grad_norm": 0.16645194590091705, "learning_rate": 4.705749004756662e-05, "loss": 1.1522, "step": 1994 }, { "epoch": 0.3136237693804162, "grad_norm": 0.2954739034175873, "learning_rate": 4.705458127399015e-05, "loss": 1.0194, "step": 1995 }, { "epoch": 0.31378097427734875, "grad_norm": 0.21758581697940826, "learning_rate": 4.705167115340376e-05, "loss": 1.1406, "step": 1996 }, { "epoch": 0.3139381791742813, "grad_norm": 0.2450348287820816, "learning_rate": 4.704875968598521e-05, "loss": 1.1483, "step": 1997 }, { "epoch": 0.31409538407121385, "grad_norm": 0.22131307423114777, "learning_rate": 4.70458468719123e-05, "loss": 1.2813, "step": 1998 }, { "epoch": 0.31425258896814634, "grad_norm": 0.20907478034496307, "learning_rate": 4.704293271136294e-05, "loss": 1.2129, "step": 1999 }, { "epoch": 0.3144097938650789, "grad_norm": 0.18173012137413025, "learning_rate": 4.704001720451513e-05, "loss": 1.1854, "step": 2000 }, { "epoch": 0.31456699876201144, "grad_norm": 0.24892567098140717, "learning_rate": 4.7037100351546914e-05, "loss": 1.2041, "step": 2001 }, { "epoch": 0.314724203658944, "grad_norm": 0.24656441807746887, "learning_rate": 4.703418215263646e-05, "loss": 1.1454, "step": 2002 }, { "epoch": 0.31488140855587654, "grad_norm": 0.31545913219451904, "learning_rate": 4.703126260796199e-05, "loss": 1.0506, "step": 2003 }, { "epoch": 0.31503861345280904, "grad_norm": 0.19431567192077637, "learning_rate": 4.702834171770183e-05, "loss": 1.1689, "step": 2004 }, { "epoch": 0.3151958183497416, "grad_norm": 0.22574302554130554, "learning_rate": 4.702541948203436e-05, "loss": 1.1604, "step": 2005 }, { "epoch": 0.31535302324667414, "grad_norm": 0.18993429839611053, "learning_rate": 4.7022495901138084e-05, "loss": 1.2154, "step": 2006 }, { "epoch": 0.3155102281436067, "grad_norm": 0.21746498346328735, "learning_rate": 4.7019570975191544e-05, "loss": 1.1902, "step": 2007 }, { "epoch": 0.31566743304053924, "grad_norm": 0.274848610162735, "learning_rate": 4.701664470437338e-05, "loss": 1.0607, "step": 2008 }, { "epoch": 0.31582463793747173, "grad_norm": 0.2473202347755432, "learning_rate": 4.701371708886233e-05, "loss": 1.1909, "step": 2009 }, { "epoch": 0.3159818428344043, "grad_norm": 0.22823210060596466, "learning_rate": 4.701078812883719e-05, "loss": 1.1499, "step": 2010 }, { "epoch": 0.31613904773133683, "grad_norm": 0.21258869767189026, "learning_rate": 4.700785782447686e-05, "loss": 1.029, "step": 2011 }, { "epoch": 0.3162962526282694, "grad_norm": 0.17091694474220276, "learning_rate": 4.700492617596032e-05, "loss": 1.1888, "step": 2012 }, { "epoch": 0.31645345752520193, "grad_norm": 0.22551240026950836, "learning_rate": 4.70019931834666e-05, "loss": 1.1017, "step": 2013 }, { "epoch": 0.3166106624221344, "grad_norm": 0.24327991902828217, "learning_rate": 4.6999058847174856e-05, "loss": 1.1317, "step": 2014 }, { "epoch": 0.316767867319067, "grad_norm": 0.2276056557893753, "learning_rate": 4.699612316726429e-05, "loss": 1.1481, "step": 2015 }, { "epoch": 0.3169250722159995, "grad_norm": 0.3077816069126129, "learning_rate": 4.699318614391422e-05, "loss": 1.1448, "step": 2016 }, { "epoch": 0.3170822771129321, "grad_norm": 0.17493993043899536, "learning_rate": 4.699024777730402e-05, "loss": 1.1789, "step": 2017 }, { "epoch": 0.3172394820098646, "grad_norm": 0.2110036462545395, "learning_rate": 4.698730806761314e-05, "loss": 1.1272, "step": 2018 }, { "epoch": 0.3173966869067971, "grad_norm": 0.23337996006011963, "learning_rate": 4.6984367015021154e-05, "loss": 1.0113, "step": 2019 }, { "epoch": 0.31755389180372967, "grad_norm": 0.23595476150512695, "learning_rate": 4.698142461970767e-05, "loss": 1.0771, "step": 2020 }, { "epoch": 0.3177110967006622, "grad_norm": 0.3300482928752899, "learning_rate": 4.697848088185241e-05, "loss": 1.0853, "step": 2021 }, { "epoch": 0.31786830159759477, "grad_norm": 0.26284661889076233, "learning_rate": 4.6975535801635146e-05, "loss": 1.1216, "step": 2022 }, { "epoch": 0.3180255064945273, "grad_norm": 0.2010652720928192, "learning_rate": 4.6972589379235775e-05, "loss": 1.1745, "step": 2023 }, { "epoch": 0.31818271139145987, "grad_norm": 0.29553526639938354, "learning_rate": 4.6969641614834244e-05, "loss": 1.1722, "step": 2024 }, { "epoch": 0.31833991628839237, "grad_norm": 0.2783648669719696, "learning_rate": 4.6966692508610586e-05, "loss": 1.1632, "step": 2025 }, { "epoch": 0.3184971211853249, "grad_norm": 0.2214052528142929, "learning_rate": 4.696374206074494e-05, "loss": 1.2004, "step": 2026 }, { "epoch": 0.31865432608225747, "grad_norm": 0.3076881766319275, "learning_rate": 4.696079027141749e-05, "loss": 1.1441, "step": 2027 }, { "epoch": 0.31881153097919, "grad_norm": 0.2371261715888977, "learning_rate": 4.695783714080852e-05, "loss": 1.1661, "step": 2028 }, { "epoch": 0.31896873587612257, "grad_norm": 0.20042219758033752, "learning_rate": 4.695488266909841e-05, "loss": 1.2247, "step": 2029 }, { "epoch": 0.31912594077305506, "grad_norm": 0.1989884376525879, "learning_rate": 4.69519268564676e-05, "loss": 1.1628, "step": 2030 }, { "epoch": 0.3192831456699876, "grad_norm": 0.16829486191272736, "learning_rate": 4.6948969703096614e-05, "loss": 1.0928, "step": 2031 }, { "epoch": 0.31944035056692016, "grad_norm": 0.20480427145957947, "learning_rate": 4.694601120916607e-05, "loss": 1.3009, "step": 2032 }, { "epoch": 0.3195975554638527, "grad_norm": 0.20662841200828552, "learning_rate": 4.694305137485666e-05, "loss": 1.1324, "step": 2033 }, { "epoch": 0.31975476036078526, "grad_norm": 0.22225816547870636, "learning_rate": 4.6940090200349165e-05, "loss": 1.0711, "step": 2034 }, { "epoch": 0.31991196525771776, "grad_norm": 0.1969664990901947, "learning_rate": 4.6937127685824426e-05, "loss": 1.223, "step": 2035 }, { "epoch": 0.3200691701546503, "grad_norm": 0.21913392841815948, "learning_rate": 4.6934163831463405e-05, "loss": 1.1268, "step": 2036 }, { "epoch": 0.32022637505158286, "grad_norm": 0.21957406401634216, "learning_rate": 4.69311986374471e-05, "loss": 1.1839, "step": 2037 }, { "epoch": 0.3203835799485154, "grad_norm": 0.189280167222023, "learning_rate": 4.6928232103956635e-05, "loss": 1.1528, "step": 2038 }, { "epoch": 0.32054078484544796, "grad_norm": 0.26548171043395996, "learning_rate": 4.6925264231173185e-05, "loss": 1.1047, "step": 2039 }, { "epoch": 0.32069798974238045, "grad_norm": 0.20444843173027039, "learning_rate": 4.6922295019278005e-05, "loss": 1.0089, "step": 2040 }, { "epoch": 0.320855194639313, "grad_norm": 0.2803153693675995, "learning_rate": 4.691932446845246e-05, "loss": 1.0605, "step": 2041 }, { "epoch": 0.32101239953624555, "grad_norm": 0.22570960223674774, "learning_rate": 4.691635257887798e-05, "loss": 1.2089, "step": 2042 }, { "epoch": 0.3211696044331781, "grad_norm": 0.2413252294063568, "learning_rate": 4.691337935073606e-05, "loss": 1.2844, "step": 2043 }, { "epoch": 0.32132680933011065, "grad_norm": 0.2029626965522766, "learning_rate": 4.691040478420832e-05, "loss": 1.1383, "step": 2044 }, { "epoch": 0.32148401422704315, "grad_norm": 0.22830082476139069, "learning_rate": 4.6907428879476404e-05, "loss": 1.2141, "step": 2045 }, { "epoch": 0.3216412191239757, "grad_norm": 0.2177695333957672, "learning_rate": 4.690445163672209e-05, "loss": 1.0376, "step": 2046 }, { "epoch": 0.32179842402090825, "grad_norm": 0.24039793014526367, "learning_rate": 4.690147305612721e-05, "loss": 1.1292, "step": 2047 }, { "epoch": 0.3219556289178408, "grad_norm": 0.20877334475517273, "learning_rate": 4.6898493137873687e-05, "loss": 1.1784, "step": 2048 }, { "epoch": 0.32211283381477335, "grad_norm": 0.238002210855484, "learning_rate": 4.689551188214352e-05, "loss": 1.1585, "step": 2049 }, { "epoch": 0.3222700387117059, "grad_norm": 0.24130697548389435, "learning_rate": 4.68925292891188e-05, "loss": 1.0301, "step": 2050 }, { "epoch": 0.3224272436086384, "grad_norm": 0.2053060382604599, "learning_rate": 4.688954535898168e-05, "loss": 1.2153, "step": 2051 }, { "epoch": 0.32258444850557094, "grad_norm": 0.2303832322359085, "learning_rate": 4.6886560091914415e-05, "loss": 1.1762, "step": 2052 }, { "epoch": 0.3227416534025035, "grad_norm": 0.25025346875190735, "learning_rate": 4.688357348809933e-05, "loss": 1.111, "step": 2053 }, { "epoch": 0.32289885829943604, "grad_norm": 0.2501404583454132, "learning_rate": 4.6880585547718845e-05, "loss": 1.0784, "step": 2054 }, { "epoch": 0.3230560631963686, "grad_norm": 0.2034914195537567, "learning_rate": 4.687759627095544e-05, "loss": 1.17, "step": 2055 }, { "epoch": 0.3232132680933011, "grad_norm": 0.17908981442451477, "learning_rate": 4.68746056579917e-05, "loss": 1.1601, "step": 2056 }, { "epoch": 0.32337047299023364, "grad_norm": 0.22271564602851868, "learning_rate": 4.6871613709010266e-05, "loss": 1.2194, "step": 2057 }, { "epoch": 0.3235276778871662, "grad_norm": 0.23161068558692932, "learning_rate": 4.6868620424193885e-05, "loss": 1.1704, "step": 2058 }, { "epoch": 0.32368488278409874, "grad_norm": 0.24310912191867828, "learning_rate": 4.6865625803725375e-05, "loss": 1.0526, "step": 2059 }, { "epoch": 0.3238420876810313, "grad_norm": 0.2134714424610138, "learning_rate": 4.6862629847787633e-05, "loss": 1.1974, "step": 2060 }, { "epoch": 0.3239992925779638, "grad_norm": 0.17639285326004028, "learning_rate": 4.685963255656364e-05, "loss": 1.2024, "step": 2061 }, { "epoch": 0.32415649747489633, "grad_norm": 0.22746217250823975, "learning_rate": 4.6856633930236457e-05, "loss": 1.1888, "step": 2062 }, { "epoch": 0.3243137023718289, "grad_norm": 0.17931844294071198, "learning_rate": 4.6853633968989244e-05, "loss": 1.2835, "step": 2063 }, { "epoch": 0.32447090726876143, "grad_norm": 0.18917080760002136, "learning_rate": 4.68506326730052e-05, "loss": 1.2193, "step": 2064 }, { "epoch": 0.324628112165694, "grad_norm": 0.2527455687522888, "learning_rate": 4.684763004246766e-05, "loss": 1.2187, "step": 2065 }, { "epoch": 0.3247853170626265, "grad_norm": 0.1864977329969406, "learning_rate": 4.684462607756001e-05, "loss": 1.2025, "step": 2066 }, { "epoch": 0.324942521959559, "grad_norm": 0.3215543031692505, "learning_rate": 4.6841620778465695e-05, "loss": 1.16, "step": 2067 }, { "epoch": 0.3250997268564916, "grad_norm": 0.3193890452384949, "learning_rate": 4.683861414536829e-05, "loss": 1.0823, "step": 2068 }, { "epoch": 0.32525693175342413, "grad_norm": 0.2766779959201813, "learning_rate": 4.683560617845143e-05, "loss": 1.0615, "step": 2069 }, { "epoch": 0.3254141366503567, "grad_norm": 0.2471621334552765, "learning_rate": 4.683259687789881e-05, "loss": 1.1033, "step": 2070 }, { "epoch": 0.3255713415472892, "grad_norm": 0.22947926819324493, "learning_rate": 4.682958624389426e-05, "loss": 1.3086, "step": 2071 }, { "epoch": 0.3257285464442217, "grad_norm": 0.19345812499523163, "learning_rate": 4.682657427662163e-05, "loss": 1.1013, "step": 2072 }, { "epoch": 0.3258857513411543, "grad_norm": 0.21470381319522858, "learning_rate": 4.682356097626488e-05, "loss": 1.1439, "step": 2073 }, { "epoch": 0.3260429562380868, "grad_norm": 0.19939084351062775, "learning_rate": 4.682054634300807e-05, "loss": 1.0446, "step": 2074 }, { "epoch": 0.3262001611350194, "grad_norm": 0.23096968233585358, "learning_rate": 4.68175303770353e-05, "loss": 1.1722, "step": 2075 }, { "epoch": 0.32635736603195187, "grad_norm": 0.19705285131931305, "learning_rate": 4.6814513078530796e-05, "loss": 1.2121, "step": 2076 }, { "epoch": 0.3265145709288844, "grad_norm": 0.14740808308124542, "learning_rate": 4.681149444767883e-05, "loss": 1.0735, "step": 2077 }, { "epoch": 0.32667177582581697, "grad_norm": 0.1801644265651703, "learning_rate": 4.680847448466376e-05, "loss": 1.14, "step": 2078 }, { "epoch": 0.3268289807227495, "grad_norm": 0.221450537443161, "learning_rate": 4.680545318967006e-05, "loss": 1.1093, "step": 2079 }, { "epoch": 0.32698618561968207, "grad_norm": 0.25395452976226807, "learning_rate": 4.6802430562882226e-05, "loss": 1.1791, "step": 2080 }, { "epoch": 0.32698618561968207, "eval_loss": 1.1437312364578247, "eval_runtime": 2317.1332, "eval_samples_per_second": 3.995, "eval_steps_per_second": 1.998, "step": 2080 }, { "epoch": 0.3271433905166146, "grad_norm": 0.15380804240703583, "learning_rate": 4.6799406604484894e-05, "loss": 1.1315, "step": 2081 }, { "epoch": 0.3273005954135471, "grad_norm": 0.24307769536972046, "learning_rate": 4.679638131466275e-05, "loss": 1.0447, "step": 2082 }, { "epoch": 0.32745780031047966, "grad_norm": 0.19176973402500153, "learning_rate": 4.6793354693600565e-05, "loss": 1.1879, "step": 2083 }, { "epoch": 0.3276150052074122, "grad_norm": 0.2640921175479889, "learning_rate": 4.679032674148319e-05, "loss": 1.0972, "step": 2084 }, { "epoch": 0.32777221010434476, "grad_norm": 0.2140382081270218, "learning_rate": 4.678729745849557e-05, "loss": 1.2197, "step": 2085 }, { "epoch": 0.3279294150012773, "grad_norm": 0.2784416675567627, "learning_rate": 4.678426684482272e-05, "loss": 1.1183, "step": 2086 }, { "epoch": 0.3280866198982098, "grad_norm": 0.2722271978855133, "learning_rate": 4.678123490064973e-05, "loss": 1.1631, "step": 2087 }, { "epoch": 0.32824382479514236, "grad_norm": 0.18923896551132202, "learning_rate": 4.6778201626161776e-05, "loss": 1.1795, "step": 2088 }, { "epoch": 0.3284010296920749, "grad_norm": 0.18081101775169373, "learning_rate": 4.6775167021544136e-05, "loss": 1.1308, "step": 2089 }, { "epoch": 0.32855823458900746, "grad_norm": 0.24127820134162903, "learning_rate": 4.677213108698214e-05, "loss": 1.1461, "step": 2090 }, { "epoch": 0.32871543948594, "grad_norm": 0.20159919559955597, "learning_rate": 4.6769093822661214e-05, "loss": 1.2414, "step": 2091 }, { "epoch": 0.3288726443828725, "grad_norm": 0.2225925475358963, "learning_rate": 4.676605522876687e-05, "loss": 1.3019, "step": 2092 }, { "epoch": 0.32902984927980505, "grad_norm": 0.20402312278747559, "learning_rate": 4.676301530548468e-05, "loss": 0.9615, "step": 2093 }, { "epoch": 0.3291870541767376, "grad_norm": 0.2000385820865631, "learning_rate": 4.6759974053000324e-05, "loss": 1.1958, "step": 2094 }, { "epoch": 0.32934425907367015, "grad_norm": 0.27835071086883545, "learning_rate": 4.6756931471499546e-05, "loss": 1.1249, "step": 2095 }, { "epoch": 0.3295014639706027, "grad_norm": 0.20449519157409668, "learning_rate": 4.675388756116816e-05, "loss": 1.0496, "step": 2096 }, { "epoch": 0.3296586688675352, "grad_norm": 0.26013025641441345, "learning_rate": 4.67508423221921e-05, "loss": 1.1194, "step": 2097 }, { "epoch": 0.32981587376446775, "grad_norm": 0.19826790690422058, "learning_rate": 4.6747795754757354e-05, "loss": 1.1878, "step": 2098 }, { "epoch": 0.3299730786614003, "grad_norm": 0.20292681455612183, "learning_rate": 4.6744747859049975e-05, "loss": 1.2216, "step": 2099 }, { "epoch": 0.33013028355833285, "grad_norm": 0.2701912224292755, "learning_rate": 4.674169863525614e-05, "loss": 1.1728, "step": 2100 }, { "epoch": 0.3302874884552654, "grad_norm": 0.2722685933113098, "learning_rate": 4.673864808356206e-05, "loss": 1.1967, "step": 2101 }, { "epoch": 0.3304446933521979, "grad_norm": 0.19252356886863708, "learning_rate": 4.673559620415408e-05, "loss": 1.1614, "step": 2102 }, { "epoch": 0.33060189824913044, "grad_norm": 0.2063681185245514, "learning_rate": 4.673254299721858e-05, "loss": 1.192, "step": 2103 }, { "epoch": 0.330759103146063, "grad_norm": 0.22739455103874207, "learning_rate": 4.6729488462942036e-05, "loss": 1.207, "step": 2104 }, { "epoch": 0.33091630804299554, "grad_norm": 0.220958411693573, "learning_rate": 4.672643260151101e-05, "loss": 1.1716, "step": 2105 }, { "epoch": 0.3310735129399281, "grad_norm": 0.24066145718097687, "learning_rate": 4.672337541311215e-05, "loss": 1.1382, "step": 2106 }, { "epoch": 0.33123071783686064, "grad_norm": 0.25801724195480347, "learning_rate": 4.672031689793217e-05, "loss": 1.1494, "step": 2107 }, { "epoch": 0.33138792273379314, "grad_norm": 0.2185317426919937, "learning_rate": 4.671725705615787e-05, "loss": 1.1325, "step": 2108 }, { "epoch": 0.3315451276307257, "grad_norm": 0.25223109126091003, "learning_rate": 4.671419588797615e-05, "loss": 1.1606, "step": 2109 }, { "epoch": 0.33170233252765824, "grad_norm": 0.320625364780426, "learning_rate": 4.6711133393573945e-05, "loss": 1.0385, "step": 2110 }, { "epoch": 0.3318595374245908, "grad_norm": 0.2799528241157532, "learning_rate": 4.6708069573138335e-05, "loss": 1.1079, "step": 2111 }, { "epoch": 0.33201674232152334, "grad_norm": 0.2274870127439499, "learning_rate": 4.670500442685642e-05, "loss": 1.0838, "step": 2112 }, { "epoch": 0.33217394721845583, "grad_norm": 0.27315694093704224, "learning_rate": 4.670193795491542e-05, "loss": 1.095, "step": 2113 }, { "epoch": 0.3323311521153884, "grad_norm": 0.21600811183452606, "learning_rate": 4.669887015750262e-05, "loss": 1.0079, "step": 2114 }, { "epoch": 0.33248835701232093, "grad_norm": 0.2445525974035263, "learning_rate": 4.669580103480539e-05, "loss": 1.1513, "step": 2115 }, { "epoch": 0.3326455619092535, "grad_norm": 0.2136092483997345, "learning_rate": 4.669273058701117e-05, "loss": 1.203, "step": 2116 }, { "epoch": 0.33280276680618603, "grad_norm": 0.2629857361316681, "learning_rate": 4.668965881430751e-05, "loss": 1.0757, "step": 2117 }, { "epoch": 0.33295997170311853, "grad_norm": 0.2461937963962555, "learning_rate": 4.668658571688201e-05, "loss": 1.0329, "step": 2118 }, { "epoch": 0.3331171766000511, "grad_norm": 0.21537308394908905, "learning_rate": 4.668351129492237e-05, "loss": 1.1246, "step": 2119 }, { "epoch": 0.33327438149698363, "grad_norm": 0.21597877144813538, "learning_rate": 4.6680435548616366e-05, "loss": 1.1728, "step": 2120 }, { "epoch": 0.3334315863939162, "grad_norm": 0.2764485776424408, "learning_rate": 4.667735847815183e-05, "loss": 1.1108, "step": 2121 }, { "epoch": 0.33358879129084873, "grad_norm": 0.24224579334259033, "learning_rate": 4.667428008371674e-05, "loss": 1.1129, "step": 2122 }, { "epoch": 0.3337459961877812, "grad_norm": 0.2688109874725342, "learning_rate": 4.667120036549907e-05, "loss": 1.0214, "step": 2123 }, { "epoch": 0.3339032010847138, "grad_norm": 0.30181390047073364, "learning_rate": 4.666811932368693e-05, "loss": 1.1261, "step": 2124 }, { "epoch": 0.3340604059816463, "grad_norm": 0.20725978910923004, "learning_rate": 4.666503695846852e-05, "loss": 1.193, "step": 2125 }, { "epoch": 0.3342176108785789, "grad_norm": 0.1896386742591858, "learning_rate": 4.666195327003208e-05, "loss": 1.1811, "step": 2126 }, { "epoch": 0.3343748157755114, "grad_norm": 0.1871979534626007, "learning_rate": 4.665886825856594e-05, "loss": 1.2184, "step": 2127 }, { "epoch": 0.3345320206724439, "grad_norm": 0.2065206915140152, "learning_rate": 4.665578192425854e-05, "loss": 1.1762, "step": 2128 }, { "epoch": 0.33468922556937647, "grad_norm": 0.293587863445282, "learning_rate": 4.665269426729838e-05, "loss": 1.1041, "step": 2129 }, { "epoch": 0.334846430466309, "grad_norm": 0.23634661734104156, "learning_rate": 4.664960528787403e-05, "loss": 1.0625, "step": 2130 }, { "epoch": 0.33500363536324157, "grad_norm": 0.21932774782180786, "learning_rate": 4.664651498617417e-05, "loss": 1.0748, "step": 2131 }, { "epoch": 0.3351608402601741, "grad_norm": 0.23783034086227417, "learning_rate": 4.6643423362387526e-05, "loss": 1.1149, "step": 2132 }, { "epoch": 0.33531804515710667, "grad_norm": 0.24081239104270935, "learning_rate": 4.664033041670293e-05, "loss": 1.2532, "step": 2133 }, { "epoch": 0.33547525005403916, "grad_norm": 0.23142661154270172, "learning_rate": 4.6637236149309296e-05, "loss": 1.178, "step": 2134 }, { "epoch": 0.3356324549509717, "grad_norm": 0.27728673815727234, "learning_rate": 4.663414056039559e-05, "loss": 1.0755, "step": 2135 }, { "epoch": 0.33578965984790426, "grad_norm": 0.22718535363674164, "learning_rate": 4.6631043650150905e-05, "loss": 1.1332, "step": 2136 }, { "epoch": 0.3359468647448368, "grad_norm": 0.28607967495918274, "learning_rate": 4.6627945418764366e-05, "loss": 1.2812, "step": 2137 }, { "epoch": 0.33610406964176937, "grad_norm": 0.19152137637138367, "learning_rate": 4.6624845866425215e-05, "loss": 1.0983, "step": 2138 }, { "epoch": 0.33626127453870186, "grad_norm": 0.230951726436615, "learning_rate": 4.662174499332275e-05, "loss": 1.1511, "step": 2139 }, { "epoch": 0.3364184794356344, "grad_norm": 0.23929493129253387, "learning_rate": 4.661864279964637e-05, "loss": 1.1524, "step": 2140 }, { "epoch": 0.33657568433256696, "grad_norm": 0.2901476323604584, "learning_rate": 4.661553928558554e-05, "loss": 1.1344, "step": 2141 }, { "epoch": 0.3367328892294995, "grad_norm": 0.26585260033607483, "learning_rate": 4.661243445132981e-05, "loss": 1.1513, "step": 2142 }, { "epoch": 0.33689009412643206, "grad_norm": 0.23678666353225708, "learning_rate": 4.660932829706882e-05, "loss": 1.0852, "step": 2143 }, { "epoch": 0.33704729902336455, "grad_norm": 0.4830389618873596, "learning_rate": 4.660622082299227e-05, "loss": 1.1329, "step": 2144 }, { "epoch": 0.3372045039202971, "grad_norm": 0.21181213855743408, "learning_rate": 4.660311202928996e-05, "loss": 1.1513, "step": 2145 }, { "epoch": 0.33736170881722966, "grad_norm": 0.22538620233535767, "learning_rate": 4.660000191615176e-05, "loss": 1.1165, "step": 2146 }, { "epoch": 0.3375189137141622, "grad_norm": 0.19452321529388428, "learning_rate": 4.659689048376763e-05, "loss": 1.1231, "step": 2147 }, { "epoch": 0.33767611861109476, "grad_norm": 0.2382746934890747, "learning_rate": 4.6593777732327595e-05, "loss": 1.1805, "step": 2148 }, { "epoch": 0.33783332350802725, "grad_norm": 0.26696088910102844, "learning_rate": 4.659066366202178e-05, "loss": 1.1743, "step": 2149 }, { "epoch": 0.3379905284049598, "grad_norm": 0.19131755828857422, "learning_rate": 4.658754827304037e-05, "loss": 1.1538, "step": 2150 }, { "epoch": 0.33814773330189235, "grad_norm": 0.20308135449886322, "learning_rate": 4.658443156557365e-05, "loss": 1.1274, "step": 2151 }, { "epoch": 0.3383049381988249, "grad_norm": 0.25910693407058716, "learning_rate": 4.658131353981198e-05, "loss": 1.2042, "step": 2152 }, { "epoch": 0.33846214309575745, "grad_norm": 0.2608768939971924, "learning_rate": 4.6578194195945776e-05, "loss": 1.0935, "step": 2153 }, { "epoch": 0.33861934799268995, "grad_norm": 0.22527897357940674, "learning_rate": 4.657507353416558e-05, "loss": 1.1007, "step": 2154 }, { "epoch": 0.3387765528896225, "grad_norm": 0.2805950343608856, "learning_rate": 4.657195155466198e-05, "loss": 1.0509, "step": 2155 }, { "epoch": 0.33893375778655505, "grad_norm": 0.244426891207695, "learning_rate": 4.656882825762565e-05, "loss": 1.0811, "step": 2156 }, { "epoch": 0.3390909626834876, "grad_norm": 0.20174293220043182, "learning_rate": 4.656570364324736e-05, "loss": 1.197, "step": 2157 }, { "epoch": 0.33924816758042015, "grad_norm": 0.2714351713657379, "learning_rate": 4.656257771171795e-05, "loss": 1.1018, "step": 2158 }, { "epoch": 0.3394053724773527, "grad_norm": 0.266418993473053, "learning_rate": 4.6559450463228316e-05, "loss": 1.1652, "step": 2159 }, { "epoch": 0.3395625773742852, "grad_norm": 0.29422682523727417, "learning_rate": 4.655632189796949e-05, "loss": 1.0787, "step": 2160 }, { "epoch": 0.33971978227121774, "grad_norm": 0.26280736923217773, "learning_rate": 4.655319201613253e-05, "loss": 1.2125, "step": 2161 }, { "epoch": 0.3398769871681503, "grad_norm": 0.20241224765777588, "learning_rate": 4.655006081790861e-05, "loss": 1.227, "step": 2162 }, { "epoch": 0.34003419206508284, "grad_norm": 0.23165303468704224, "learning_rate": 4.654692830348897e-05, "loss": 1.0675, "step": 2163 }, { "epoch": 0.3401913969620154, "grad_norm": 0.3363533020019531, "learning_rate": 4.654379447306493e-05, "loss": 1.1536, "step": 2164 }, { "epoch": 0.3403486018589479, "grad_norm": 0.21518482267856598, "learning_rate": 4.654065932682789e-05, "loss": 1.1749, "step": 2165 }, { "epoch": 0.34050580675588044, "grad_norm": 0.16388140618801117, "learning_rate": 4.653752286496933e-05, "loss": 1.2337, "step": 2166 }, { "epoch": 0.340663011652813, "grad_norm": 0.20146115124225616, "learning_rate": 4.6534385087680824e-05, "loss": 1.1984, "step": 2167 }, { "epoch": 0.34082021654974554, "grad_norm": 0.22612817585468292, "learning_rate": 4.653124599515401e-05, "loss": 1.1037, "step": 2168 }, { "epoch": 0.3409774214466781, "grad_norm": 0.23036076128482819, "learning_rate": 4.652810558758061e-05, "loss": 1.2341, "step": 2169 }, { "epoch": 0.3411346263436106, "grad_norm": 0.23168572783470154, "learning_rate": 4.652496386515243e-05, "loss": 1.167, "step": 2170 }, { "epoch": 0.34129183124054313, "grad_norm": 0.24674174189567566, "learning_rate": 4.6521820828061354e-05, "loss": 1.1577, "step": 2171 }, { "epoch": 0.3414490361374757, "grad_norm": 0.1900520622730255, "learning_rate": 4.6518676476499353e-05, "loss": 1.1402, "step": 2172 }, { "epoch": 0.34160624103440823, "grad_norm": 0.18541108071804047, "learning_rate": 4.651553081065846e-05, "loss": 1.0964, "step": 2173 }, { "epoch": 0.3417634459313408, "grad_norm": 0.2279513031244278, "learning_rate": 4.651238383073081e-05, "loss": 1.0895, "step": 2174 }, { "epoch": 0.3419206508282733, "grad_norm": 0.2681223750114441, "learning_rate": 4.65092355369086e-05, "loss": 1.0725, "step": 2175 }, { "epoch": 0.3420778557252058, "grad_norm": 0.2828345000743866, "learning_rate": 4.6506085929384124e-05, "loss": 1.0066, "step": 2176 }, { "epoch": 0.3422350606221384, "grad_norm": 0.20930872857570648, "learning_rate": 4.6502935008349747e-05, "loss": 1.2589, "step": 2177 }, { "epoch": 0.3423922655190709, "grad_norm": 0.17460274696350098, "learning_rate": 4.6499782773997906e-05, "loss": 1.1705, "step": 2178 }, { "epoch": 0.3425494704160035, "grad_norm": 0.24282746016979218, "learning_rate": 4.649662922652114e-05, "loss": 1.152, "step": 2179 }, { "epoch": 0.34270667531293597, "grad_norm": 0.21630579233169556, "learning_rate": 4.649347436611205e-05, "loss": 1.1686, "step": 2180 }, { "epoch": 0.3428638802098685, "grad_norm": 0.2531259059906006, "learning_rate": 4.649031819296332e-05, "loss": 1.1321, "step": 2181 }, { "epoch": 0.34302108510680107, "grad_norm": 0.26737165451049805, "learning_rate": 4.648716070726772e-05, "loss": 1.1569, "step": 2182 }, { "epoch": 0.3431782900037336, "grad_norm": 0.27461764216423035, "learning_rate": 4.64840019092181e-05, "loss": 1.0203, "step": 2183 }, { "epoch": 0.34333549490066617, "grad_norm": 0.2627179026603699, "learning_rate": 4.648084179900739e-05, "loss": 1.2092, "step": 2184 }, { "epoch": 0.3434926997975987, "grad_norm": 0.23868447542190552, "learning_rate": 4.647768037682858e-05, "loss": 1.0356, "step": 2185 }, { "epoch": 0.3436499046945312, "grad_norm": 0.24952903389930725, "learning_rate": 4.647451764287478e-05, "loss": 1.0801, "step": 2186 }, { "epoch": 0.34380710959146377, "grad_norm": 0.23518601059913635, "learning_rate": 4.647135359733914e-05, "loss": 1.0065, "step": 2187 }, { "epoch": 0.3439643144883963, "grad_norm": 0.20000571012496948, "learning_rate": 4.6468188240414924e-05, "loss": 1.1048, "step": 2188 }, { "epoch": 0.34412151938532887, "grad_norm": 0.2511467933654785, "learning_rate": 4.646502157229544e-05, "loss": 1.1423, "step": 2189 }, { "epoch": 0.3442787242822614, "grad_norm": 0.18525715172290802, "learning_rate": 4.646185359317412e-05, "loss": 1.1245, "step": 2190 }, { "epoch": 0.3444359291791939, "grad_norm": 0.18307547271251678, "learning_rate": 4.6458684303244435e-05, "loss": 1.0499, "step": 2191 }, { "epoch": 0.34459313407612646, "grad_norm": 0.263036847114563, "learning_rate": 4.645551370269995e-05, "loss": 1.1585, "step": 2192 }, { "epoch": 0.344750338973059, "grad_norm": 0.276479572057724, "learning_rate": 4.6452341791734335e-05, "loss": 1.1801, "step": 2193 }, { "epoch": 0.34490754386999156, "grad_norm": 0.19550184905529022, "learning_rate": 4.644916857054129e-05, "loss": 1.09, "step": 2194 }, { "epoch": 0.3450647487669241, "grad_norm": 0.22002463042736053, "learning_rate": 4.644599403931465e-05, "loss": 1.0928, "step": 2195 }, { "epoch": 0.3452219536638566, "grad_norm": 0.26032400131225586, "learning_rate": 4.6442818198248276e-05, "loss": 1.1699, "step": 2196 }, { "epoch": 0.34537915856078916, "grad_norm": 0.19974485039710999, "learning_rate": 4.643964104753617e-05, "loss": 1.1668, "step": 2197 }, { "epoch": 0.3455363634577217, "grad_norm": 0.2452566772699356, "learning_rate": 4.6436462587372345e-05, "loss": 1.1597, "step": 2198 }, { "epoch": 0.34569356835465426, "grad_norm": 0.2143898904323578, "learning_rate": 4.643328281795095e-05, "loss": 1.1134, "step": 2199 }, { "epoch": 0.3458507732515868, "grad_norm": 0.21687857806682587, "learning_rate": 4.643010173946619e-05, "loss": 1.1802, "step": 2200 }, { "epoch": 0.3460079781485193, "grad_norm": 0.21475355327129364, "learning_rate": 4.6426919352112355e-05, "loss": 1.2065, "step": 2201 }, { "epoch": 0.34616518304545185, "grad_norm": 0.25029754638671875, "learning_rate": 4.64237356560838e-05, "loss": 1.1723, "step": 2202 }, { "epoch": 0.3463223879423844, "grad_norm": 0.23117133975028992, "learning_rate": 4.642055065157499e-05, "loss": 1.1915, "step": 2203 }, { "epoch": 0.34647959283931695, "grad_norm": 0.28021204471588135, "learning_rate": 4.641736433878045e-05, "loss": 1.0219, "step": 2204 }, { "epoch": 0.3466367977362495, "grad_norm": 0.24753634631633759, "learning_rate": 4.641417671789478e-05, "loss": 1.0664, "step": 2205 }, { "epoch": 0.346794002633182, "grad_norm": 0.2812795341014862, "learning_rate": 4.6410987789112676e-05, "loss": 1.1794, "step": 2206 }, { "epoch": 0.34695120753011455, "grad_norm": 0.17878840863704681, "learning_rate": 4.64077975526289e-05, "loss": 1.1064, "step": 2207 }, { "epoch": 0.3471084124270471, "grad_norm": 0.20422449707984924, "learning_rate": 4.6404606008638295e-05, "loss": 1.2858, "step": 2208 }, { "epoch": 0.34726561732397965, "grad_norm": 0.2892017066478729, "learning_rate": 4.6401413157335796e-05, "loss": 1.1227, "step": 2209 }, { "epoch": 0.3474228222209122, "grad_norm": 0.16432105004787445, "learning_rate": 4.639821899891641e-05, "loss": 1.1985, "step": 2210 }, { "epoch": 0.34758002711784475, "grad_norm": 0.2849401533603668, "learning_rate": 4.639502353357522e-05, "loss": 1.056, "step": 2211 }, { "epoch": 0.34773723201477724, "grad_norm": 0.17997637391090393, "learning_rate": 4.6391826761507403e-05, "loss": 1.1146, "step": 2212 }, { "epoch": 0.3478944369117098, "grad_norm": 0.22177653014659882, "learning_rate": 4.6388628682908186e-05, "loss": 1.1871, "step": 2213 }, { "epoch": 0.34805164180864234, "grad_norm": 0.18821461498737335, "learning_rate": 4.6385429297972914e-05, "loss": 1.1184, "step": 2214 }, { "epoch": 0.3482088467055749, "grad_norm": 0.2185995727777481, "learning_rate": 4.6382228606896994e-05, "loss": 1.2098, "step": 2215 }, { "epoch": 0.34836605160250744, "grad_norm": 0.2325439453125, "learning_rate": 4.6379026609875894e-05, "loss": 1.1827, "step": 2216 }, { "epoch": 0.34852325649943994, "grad_norm": 0.23903654515743256, "learning_rate": 4.637582330710519e-05, "loss": 1.1347, "step": 2217 }, { "epoch": 0.3486804613963725, "grad_norm": 0.2033540904521942, "learning_rate": 4.637261869878054e-05, "loss": 1.0412, "step": 2218 }, { "epoch": 0.34883766629330504, "grad_norm": 0.21630467474460602, "learning_rate": 4.6369412785097644e-05, "loss": 1.2504, "step": 2219 }, { "epoch": 0.3489948711902376, "grad_norm": 0.2330722063779831, "learning_rate": 4.636620556625233e-05, "loss": 1.2137, "step": 2220 }, { "epoch": 0.34915207608717014, "grad_norm": 0.22527045011520386, "learning_rate": 4.636299704244047e-05, "loss": 1.1868, "step": 2221 }, { "epoch": 0.34930928098410263, "grad_norm": 0.26466044783592224, "learning_rate": 4.635978721385803e-05, "loss": 1.0843, "step": 2222 }, { "epoch": 0.3494664858810352, "grad_norm": 0.1883208006620407, "learning_rate": 4.6356576080701054e-05, "loss": 1.1547, "step": 2223 }, { "epoch": 0.34962369077796773, "grad_norm": 0.23476985096931458, "learning_rate": 4.635336364316567e-05, "loss": 1.1051, "step": 2224 }, { "epoch": 0.3497808956749003, "grad_norm": 0.224160835146904, "learning_rate": 4.635014990144808e-05, "loss": 1.1945, "step": 2225 }, { "epoch": 0.34993810057183283, "grad_norm": 0.19404496252536774, "learning_rate": 4.634693485574457e-05, "loss": 1.2492, "step": 2226 }, { "epoch": 0.3500953054687653, "grad_norm": 0.253548800945282, "learning_rate": 4.6343718506251485e-05, "loss": 1.2643, "step": 2227 }, { "epoch": 0.3502525103656979, "grad_norm": 0.3645852208137512, "learning_rate": 4.634050085316529e-05, "loss": 1.147, "step": 2228 }, { "epoch": 0.35040971526263043, "grad_norm": 0.25876176357269287, "learning_rate": 4.6337281896682504e-05, "loss": 0.9978, "step": 2229 }, { "epoch": 0.350566920159563, "grad_norm": 0.20177343487739563, "learning_rate": 4.633406163699972e-05, "loss": 1.1169, "step": 2230 }, { "epoch": 0.35072412505649553, "grad_norm": 0.23162272572517395, "learning_rate": 4.633084007431361e-05, "loss": 1.0649, "step": 2231 }, { "epoch": 0.350881329953428, "grad_norm": 0.18436746299266815, "learning_rate": 4.6327617208820964e-05, "loss": 1.2476, "step": 2232 }, { "epoch": 0.3510385348503606, "grad_norm": 0.17864318192005157, "learning_rate": 4.6324393040718596e-05, "loss": 1.1171, "step": 2233 }, { "epoch": 0.3511957397472931, "grad_norm": 0.21066221594810486, "learning_rate": 4.632116757020343e-05, "loss": 1.1708, "step": 2234 }, { "epoch": 0.3513529446442257, "grad_norm": 0.19731271266937256, "learning_rate": 4.631794079747248e-05, "loss": 1.0509, "step": 2235 }, { "epoch": 0.3515101495411582, "grad_norm": 0.24790634214878082, "learning_rate": 4.631471272272281e-05, "loss": 1.0759, "step": 2236 }, { "epoch": 0.3516673544380908, "grad_norm": 0.23736214637756348, "learning_rate": 4.6311483346151587e-05, "loss": 1.1302, "step": 2237 }, { "epoch": 0.35182455933502327, "grad_norm": 0.20299890637397766, "learning_rate": 4.630825266795605e-05, "loss": 1.171, "step": 2238 }, { "epoch": 0.3519817642319558, "grad_norm": 0.17163951694965363, "learning_rate": 4.63050206883335e-05, "loss": 1.1312, "step": 2239 }, { "epoch": 0.35213896912888837, "grad_norm": 0.22283077239990234, "learning_rate": 4.6301787407481356e-05, "loss": 1.2501, "step": 2240 }, { "epoch": 0.35213896912888837, "eval_loss": 1.137406349182129, "eval_runtime": 2319.6638, "eval_samples_per_second": 3.991, "eval_steps_per_second": 1.996, "step": 2240 } ], "logging_steps": 1, "max_steps": 12722, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 160, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.1474133293088113e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }