diff --git "a/checkpoint-6880/trainer_state.json" "b/checkpoint-6880/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-6880/trainer_state.json" @@ -0,0 +1,48546 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.081589341507988, + "eval_steps": 160, + "global_step": 6880, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00015720489693253945, + "grad_norm": 1.3751904964447021, + "learning_rate": 0.0, + "loss": 3.5741, + "step": 1 + }, + { + "epoch": 0.00015720489693253945, + "eval_loss": 3.4173049926757812, + "eval_runtime": 2315.7248, + "eval_samples_per_second": 3.998, + "eval_steps_per_second": 1.999, + "step": 1 + }, + { + "epoch": 0.0003144097938650789, + "grad_norm": 1.231239676475525, + "learning_rate": 5e-06, + "loss": 3.3021, + "step": 2 + }, + { + "epoch": 0.00047161469079761836, + "grad_norm": 1.3657807111740112, + "learning_rate": 1e-05, + "loss": 3.6333, + "step": 3 + }, + { + "epoch": 0.0006288195877301578, + "grad_norm": 1.3117496967315674, + "learning_rate": 1.5e-05, + "loss": 3.3731, + "step": 4 + }, + { + "epoch": 0.0007860244846626972, + "grad_norm": 1.4118576049804688, + "learning_rate": 2e-05, + "loss": 3.612, + "step": 5 + }, + { + "epoch": 0.0009432293815952367, + "grad_norm": 1.3155895471572876, + "learning_rate": 2.5e-05, + "loss": 3.3296, + "step": 6 + }, + { + "epoch": 0.001100434278527776, + "grad_norm": 1.2847192287445068, + "learning_rate": 3e-05, + "loss": 3.2168, + "step": 7 + }, + { + "epoch": 0.0012576391754603156, + "grad_norm": 1.1421078443527222, + "learning_rate": 3.5e-05, + "loss": 3.085, + "step": 8 + }, + { + "epoch": 0.0014148440723928551, + "grad_norm": 0.9923035502433777, + "learning_rate": 4e-05, + "loss": 3.0472, + "step": 9 + }, + { + "epoch": 0.0015720489693253944, + "grad_norm": 0.795043408870697, + "learning_rate": 4.5e-05, + "loss": 2.6666, + "step": 10 + }, + { + "epoch": 0.001729253866257934, + "grad_norm": 0.5987974405288696, + "learning_rate": 5e-05, + "loss": 2.473, + "step": 11 + }, + { + "epoch": 0.0018864587631904734, + "grad_norm": 0.4488905668258667, + "learning_rate": 4.9999999236547564e-05, + "loss": 2.3731, + "step": 12 + }, + { + "epoch": 0.002043663660123013, + "grad_norm": 0.3517301380634308, + "learning_rate": 4.999999694619029e-05, + "loss": 2.2158, + "step": 13 + }, + { + "epoch": 0.002200868557055552, + "grad_norm": 0.3045121431350708, + "learning_rate": 4.999999312892831e-05, + "loss": 2.3351, + "step": 14 + }, + { + "epoch": 0.002358073453988092, + "grad_norm": 0.24488244950771332, + "learning_rate": 4.9999987784761884e-05, + "loss": 2.2693, + "step": 15 + }, + { + "epoch": 0.0025152783509206312, + "grad_norm": 0.22892728447914124, + "learning_rate": 4.999998091369132e-05, + "loss": 2.1006, + "step": 16 + }, + { + "epoch": 0.0026724832478531705, + "grad_norm": 0.23219206929206848, + "learning_rate": 4.999997251571704e-05, + "loss": 2.215, + "step": 17 + }, + { + "epoch": 0.0028296881447857102, + "grad_norm": 0.24427154660224915, + "learning_rate": 4.999996259083956e-05, + "loss": 2.1708, + "step": 18 + }, + { + "epoch": 0.0029868930417182495, + "grad_norm": 0.2640205919742584, + "learning_rate": 4.999995113905947e-05, + "loss": 2.1709, + "step": 19 + }, + { + "epoch": 0.003144097938650789, + "grad_norm": 0.26644033193588257, + "learning_rate": 4.999993816037749e-05, + "loss": 2.1733, + "step": 20 + }, + { + "epoch": 0.0033013028355833285, + "grad_norm": 0.2621535062789917, + "learning_rate": 4.9999923654794414e-05, + "loss": 2.0059, + "step": 21 + }, + { + "epoch": 0.003458507732515868, + "grad_norm": 0.2586187422275543, + "learning_rate": 4.999990762231111e-05, + "loss": 2.0336, + "step": 22 + }, + { + "epoch": 0.003615712629448407, + "grad_norm": 0.26732271909713745, + "learning_rate": 4.9999890062928566e-05, + "loss": 2.0566, + "step": 23 + }, + { + "epoch": 0.003772917526380947, + "grad_norm": 0.2357867807149887, + "learning_rate": 4.999987097664787e-05, + "loss": 1.9529, + "step": 24 + }, + { + "epoch": 0.003930122423313486, + "grad_norm": 0.2297009825706482, + "learning_rate": 4.999985036347016e-05, + "loss": 2.0369, + "step": 25 + }, + { + "epoch": 0.004087327320246026, + "grad_norm": 0.20529747009277344, + "learning_rate": 4.9999828223396705e-05, + "loss": 1.9781, + "step": 26 + }, + { + "epoch": 0.004244532217178565, + "grad_norm": 0.18342873454093933, + "learning_rate": 4.999980455642887e-05, + "loss": 1.9986, + "step": 27 + }, + { + "epoch": 0.004401737114111104, + "grad_norm": 0.16487397253513336, + "learning_rate": 4.999977936256809e-05, + "loss": 1.9063, + "step": 28 + }, + { + "epoch": 0.004558942011043644, + "grad_norm": 0.1762266606092453, + "learning_rate": 4.99997526418159e-05, + "loss": 1.9517, + "step": 29 + }, + { + "epoch": 0.004716146907976184, + "grad_norm": 0.16371938586235046, + "learning_rate": 4.999972439417394e-05, + "loss": 1.7734, + "step": 30 + }, + { + "epoch": 0.004873351804908723, + "grad_norm": 0.17309769988059998, + "learning_rate": 4.999969461964392e-05, + "loss": 1.8732, + "step": 31 + }, + { + "epoch": 0.0050305567018412625, + "grad_norm": 0.15772338211536407, + "learning_rate": 4.9999663318227683e-05, + "loss": 1.7537, + "step": 32 + }, + { + "epoch": 0.005187761598773802, + "grad_norm": 0.17521986365318298, + "learning_rate": 4.9999630489927126e-05, + "loss": 2.0077, + "step": 33 + }, + { + "epoch": 0.005344966495706341, + "grad_norm": 0.15462292730808258, + "learning_rate": 4.999959613474425e-05, + "loss": 1.8576, + "step": 34 + }, + { + "epoch": 0.005502171392638881, + "grad_norm": 0.15280336141586304, + "learning_rate": 4.999956025268117e-05, + "loss": 1.862, + "step": 35 + }, + { + "epoch": 0.0056593762895714205, + "grad_norm": 0.14518432319164276, + "learning_rate": 4.999952284374006e-05, + "loss": 1.8893, + "step": 36 + }, + { + "epoch": 0.005816581186503959, + "grad_norm": 0.16087624430656433, + "learning_rate": 4.999948390792321e-05, + "loss": 1.8658, + "step": 37 + }, + { + "epoch": 0.005973786083436499, + "grad_norm": 0.17504698038101196, + "learning_rate": 4.999944344523301e-05, + "loss": 1.7647, + "step": 38 + }, + { + "epoch": 0.006130990980369039, + "grad_norm": 0.17786233127117157, + "learning_rate": 4.999940145567191e-05, + "loss": 1.8133, + "step": 39 + }, + { + "epoch": 0.006288195877301578, + "grad_norm": 0.1628972887992859, + "learning_rate": 4.999935793924249e-05, + "loss": 1.7731, + "step": 40 + }, + { + "epoch": 0.006445400774234117, + "grad_norm": 0.13461466133594513, + "learning_rate": 4.9999312895947406e-05, + "loss": 1.7558, + "step": 41 + }, + { + "epoch": 0.006602605671166657, + "grad_norm": 0.12960125505924225, + "learning_rate": 4.99992663257894e-05, + "loss": 1.7639, + "step": 42 + }, + { + "epoch": 0.006759810568099196, + "grad_norm": 0.10991287231445312, + "learning_rate": 4.9999218228771324e-05, + "loss": 1.7538, + "step": 43 + }, + { + "epoch": 0.006917015465031736, + "grad_norm": 0.11583230644464493, + "learning_rate": 4.999916860489612e-05, + "loss": 1.715, + "step": 44 + }, + { + "epoch": 0.007074220361964275, + "grad_norm": 0.10344280302524567, + "learning_rate": 4.999911745416681e-05, + "loss": 1.6907, + "step": 45 + }, + { + "epoch": 0.007231425258896814, + "grad_norm": 0.10546118766069412, + "learning_rate": 4.999906477658651e-05, + "loss": 1.7294, + "step": 46 + }, + { + "epoch": 0.007388630155829354, + "grad_norm": 0.11775675415992737, + "learning_rate": 4.9999010572158465e-05, + "loss": 1.7146, + "step": 47 + }, + { + "epoch": 0.007545835052761894, + "grad_norm": 0.11109112203121185, + "learning_rate": 4.999895484088596e-05, + "loss": 1.6939, + "step": 48 + }, + { + "epoch": 0.007703039949694433, + "grad_norm": 0.1116517186164856, + "learning_rate": 4.999889758277242e-05, + "loss": 1.7271, + "step": 49 + }, + { + "epoch": 0.007860244846626972, + "grad_norm": 0.11245547980070114, + "learning_rate": 4.999883879782132e-05, + "loss": 1.7333, + "step": 50 + }, + { + "epoch": 0.008017449743559512, + "grad_norm": 0.1150551363825798, + "learning_rate": 4.999877848603626e-05, + "loss": 1.7036, + "step": 51 + }, + { + "epoch": 0.008174654640492052, + "grad_norm": 0.10856381803750992, + "learning_rate": 4.999871664742093e-05, + "loss": 1.7493, + "step": 52 + }, + { + "epoch": 0.008331859537424591, + "grad_norm": 0.10760089010000229, + "learning_rate": 4.9998653281979095e-05, + "loss": 1.6292, + "step": 53 + }, + { + "epoch": 0.00848906443435713, + "grad_norm": 0.0932115837931633, + "learning_rate": 4.9998588389714634e-05, + "loss": 1.6608, + "step": 54 + }, + { + "epoch": 0.00864626933128967, + "grad_norm": 0.09837482124567032, + "learning_rate": 4.9998521970631504e-05, + "loss": 1.7834, + "step": 55 + }, + { + "epoch": 0.008803474228222209, + "grad_norm": 0.08872833847999573, + "learning_rate": 4.9998454024733775e-05, + "loss": 1.6484, + "step": 56 + }, + { + "epoch": 0.008960679125154749, + "grad_norm": 0.08829163759946823, + "learning_rate": 4.9998384552025577e-05, + "loss": 1.5913, + "step": 57 + }, + { + "epoch": 0.009117884022087288, + "grad_norm": 0.09087682515382767, + "learning_rate": 4.999831355251117e-05, + "loss": 1.6809, + "step": 58 + }, + { + "epoch": 0.009275088919019828, + "grad_norm": 0.08675853163003922, + "learning_rate": 4.9998241026194884e-05, + "loss": 1.6519, + "step": 59 + }, + { + "epoch": 0.009432293815952368, + "grad_norm": 0.08463481813669205, + "learning_rate": 4.999816697308114e-05, + "loss": 1.6234, + "step": 60 + }, + { + "epoch": 0.009589498712884906, + "grad_norm": 0.08403950184583664, + "learning_rate": 4.999809139317448e-05, + "loss": 1.6533, + "step": 61 + }, + { + "epoch": 0.009746703609817445, + "grad_norm": 0.08155622333288193, + "learning_rate": 4.99980142864795e-05, + "loss": 1.6726, + "step": 62 + }, + { + "epoch": 0.009903908506749985, + "grad_norm": 0.08056480437517166, + "learning_rate": 4.999793565300093e-05, + "loss": 1.5881, + "step": 63 + }, + { + "epoch": 0.010061113403682525, + "grad_norm": 0.07879023998975754, + "learning_rate": 4.999785549274355e-05, + "loss": 1.5568, + "step": 64 + }, + { + "epoch": 0.010218318300615065, + "grad_norm": 0.07828455418348312, + "learning_rate": 4.9997773805712265e-05, + "loss": 1.6464, + "step": 65 + }, + { + "epoch": 0.010375523197547604, + "grad_norm": 0.08054805546998978, + "learning_rate": 4.9997690591912075e-05, + "loss": 1.6213, + "step": 66 + }, + { + "epoch": 0.010532728094480142, + "grad_norm": 0.07610727101564407, + "learning_rate": 4.999760585134805e-05, + "loss": 1.5729, + "step": 67 + }, + { + "epoch": 0.010689932991412682, + "grad_norm": 0.07693428546190262, + "learning_rate": 4.999751958402537e-05, + "loss": 1.5444, + "step": 68 + }, + { + "epoch": 0.010847137888345222, + "grad_norm": 0.0810319185256958, + "learning_rate": 4.99974317899493e-05, + "loss": 1.7045, + "step": 69 + }, + { + "epoch": 0.011004342785277762, + "grad_norm": 0.07729896157979965, + "learning_rate": 4.9997342469125205e-05, + "loss": 1.6268, + "step": 70 + }, + { + "epoch": 0.011161547682210301, + "grad_norm": 0.07730107754468918, + "learning_rate": 4.999725162155855e-05, + "loss": 1.658, + "step": 71 + }, + { + "epoch": 0.011318752579142841, + "grad_norm": 0.08072328567504883, + "learning_rate": 4.9997159247254864e-05, + "loss": 1.5045, + "step": 72 + }, + { + "epoch": 0.011475957476075379, + "grad_norm": 0.08120577782392502, + "learning_rate": 4.9997065346219805e-05, + "loss": 1.568, + "step": 73 + }, + { + "epoch": 0.011633162373007919, + "grad_norm": 0.08131498098373413, + "learning_rate": 4.99969699184591e-05, + "loss": 1.6035, + "step": 74 + }, + { + "epoch": 0.011790367269940458, + "grad_norm": 0.08395873010158539, + "learning_rate": 4.9996872963978584e-05, + "loss": 1.5844, + "step": 75 + }, + { + "epoch": 0.011947572166872998, + "grad_norm": 0.08502068370580673, + "learning_rate": 4.999677448278417e-05, + "loss": 1.6661, + "step": 76 + }, + { + "epoch": 0.012104777063805538, + "grad_norm": 0.08467952907085419, + "learning_rate": 4.999667447488188e-05, + "loss": 1.5537, + "step": 77 + }, + { + "epoch": 0.012261981960738078, + "grad_norm": 0.19682182371616364, + "learning_rate": 4.999657294027782e-05, + "loss": 1.5051, + "step": 78 + }, + { + "epoch": 0.012419186857670617, + "grad_norm": 0.08586428314447403, + "learning_rate": 4.999646987897818e-05, + "loss": 1.565, + "step": 79 + }, + { + "epoch": 0.012576391754603155, + "grad_norm": 0.08156823366880417, + "learning_rate": 4.999636529098928e-05, + "loss": 1.6627, + "step": 80 + }, + { + "epoch": 0.012733596651535695, + "grad_norm": 0.08715341240167618, + "learning_rate": 4.9996259176317486e-05, + "loss": 1.5862, + "step": 81 + }, + { + "epoch": 0.012890801548468235, + "grad_norm": 0.09664586186408997, + "learning_rate": 4.999615153496928e-05, + "loss": 1.5741, + "step": 82 + }, + { + "epoch": 0.013048006445400774, + "grad_norm": 0.08438891172409058, + "learning_rate": 4.999604236695125e-05, + "loss": 1.5933, + "step": 83 + }, + { + "epoch": 0.013205211342333314, + "grad_norm": 0.08333732932806015, + "learning_rate": 4.999593167227006e-05, + "loss": 1.5904, + "step": 84 + }, + { + "epoch": 0.013362416239265854, + "grad_norm": 0.07945791631937027, + "learning_rate": 4.9995819450932455e-05, + "loss": 1.5763, + "step": 85 + }, + { + "epoch": 0.013519621136198392, + "grad_norm": 0.07682961225509644, + "learning_rate": 4.9995705702945304e-05, + "loss": 1.5197, + "step": 86 + }, + { + "epoch": 0.013676826033130932, + "grad_norm": 0.07547677308320999, + "learning_rate": 4.999559042831555e-05, + "loss": 1.6825, + "step": 87 + }, + { + "epoch": 0.013834030930063471, + "grad_norm": 0.07293456047773361, + "learning_rate": 4.999547362705025e-05, + "loss": 1.5466, + "step": 88 + }, + { + "epoch": 0.013991235826996011, + "grad_norm": 0.07730914652347565, + "learning_rate": 4.999535529915651e-05, + "loss": 1.5775, + "step": 89 + }, + { + "epoch": 0.01414844072392855, + "grad_norm": 0.07689664512872696, + "learning_rate": 4.9995235444641565e-05, + "loss": 1.5881, + "step": 90 + }, + { + "epoch": 0.01430564562086109, + "grad_norm": 0.07754997909069061, + "learning_rate": 4.999511406351275e-05, + "loss": 1.5037, + "step": 91 + }, + { + "epoch": 0.014462850517793628, + "grad_norm": 0.07229866087436676, + "learning_rate": 4.999499115577746e-05, + "loss": 1.5077, + "step": 92 + }, + { + "epoch": 0.014620055414726168, + "grad_norm": 0.07491567730903625, + "learning_rate": 4.9994866721443215e-05, + "loss": 1.5461, + "step": 93 + }, + { + "epoch": 0.014777260311658708, + "grad_norm": 0.07258685678243637, + "learning_rate": 4.9994740760517605e-05, + "loss": 1.5516, + "step": 94 + }, + { + "epoch": 0.014934465208591248, + "grad_norm": 0.07643327116966248, + "learning_rate": 4.9994613273008334e-05, + "loss": 1.6223, + "step": 95 + }, + { + "epoch": 0.015091670105523787, + "grad_norm": 0.0740588903427124, + "learning_rate": 4.999448425892318e-05, + "loss": 1.5322, + "step": 96 + }, + { + "epoch": 0.015248875002456327, + "grad_norm": 0.44172239303588867, + "learning_rate": 4.999435371827003e-05, + "loss": 1.5498, + "step": 97 + }, + { + "epoch": 0.015406079899388867, + "grad_norm": 0.0756363570690155, + "learning_rate": 4.999422165105684e-05, + "loss": 1.559, + "step": 98 + }, + { + "epoch": 0.015563284796321405, + "grad_norm": 0.07251248508691788, + "learning_rate": 4.99940880572917e-05, + "loss": 1.5903, + "step": 99 + }, + { + "epoch": 0.015720489693253945, + "grad_norm": 0.06931837648153305, + "learning_rate": 4.999395293698275e-05, + "loss": 1.4849, + "step": 100 + }, + { + "epoch": 0.015877694590186484, + "grad_norm": 0.07403590530157089, + "learning_rate": 4.9993816290138254e-05, + "loss": 1.5191, + "step": 101 + }, + { + "epoch": 0.016034899487119024, + "grad_norm": 0.07027724385261536, + "learning_rate": 4.999367811676655e-05, + "loss": 1.5655, + "step": 102 + }, + { + "epoch": 0.016192104384051564, + "grad_norm": 0.07320379465818405, + "learning_rate": 4.9993538416876093e-05, + "loss": 1.4869, + "step": 103 + }, + { + "epoch": 0.016349309280984103, + "grad_norm": 0.0726180374622345, + "learning_rate": 4.9993397190475396e-05, + "loss": 1.4629, + "step": 104 + }, + { + "epoch": 0.016506514177916643, + "grad_norm": 0.07542011886835098, + "learning_rate": 4.999325443757309e-05, + "loss": 1.5976, + "step": 105 + }, + { + "epoch": 0.016663719074849183, + "grad_norm": 0.07440067082643509, + "learning_rate": 4.9993110158177895e-05, + "loss": 1.5469, + "step": 106 + }, + { + "epoch": 0.016820923971781723, + "grad_norm": 0.07547372579574585, + "learning_rate": 4.999296435229863e-05, + "loss": 1.5328, + "step": 107 + }, + { + "epoch": 0.01697812886871426, + "grad_norm": 0.07532137632369995, + "learning_rate": 4.999281701994419e-05, + "loss": 1.6742, + "step": 108 + }, + { + "epoch": 0.0171353337656468, + "grad_norm": 0.07249438762664795, + "learning_rate": 4.999266816112358e-05, + "loss": 1.4799, + "step": 109 + }, + { + "epoch": 0.01729253866257934, + "grad_norm": 0.07399806380271912, + "learning_rate": 4.999251777584589e-05, + "loss": 1.5438, + "step": 110 + }, + { + "epoch": 0.017449743559511878, + "grad_norm": 0.08135057240724564, + "learning_rate": 4.99923658641203e-05, + "loss": 1.5608, + "step": 111 + }, + { + "epoch": 0.017606948456444418, + "grad_norm": 0.07508935779333115, + "learning_rate": 4.99922124259561e-05, + "loss": 1.5894, + "step": 112 + }, + { + "epoch": 0.017764153353376957, + "grad_norm": 0.07432372123003006, + "learning_rate": 4.999205746136265e-05, + "loss": 1.4818, + "step": 113 + }, + { + "epoch": 0.017921358250309497, + "grad_norm": 0.07694194465875626, + "learning_rate": 4.999190097034942e-05, + "loss": 1.5629, + "step": 114 + }, + { + "epoch": 0.018078563147242037, + "grad_norm": 0.07384433597326279, + "learning_rate": 4.999174295292597e-05, + "loss": 1.4829, + "step": 115 + }, + { + "epoch": 0.018235768044174577, + "grad_norm": 0.07152919471263885, + "learning_rate": 4.999158340910195e-05, + "loss": 1.4748, + "step": 116 + }, + { + "epoch": 0.018392972941107116, + "grad_norm": 0.07719701528549194, + "learning_rate": 4.999142233888709e-05, + "loss": 1.5524, + "step": 117 + }, + { + "epoch": 0.018550177838039656, + "grad_norm": 0.07540587335824966, + "learning_rate": 4.999125974229125e-05, + "loss": 1.4661, + "step": 118 + }, + { + "epoch": 0.018707382734972196, + "grad_norm": 0.0787581130862236, + "learning_rate": 4.9991095619324344e-05, + "loss": 1.6455, + "step": 119 + }, + { + "epoch": 0.018864587631904736, + "grad_norm": 0.07454577833414078, + "learning_rate": 4.999092996999641e-05, + "loss": 1.5083, + "step": 120 + }, + { + "epoch": 0.019021792528837272, + "grad_norm": 0.0751076266169548, + "learning_rate": 4.9990762794317545e-05, + "loss": 1.4874, + "step": 121 + }, + { + "epoch": 0.01917899742576981, + "grad_norm": 0.07733119279146194, + "learning_rate": 4.999059409229798e-05, + "loss": 1.6308, + "step": 122 + }, + { + "epoch": 0.01933620232270235, + "grad_norm": 0.07897089421749115, + "learning_rate": 4.999042386394802e-05, + "loss": 1.5906, + "step": 123 + }, + { + "epoch": 0.01949340721963489, + "grad_norm": 0.07758141309022903, + "learning_rate": 4.999025210927804e-05, + "loss": 1.5604, + "step": 124 + }, + { + "epoch": 0.01965061211656743, + "grad_norm": 0.07845707982778549, + "learning_rate": 4.9990078828298544e-05, + "loss": 1.5901, + "step": 125 + }, + { + "epoch": 0.01980781701349997, + "grad_norm": 0.0772818773984909, + "learning_rate": 4.998990402102012e-05, + "loss": 1.4516, + "step": 126 + }, + { + "epoch": 0.01996502191043251, + "grad_norm": 0.07795504480600357, + "learning_rate": 4.998972768745344e-05, + "loss": 1.4642, + "step": 127 + }, + { + "epoch": 0.02012222680736505, + "grad_norm": 0.0784008800983429, + "learning_rate": 4.998954982760926e-05, + "loss": 1.5936, + "step": 128 + }, + { + "epoch": 0.02027943170429759, + "grad_norm": 0.07791212201118469, + "learning_rate": 4.9989370441498465e-05, + "loss": 1.4705, + "step": 129 + }, + { + "epoch": 0.02043663660123013, + "grad_norm": 0.07785367220640182, + "learning_rate": 4.9989189529132004e-05, + "loss": 1.5085, + "step": 130 + }, + { + "epoch": 0.02059384149816267, + "grad_norm": 0.07916689664125443, + "learning_rate": 4.9989007090520925e-05, + "loss": 1.5365, + "step": 131 + }, + { + "epoch": 0.02075104639509521, + "grad_norm": 0.0775083601474762, + "learning_rate": 4.9988823125676367e-05, + "loss": 1.5286, + "step": 132 + }, + { + "epoch": 0.020908251292027745, + "grad_norm": 0.08110442757606506, + "learning_rate": 4.998863763460956e-05, + "loss": 1.5779, + "step": 133 + }, + { + "epoch": 0.021065456188960285, + "grad_norm": 0.0814640000462532, + "learning_rate": 4.998845061733185e-05, + "loss": 1.4778, + "step": 134 + }, + { + "epoch": 0.021222661085892824, + "grad_norm": 0.08069492131471634, + "learning_rate": 4.998826207385465e-05, + "loss": 1.5317, + "step": 135 + }, + { + "epoch": 0.021379865982825364, + "grad_norm": 0.07377774268388748, + "learning_rate": 4.998807200418948e-05, + "loss": 1.5258, + "step": 136 + }, + { + "epoch": 0.021537070879757904, + "grad_norm": 0.0787922590970993, + "learning_rate": 4.9987880408347945e-05, + "loss": 1.5185, + "step": 137 + }, + { + "epoch": 0.021694275776690444, + "grad_norm": 0.07662995159626007, + "learning_rate": 4.9987687286341745e-05, + "loss": 1.4637, + "step": 138 + }, + { + "epoch": 0.021851480673622983, + "grad_norm": 0.08528955280780792, + "learning_rate": 4.9987492638182676e-05, + "loss": 1.4776, + "step": 139 + }, + { + "epoch": 0.022008685570555523, + "grad_norm": 0.08089053630828857, + "learning_rate": 4.9987296463882626e-05, + "loss": 1.5885, + "step": 140 + }, + { + "epoch": 0.022165890467488063, + "grad_norm": 0.08029694855213165, + "learning_rate": 4.998709876345358e-05, + "loss": 1.4557, + "step": 141 + }, + { + "epoch": 0.022323095364420602, + "grad_norm": 0.07918502390384674, + "learning_rate": 4.9986899536907614e-05, + "loss": 1.4285, + "step": 142 + }, + { + "epoch": 0.022480300261353142, + "grad_norm": 0.0813126415014267, + "learning_rate": 4.998669878425689e-05, + "loss": 1.5958, + "step": 143 + }, + { + "epoch": 0.022637505158285682, + "grad_norm": 0.07935188710689545, + "learning_rate": 4.998649650551368e-05, + "loss": 1.5249, + "step": 144 + }, + { + "epoch": 0.02279471005521822, + "grad_norm": 0.08163304626941681, + "learning_rate": 4.9986292700690324e-05, + "loss": 1.483, + "step": 145 + }, + { + "epoch": 0.022951914952150758, + "grad_norm": 0.08277447521686554, + "learning_rate": 4.998608736979928e-05, + "loss": 1.6212, + "step": 146 + }, + { + "epoch": 0.023109119849083298, + "grad_norm": 0.08285827934741974, + "learning_rate": 4.9985880512853076e-05, + "loss": 1.4495, + "step": 147 + }, + { + "epoch": 0.023266324746015837, + "grad_norm": 0.082750603556633, + "learning_rate": 4.998567212986437e-05, + "loss": 1.4335, + "step": 148 + }, + { + "epoch": 0.023423529642948377, + "grad_norm": 0.07986058294773102, + "learning_rate": 4.998546222084587e-05, + "loss": 1.4704, + "step": 149 + }, + { + "epoch": 0.023580734539880917, + "grad_norm": 0.08105576783418655, + "learning_rate": 4.9985250785810396e-05, + "loss": 1.5183, + "step": 150 + }, + { + "epoch": 0.023737939436813457, + "grad_norm": 0.08202917128801346, + "learning_rate": 4.9985037824770866e-05, + "loss": 1.5423, + "step": 151 + }, + { + "epoch": 0.023895144333745996, + "grad_norm": 0.08937894552946091, + "learning_rate": 4.998482333774029e-05, + "loss": 1.5731, + "step": 152 + }, + { + "epoch": 0.024052349230678536, + "grad_norm": 0.08333728462457657, + "learning_rate": 4.9984607324731766e-05, + "loss": 1.5133, + "step": 153 + }, + { + "epoch": 0.024209554127611076, + "grad_norm": 0.08529175072908401, + "learning_rate": 4.998438978575849e-05, + "loss": 1.516, + "step": 154 + }, + { + "epoch": 0.024366759024543615, + "grad_norm": 0.08508963882923126, + "learning_rate": 4.998417072083374e-05, + "loss": 1.5646, + "step": 155 + }, + { + "epoch": 0.024523963921476155, + "grad_norm": 0.08971578627824783, + "learning_rate": 4.99839501299709e-05, + "loss": 1.4714, + "step": 156 + }, + { + "epoch": 0.024681168818408695, + "grad_norm": 0.08380109816789627, + "learning_rate": 4.998372801318345e-05, + "loss": 1.4476, + "step": 157 + }, + { + "epoch": 0.024838373715341235, + "grad_norm": 0.08533143252134323, + "learning_rate": 4.9983504370484945e-05, + "loss": 1.4866, + "step": 158 + }, + { + "epoch": 0.02499557861227377, + "grad_norm": 0.08318709582090378, + "learning_rate": 4.998327920188905e-05, + "loss": 1.5274, + "step": 159 + }, + { + "epoch": 0.02515278350920631, + "grad_norm": 0.08486370742321014, + "learning_rate": 4.9983052507409525e-05, + "loss": 1.4713, + "step": 160 + }, + { + "epoch": 0.02515278350920631, + "eval_loss": 1.5136528015136719, + "eval_runtime": 2318.8971, + "eval_samples_per_second": 3.992, + "eval_steps_per_second": 1.996, + "step": 160 + }, + { + "epoch": 0.02530998840613885, + "grad_norm": 0.08242359757423401, + "learning_rate": 4.9982824287060195e-05, + "loss": 1.5069, + "step": 161 + }, + { + "epoch": 0.02546719330307139, + "grad_norm": 0.08547423779964447, + "learning_rate": 4.9982594540855014e-05, + "loss": 1.4973, + "step": 162 + }, + { + "epoch": 0.02562439820000393, + "grad_norm": 0.08345580101013184, + "learning_rate": 4.9982363268808016e-05, + "loss": 1.5078, + "step": 163 + }, + { + "epoch": 0.02578160309693647, + "grad_norm": 0.0830339640378952, + "learning_rate": 4.9982130470933316e-05, + "loss": 1.4098, + "step": 164 + }, + { + "epoch": 0.02593880799386901, + "grad_norm": 0.08568515628576279, + "learning_rate": 4.998189614724514e-05, + "loss": 1.4628, + "step": 165 + }, + { + "epoch": 0.02609601289080155, + "grad_norm": 0.08261829614639282, + "learning_rate": 4.998166029775779e-05, + "loss": 1.4492, + "step": 166 + }, + { + "epoch": 0.02625321778773409, + "grad_norm": 0.08944887667894363, + "learning_rate": 4.998142292248569e-05, + "loss": 1.5633, + "step": 167 + }, + { + "epoch": 0.02641042268466663, + "grad_norm": 0.08632911741733551, + "learning_rate": 4.998118402144332e-05, + "loss": 1.5106, + "step": 168 + }, + { + "epoch": 0.026567627581599168, + "grad_norm": 0.08733859658241272, + "learning_rate": 4.998094359464528e-05, + "loss": 1.5607, + "step": 169 + }, + { + "epoch": 0.026724832478531708, + "grad_norm": 0.08667927235364914, + "learning_rate": 4.9980701642106245e-05, + "loss": 1.4544, + "step": 170 + }, + { + "epoch": 0.026882037375464244, + "grad_norm": 0.08655022084712982, + "learning_rate": 4.9980458163841006e-05, + "loss": 1.5264, + "step": 171 + }, + { + "epoch": 0.027039242272396784, + "grad_norm": 0.08899988234043121, + "learning_rate": 4.9980213159864426e-05, + "loss": 1.4778, + "step": 172 + }, + { + "epoch": 0.027196447169329323, + "grad_norm": 0.09411856532096863, + "learning_rate": 4.997996663019147e-05, + "loss": 1.5269, + "step": 173 + }, + { + "epoch": 0.027353652066261863, + "grad_norm": 0.087191641330719, + "learning_rate": 4.997971857483719e-05, + "loss": 1.5166, + "step": 174 + }, + { + "epoch": 0.027510856963194403, + "grad_norm": 0.08959636092185974, + "learning_rate": 4.997946899381675e-05, + "loss": 1.5503, + "step": 175 + }, + { + "epoch": 0.027668061860126943, + "grad_norm": 0.0951187014579773, + "learning_rate": 4.997921788714537e-05, + "loss": 1.4879, + "step": 176 + }, + { + "epoch": 0.027825266757059482, + "grad_norm": 0.09324768930673599, + "learning_rate": 4.997896525483841e-05, + "loss": 1.5714, + "step": 177 + }, + { + "epoch": 0.027982471653992022, + "grad_norm": 0.08633986115455627, + "learning_rate": 4.997871109691129e-05, + "loss": 1.4198, + "step": 178 + }, + { + "epoch": 0.028139676550924562, + "grad_norm": 0.08947525173425674, + "learning_rate": 4.9978455413379535e-05, + "loss": 1.4702, + "step": 179 + }, + { + "epoch": 0.0282968814478571, + "grad_norm": 0.09275490790605545, + "learning_rate": 4.9978198204258766e-05, + "loss": 1.5252, + "step": 180 + }, + { + "epoch": 0.02845408634478964, + "grad_norm": 0.08761609345674515, + "learning_rate": 4.9977939469564676e-05, + "loss": 1.505, + "step": 181 + }, + { + "epoch": 0.02861129124172218, + "grad_norm": 0.08683087676763535, + "learning_rate": 4.997767920931308e-05, + "loss": 1.5059, + "step": 182 + }, + { + "epoch": 0.02876849613865472, + "grad_norm": 0.08931361883878708, + "learning_rate": 4.997741742351988e-05, + "loss": 1.5003, + "step": 183 + }, + { + "epoch": 0.028925701035587257, + "grad_norm": 0.08820109069347382, + "learning_rate": 4.997715411220105e-05, + "loss": 1.5132, + "step": 184 + }, + { + "epoch": 0.029082905932519797, + "grad_norm": 0.09284964948892593, + "learning_rate": 4.997688927537268e-05, + "loss": 1.4561, + "step": 185 + }, + { + "epoch": 0.029240110829452336, + "grad_norm": 0.09472864121198654, + "learning_rate": 4.997662291305094e-05, + "loss": 1.4729, + "step": 186 + }, + { + "epoch": 0.029397315726384876, + "grad_norm": 0.08725330233573914, + "learning_rate": 4.997635502525211e-05, + "loss": 1.3994, + "step": 187 + }, + { + "epoch": 0.029554520623317416, + "grad_norm": 0.09085626900196075, + "learning_rate": 4.9976085611992536e-05, + "loss": 1.4695, + "step": 188 + }, + { + "epoch": 0.029711725520249956, + "grad_norm": 0.09322400391101837, + "learning_rate": 4.9975814673288684e-05, + "loss": 1.4753, + "step": 189 + }, + { + "epoch": 0.029868930417182495, + "grad_norm": 0.08927160501480103, + "learning_rate": 4.99755422091571e-05, + "loss": 1.4465, + "step": 190 + }, + { + "epoch": 0.030026135314115035, + "grad_norm": 0.09317070990800858, + "learning_rate": 4.997526821961442e-05, + "loss": 1.5124, + "step": 191 + }, + { + "epoch": 0.030183340211047575, + "grad_norm": 0.08911167085170746, + "learning_rate": 4.9974992704677385e-05, + "loss": 1.4515, + "step": 192 + }, + { + "epoch": 0.030340545107980114, + "grad_norm": 0.09432853013277054, + "learning_rate": 4.997471566436282e-05, + "loss": 1.4623, + "step": 193 + }, + { + "epoch": 0.030497750004912654, + "grad_norm": 0.09417332708835602, + "learning_rate": 4.997443709868764e-05, + "loss": 1.5103, + "step": 194 + }, + { + "epoch": 0.030654954901845194, + "grad_norm": 0.09564542025327682, + "learning_rate": 4.997415700766887e-05, + "loss": 1.4929, + "step": 195 + }, + { + "epoch": 0.030812159798777734, + "grad_norm": 0.09101004898548126, + "learning_rate": 4.997387539132361e-05, + "loss": 1.4225, + "step": 196 + }, + { + "epoch": 0.03096936469571027, + "grad_norm": 0.09196274727582932, + "learning_rate": 4.997359224966906e-05, + "loss": 1.4701, + "step": 197 + }, + { + "epoch": 0.03112656959264281, + "grad_norm": 0.09573279321193695, + "learning_rate": 4.997330758272251e-05, + "loss": 1.4425, + "step": 198 + }, + { + "epoch": 0.03128377448957535, + "grad_norm": 0.09180758893489838, + "learning_rate": 4.9973021390501354e-05, + "loss": 1.4426, + "step": 199 + }, + { + "epoch": 0.03144097938650789, + "grad_norm": 0.09583238512277603, + "learning_rate": 4.997273367302306e-05, + "loss": 1.5158, + "step": 200 + }, + { + "epoch": 0.03159818428344043, + "grad_norm": 0.09394747018814087, + "learning_rate": 4.997244443030521e-05, + "loss": 1.4306, + "step": 201 + }, + { + "epoch": 0.03175538918037297, + "grad_norm": 0.09470199793577194, + "learning_rate": 4.9972153662365474e-05, + "loss": 1.5286, + "step": 202 + }, + { + "epoch": 0.031912594077305505, + "grad_norm": 0.09274959564208984, + "learning_rate": 4.997186136922161e-05, + "loss": 1.4803, + "step": 203 + }, + { + "epoch": 0.03206979897423805, + "grad_norm": 0.09344369918107986, + "learning_rate": 4.997156755089145e-05, + "loss": 1.5449, + "step": 204 + }, + { + "epoch": 0.032227003871170584, + "grad_norm": 0.09794919937849045, + "learning_rate": 4.997127220739296e-05, + "loss": 1.4383, + "step": 205 + }, + { + "epoch": 0.03238420876810313, + "grad_norm": 0.09698093682527542, + "learning_rate": 4.997097533874418e-05, + "loss": 1.4462, + "step": 206 + }, + { + "epoch": 0.032541413665035664, + "grad_norm": 0.09690559655427933, + "learning_rate": 4.997067694496323e-05, + "loss": 1.4735, + "step": 207 + }, + { + "epoch": 0.03269861856196821, + "grad_norm": 0.09657544642686844, + "learning_rate": 4.9970377026068336e-05, + "loss": 1.5672, + "step": 208 + }, + { + "epoch": 0.03285582345890074, + "grad_norm": 0.09483659267425537, + "learning_rate": 4.9970075582077825e-05, + "loss": 1.4931, + "step": 209 + }, + { + "epoch": 0.033013028355833286, + "grad_norm": 0.09744243323802948, + "learning_rate": 4.9969772613010104e-05, + "loss": 1.4638, + "step": 210 + }, + { + "epoch": 0.03317023325276582, + "grad_norm": 0.09521006047725677, + "learning_rate": 4.9969468118883665e-05, + "loss": 1.4127, + "step": 211 + }, + { + "epoch": 0.033327438149698366, + "grad_norm": 0.09646004438400269, + "learning_rate": 4.996916209971713e-05, + "loss": 1.5139, + "step": 212 + }, + { + "epoch": 0.0334846430466309, + "grad_norm": 0.09292810410261154, + "learning_rate": 4.996885455552916e-05, + "loss": 1.4399, + "step": 213 + }, + { + "epoch": 0.033641847943563445, + "grad_norm": 0.09986516088247299, + "learning_rate": 4.996854548633857e-05, + "loss": 1.4637, + "step": 214 + }, + { + "epoch": 0.03379905284049598, + "grad_norm": 0.09723702073097229, + "learning_rate": 4.996823489216421e-05, + "loss": 1.5673, + "step": 215 + }, + { + "epoch": 0.03395625773742852, + "grad_norm": 0.09608977288007736, + "learning_rate": 4.996792277302507e-05, + "loss": 1.4428, + "step": 216 + }, + { + "epoch": 0.03411346263436106, + "grad_norm": 0.09329380095005035, + "learning_rate": 4.99676091289402e-05, + "loss": 1.3892, + "step": 217 + }, + { + "epoch": 0.0342706675312936, + "grad_norm": 0.0959913358092308, + "learning_rate": 4.996729395992875e-05, + "loss": 1.5219, + "step": 218 + }, + { + "epoch": 0.03442787242822614, + "grad_norm": 0.09832671284675598, + "learning_rate": 4.996697726600999e-05, + "loss": 1.5259, + "step": 219 + }, + { + "epoch": 0.03458507732515868, + "grad_norm": 0.10061636567115784, + "learning_rate": 4.996665904720325e-05, + "loss": 1.5216, + "step": 220 + }, + { + "epoch": 0.03474228222209122, + "grad_norm": 0.09742400050163269, + "learning_rate": 4.9966339303527965e-05, + "loss": 1.3819, + "step": 221 + }, + { + "epoch": 0.034899487119023756, + "grad_norm": 0.09629969298839569, + "learning_rate": 4.996601803500367e-05, + "loss": 1.5341, + "step": 222 + }, + { + "epoch": 0.0350566920159563, + "grad_norm": 0.09776200354099274, + "learning_rate": 4.996569524164998e-05, + "loss": 1.5054, + "step": 223 + }, + { + "epoch": 0.035213896912888835, + "grad_norm": 0.1008530780673027, + "learning_rate": 4.996537092348661e-05, + "loss": 1.5333, + "step": 224 + }, + { + "epoch": 0.03537110180982138, + "grad_norm": 0.09749735891819, + "learning_rate": 4.996504508053338e-05, + "loss": 1.3899, + "step": 225 + }, + { + "epoch": 0.035528306706753915, + "grad_norm": 0.10522401332855225, + "learning_rate": 4.9964717712810175e-05, + "loss": 1.5413, + "step": 226 + }, + { + "epoch": 0.03568551160368646, + "grad_norm": 0.09566272795200348, + "learning_rate": 4.9964388820336996e-05, + "loss": 1.435, + "step": 227 + }, + { + "epoch": 0.035842716500618994, + "grad_norm": 0.10133984684944153, + "learning_rate": 4.996405840313393e-05, + "loss": 1.445, + "step": 228 + }, + { + "epoch": 0.03599992139755153, + "grad_norm": 0.09702739119529724, + "learning_rate": 4.996372646122116e-05, + "loss": 1.4287, + "step": 229 + }, + { + "epoch": 0.036157126294484074, + "grad_norm": 0.1012992411851883, + "learning_rate": 4.996339299461896e-05, + "loss": 1.382, + "step": 230 + }, + { + "epoch": 0.03631433119141661, + "grad_norm": 0.09877166152000427, + "learning_rate": 4.99630580033477e-05, + "loss": 1.5729, + "step": 231 + }, + { + "epoch": 0.03647153608834915, + "grad_norm": 0.1033129170536995, + "learning_rate": 4.996272148742783e-05, + "loss": 1.4754, + "step": 232 + }, + { + "epoch": 0.03662874098528169, + "grad_norm": 0.09901215881109238, + "learning_rate": 4.9962383446879914e-05, + "loss": 1.5153, + "step": 233 + }, + { + "epoch": 0.03678594588221423, + "grad_norm": 0.10241983830928802, + "learning_rate": 4.996204388172458e-05, + "loss": 1.5131, + "step": 234 + }, + { + "epoch": 0.03694315077914677, + "grad_norm": 0.09574593603610992, + "learning_rate": 4.9961702791982594e-05, + "loss": 1.5285, + "step": 235 + }, + { + "epoch": 0.03710035567607931, + "grad_norm": 0.10309838503599167, + "learning_rate": 4.996136017767477e-05, + "loss": 1.5751, + "step": 236 + }, + { + "epoch": 0.03725756057301185, + "grad_norm": 0.09928470849990845, + "learning_rate": 4.996101603882204e-05, + "loss": 1.5108, + "step": 237 + }, + { + "epoch": 0.03741476546994439, + "grad_norm": 0.10514767467975616, + "learning_rate": 4.996067037544542e-05, + "loss": 1.4206, + "step": 238 + }, + { + "epoch": 0.03757197036687693, + "grad_norm": 0.10411518812179565, + "learning_rate": 4.996032318756601e-05, + "loss": 1.5628, + "step": 239 + }, + { + "epoch": 0.03772917526380947, + "grad_norm": 0.0989808738231659, + "learning_rate": 4.9959974475205045e-05, + "loss": 1.4444, + "step": 240 + }, + { + "epoch": 0.03788638016074201, + "grad_norm": 0.10069911926984787, + "learning_rate": 4.9959624238383804e-05, + "loss": 1.4805, + "step": 241 + }, + { + "epoch": 0.038043585057674544, + "grad_norm": 0.10637518763542175, + "learning_rate": 4.995927247712367e-05, + "loss": 1.5289, + "step": 242 + }, + { + "epoch": 0.03820078995460709, + "grad_norm": 0.10085684061050415, + "learning_rate": 4.995891919144614e-05, + "loss": 1.5288, + "step": 243 + }, + { + "epoch": 0.03835799485153962, + "grad_norm": 0.09989017248153687, + "learning_rate": 4.995856438137279e-05, + "loss": 1.5444, + "step": 244 + }, + { + "epoch": 0.038515199748472166, + "grad_norm": 0.10382463037967682, + "learning_rate": 4.9958208046925294e-05, + "loss": 1.4621, + "step": 245 + }, + { + "epoch": 0.0386724046454047, + "grad_norm": 0.10208063572645187, + "learning_rate": 4.99578501881254e-05, + "loss": 1.5003, + "step": 246 + }, + { + "epoch": 0.038829609542337246, + "grad_norm": 0.1028011366724968, + "learning_rate": 4.9957490804994977e-05, + "loss": 1.516, + "step": 247 + }, + { + "epoch": 0.03898681443926978, + "grad_norm": 0.10475701838731766, + "learning_rate": 4.995712989755598e-05, + "loss": 1.5333, + "step": 248 + }, + { + "epoch": 0.039144019336202325, + "grad_norm": 0.1038154736161232, + "learning_rate": 4.995676746583044e-05, + "loss": 1.4779, + "step": 249 + }, + { + "epoch": 0.03930122423313486, + "grad_norm": 0.10413440316915512, + "learning_rate": 4.99564035098405e-05, + "loss": 1.5241, + "step": 250 + }, + { + "epoch": 0.039458429130067404, + "grad_norm": 0.09869382530450821, + "learning_rate": 4.995603802960838e-05, + "loss": 1.442, + "step": 251 + }, + { + "epoch": 0.03961563402699994, + "grad_norm": 0.10138234496116638, + "learning_rate": 4.995567102515641e-05, + "loss": 1.5393, + "step": 252 + }, + { + "epoch": 0.039772838923932484, + "grad_norm": 0.10225867480039597, + "learning_rate": 4.995530249650701e-05, + "loss": 1.4516, + "step": 253 + }, + { + "epoch": 0.03993004382086502, + "grad_norm": 0.09942895174026489, + "learning_rate": 4.995493244368268e-05, + "loss": 1.4543, + "step": 254 + }, + { + "epoch": 0.040087248717797556, + "grad_norm": 0.11218860745429993, + "learning_rate": 4.995456086670602e-05, + "loss": 1.4985, + "step": 255 + }, + { + "epoch": 0.0402444536147301, + "grad_norm": 0.10839337855577469, + "learning_rate": 4.9954187765599736e-05, + "loss": 1.4805, + "step": 256 + }, + { + "epoch": 0.040401658511662636, + "grad_norm": 0.10317599028348923, + "learning_rate": 4.9953813140386595e-05, + "loss": 1.4412, + "step": 257 + }, + { + "epoch": 0.04055886340859518, + "grad_norm": 0.10285656154155731, + "learning_rate": 4.99534369910895e-05, + "loss": 1.476, + "step": 258 + }, + { + "epoch": 0.040716068305527715, + "grad_norm": 0.10330680012702942, + "learning_rate": 4.995305931773141e-05, + "loss": 1.5157, + "step": 259 + }, + { + "epoch": 0.04087327320246026, + "grad_norm": 0.1086694598197937, + "learning_rate": 4.99526801203354e-05, + "loss": 1.4999, + "step": 260 + }, + { + "epoch": 0.041030478099392795, + "grad_norm": 0.10800144821405411, + "learning_rate": 4.995229939892464e-05, + "loss": 1.4764, + "step": 261 + }, + { + "epoch": 0.04118768299632534, + "grad_norm": 0.10645303875207901, + "learning_rate": 4.9951917153522355e-05, + "loss": 1.4404, + "step": 262 + }, + { + "epoch": 0.041344887893257874, + "grad_norm": 0.10440964996814728, + "learning_rate": 4.9951533384151906e-05, + "loss": 1.3678, + "step": 263 + }, + { + "epoch": 0.04150209279019042, + "grad_norm": 0.10993078351020813, + "learning_rate": 4.995114809083673e-05, + "loss": 1.5064, + "step": 264 + }, + { + "epoch": 0.041659297687122954, + "grad_norm": 0.10710245370864868, + "learning_rate": 4.9950761273600366e-05, + "loss": 1.4134, + "step": 265 + }, + { + "epoch": 0.04181650258405549, + "grad_norm": 0.11030582338571548, + "learning_rate": 4.995037293246644e-05, + "loss": 1.5299, + "step": 266 + }, + { + "epoch": 0.04197370748098803, + "grad_norm": 0.1058267131447792, + "learning_rate": 4.994998306745866e-05, + "loss": 1.3654, + "step": 267 + }, + { + "epoch": 0.04213091237792057, + "grad_norm": 0.10541702806949615, + "learning_rate": 4.994959167860084e-05, + "loss": 1.4297, + "step": 268 + }, + { + "epoch": 0.04228811727485311, + "grad_norm": 0.11085420846939087, + "learning_rate": 4.994919876591689e-05, + "loss": 1.4876, + "step": 269 + }, + { + "epoch": 0.04244532217178565, + "grad_norm": 0.11054470390081406, + "learning_rate": 4.994880432943081e-05, + "loss": 1.574, + "step": 270 + }, + { + "epoch": 0.04260252706871819, + "grad_norm": 0.11234510689973831, + "learning_rate": 4.994840836916668e-05, + "loss": 1.5079, + "step": 271 + }, + { + "epoch": 0.04275973196565073, + "grad_norm": 0.11040106415748596, + "learning_rate": 4.994801088514869e-05, + "loss": 1.5091, + "step": 272 + }, + { + "epoch": 0.04291693686258327, + "grad_norm": 0.10639887303113937, + "learning_rate": 4.994761187740111e-05, + "loss": 1.4495, + "step": 273 + }, + { + "epoch": 0.04307414175951581, + "grad_norm": 0.11268071085214615, + "learning_rate": 4.994721134594833e-05, + "loss": 1.5057, + "step": 274 + }, + { + "epoch": 0.04323134665644835, + "grad_norm": 0.10079260170459747, + "learning_rate": 4.994680929081479e-05, + "loss": 1.4145, + "step": 275 + }, + { + "epoch": 0.04338855155338089, + "grad_norm": 0.11474710702896118, + "learning_rate": 4.994640571202506e-05, + "loss": 1.5061, + "step": 276 + }, + { + "epoch": 0.04354575645031343, + "grad_norm": 0.10946876555681229, + "learning_rate": 4.994600060960377e-05, + "loss": 1.5306, + "step": 277 + }, + { + "epoch": 0.04370296134724597, + "grad_norm": 0.11192137002944946, + "learning_rate": 4.994559398357569e-05, + "loss": 1.5347, + "step": 278 + }, + { + "epoch": 0.0438601662441785, + "grad_norm": 0.10744784027338028, + "learning_rate": 4.994518583396564e-05, + "loss": 1.4686, + "step": 279 + }, + { + "epoch": 0.044017371141111046, + "grad_norm": 0.11113352328538895, + "learning_rate": 4.9944776160798544e-05, + "loss": 1.4101, + "step": 280 + }, + { + "epoch": 0.04417457603804358, + "grad_norm": 0.11456230282783508, + "learning_rate": 4.994436496409943e-05, + "loss": 1.4036, + "step": 281 + }, + { + "epoch": 0.044331780934976125, + "grad_norm": 0.11608672887086868, + "learning_rate": 4.994395224389342e-05, + "loss": 1.4949, + "step": 282 + }, + { + "epoch": 0.04448898583190866, + "grad_norm": 0.1232326403260231, + "learning_rate": 4.9943538000205705e-05, + "loss": 1.5501, + "step": 283 + }, + { + "epoch": 0.044646190728841205, + "grad_norm": 0.11791515350341797, + "learning_rate": 4.994312223306159e-05, + "loss": 1.4542, + "step": 284 + }, + { + "epoch": 0.04480339562577374, + "grad_norm": 0.11657550930976868, + "learning_rate": 4.9942704942486476e-05, + "loss": 1.4724, + "step": 285 + }, + { + "epoch": 0.044960600522706284, + "grad_norm": 0.11560262739658356, + "learning_rate": 4.994228612850584e-05, + "loss": 1.4036, + "step": 286 + }, + { + "epoch": 0.04511780541963882, + "grad_norm": 0.10999175906181335, + "learning_rate": 4.994186579114527e-05, + "loss": 1.4489, + "step": 287 + }, + { + "epoch": 0.045275010316571364, + "grad_norm": 0.11586826294660568, + "learning_rate": 4.9941443930430436e-05, + "loss": 1.5486, + "step": 288 + }, + { + "epoch": 0.0454322152135039, + "grad_norm": 0.11349951475858688, + "learning_rate": 4.994102054638711e-05, + "loss": 1.5698, + "step": 289 + }, + { + "epoch": 0.04558942011043644, + "grad_norm": 0.11978698521852493, + "learning_rate": 4.9940595639041134e-05, + "loss": 1.3933, + "step": 290 + }, + { + "epoch": 0.04574662500736898, + "grad_norm": 0.11438622325658798, + "learning_rate": 4.994016920841846e-05, + "loss": 1.5005, + "step": 291 + }, + { + "epoch": 0.045903829904301516, + "grad_norm": 0.11395915597677231, + "learning_rate": 4.9939741254545155e-05, + "loss": 1.4521, + "step": 292 + }, + { + "epoch": 0.04606103480123406, + "grad_norm": 0.11659599095582962, + "learning_rate": 4.993931177744734e-05, + "loss": 1.5166, + "step": 293 + }, + { + "epoch": 0.046218239698166595, + "grad_norm": 0.11053171753883362, + "learning_rate": 4.9938880777151254e-05, + "loss": 1.4459, + "step": 294 + }, + { + "epoch": 0.04637544459509914, + "grad_norm": 0.11428084224462509, + "learning_rate": 4.993844825368321e-05, + "loss": 1.4448, + "step": 295 + }, + { + "epoch": 0.046532649492031675, + "grad_norm": 0.10734150558710098, + "learning_rate": 4.993801420706964e-05, + "loss": 1.3388, + "step": 296 + }, + { + "epoch": 0.04668985438896422, + "grad_norm": 0.11137369275093079, + "learning_rate": 4.993757863733703e-05, + "loss": 1.4155, + "step": 297 + }, + { + "epoch": 0.046847059285896754, + "grad_norm": 0.1221408098936081, + "learning_rate": 4.993714154451202e-05, + "loss": 1.4884, + "step": 298 + }, + { + "epoch": 0.0470042641828293, + "grad_norm": 0.11707969009876251, + "learning_rate": 4.993670292862127e-05, + "loss": 1.4605, + "step": 299 + }, + { + "epoch": 0.047161469079761834, + "grad_norm": 0.11751751601696014, + "learning_rate": 4.993626278969158e-05, + "loss": 1.5538, + "step": 300 + }, + { + "epoch": 0.04731867397669438, + "grad_norm": 0.11617731302976608, + "learning_rate": 4.993582112774984e-05, + "loss": 1.438, + "step": 301 + }, + { + "epoch": 0.04747587887362691, + "grad_norm": 0.15164637565612793, + "learning_rate": 4.993537794282302e-05, + "loss": 1.4607, + "step": 302 + }, + { + "epoch": 0.047633083770559456, + "grad_norm": 0.12434446811676025, + "learning_rate": 4.9934933234938193e-05, + "loss": 1.4167, + "step": 303 + }, + { + "epoch": 0.04779028866749199, + "grad_norm": 0.12518739700317383, + "learning_rate": 4.993448700412251e-05, + "loss": 1.4003, + "step": 304 + }, + { + "epoch": 0.04794749356442453, + "grad_norm": 0.11146944761276245, + "learning_rate": 4.993403925040323e-05, + "loss": 1.3913, + "step": 305 + }, + { + "epoch": 0.04810469846135707, + "grad_norm": 0.11682326346635818, + "learning_rate": 4.993358997380771e-05, + "loss": 1.3415, + "step": 306 + }, + { + "epoch": 0.04826190335828961, + "grad_norm": 0.1197504773736, + "learning_rate": 4.993313917436336e-05, + "loss": 1.515, + "step": 307 + }, + { + "epoch": 0.04841910825522215, + "grad_norm": 0.14647473394870758, + "learning_rate": 4.993268685209775e-05, + "loss": 1.4529, + "step": 308 + }, + { + "epoch": 0.04857631315215469, + "grad_norm": 0.12431525439023972, + "learning_rate": 4.9932233007038484e-05, + "loss": 1.5426, + "step": 309 + }, + { + "epoch": 0.04873351804908723, + "grad_norm": 0.11715538799762726, + "learning_rate": 4.9931777639213284e-05, + "loss": 1.4615, + "step": 310 + }, + { + "epoch": 0.04889072294601977, + "grad_norm": 0.12391876429319382, + "learning_rate": 4.993132074864997e-05, + "loss": 1.4138, + "step": 311 + }, + { + "epoch": 0.04904792784295231, + "grad_norm": 0.11894181370735168, + "learning_rate": 4.9930862335376444e-05, + "loss": 1.4383, + "step": 312 + }, + { + "epoch": 0.049205132739884846, + "grad_norm": 0.1225295439362526, + "learning_rate": 4.9930402399420695e-05, + "loss": 1.3847, + "step": 313 + }, + { + "epoch": 0.04936233763681739, + "grad_norm": 0.11435995995998383, + "learning_rate": 4.9929940940810825e-05, + "loss": 1.4254, + "step": 314 + }, + { + "epoch": 0.049519542533749926, + "grad_norm": 0.11988761276006699, + "learning_rate": 4.9929477959575024e-05, + "loss": 1.4787, + "step": 315 + }, + { + "epoch": 0.04967674743068247, + "grad_norm": 0.11983373016119003, + "learning_rate": 4.992901345574155e-05, + "loss": 1.4341, + "step": 316 + }, + { + "epoch": 0.049833952327615005, + "grad_norm": 0.13395054638385773, + "learning_rate": 4.992854742933878e-05, + "loss": 1.4315, + "step": 317 + }, + { + "epoch": 0.04999115722454754, + "grad_norm": 0.12578143179416656, + "learning_rate": 4.9928079880395186e-05, + "loss": 1.4143, + "step": 318 + }, + { + "epoch": 0.050148362121480085, + "grad_norm": 0.1401878446340561, + "learning_rate": 4.992761080893932e-05, + "loss": 1.4665, + "step": 319 + }, + { + "epoch": 0.05030556701841262, + "grad_norm": 0.13048145174980164, + "learning_rate": 4.9927140214999826e-05, + "loss": 1.4266, + "step": 320 + }, + { + "epoch": 0.05030556701841262, + "eval_loss": 1.450086236000061, + "eval_runtime": 2316.1877, + "eval_samples_per_second": 3.997, + "eval_steps_per_second": 1.999, + "step": 320 + }, + { + "epoch": 0.050462771915345164, + "grad_norm": 0.13121232390403748, + "learning_rate": 4.992666809860545e-05, + "loss": 1.4946, + "step": 321 + }, + { + "epoch": 0.0506199768122777, + "grad_norm": 0.13547195494174957, + "learning_rate": 4.9926194459785015e-05, + "loss": 1.5532, + "step": 322 + }, + { + "epoch": 0.050777181709210244, + "grad_norm": 0.11797169595956802, + "learning_rate": 4.992571929856747e-05, + "loss": 1.4118, + "step": 323 + }, + { + "epoch": 0.05093438660614278, + "grad_norm": 0.12734922766685486, + "learning_rate": 4.992524261498183e-05, + "loss": 1.4427, + "step": 324 + }, + { + "epoch": 0.05109159150307532, + "grad_norm": 0.12444902211427689, + "learning_rate": 4.99247644090572e-05, + "loss": 1.4369, + "step": 325 + }, + { + "epoch": 0.05124879640000786, + "grad_norm": 0.12244518846273422, + "learning_rate": 4.99242846808228e-05, + "loss": 1.4587, + "step": 326 + }, + { + "epoch": 0.0514060012969404, + "grad_norm": 0.12424397468566895, + "learning_rate": 4.9923803430307916e-05, + "loss": 1.3949, + "step": 327 + }, + { + "epoch": 0.05156320619387294, + "grad_norm": 0.1352718621492386, + "learning_rate": 4.9923320657541944e-05, + "loss": 1.504, + "step": 328 + }, + { + "epoch": 0.05172041109080548, + "grad_norm": 0.12855666875839233, + "learning_rate": 4.992283636255438e-05, + "loss": 1.4271, + "step": 329 + }, + { + "epoch": 0.05187761598773802, + "grad_norm": 0.129829540848732, + "learning_rate": 4.99223505453748e-05, + "loss": 1.455, + "step": 330 + }, + { + "epoch": 0.052034820884670555, + "grad_norm": 0.12780050933361053, + "learning_rate": 4.992186320603286e-05, + "loss": 1.4045, + "step": 331 + }, + { + "epoch": 0.0521920257816031, + "grad_norm": 0.13515712320804596, + "learning_rate": 4.992137434455834e-05, + "loss": 1.4335, + "step": 332 + }, + { + "epoch": 0.052349230678535634, + "grad_norm": 0.15026766061782837, + "learning_rate": 4.99208839609811e-05, + "loss": 1.5386, + "step": 333 + }, + { + "epoch": 0.05250643557546818, + "grad_norm": 0.13422101736068726, + "learning_rate": 4.992039205533108e-05, + "loss": 1.454, + "step": 334 + }, + { + "epoch": 0.05266364047240071, + "grad_norm": 0.13735777139663696, + "learning_rate": 4.991989862763833e-05, + "loss": 1.4415, + "step": 335 + }, + { + "epoch": 0.05282084536933326, + "grad_norm": 0.12985137104988098, + "learning_rate": 4.9919403677932994e-05, + "loss": 1.385, + "step": 336 + }, + { + "epoch": 0.05297805026626579, + "grad_norm": 0.1301167607307434, + "learning_rate": 4.9918907206245285e-05, + "loss": 1.4364, + "step": 337 + }, + { + "epoch": 0.053135255163198336, + "grad_norm": 0.1407599002122879, + "learning_rate": 4.991840921260553e-05, + "loss": 1.4454, + "step": 338 + }, + { + "epoch": 0.05329246006013087, + "grad_norm": 0.12763133645057678, + "learning_rate": 4.9917909697044164e-05, + "loss": 1.4008, + "step": 339 + }, + { + "epoch": 0.053449664957063416, + "grad_norm": 0.1443052589893341, + "learning_rate": 4.991740865959167e-05, + "loss": 1.5184, + "step": 340 + }, + { + "epoch": 0.05360686985399595, + "grad_norm": 0.13496418297290802, + "learning_rate": 4.991690610027866e-05, + "loss": 1.3888, + "step": 341 + }, + { + "epoch": 0.05376407475092849, + "grad_norm": 0.12681293487548828, + "learning_rate": 4.991640201913583e-05, + "loss": 1.42, + "step": 342 + }, + { + "epoch": 0.05392127964786103, + "grad_norm": 0.13178062438964844, + "learning_rate": 4.9915896416193965e-05, + "loss": 1.4178, + "step": 343 + }, + { + "epoch": 0.05407848454479357, + "grad_norm": 0.14452503621578217, + "learning_rate": 4.991538929148394e-05, + "loss": 1.4248, + "step": 344 + }, + { + "epoch": 0.05423568944172611, + "grad_norm": 0.1352955400943756, + "learning_rate": 4.991488064503674e-05, + "loss": 1.4304, + "step": 345 + }, + { + "epoch": 0.05439289433865865, + "grad_norm": 0.14846469461917877, + "learning_rate": 4.991437047688343e-05, + "loss": 1.4784, + "step": 346 + }, + { + "epoch": 0.05455009923559119, + "grad_norm": 0.12475849688053131, + "learning_rate": 4.9913858787055156e-05, + "loss": 1.4131, + "step": 347 + }, + { + "epoch": 0.054707304132523726, + "grad_norm": 0.13835409283638, + "learning_rate": 4.991334557558318e-05, + "loss": 1.4913, + "step": 348 + }, + { + "epoch": 0.05486450902945627, + "grad_norm": 0.13921529054641724, + "learning_rate": 4.991283084249885e-05, + "loss": 1.3713, + "step": 349 + }, + { + "epoch": 0.055021713926388806, + "grad_norm": 0.13188250362873077, + "learning_rate": 4.9912314587833586e-05, + "loss": 1.3608, + "step": 350 + }, + { + "epoch": 0.05517891882332135, + "grad_norm": 0.12457428872585297, + "learning_rate": 4.991179681161895e-05, + "loss": 1.4427, + "step": 351 + }, + { + "epoch": 0.055336123720253885, + "grad_norm": 0.12452542781829834, + "learning_rate": 4.9911277513886535e-05, + "loss": 1.4179, + "step": 352 + }, + { + "epoch": 0.05549332861718643, + "grad_norm": 0.14799195528030396, + "learning_rate": 4.9910756694668074e-05, + "loss": 1.4532, + "step": 353 + }, + { + "epoch": 0.055650533514118965, + "grad_norm": 0.13485541939735413, + "learning_rate": 4.991023435399538e-05, + "loss": 1.4114, + "step": 354 + }, + { + "epoch": 0.0558077384110515, + "grad_norm": 0.1422443389892578, + "learning_rate": 4.990971049190034e-05, + "loss": 1.377, + "step": 355 + }, + { + "epoch": 0.055964943307984044, + "grad_norm": 0.12994804978370667, + "learning_rate": 4.990918510841496e-05, + "loss": 1.4474, + "step": 356 + }, + { + "epoch": 0.05612214820491658, + "grad_norm": 0.1429785192012787, + "learning_rate": 4.990865820357133e-05, + "loss": 1.4435, + "step": 357 + }, + { + "epoch": 0.056279353101849124, + "grad_norm": 0.12979790568351746, + "learning_rate": 4.9908129777401625e-05, + "loss": 1.4039, + "step": 358 + }, + { + "epoch": 0.05643655799878166, + "grad_norm": 0.1332644671201706, + "learning_rate": 4.990759982993812e-05, + "loss": 1.4377, + "step": 359 + }, + { + "epoch": 0.0565937628957142, + "grad_norm": 0.13796579837799072, + "learning_rate": 4.99070683612132e-05, + "loss": 1.3951, + "step": 360 + }, + { + "epoch": 0.05675096779264674, + "grad_norm": 0.14315246045589447, + "learning_rate": 4.9906535371259294e-05, + "loss": 1.4042, + "step": 361 + }, + { + "epoch": 0.05690817268957928, + "grad_norm": 0.1463768631219864, + "learning_rate": 4.9906000860108974e-05, + "loss": 1.461, + "step": 362 + }, + { + "epoch": 0.05706537758651182, + "grad_norm": 0.14041170477867126, + "learning_rate": 4.9905464827794884e-05, + "loss": 1.4147, + "step": 363 + }, + { + "epoch": 0.05722258248344436, + "grad_norm": 0.19242697954177856, + "learning_rate": 4.990492727434976e-05, + "loss": 1.3435, + "step": 364 + }, + { + "epoch": 0.0573797873803769, + "grad_norm": 0.1556611955165863, + "learning_rate": 4.990438819980644e-05, + "loss": 1.4075, + "step": 365 + }, + { + "epoch": 0.05753699227730944, + "grad_norm": 0.13157570362091064, + "learning_rate": 4.990384760419784e-05, + "loss": 1.3334, + "step": 366 + }, + { + "epoch": 0.05769419717424198, + "grad_norm": 0.17953743040561676, + "learning_rate": 4.990330548755698e-05, + "loss": 1.4609, + "step": 367 + }, + { + "epoch": 0.057851402071174514, + "grad_norm": 0.14179491996765137, + "learning_rate": 4.990276184991697e-05, + "loss": 1.4344, + "step": 368 + }, + { + "epoch": 0.05800860696810706, + "grad_norm": 0.16522593796253204, + "learning_rate": 4.9902216691311024e-05, + "loss": 1.3794, + "step": 369 + }, + { + "epoch": 0.05816581186503959, + "grad_norm": 0.12736016511917114, + "learning_rate": 4.9901670011772425e-05, + "loss": 1.4167, + "step": 370 + }, + { + "epoch": 0.058323016761972137, + "grad_norm": 0.15869787335395813, + "learning_rate": 4.990112181133456e-05, + "loss": 1.4293, + "step": 371 + }, + { + "epoch": 0.05848022165890467, + "grad_norm": 0.14410504698753357, + "learning_rate": 4.990057209003093e-05, + "loss": 1.4357, + "step": 372 + }, + { + "epoch": 0.058637426555837216, + "grad_norm": 0.1567080020904541, + "learning_rate": 4.9900020847895086e-05, + "loss": 1.4146, + "step": 373 + }, + { + "epoch": 0.05879463145276975, + "grad_norm": 0.1430107057094574, + "learning_rate": 4.989946808496071e-05, + "loss": 1.3415, + "step": 374 + }, + { + "epoch": 0.058951836349702295, + "grad_norm": 0.146332785487175, + "learning_rate": 4.989891380126156e-05, + "loss": 1.496, + "step": 375 + }, + { + "epoch": 0.05910904124663483, + "grad_norm": 0.13674487173557281, + "learning_rate": 4.989835799683149e-05, + "loss": 1.3611, + "step": 376 + }, + { + "epoch": 0.059266246143567375, + "grad_norm": 0.1321984827518463, + "learning_rate": 4.989780067170444e-05, + "loss": 1.4695, + "step": 377 + }, + { + "epoch": 0.05942345104049991, + "grad_norm": 0.1535942554473877, + "learning_rate": 4.9897241825914464e-05, + "loss": 1.3564, + "step": 378 + }, + { + "epoch": 0.059580655937432454, + "grad_norm": 0.1538037806749344, + "learning_rate": 4.989668145949568e-05, + "loss": 1.3502, + "step": 379 + }, + { + "epoch": 0.05973786083436499, + "grad_norm": 0.15744829177856445, + "learning_rate": 4.989611957248232e-05, + "loss": 1.4318, + "step": 380 + }, + { + "epoch": 0.05989506573129753, + "grad_norm": 0.17178332805633545, + "learning_rate": 4.98955561649087e-05, + "loss": 1.4643, + "step": 381 + }, + { + "epoch": 0.06005227062823007, + "grad_norm": 0.15913072228431702, + "learning_rate": 4.989499123680923e-05, + "loss": 1.487, + "step": 382 + }, + { + "epoch": 0.060209475525162606, + "grad_norm": 0.15134060382843018, + "learning_rate": 4.9894424788218415e-05, + "loss": 1.4705, + "step": 383 + }, + { + "epoch": 0.06036668042209515, + "grad_norm": 0.13704389333724976, + "learning_rate": 4.989385681917085e-05, + "loss": 1.4756, + "step": 384 + }, + { + "epoch": 0.060523885319027686, + "grad_norm": 0.14025503396987915, + "learning_rate": 4.989328732970122e-05, + "loss": 1.443, + "step": 385 + }, + { + "epoch": 0.06068109021596023, + "grad_norm": 0.1822325438261032, + "learning_rate": 4.9892716319844325e-05, + "loss": 1.3996, + "step": 386 + }, + { + "epoch": 0.060838295112892765, + "grad_norm": 0.15639656782150269, + "learning_rate": 4.989214378963502e-05, + "loss": 1.3656, + "step": 387 + }, + { + "epoch": 0.06099550000982531, + "grad_norm": 0.15097728371620178, + "learning_rate": 4.989156973910828e-05, + "loss": 1.4055, + "step": 388 + }, + { + "epoch": 0.061152704906757845, + "grad_norm": 0.18977142870426178, + "learning_rate": 4.989099416829917e-05, + "loss": 1.4472, + "step": 389 + }, + { + "epoch": 0.06130990980369039, + "grad_norm": 0.1596304178237915, + "learning_rate": 4.989041707724284e-05, + "loss": 1.4373, + "step": 390 + }, + { + "epoch": 0.061467114700622924, + "grad_norm": 0.171820729970932, + "learning_rate": 4.988983846597454e-05, + "loss": 1.468, + "step": 391 + }, + { + "epoch": 0.06162431959755547, + "grad_norm": 0.14266176521778107, + "learning_rate": 4.98892583345296e-05, + "loss": 1.4037, + "step": 392 + }, + { + "epoch": 0.061781524494488003, + "grad_norm": 0.13375528156757355, + "learning_rate": 4.988867668294346e-05, + "loss": 1.437, + "step": 393 + }, + { + "epoch": 0.06193872939142054, + "grad_norm": 0.13332228362560272, + "learning_rate": 4.988809351125165e-05, + "loss": 1.3892, + "step": 394 + }, + { + "epoch": 0.06209593428835308, + "grad_norm": 0.17180980741977692, + "learning_rate": 4.988750881948977e-05, + "loss": 1.3494, + "step": 395 + }, + { + "epoch": 0.06225313918528562, + "grad_norm": 0.1419111043214798, + "learning_rate": 4.988692260769355e-05, + "loss": 1.3748, + "step": 396 + }, + { + "epoch": 0.06241034408221816, + "grad_norm": 0.17256620526313782, + "learning_rate": 4.9886334875898776e-05, + "loss": 1.3549, + "step": 397 + }, + { + "epoch": 0.0625675489791507, + "grad_norm": 0.2243422418832779, + "learning_rate": 4.988574562414137e-05, + "loss": 1.4465, + "step": 398 + }, + { + "epoch": 0.06272475387608324, + "grad_norm": 0.15700723230838776, + "learning_rate": 4.9885154852457294e-05, + "loss": 1.4477, + "step": 399 + }, + { + "epoch": 0.06288195877301578, + "grad_norm": 0.14497259259223938, + "learning_rate": 4.988456256088264e-05, + "loss": 1.3861, + "step": 400 + }, + { + "epoch": 0.06303916366994831, + "grad_norm": 0.14747034013271332, + "learning_rate": 4.988396874945359e-05, + "loss": 1.4206, + "step": 401 + }, + { + "epoch": 0.06319636856688086, + "grad_norm": 0.17671054601669312, + "learning_rate": 4.98833734182064e-05, + "loss": 1.2475, + "step": 402 + }, + { + "epoch": 0.0633535734638134, + "grad_norm": 0.16974316537380219, + "learning_rate": 4.9882776567177446e-05, + "loss": 1.4955, + "step": 403 + }, + { + "epoch": 0.06351077836074594, + "grad_norm": 0.15419775247573853, + "learning_rate": 4.988217819640317e-05, + "loss": 1.4209, + "step": 404 + }, + { + "epoch": 0.06366798325767847, + "grad_norm": 0.13987664878368378, + "learning_rate": 4.988157830592012e-05, + "loss": 1.456, + "step": 405 + }, + { + "epoch": 0.06382518815461101, + "grad_norm": 0.24560455977916718, + "learning_rate": 4.988097689576493e-05, + "loss": 1.3567, + "step": 406 + }, + { + "epoch": 0.06398239305154356, + "grad_norm": 0.13870076835155487, + "learning_rate": 4.9880373965974334e-05, + "loss": 1.3752, + "step": 407 + }, + { + "epoch": 0.0641395979484761, + "grad_norm": 0.16167718172073364, + "learning_rate": 4.987976951658517e-05, + "loss": 1.4766, + "step": 408 + }, + { + "epoch": 0.06429680284540863, + "grad_norm": 0.1700398474931717, + "learning_rate": 4.9879163547634346e-05, + "loss": 1.427, + "step": 409 + }, + { + "epoch": 0.06445400774234117, + "grad_norm": 0.15502458810806274, + "learning_rate": 4.987855605915887e-05, + "loss": 1.3965, + "step": 410 + }, + { + "epoch": 0.06461121263927372, + "grad_norm": 0.14834032952785492, + "learning_rate": 4.987794705119584e-05, + "loss": 1.4399, + "step": 411 + }, + { + "epoch": 0.06476841753620625, + "grad_norm": 0.22443649172782898, + "learning_rate": 4.987733652378246e-05, + "loss": 1.3736, + "step": 412 + }, + { + "epoch": 0.06492562243313879, + "grad_norm": 0.14396560192108154, + "learning_rate": 4.9876724476956015e-05, + "loss": 1.4648, + "step": 413 + }, + { + "epoch": 0.06508282733007133, + "grad_norm": 0.15352006256580353, + "learning_rate": 4.987611091075389e-05, + "loss": 1.4988, + "step": 414 + }, + { + "epoch": 0.06524003222700388, + "grad_norm": 0.13210074603557587, + "learning_rate": 4.987549582521356e-05, + "loss": 1.3705, + "step": 415 + }, + { + "epoch": 0.06539723712393641, + "grad_norm": 0.16056782007217407, + "learning_rate": 4.98748792203726e-05, + "loss": 1.3388, + "step": 416 + }, + { + "epoch": 0.06555444202086895, + "grad_norm": 0.18992343544960022, + "learning_rate": 4.9874261096268647e-05, + "loss": 1.3842, + "step": 417 + }, + { + "epoch": 0.06571164691780149, + "grad_norm": 0.1789916455745697, + "learning_rate": 4.9873641452939466e-05, + "loss": 1.3622, + "step": 418 + }, + { + "epoch": 0.06586885181473402, + "grad_norm": 0.21043789386749268, + "learning_rate": 4.9873020290422915e-05, + "loss": 1.3477, + "step": 419 + }, + { + "epoch": 0.06602605671166657, + "grad_norm": 0.15355254709720612, + "learning_rate": 4.987239760875691e-05, + "loss": 1.3643, + "step": 420 + }, + { + "epoch": 0.06618326160859911, + "grad_norm": 0.1433190107345581, + "learning_rate": 4.9871773407979496e-05, + "loss": 1.3753, + "step": 421 + }, + { + "epoch": 0.06634046650553165, + "grad_norm": 0.17479249835014343, + "learning_rate": 4.987114768812879e-05, + "loss": 1.3809, + "step": 422 + }, + { + "epoch": 0.06649767140246418, + "grad_norm": 0.186944842338562, + "learning_rate": 4.987052044924302e-05, + "loss": 1.3616, + "step": 423 + }, + { + "epoch": 0.06665487629939673, + "grad_norm": 0.15202952921390533, + "learning_rate": 4.986989169136048e-05, + "loss": 1.4479, + "step": 424 + }, + { + "epoch": 0.06681208119632927, + "grad_norm": 0.16295532882213593, + "learning_rate": 4.9869261414519575e-05, + "loss": 1.3713, + "step": 425 + }, + { + "epoch": 0.0669692860932618, + "grad_norm": 0.19577625393867493, + "learning_rate": 4.986862961875881e-05, + "loss": 1.4199, + "step": 426 + }, + { + "epoch": 0.06712649099019434, + "grad_norm": 0.22768542170524597, + "learning_rate": 4.986799630411677e-05, + "loss": 1.3529, + "step": 427 + }, + { + "epoch": 0.06728369588712689, + "grad_norm": 0.25184011459350586, + "learning_rate": 4.986736147063212e-05, + "loss": 1.3944, + "step": 428 + }, + { + "epoch": 0.06744090078405943, + "grad_norm": 0.15565118193626404, + "learning_rate": 4.986672511834366e-05, + "loss": 1.4505, + "step": 429 + }, + { + "epoch": 0.06759810568099196, + "grad_norm": 0.16559922695159912, + "learning_rate": 4.986608724729024e-05, + "loss": 1.3742, + "step": 430 + }, + { + "epoch": 0.0677553105779245, + "grad_norm": 0.14826242625713348, + "learning_rate": 4.986544785751081e-05, + "loss": 1.4008, + "step": 431 + }, + { + "epoch": 0.06791251547485704, + "grad_norm": 0.16543184220790863, + "learning_rate": 4.986480694904444e-05, + "loss": 1.3433, + "step": 432 + }, + { + "epoch": 0.06806972037178959, + "grad_norm": 0.15332931280136108, + "learning_rate": 4.986416452193027e-05, + "loss": 1.4459, + "step": 433 + }, + { + "epoch": 0.06822692526872212, + "grad_norm": 0.18880733847618103, + "learning_rate": 4.986352057620752e-05, + "loss": 1.3902, + "step": 434 + }, + { + "epoch": 0.06838413016565466, + "grad_norm": 0.1513829231262207, + "learning_rate": 4.986287511191554e-05, + "loss": 1.3485, + "step": 435 + }, + { + "epoch": 0.0685413350625872, + "grad_norm": 0.15241704881191254, + "learning_rate": 4.9862228129093745e-05, + "loss": 1.3051, + "step": 436 + }, + { + "epoch": 0.06869853995951974, + "grad_norm": 0.1956702321767807, + "learning_rate": 4.986157962778165e-05, + "loss": 1.4647, + "step": 437 + }, + { + "epoch": 0.06885574485645228, + "grad_norm": 0.2027936428785324, + "learning_rate": 4.9860929608018866e-05, + "loss": 1.3602, + "step": 438 + }, + { + "epoch": 0.06901294975338482, + "grad_norm": 0.1623186320066452, + "learning_rate": 4.986027806984509e-05, + "loss": 1.4154, + "step": 439 + }, + { + "epoch": 0.06917015465031735, + "grad_norm": 0.16111283004283905, + "learning_rate": 4.985962501330011e-05, + "loss": 1.4311, + "step": 440 + }, + { + "epoch": 0.0693273595472499, + "grad_norm": 0.16754299402236938, + "learning_rate": 4.985897043842382e-05, + "loss": 1.349, + "step": 441 + }, + { + "epoch": 0.06948456444418244, + "grad_norm": 0.1766330897808075, + "learning_rate": 4.985831434525621e-05, + "loss": 1.3714, + "step": 442 + }, + { + "epoch": 0.06964176934111498, + "grad_norm": 0.1742810308933258, + "learning_rate": 4.985765673383733e-05, + "loss": 1.4161, + "step": 443 + }, + { + "epoch": 0.06979897423804751, + "grad_norm": 0.17025281488895416, + "learning_rate": 4.985699760420736e-05, + "loss": 1.3925, + "step": 444 + }, + { + "epoch": 0.06995617913498005, + "grad_norm": 0.19201375544071198, + "learning_rate": 4.985633695640655e-05, + "loss": 1.4158, + "step": 445 + }, + { + "epoch": 0.0701133840319126, + "grad_norm": 0.1636267751455307, + "learning_rate": 4.985567479047524e-05, + "loss": 1.4071, + "step": 446 + }, + { + "epoch": 0.07027058892884513, + "grad_norm": 0.19676333665847778, + "learning_rate": 4.9855011106453894e-05, + "loss": 1.3449, + "step": 447 + }, + { + "epoch": 0.07042779382577767, + "grad_norm": 0.17712907493114471, + "learning_rate": 4.985434590438303e-05, + "loss": 1.3421, + "step": 448 + }, + { + "epoch": 0.07058499872271021, + "grad_norm": 0.18515101075172424, + "learning_rate": 4.985367918430329e-05, + "loss": 1.4051, + "step": 449 + }, + { + "epoch": 0.07074220361964276, + "grad_norm": 0.17168915271759033, + "learning_rate": 4.985301094625538e-05, + "loss": 1.3093, + "step": 450 + }, + { + "epoch": 0.0708994085165753, + "grad_norm": 0.1891397386789322, + "learning_rate": 4.9852341190280127e-05, + "loss": 1.3075, + "step": 451 + }, + { + "epoch": 0.07105661341350783, + "grad_norm": 0.17731457948684692, + "learning_rate": 4.985166991641843e-05, + "loss": 1.3986, + "step": 452 + }, + { + "epoch": 0.07121381831044037, + "grad_norm": 0.18817296624183655, + "learning_rate": 4.985099712471129e-05, + "loss": 1.3531, + "step": 453 + }, + { + "epoch": 0.07137102320737292, + "grad_norm": 0.1782791018486023, + "learning_rate": 4.9850322815199795e-05, + "loss": 1.4064, + "step": 454 + }, + { + "epoch": 0.07152822810430545, + "grad_norm": 0.18053874373435974, + "learning_rate": 4.984964698792514e-05, + "loss": 1.4607, + "step": 455 + }, + { + "epoch": 0.07168543300123799, + "grad_norm": 0.286338746547699, + "learning_rate": 4.984896964292858e-05, + "loss": 1.3036, + "step": 456 + }, + { + "epoch": 0.07184263789817052, + "grad_norm": 0.2560707926750183, + "learning_rate": 4.98482907802515e-05, + "loss": 1.3428, + "step": 457 + }, + { + "epoch": 0.07199984279510306, + "grad_norm": 0.19296897947788239, + "learning_rate": 4.984761039993537e-05, + "loss": 1.3502, + "step": 458 + }, + { + "epoch": 0.07215704769203561, + "grad_norm": 0.19685949385166168, + "learning_rate": 4.9846928502021725e-05, + "loss": 1.4015, + "step": 459 + }, + { + "epoch": 0.07231425258896815, + "grad_norm": 0.1548481583595276, + "learning_rate": 4.984624508655223e-05, + "loss": 1.3698, + "step": 460 + }, + { + "epoch": 0.07247145748590068, + "grad_norm": 0.16076034307479858, + "learning_rate": 4.984556015356862e-05, + "loss": 1.3627, + "step": 461 + }, + { + "epoch": 0.07262866238283322, + "grad_norm": 0.18571603298187256, + "learning_rate": 4.9844873703112726e-05, + "loss": 1.3506, + "step": 462 + }, + { + "epoch": 0.07278586727976577, + "grad_norm": 0.1540035605430603, + "learning_rate": 4.984418573522648e-05, + "loss": 1.4483, + "step": 463 + }, + { + "epoch": 0.0729430721766983, + "grad_norm": 0.1730145364999771, + "learning_rate": 4.984349624995188e-05, + "loss": 1.3678, + "step": 464 + }, + { + "epoch": 0.07310027707363084, + "grad_norm": 0.26254212856292725, + "learning_rate": 4.984280524733107e-05, + "loss": 1.401, + "step": 465 + }, + { + "epoch": 0.07325748197056338, + "grad_norm": 0.2079063057899475, + "learning_rate": 4.984211272740623e-05, + "loss": 1.3655, + "step": 466 + }, + { + "epoch": 0.07341468686749593, + "grad_norm": 0.21711499989032745, + "learning_rate": 4.9841418690219653e-05, + "loss": 1.4011, + "step": 467 + }, + { + "epoch": 0.07357189176442847, + "grad_norm": 0.18226252496242523, + "learning_rate": 4.984072313581375e-05, + "loss": 1.4213, + "step": 468 + }, + { + "epoch": 0.073729096661361, + "grad_norm": 0.1463780552148819, + "learning_rate": 4.9840026064230984e-05, + "loss": 1.4519, + "step": 469 + }, + { + "epoch": 0.07388630155829354, + "grad_norm": 0.18232892453670502, + "learning_rate": 4.983932747551394e-05, + "loss": 1.3657, + "step": 470 + }, + { + "epoch": 0.07404350645522607, + "grad_norm": 0.19644559919834137, + "learning_rate": 4.9838627369705285e-05, + "loss": 1.3988, + "step": 471 + }, + { + "epoch": 0.07420071135215862, + "grad_norm": 0.16292576491832733, + "learning_rate": 4.983792574684776e-05, + "loss": 1.4369, + "step": 472 + }, + { + "epoch": 0.07435791624909116, + "grad_norm": 0.2244543433189392, + "learning_rate": 4.983722260698425e-05, + "loss": 1.4269, + "step": 473 + }, + { + "epoch": 0.0745151211460237, + "grad_norm": 0.2582489848136902, + "learning_rate": 4.9836517950157666e-05, + "loss": 1.3986, + "step": 474 + }, + { + "epoch": 0.07467232604295623, + "grad_norm": 0.15564194321632385, + "learning_rate": 4.983581177641108e-05, + "loss": 1.3871, + "step": 475 + }, + { + "epoch": 0.07482953093988878, + "grad_norm": 0.2301008552312851, + "learning_rate": 4.9835104085787596e-05, + "loss": 1.3572, + "step": 476 + }, + { + "epoch": 0.07498673583682132, + "grad_norm": 0.21603424847126007, + "learning_rate": 4.9834394878330444e-05, + "loss": 1.3803, + "step": 477 + }, + { + "epoch": 0.07514394073375386, + "grad_norm": 0.16744717955589294, + "learning_rate": 4.9833684154082937e-05, + "loss": 1.4233, + "step": 478 + }, + { + "epoch": 0.07530114563068639, + "grad_norm": 0.23016415536403656, + "learning_rate": 4.98329719130885e-05, + "loss": 1.3962, + "step": 479 + }, + { + "epoch": 0.07545835052761894, + "grad_norm": 0.19687114655971527, + "learning_rate": 4.983225815539061e-05, + "loss": 1.3667, + "step": 480 + }, + { + "epoch": 0.07545835052761894, + "eval_loss": 1.3748993873596191, + "eval_runtime": 2315.5952, + "eval_samples_per_second": 3.998, + "eval_steps_per_second": 1.999, + "step": 480 + }, + { + "epoch": 0.07561555542455148, + "grad_norm": 0.1833205670118332, + "learning_rate": 4.9831542881032884e-05, + "loss": 1.4365, + "step": 481 + }, + { + "epoch": 0.07577276032148401, + "grad_norm": 0.17124423384666443, + "learning_rate": 4.983082609005899e-05, + "loss": 1.3641, + "step": 482 + }, + { + "epoch": 0.07592996521841655, + "grad_norm": 0.17352670431137085, + "learning_rate": 4.9830107782512715e-05, + "loss": 1.3415, + "step": 483 + }, + { + "epoch": 0.07608717011534909, + "grad_norm": 0.20768220722675323, + "learning_rate": 4.982938795843793e-05, + "loss": 1.3261, + "step": 484 + }, + { + "epoch": 0.07624437501228164, + "grad_norm": 0.21459853649139404, + "learning_rate": 4.982866661787859e-05, + "loss": 1.4185, + "step": 485 + }, + { + "epoch": 0.07640157990921417, + "grad_norm": 0.26912233233451843, + "learning_rate": 4.982794376087877e-05, + "loss": 1.3941, + "step": 486 + }, + { + "epoch": 0.07655878480614671, + "grad_norm": 0.28497114777565, + "learning_rate": 4.982721938748261e-05, + "loss": 1.3201, + "step": 487 + }, + { + "epoch": 0.07671598970307925, + "grad_norm": 0.15378472208976746, + "learning_rate": 4.982649349773435e-05, + "loss": 1.3615, + "step": 488 + }, + { + "epoch": 0.0768731946000118, + "grad_norm": 0.16169893741607666, + "learning_rate": 4.982576609167831e-05, + "loss": 1.3342, + "step": 489 + }, + { + "epoch": 0.07703039949694433, + "grad_norm": 0.24693650007247925, + "learning_rate": 4.982503716935896e-05, + "loss": 1.3788, + "step": 490 + }, + { + "epoch": 0.07718760439387687, + "grad_norm": 0.1769181787967682, + "learning_rate": 4.982430673082077e-05, + "loss": 1.3664, + "step": 491 + }, + { + "epoch": 0.0773448092908094, + "grad_norm": 0.26325106620788574, + "learning_rate": 4.982357477610839e-05, + "loss": 1.3173, + "step": 492 + }, + { + "epoch": 0.07750201418774195, + "grad_norm": 0.2063319832086563, + "learning_rate": 4.9822841305266506e-05, + "loss": 1.4125, + "step": 493 + }, + { + "epoch": 0.07765921908467449, + "grad_norm": 0.29141879081726074, + "learning_rate": 4.982210631833992e-05, + "loss": 1.3596, + "step": 494 + }, + { + "epoch": 0.07781642398160703, + "grad_norm": 0.18967591226100922, + "learning_rate": 4.982136981537352e-05, + "loss": 1.4128, + "step": 495 + }, + { + "epoch": 0.07797362887853956, + "grad_norm": 0.2291795313358307, + "learning_rate": 4.9820631796412287e-05, + "loss": 1.3772, + "step": 496 + }, + { + "epoch": 0.0781308337754721, + "grad_norm": 0.200834721326828, + "learning_rate": 4.98198922615013e-05, + "loss": 1.369, + "step": 497 + }, + { + "epoch": 0.07828803867240465, + "grad_norm": 0.22960609197616577, + "learning_rate": 4.9819151210685736e-05, + "loss": 1.3979, + "step": 498 + }, + { + "epoch": 0.07844524356933719, + "grad_norm": 0.17247427999973297, + "learning_rate": 4.981840864401084e-05, + "loss": 1.3927, + "step": 499 + }, + { + "epoch": 0.07860244846626972, + "grad_norm": 0.2623608112335205, + "learning_rate": 4.981766456152198e-05, + "loss": 1.3919, + "step": 500 + }, + { + "epoch": 0.07875965336320226, + "grad_norm": 0.19911788403987885, + "learning_rate": 4.981691896326459e-05, + "loss": 1.3925, + "step": 501 + }, + { + "epoch": 0.07891685826013481, + "grad_norm": 0.24869734048843384, + "learning_rate": 4.9816171849284205e-05, + "loss": 1.3562, + "step": 502 + }, + { + "epoch": 0.07907406315706735, + "grad_norm": 0.31372350454330444, + "learning_rate": 4.981542321962647e-05, + "loss": 1.3211, + "step": 503 + }, + { + "epoch": 0.07923126805399988, + "grad_norm": 0.21760910749435425, + "learning_rate": 4.981467307433709e-05, + "loss": 1.3042, + "step": 504 + }, + { + "epoch": 0.07938847295093242, + "grad_norm": 0.2469843477010727, + "learning_rate": 4.9813921413461906e-05, + "loss": 1.2831, + "step": 505 + }, + { + "epoch": 0.07954567784786497, + "grad_norm": 0.24319148063659668, + "learning_rate": 4.981316823704681e-05, + "loss": 1.2703, + "step": 506 + }, + { + "epoch": 0.0797028827447975, + "grad_norm": 0.19718031585216522, + "learning_rate": 4.98124135451378e-05, + "loss": 1.3258, + "step": 507 + }, + { + "epoch": 0.07986008764173004, + "grad_norm": 0.17459236085414886, + "learning_rate": 4.981165733778098e-05, + "loss": 1.4248, + "step": 508 + }, + { + "epoch": 0.08001729253866258, + "grad_norm": 0.17684616148471832, + "learning_rate": 4.981089961502253e-05, + "loss": 1.3939, + "step": 509 + }, + { + "epoch": 0.08017449743559511, + "grad_norm": 0.17499729990959167, + "learning_rate": 4.981014037690874e-05, + "loss": 1.4156, + "step": 510 + }, + { + "epoch": 0.08033170233252766, + "grad_norm": 0.1901170015335083, + "learning_rate": 4.9809379623485964e-05, + "loss": 1.4209, + "step": 511 + }, + { + "epoch": 0.0804889072294602, + "grad_norm": 0.18230682611465454, + "learning_rate": 4.980861735480067e-05, + "loss": 1.4607, + "step": 512 + }, + { + "epoch": 0.08064611212639274, + "grad_norm": 0.22843636572360992, + "learning_rate": 4.9807853570899427e-05, + "loss": 1.3671, + "step": 513 + }, + { + "epoch": 0.08080331702332527, + "grad_norm": 0.2288489192724228, + "learning_rate": 4.980708827182887e-05, + "loss": 1.3657, + "step": 514 + }, + { + "epoch": 0.08096052192025782, + "grad_norm": 0.19647593796253204, + "learning_rate": 4.980632145763575e-05, + "loss": 1.4079, + "step": 515 + }, + { + "epoch": 0.08111772681719036, + "grad_norm": 0.20980435609817505, + "learning_rate": 4.98055531283669e-05, + "loss": 1.3746, + "step": 516 + }, + { + "epoch": 0.0812749317141229, + "grad_norm": 0.19381123781204224, + "learning_rate": 4.980478328406923e-05, + "loss": 1.3986, + "step": 517 + }, + { + "epoch": 0.08143213661105543, + "grad_norm": 0.2224361151456833, + "learning_rate": 4.980401192478979e-05, + "loss": 1.3082, + "step": 518 + }, + { + "epoch": 0.08158934150798797, + "grad_norm": 0.20567384362220764, + "learning_rate": 4.9803239050575664e-05, + "loss": 1.4417, + "step": 519 + }, + { + "epoch": 0.08174654640492052, + "grad_norm": 0.22890503704547882, + "learning_rate": 4.9802464661474074e-05, + "loss": 1.3034, + "step": 520 + }, + { + "epoch": 0.08190375130185305, + "grad_norm": 0.23220910131931305, + "learning_rate": 4.9801688757532304e-05, + "loss": 1.3705, + "step": 521 + }, + { + "epoch": 0.08206095619878559, + "grad_norm": 0.29084959626197815, + "learning_rate": 4.980091133879775e-05, + "loss": 1.3246, + "step": 522 + }, + { + "epoch": 0.08221816109571813, + "grad_norm": 0.15776456892490387, + "learning_rate": 4.9800132405317895e-05, + "loss": 1.4311, + "step": 523 + }, + { + "epoch": 0.08237536599265068, + "grad_norm": 0.2636071443557739, + "learning_rate": 4.9799351957140314e-05, + "loss": 1.3265, + "step": 524 + }, + { + "epoch": 0.08253257088958321, + "grad_norm": 0.20042134821414948, + "learning_rate": 4.979856999431266e-05, + "loss": 1.3257, + "step": 525 + }, + { + "epoch": 0.08268977578651575, + "grad_norm": 0.24039289355278015, + "learning_rate": 4.9797786516882714e-05, + "loss": 1.3999, + "step": 526 + }, + { + "epoch": 0.08284698068344828, + "grad_norm": 0.16932524740695953, + "learning_rate": 4.9797001524898315e-05, + "loss": 1.4113, + "step": 527 + }, + { + "epoch": 0.08300418558038083, + "grad_norm": 0.2101370096206665, + "learning_rate": 4.97962150184074e-05, + "loss": 1.3973, + "step": 528 + }, + { + "epoch": 0.08316139047731337, + "grad_norm": 0.20983585715293884, + "learning_rate": 4.979542699745803e-05, + "loss": 1.3255, + "step": 529 + }, + { + "epoch": 0.08331859537424591, + "grad_norm": 0.20477800071239471, + "learning_rate": 4.97946374620983e-05, + "loss": 1.4349, + "step": 530 + }, + { + "epoch": 0.08347580027117844, + "grad_norm": 0.22637289762496948, + "learning_rate": 4.979384641237647e-05, + "loss": 1.3263, + "step": 531 + }, + { + "epoch": 0.08363300516811098, + "grad_norm": 0.20332221686840057, + "learning_rate": 4.9793053848340835e-05, + "loss": 1.3411, + "step": 532 + }, + { + "epoch": 0.08379021006504353, + "grad_norm": 0.22744616866111755, + "learning_rate": 4.979225977003979e-05, + "loss": 1.4042, + "step": 533 + }, + { + "epoch": 0.08394741496197607, + "grad_norm": 0.20091576874256134, + "learning_rate": 4.979146417752185e-05, + "loss": 1.3218, + "step": 534 + }, + { + "epoch": 0.0841046198589086, + "grad_norm": 0.2225920408964157, + "learning_rate": 4.9790667070835604e-05, + "loss": 1.4223, + "step": 535 + }, + { + "epoch": 0.08426182475584114, + "grad_norm": 0.20447570085525513, + "learning_rate": 4.9789868450029745e-05, + "loss": 1.3884, + "step": 536 + }, + { + "epoch": 0.08441902965277369, + "grad_norm": 0.22765719890594482, + "learning_rate": 4.9789068315153035e-05, + "loss": 1.3575, + "step": 537 + }, + { + "epoch": 0.08457623454970623, + "grad_norm": 0.18886259198188782, + "learning_rate": 4.9788266666254343e-05, + "loss": 1.2737, + "step": 538 + }, + { + "epoch": 0.08473343944663876, + "grad_norm": 0.26551586389541626, + "learning_rate": 4.978746350338264e-05, + "loss": 1.3867, + "step": 539 + }, + { + "epoch": 0.0848906443435713, + "grad_norm": 0.29268744587898254, + "learning_rate": 4.9786658826586975e-05, + "loss": 1.4266, + "step": 540 + }, + { + "epoch": 0.08504784924050385, + "grad_norm": 0.2537211775779724, + "learning_rate": 4.97858526359165e-05, + "loss": 1.3402, + "step": 541 + }, + { + "epoch": 0.08520505413743638, + "grad_norm": 0.20287925004959106, + "learning_rate": 4.978504493142045e-05, + "loss": 1.3148, + "step": 542 + }, + { + "epoch": 0.08536225903436892, + "grad_norm": 0.18584851920604706, + "learning_rate": 4.978423571314814e-05, + "loss": 1.3293, + "step": 543 + }, + { + "epoch": 0.08551946393130146, + "grad_norm": 0.1944153755903244, + "learning_rate": 4.978342498114903e-05, + "loss": 1.4084, + "step": 544 + }, + { + "epoch": 0.08567666882823399, + "grad_norm": 0.18139739334583282, + "learning_rate": 4.978261273547261e-05, + "loss": 1.2734, + "step": 545 + }, + { + "epoch": 0.08583387372516654, + "grad_norm": 0.20824116468429565, + "learning_rate": 4.97817989761685e-05, + "loss": 1.3346, + "step": 546 + }, + { + "epoch": 0.08599107862209908, + "grad_norm": 0.16180047392845154, + "learning_rate": 4.978098370328639e-05, + "loss": 1.4547, + "step": 547 + }, + { + "epoch": 0.08614828351903162, + "grad_norm": 0.17156392335891724, + "learning_rate": 4.978016691687609e-05, + "loss": 1.366, + "step": 548 + }, + { + "epoch": 0.08630548841596415, + "grad_norm": 0.17913401126861572, + "learning_rate": 4.977934861698746e-05, + "loss": 1.2771, + "step": 549 + }, + { + "epoch": 0.0864626933128967, + "grad_norm": 0.17393502593040466, + "learning_rate": 4.977852880367051e-05, + "loss": 1.3061, + "step": 550 + }, + { + "epoch": 0.08661989820982924, + "grad_norm": 0.21741637587547302, + "learning_rate": 4.97777074769753e-05, + "loss": 1.3232, + "step": 551 + }, + { + "epoch": 0.08677710310676177, + "grad_norm": 0.26123344898223877, + "learning_rate": 4.977688463695198e-05, + "loss": 1.2678, + "step": 552 + }, + { + "epoch": 0.08693430800369431, + "grad_norm": 0.2508600354194641, + "learning_rate": 4.9776060283650826e-05, + "loss": 1.4543, + "step": 553 + }, + { + "epoch": 0.08709151290062686, + "grad_norm": 0.18527132272720337, + "learning_rate": 4.977523441712217e-05, + "loss": 1.3359, + "step": 554 + }, + { + "epoch": 0.0872487177975594, + "grad_norm": 0.24495406448841095, + "learning_rate": 4.977440703741646e-05, + "loss": 1.2892, + "step": 555 + }, + { + "epoch": 0.08740592269449193, + "grad_norm": 0.22759339213371277, + "learning_rate": 4.9773578144584235e-05, + "loss": 1.2212, + "step": 556 + }, + { + "epoch": 0.08756312759142447, + "grad_norm": 0.1627693474292755, + "learning_rate": 4.977274773867611e-05, + "loss": 1.3461, + "step": 557 + }, + { + "epoch": 0.087720332488357, + "grad_norm": 0.2068985551595688, + "learning_rate": 4.9771915819742804e-05, + "loss": 1.3348, + "step": 558 + }, + { + "epoch": 0.08787753738528956, + "grad_norm": 0.19731195271015167, + "learning_rate": 4.9771082387835135e-05, + "loss": 1.3727, + "step": 559 + }, + { + "epoch": 0.08803474228222209, + "grad_norm": 0.26571184396743774, + "learning_rate": 4.977024744300399e-05, + "loss": 1.3911, + "step": 560 + }, + { + "epoch": 0.08819194717915463, + "grad_norm": 0.23141519725322723, + "learning_rate": 4.976941098530039e-05, + "loss": 1.3978, + "step": 561 + }, + { + "epoch": 0.08834915207608716, + "grad_norm": 0.2507224380970001, + "learning_rate": 4.97685730147754e-05, + "loss": 1.3017, + "step": 562 + }, + { + "epoch": 0.08850635697301971, + "grad_norm": 0.2453109323978424, + "learning_rate": 4.976773353148022e-05, + "loss": 1.2977, + "step": 563 + }, + { + "epoch": 0.08866356186995225, + "grad_norm": 0.2600953280925751, + "learning_rate": 4.9766892535466105e-05, + "loss": 1.4015, + "step": 564 + }, + { + "epoch": 0.08882076676688479, + "grad_norm": 0.19863371551036835, + "learning_rate": 4.9766050026784416e-05, + "loss": 1.3593, + "step": 565 + }, + { + "epoch": 0.08897797166381732, + "grad_norm": 0.2115338146686554, + "learning_rate": 4.976520600548663e-05, + "loss": 1.2928, + "step": 566 + }, + { + "epoch": 0.08913517656074987, + "grad_norm": 0.18994684517383575, + "learning_rate": 4.976436047162429e-05, + "loss": 1.3506, + "step": 567 + }, + { + "epoch": 0.08929238145768241, + "grad_norm": 0.22891771793365479, + "learning_rate": 4.976351342524903e-05, + "loss": 1.4449, + "step": 568 + }, + { + "epoch": 0.08944958635461495, + "grad_norm": 0.19313135743141174, + "learning_rate": 4.976266486641259e-05, + "loss": 1.2916, + "step": 569 + }, + { + "epoch": 0.08960679125154748, + "grad_norm": 0.17697346210479736, + "learning_rate": 4.976181479516679e-05, + "loss": 1.3696, + "step": 570 + }, + { + "epoch": 0.08976399614848002, + "grad_norm": 0.22902925312519073, + "learning_rate": 4.976096321156356e-05, + "loss": 1.3688, + "step": 571 + }, + { + "epoch": 0.08992120104541257, + "grad_norm": 0.25305554270744324, + "learning_rate": 4.97601101156549e-05, + "loss": 1.3057, + "step": 572 + }, + { + "epoch": 0.0900784059423451, + "grad_norm": 0.23255370557308197, + "learning_rate": 4.975925550749293e-05, + "loss": 1.3571, + "step": 573 + }, + { + "epoch": 0.09023561083927764, + "grad_norm": 0.25259101390838623, + "learning_rate": 4.9758399387129834e-05, + "loss": 1.3152, + "step": 574 + }, + { + "epoch": 0.09039281573621018, + "grad_norm": 0.26062390208244324, + "learning_rate": 4.97575417546179e-05, + "loss": 1.3042, + "step": 575 + }, + { + "epoch": 0.09055002063314273, + "grad_norm": 0.16536732017993927, + "learning_rate": 4.9756682610009515e-05, + "loss": 1.2797, + "step": 576 + }, + { + "epoch": 0.09070722553007526, + "grad_norm": 0.19088499248027802, + "learning_rate": 4.9755821953357144e-05, + "loss": 1.3774, + "step": 577 + }, + { + "epoch": 0.0908644304270078, + "grad_norm": 0.2181147336959839, + "learning_rate": 4.975495978471336e-05, + "loss": 1.3364, + "step": 578 + }, + { + "epoch": 0.09102163532394034, + "grad_norm": 0.18012750148773193, + "learning_rate": 4.975409610413082e-05, + "loss": 1.3852, + "step": 579 + }, + { + "epoch": 0.09117884022087289, + "grad_norm": 0.18108834326267242, + "learning_rate": 4.975323091166227e-05, + "loss": 1.3214, + "step": 580 + }, + { + "epoch": 0.09133604511780542, + "grad_norm": 0.25102898478507996, + "learning_rate": 4.975236420736056e-05, + "loss": 1.3199, + "step": 581 + }, + { + "epoch": 0.09149325001473796, + "grad_norm": 0.20121383666992188, + "learning_rate": 4.9751495991278626e-05, + "loss": 1.3328, + "step": 582 + }, + { + "epoch": 0.0916504549116705, + "grad_norm": 0.24183815717697144, + "learning_rate": 4.975062626346948e-05, + "loss": 1.3881, + "step": 583 + }, + { + "epoch": 0.09180765980860303, + "grad_norm": 0.23274902999401093, + "learning_rate": 4.974975502398626e-05, + "loss": 1.3674, + "step": 584 + }, + { + "epoch": 0.09196486470553558, + "grad_norm": 0.224375382065773, + "learning_rate": 4.9748882272882165e-05, + "loss": 1.362, + "step": 585 + }, + { + "epoch": 0.09212206960246812, + "grad_norm": 0.2743482291698456, + "learning_rate": 4.97480080102105e-05, + "loss": 1.3028, + "step": 586 + }, + { + "epoch": 0.09227927449940065, + "grad_norm": 0.30631452798843384, + "learning_rate": 4.974713223602467e-05, + "loss": 1.3541, + "step": 587 + }, + { + "epoch": 0.09243647939633319, + "grad_norm": 0.1999395489692688, + "learning_rate": 4.9746254950378166e-05, + "loss": 1.3515, + "step": 588 + }, + { + "epoch": 0.09259368429326574, + "grad_norm": 0.3005799353122711, + "learning_rate": 4.974537615332455e-05, + "loss": 1.3872, + "step": 589 + }, + { + "epoch": 0.09275088919019828, + "grad_norm": 0.21795117855072021, + "learning_rate": 4.9744495844917524e-05, + "loss": 1.2804, + "step": 590 + }, + { + "epoch": 0.09290809408713081, + "grad_norm": 0.2832283675670624, + "learning_rate": 4.9743614025210825e-05, + "loss": 1.3209, + "step": 591 + }, + { + "epoch": 0.09306529898406335, + "grad_norm": 0.21391350030899048, + "learning_rate": 4.9742730694258334e-05, + "loss": 1.3041, + "step": 592 + }, + { + "epoch": 0.0932225038809959, + "grad_norm": 0.21651242673397064, + "learning_rate": 4.974184585211399e-05, + "loss": 1.2529, + "step": 593 + }, + { + "epoch": 0.09337970877792844, + "grad_norm": 0.22796374559402466, + "learning_rate": 4.974095949883183e-05, + "loss": 1.3999, + "step": 594 + }, + { + "epoch": 0.09353691367486097, + "grad_norm": 0.21013247966766357, + "learning_rate": 4.9740071634466e-05, + "loss": 1.3626, + "step": 595 + }, + { + "epoch": 0.09369411857179351, + "grad_norm": 0.31589969992637634, + "learning_rate": 4.973918225907073e-05, + "loss": 1.4096, + "step": 596 + }, + { + "epoch": 0.09385132346872604, + "grad_norm": 0.2923184931278229, + "learning_rate": 4.973829137270033e-05, + "loss": 1.2116, + "step": 597 + }, + { + "epoch": 0.0940085283656586, + "grad_norm": 0.2147187739610672, + "learning_rate": 4.9737398975409224e-05, + "loss": 1.3909, + "step": 598 + }, + { + "epoch": 0.09416573326259113, + "grad_norm": 0.20287127792835236, + "learning_rate": 4.9736505067251896e-05, + "loss": 1.3621, + "step": 599 + }, + { + "epoch": 0.09432293815952367, + "grad_norm": 0.24703876674175262, + "learning_rate": 4.9735609648282965e-05, + "loss": 1.3525, + "step": 600 + }, + { + "epoch": 0.0944801430564562, + "grad_norm": 0.25060412287712097, + "learning_rate": 4.97347127185571e-05, + "loss": 1.36, + "step": 601 + }, + { + "epoch": 0.09463734795338875, + "grad_norm": 0.214557945728302, + "learning_rate": 4.9733814278129096e-05, + "loss": 1.4372, + "step": 602 + }, + { + "epoch": 0.09479455285032129, + "grad_norm": 0.1984785795211792, + "learning_rate": 4.9732914327053825e-05, + "loss": 1.3191, + "step": 603 + }, + { + "epoch": 0.09495175774725383, + "grad_norm": 0.2099440097808838, + "learning_rate": 4.9732012865386244e-05, + "loss": 1.313, + "step": 604 + }, + { + "epoch": 0.09510896264418636, + "grad_norm": 0.20393683016300201, + "learning_rate": 4.9731109893181423e-05, + "loss": 1.3465, + "step": 605 + }, + { + "epoch": 0.09526616754111891, + "grad_norm": 0.25346165895462036, + "learning_rate": 4.97302054104945e-05, + "loss": 1.3379, + "step": 606 + }, + { + "epoch": 0.09542337243805145, + "grad_norm": 0.21876423060894012, + "learning_rate": 4.9729299417380725e-05, + "loss": 1.2746, + "step": 607 + }, + { + "epoch": 0.09558057733498398, + "grad_norm": 0.21032990515232086, + "learning_rate": 4.9728391913895436e-05, + "loss": 1.3215, + "step": 608 + }, + { + "epoch": 0.09573778223191652, + "grad_norm": 0.2550762891769409, + "learning_rate": 4.9727482900094044e-05, + "loss": 1.3239, + "step": 609 + }, + { + "epoch": 0.09589498712884906, + "grad_norm": 0.31706327199935913, + "learning_rate": 4.972657237603208e-05, + "loss": 1.3467, + "step": 610 + }, + { + "epoch": 0.09605219202578161, + "grad_norm": 0.17176879942417145, + "learning_rate": 4.972566034176516e-05, + "loss": 1.3815, + "step": 611 + }, + { + "epoch": 0.09620939692271414, + "grad_norm": 0.22620820999145508, + "learning_rate": 4.972474679734898e-05, + "loss": 1.2593, + "step": 612 + }, + { + "epoch": 0.09636660181964668, + "grad_norm": 0.18735802173614502, + "learning_rate": 4.9723831742839334e-05, + "loss": 1.424, + "step": 613 + }, + { + "epoch": 0.09652380671657922, + "grad_norm": 0.2582910656929016, + "learning_rate": 4.972291517829211e-05, + "loss": 1.2741, + "step": 614 + }, + { + "epoch": 0.09668101161351177, + "grad_norm": 0.19907522201538086, + "learning_rate": 4.97219971037633e-05, + "loss": 1.2045, + "step": 615 + }, + { + "epoch": 0.0968382165104443, + "grad_norm": 0.20451949536800385, + "learning_rate": 4.972107751930896e-05, + "loss": 1.3026, + "step": 616 + }, + { + "epoch": 0.09699542140737684, + "grad_norm": 0.29682090878486633, + "learning_rate": 4.972015642498527e-05, + "loss": 1.3789, + "step": 617 + }, + { + "epoch": 0.09715262630430938, + "grad_norm": 0.27210530638694763, + "learning_rate": 4.9719233820848476e-05, + "loss": 1.3968, + "step": 618 + }, + { + "epoch": 0.09730983120124193, + "grad_norm": 0.24241842329502106, + "learning_rate": 4.971830970695493e-05, + "loss": 1.2763, + "step": 619 + }, + { + "epoch": 0.09746703609817446, + "grad_norm": 0.2535828649997711, + "learning_rate": 4.9717384083361075e-05, + "loss": 1.3463, + "step": 620 + }, + { + "epoch": 0.097624240995107, + "grad_norm": 0.22121217846870422, + "learning_rate": 4.971645695012344e-05, + "loss": 1.3384, + "step": 621 + }, + { + "epoch": 0.09778144589203953, + "grad_norm": 0.28840744495391846, + "learning_rate": 4.971552830729866e-05, + "loss": 1.2418, + "step": 622 + }, + { + "epoch": 0.09793865078897207, + "grad_norm": 0.1682664453983307, + "learning_rate": 4.971459815494345e-05, + "loss": 1.3658, + "step": 623 + }, + { + "epoch": 0.09809585568590462, + "grad_norm": 0.24955761432647705, + "learning_rate": 4.971366649311461e-05, + "loss": 1.2372, + "step": 624 + }, + { + "epoch": 0.09825306058283716, + "grad_norm": 0.2756117582321167, + "learning_rate": 4.971273332186906e-05, + "loss": 1.3212, + "step": 625 + }, + { + "epoch": 0.09841026547976969, + "grad_norm": 0.2370867133140564, + "learning_rate": 4.971179864126377e-05, + "loss": 1.2879, + "step": 626 + }, + { + "epoch": 0.09856747037670223, + "grad_norm": 0.20566895604133606, + "learning_rate": 4.9710862451355846e-05, + "loss": 1.4243, + "step": 627 + }, + { + "epoch": 0.09872467527363478, + "grad_norm": 0.1923399120569229, + "learning_rate": 4.970992475220246e-05, + "loss": 1.2639, + "step": 628 + }, + { + "epoch": 0.09888188017056732, + "grad_norm": 0.17972147464752197, + "learning_rate": 4.9708985543860896e-05, + "loss": 1.3366, + "step": 629 + }, + { + "epoch": 0.09903908506749985, + "grad_norm": 0.1936875432729721, + "learning_rate": 4.97080448263885e-05, + "loss": 1.3496, + "step": 630 + }, + { + "epoch": 0.09919628996443239, + "grad_norm": 0.24409984052181244, + "learning_rate": 4.9707102599842735e-05, + "loss": 1.3268, + "step": 631 + }, + { + "epoch": 0.09935349486136494, + "grad_norm": 0.21084928512573242, + "learning_rate": 4.970615886428115e-05, + "loss": 1.3421, + "step": 632 + }, + { + "epoch": 0.09951069975829747, + "grad_norm": 0.21201804280281067, + "learning_rate": 4.970521361976138e-05, + "loss": 1.3189, + "step": 633 + }, + { + "epoch": 0.09966790465523001, + "grad_norm": 0.2698107063770294, + "learning_rate": 4.9704266866341156e-05, + "loss": 1.2193, + "step": 634 + }, + { + "epoch": 0.09982510955216255, + "grad_norm": 0.27072674036026, + "learning_rate": 4.970331860407831e-05, + "loss": 1.2694, + "step": 635 + }, + { + "epoch": 0.09998231444909508, + "grad_norm": 0.26514896750450134, + "learning_rate": 4.9702368833030754e-05, + "loss": 1.2175, + "step": 636 + }, + { + "epoch": 0.10013951934602763, + "grad_norm": 0.21645940840244293, + "learning_rate": 4.970141755325649e-05, + "loss": 1.3099, + "step": 637 + }, + { + "epoch": 0.10029672424296017, + "grad_norm": 0.27035385370254517, + "learning_rate": 4.970046476481363e-05, + "loss": 1.2723, + "step": 638 + }, + { + "epoch": 0.1004539291398927, + "grad_norm": 0.20999298989772797, + "learning_rate": 4.969951046776036e-05, + "loss": 1.369, + "step": 639 + }, + { + "epoch": 0.10061113403682524, + "grad_norm": 0.18554192781448364, + "learning_rate": 4.969855466215497e-05, + "loss": 1.3483, + "step": 640 + }, + { + "epoch": 0.10061113403682524, + "eval_loss": 1.314468502998352, + "eval_runtime": 2275.7115, + "eval_samples_per_second": 4.068, + "eval_steps_per_second": 2.034, + "step": 640 + }, + { + "epoch": 0.10076833893375779, + "grad_norm": 0.19117292761802673, + "learning_rate": 4.969759734805582e-05, + "loss": 1.3538, + "step": 641 + }, + { + "epoch": 0.10092554383069033, + "grad_norm": 0.21971918642520905, + "learning_rate": 4.969663852552141e-05, + "loss": 1.2827, + "step": 642 + }, + { + "epoch": 0.10108274872762286, + "grad_norm": 0.2663845121860504, + "learning_rate": 4.969567819461027e-05, + "loss": 1.3332, + "step": 643 + }, + { + "epoch": 0.1012399536245554, + "grad_norm": 0.23752686381340027, + "learning_rate": 4.9694716355381076e-05, + "loss": 1.2675, + "step": 644 + }, + { + "epoch": 0.10139715852148795, + "grad_norm": 0.1558876782655716, + "learning_rate": 4.9693753007892565e-05, + "loss": 1.3356, + "step": 645 + }, + { + "epoch": 0.10155436341842049, + "grad_norm": 0.2064114212989807, + "learning_rate": 4.969278815220356e-05, + "loss": 1.3261, + "step": 646 + }, + { + "epoch": 0.10171156831535302, + "grad_norm": 0.2371819168329239, + "learning_rate": 4.969182178837302e-05, + "loss": 1.2706, + "step": 647 + }, + { + "epoch": 0.10186877321228556, + "grad_norm": 0.22757107019424438, + "learning_rate": 4.969085391645994e-05, + "loss": 1.4035, + "step": 648 + }, + { + "epoch": 0.1020259781092181, + "grad_norm": 0.16831007599830627, + "learning_rate": 4.968988453652345e-05, + "loss": 1.3006, + "step": 649 + }, + { + "epoch": 0.10218318300615065, + "grad_norm": 0.1719575822353363, + "learning_rate": 4.968891364862275e-05, + "loss": 1.2439, + "step": 650 + }, + { + "epoch": 0.10234038790308318, + "grad_norm": 0.27235090732574463, + "learning_rate": 4.9687941252817144e-05, + "loss": 1.3065, + "step": 651 + }, + { + "epoch": 0.10249759280001572, + "grad_norm": 0.25622984766960144, + "learning_rate": 4.968696734916601e-05, + "loss": 1.2908, + "step": 652 + }, + { + "epoch": 0.10265479769694826, + "grad_norm": 0.22526390850543976, + "learning_rate": 4.968599193772885e-05, + "loss": 1.3081, + "step": 653 + }, + { + "epoch": 0.1028120025938808, + "grad_norm": 0.2552133798599243, + "learning_rate": 4.968501501856522e-05, + "loss": 1.3292, + "step": 654 + }, + { + "epoch": 0.10296920749081334, + "grad_norm": 0.26533272862434387, + "learning_rate": 4.96840365917348e-05, + "loss": 1.3571, + "step": 655 + }, + { + "epoch": 0.10312641238774588, + "grad_norm": 0.29065170884132385, + "learning_rate": 4.968305665729732e-05, + "loss": 1.2799, + "step": 656 + }, + { + "epoch": 0.10328361728467841, + "grad_norm": 0.27552661299705505, + "learning_rate": 4.968207521531267e-05, + "loss": 1.2262, + "step": 657 + }, + { + "epoch": 0.10344082218161096, + "grad_norm": 1.929308533668518, + "learning_rate": 4.9681092265840775e-05, + "loss": 1.2027, + "step": 658 + }, + { + "epoch": 0.1035980270785435, + "grad_norm": 0.2610799968242645, + "learning_rate": 4.968010780894167e-05, + "loss": 1.3527, + "step": 659 + }, + { + "epoch": 0.10375523197547604, + "grad_norm": 0.28388604521751404, + "learning_rate": 4.967912184467547e-05, + "loss": 1.2989, + "step": 660 + }, + { + "epoch": 0.10391243687240857, + "grad_norm": 0.21056891977787018, + "learning_rate": 4.9678134373102415e-05, + "loss": 1.2748, + "step": 661 + }, + { + "epoch": 0.10406964176934111, + "grad_norm": 0.268331378698349, + "learning_rate": 4.967714539428281e-05, + "loss": 1.3712, + "step": 662 + }, + { + "epoch": 0.10422684666627366, + "grad_norm": 0.28430554270744324, + "learning_rate": 4.967615490827705e-05, + "loss": 1.3641, + "step": 663 + }, + { + "epoch": 0.1043840515632062, + "grad_norm": 0.254165917634964, + "learning_rate": 4.9675162915145636e-05, + "loss": 1.3042, + "step": 664 + }, + { + "epoch": 0.10454125646013873, + "grad_norm": 0.19123367965221405, + "learning_rate": 4.967416941494914e-05, + "loss": 1.3613, + "step": 665 + }, + { + "epoch": 0.10469846135707127, + "grad_norm": 0.20710323750972748, + "learning_rate": 4.967317440774828e-05, + "loss": 1.2815, + "step": 666 + }, + { + "epoch": 0.10485566625400382, + "grad_norm": 0.2143716812133789, + "learning_rate": 4.967217789360379e-05, + "loss": 1.3136, + "step": 667 + }, + { + "epoch": 0.10501287115093635, + "grad_norm": 0.2556392550468445, + "learning_rate": 4.967117987257654e-05, + "loss": 1.384, + "step": 668 + }, + { + "epoch": 0.10517007604786889, + "grad_norm": 0.28254854679107666, + "learning_rate": 4.9670180344727505e-05, + "loss": 1.3218, + "step": 669 + }, + { + "epoch": 0.10532728094480143, + "grad_norm": 0.24643027782440186, + "learning_rate": 4.9669179310117706e-05, + "loss": 1.278, + "step": 670 + }, + { + "epoch": 0.10548448584173396, + "grad_norm": 0.34323665499687195, + "learning_rate": 4.9668176768808304e-05, + "loss": 1.2511, + "step": 671 + }, + { + "epoch": 0.10564169073866651, + "grad_norm": 0.2499508410692215, + "learning_rate": 4.966717272086052e-05, + "loss": 1.338, + "step": 672 + }, + { + "epoch": 0.10579889563559905, + "grad_norm": 0.2145325094461441, + "learning_rate": 4.966616716633567e-05, + "loss": 1.3304, + "step": 673 + }, + { + "epoch": 0.10595610053253159, + "grad_norm": 0.19230923056602478, + "learning_rate": 4.9665160105295185e-05, + "loss": 1.3535, + "step": 674 + }, + { + "epoch": 0.10611330542946412, + "grad_norm": 0.20243465900421143, + "learning_rate": 4.966415153780056e-05, + "loss": 1.3118, + "step": 675 + }, + { + "epoch": 0.10627051032639667, + "grad_norm": 0.24927914142608643, + "learning_rate": 4.966314146391341e-05, + "loss": 1.3136, + "step": 676 + }, + { + "epoch": 0.10642771522332921, + "grad_norm": 0.21791934967041016, + "learning_rate": 4.9662129883695406e-05, + "loss": 1.3314, + "step": 677 + }, + { + "epoch": 0.10658492012026174, + "grad_norm": 0.24318841099739075, + "learning_rate": 4.966111679720835e-05, + "loss": 1.3929, + "step": 678 + }, + { + "epoch": 0.10674212501719428, + "grad_norm": 0.2829376757144928, + "learning_rate": 4.966010220451411e-05, + "loss": 1.3232, + "step": 679 + }, + { + "epoch": 0.10689932991412683, + "grad_norm": 0.2353716641664505, + "learning_rate": 4.965908610567465e-05, + "loss": 1.2851, + "step": 680 + }, + { + "epoch": 0.10705653481105937, + "grad_norm": 0.2615984380245209, + "learning_rate": 4.965806850075203e-05, + "loss": 1.2552, + "step": 681 + }, + { + "epoch": 0.1072137397079919, + "grad_norm": 0.23773109912872314, + "learning_rate": 4.965704938980841e-05, + "loss": 1.2961, + "step": 682 + }, + { + "epoch": 0.10737094460492444, + "grad_norm": 0.2622957229614258, + "learning_rate": 4.9656028772906014e-05, + "loss": 1.3073, + "step": 683 + }, + { + "epoch": 0.10752814950185698, + "grad_norm": 0.24974018335342407, + "learning_rate": 4.965500665010721e-05, + "loss": 1.2774, + "step": 684 + }, + { + "epoch": 0.10768535439878953, + "grad_norm": 0.17124338448047638, + "learning_rate": 4.9653983021474395e-05, + "loss": 1.4159, + "step": 685 + }, + { + "epoch": 0.10784255929572206, + "grad_norm": 0.16673363745212555, + "learning_rate": 4.96529578870701e-05, + "loss": 1.3748, + "step": 686 + }, + { + "epoch": 0.1079997641926546, + "grad_norm": 0.25368422269821167, + "learning_rate": 4.965193124695693e-05, + "loss": 1.3958, + "step": 687 + }, + { + "epoch": 0.10815696908958713, + "grad_norm": 0.22910015285015106, + "learning_rate": 4.96509031011976e-05, + "loss": 1.3259, + "step": 688 + }, + { + "epoch": 0.10831417398651969, + "grad_norm": 0.277851939201355, + "learning_rate": 4.96498734498549e-05, + "loss": 1.3254, + "step": 689 + }, + { + "epoch": 0.10847137888345222, + "grad_norm": 0.32443082332611084, + "learning_rate": 4.964884229299172e-05, + "loss": 1.3007, + "step": 690 + }, + { + "epoch": 0.10862858378038476, + "grad_norm": 0.20710885524749756, + "learning_rate": 4.964780963067102e-05, + "loss": 1.3297, + "step": 691 + }, + { + "epoch": 0.1087857886773173, + "grad_norm": 0.25522252917289734, + "learning_rate": 4.96467754629559e-05, + "loss": 1.2487, + "step": 692 + }, + { + "epoch": 0.10894299357424984, + "grad_norm": 0.3286147713661194, + "learning_rate": 4.9645739789909504e-05, + "loss": 1.2255, + "step": 693 + }, + { + "epoch": 0.10910019847118238, + "grad_norm": 0.3795601725578308, + "learning_rate": 4.964470261159509e-05, + "loss": 1.2725, + "step": 694 + }, + { + "epoch": 0.10925740336811492, + "grad_norm": 0.3112131655216217, + "learning_rate": 4.964366392807602e-05, + "loss": 1.252, + "step": 695 + }, + { + "epoch": 0.10941460826504745, + "grad_norm": 0.2891729176044464, + "learning_rate": 4.964262373941571e-05, + "loss": 1.3377, + "step": 696 + }, + { + "epoch": 0.10957181316197999, + "grad_norm": 0.26973745226860046, + "learning_rate": 4.96415820456777e-05, + "loss": 1.3186, + "step": 697 + }, + { + "epoch": 0.10972901805891254, + "grad_norm": 0.2832094430923462, + "learning_rate": 4.964053884692562e-05, + "loss": 1.3248, + "step": 698 + }, + { + "epoch": 0.10988622295584508, + "grad_norm": 0.2840999960899353, + "learning_rate": 4.963949414322318e-05, + "loss": 1.2677, + "step": 699 + }, + { + "epoch": 0.11004342785277761, + "grad_norm": 0.2891542911529541, + "learning_rate": 4.963844793463418e-05, + "loss": 1.3274, + "step": 700 + }, + { + "epoch": 0.11020063274971015, + "grad_norm": 0.23569005727767944, + "learning_rate": 4.963740022122252e-05, + "loss": 1.2259, + "step": 701 + }, + { + "epoch": 0.1103578376466427, + "grad_norm": 0.2174285501241684, + "learning_rate": 4.963635100305221e-05, + "loss": 1.2785, + "step": 702 + }, + { + "epoch": 0.11051504254357523, + "grad_norm": 0.2753438651561737, + "learning_rate": 4.96353002801873e-05, + "loss": 1.3263, + "step": 703 + }, + { + "epoch": 0.11067224744050777, + "grad_norm": 0.21094419062137604, + "learning_rate": 4.963424805269198e-05, + "loss": 1.2439, + "step": 704 + }, + { + "epoch": 0.1108294523374403, + "grad_norm": 0.20501388609409332, + "learning_rate": 4.963319432063052e-05, + "loss": 1.3091, + "step": 705 + }, + { + "epoch": 0.11098665723437286, + "grad_norm": 0.2041424810886383, + "learning_rate": 4.963213908406728e-05, + "loss": 1.2951, + "step": 706 + }, + { + "epoch": 0.1111438621313054, + "grad_norm": 0.24955442547798157, + "learning_rate": 4.963108234306669e-05, + "loss": 1.2208, + "step": 707 + }, + { + "epoch": 0.11130106702823793, + "grad_norm": 0.39431118965148926, + "learning_rate": 4.9630024097693314e-05, + "loss": 1.306, + "step": 708 + }, + { + "epoch": 0.11145827192517047, + "grad_norm": 0.24803434312343597, + "learning_rate": 4.962896434801178e-05, + "loss": 1.2951, + "step": 709 + }, + { + "epoch": 0.111615476822103, + "grad_norm": 0.2736116349697113, + "learning_rate": 4.962790309408681e-05, + "loss": 1.3245, + "step": 710 + }, + { + "epoch": 0.11177268171903555, + "grad_norm": 0.24502034485340118, + "learning_rate": 4.9626840335983215e-05, + "loss": 1.2961, + "step": 711 + }, + { + "epoch": 0.11192988661596809, + "grad_norm": 0.24158692359924316, + "learning_rate": 4.962577607376592e-05, + "loss": 1.2387, + "step": 712 + }, + { + "epoch": 0.11208709151290062, + "grad_norm": 0.24977251887321472, + "learning_rate": 4.962471030749991e-05, + "loss": 1.2976, + "step": 713 + }, + { + "epoch": 0.11224429640983316, + "grad_norm": 0.15401019155979156, + "learning_rate": 4.962364303725029e-05, + "loss": 1.2684, + "step": 714 + }, + { + "epoch": 0.11240150130676571, + "grad_norm": 0.2611544132232666, + "learning_rate": 4.962257426308224e-05, + "loss": 1.2928, + "step": 715 + }, + { + "epoch": 0.11255870620369825, + "grad_norm": 0.434600830078125, + "learning_rate": 4.962150398506103e-05, + "loss": 1.3657, + "step": 716 + }, + { + "epoch": 0.11271591110063078, + "grad_norm": 0.2896519601345062, + "learning_rate": 4.9620432203252045e-05, + "loss": 1.3055, + "step": 717 + }, + { + "epoch": 0.11287311599756332, + "grad_norm": 0.1891547590494156, + "learning_rate": 4.961935891772073e-05, + "loss": 1.3355, + "step": 718 + }, + { + "epoch": 0.11303032089449587, + "grad_norm": 0.2223133146762848, + "learning_rate": 4.9618284128532644e-05, + "loss": 1.2939, + "step": 719 + }, + { + "epoch": 0.1131875257914284, + "grad_norm": 0.27313077449798584, + "learning_rate": 4.961720783575343e-05, + "loss": 1.2596, + "step": 720 + }, + { + "epoch": 0.11334473068836094, + "grad_norm": 0.24807053804397583, + "learning_rate": 4.961613003944883e-05, + "loss": 1.2851, + "step": 721 + }, + { + "epoch": 0.11350193558529348, + "grad_norm": 0.2343195378780365, + "learning_rate": 4.9615050739684656e-05, + "loss": 1.2899, + "step": 722 + }, + { + "epoch": 0.11365914048222601, + "grad_norm": 0.229730024933815, + "learning_rate": 4.961396993652684e-05, + "loss": 1.3118, + "step": 723 + }, + { + "epoch": 0.11381634537915856, + "grad_norm": 0.2397170215845108, + "learning_rate": 4.9612887630041394e-05, + "loss": 1.2148, + "step": 724 + }, + { + "epoch": 0.1139735502760911, + "grad_norm": 0.2167958915233612, + "learning_rate": 4.9611803820294414e-05, + "loss": 1.2597, + "step": 725 + }, + { + "epoch": 0.11413075517302364, + "grad_norm": 0.21318721771240234, + "learning_rate": 4.961071850735209e-05, + "loss": 1.3949, + "step": 726 + }, + { + "epoch": 0.11428796006995617, + "grad_norm": 0.21988382935523987, + "learning_rate": 4.960963169128073e-05, + "loss": 1.3196, + "step": 727 + }, + { + "epoch": 0.11444516496688872, + "grad_norm": 0.17555692791938782, + "learning_rate": 4.96085433721467e-05, + "loss": 1.3661, + "step": 728 + }, + { + "epoch": 0.11460236986382126, + "grad_norm": 0.3545222282409668, + "learning_rate": 4.960745355001647e-05, + "loss": 1.2659, + "step": 729 + }, + { + "epoch": 0.1147595747607538, + "grad_norm": 0.3196569085121155, + "learning_rate": 4.960636222495659e-05, + "loss": 1.2893, + "step": 730 + }, + { + "epoch": 0.11491677965768633, + "grad_norm": 0.2241334766149521, + "learning_rate": 4.960526939703374e-05, + "loss": 1.2155, + "step": 731 + }, + { + "epoch": 0.11507398455461888, + "grad_norm": 0.26543980836868286, + "learning_rate": 4.960417506631465e-05, + "loss": 1.3615, + "step": 732 + }, + { + "epoch": 0.11523118945155142, + "grad_norm": 0.21146585047245026, + "learning_rate": 4.960307923286616e-05, + "loss": 1.3516, + "step": 733 + }, + { + "epoch": 0.11538839434848396, + "grad_norm": 0.18095079064369202, + "learning_rate": 4.960198189675519e-05, + "loss": 1.3581, + "step": 734 + }, + { + "epoch": 0.11554559924541649, + "grad_norm": 0.26687100529670715, + "learning_rate": 4.9600883058048775e-05, + "loss": 1.1971, + "step": 735 + }, + { + "epoch": 0.11570280414234903, + "grad_norm": 0.2271047830581665, + "learning_rate": 4.959978271681402e-05, + "loss": 1.1867, + "step": 736 + }, + { + "epoch": 0.11586000903928158, + "grad_norm": 0.2102867215871811, + "learning_rate": 4.959868087311814e-05, + "loss": 1.2749, + "step": 737 + }, + { + "epoch": 0.11601721393621411, + "grad_norm": 0.2752761244773865, + "learning_rate": 4.9597577527028424e-05, + "loss": 1.1753, + "step": 738 + }, + { + "epoch": 0.11617441883314665, + "grad_norm": 0.22385725378990173, + "learning_rate": 4.959647267861226e-05, + "loss": 1.343, + "step": 739 + }, + { + "epoch": 0.11633162373007919, + "grad_norm": 0.2597412168979645, + "learning_rate": 4.959536632793712e-05, + "loss": 1.2539, + "step": 740 + }, + { + "epoch": 0.11648882862701174, + "grad_norm": 0.27975237369537354, + "learning_rate": 4.959425847507059e-05, + "loss": 1.2883, + "step": 741 + }, + { + "epoch": 0.11664603352394427, + "grad_norm": 0.29127049446105957, + "learning_rate": 4.959314912008033e-05, + "loss": 1.3139, + "step": 742 + }, + { + "epoch": 0.11680323842087681, + "grad_norm": 0.19929318130016327, + "learning_rate": 4.9592038263034094e-05, + "loss": 1.271, + "step": 743 + }, + { + "epoch": 0.11696044331780935, + "grad_norm": 0.23164550960063934, + "learning_rate": 4.9590925903999716e-05, + "loss": 1.3359, + "step": 744 + }, + { + "epoch": 0.1171176482147419, + "grad_norm": 0.27876612544059753, + "learning_rate": 4.958981204304516e-05, + "loss": 1.2568, + "step": 745 + }, + { + "epoch": 0.11727485311167443, + "grad_norm": 0.2459796965122223, + "learning_rate": 4.9588696680238435e-05, + "loss": 1.2426, + "step": 746 + }, + { + "epoch": 0.11743205800860697, + "grad_norm": 0.2039456069469452, + "learning_rate": 4.958757981564767e-05, + "loss": 1.2681, + "step": 747 + }, + { + "epoch": 0.1175892629055395, + "grad_norm": 0.24796408414840698, + "learning_rate": 4.958646144934108e-05, + "loss": 1.257, + "step": 748 + }, + { + "epoch": 0.11774646780247204, + "grad_norm": 0.2779620289802551, + "learning_rate": 4.958534158138697e-05, + "loss": 1.2933, + "step": 749 + }, + { + "epoch": 0.11790367269940459, + "grad_norm": 0.20878851413726807, + "learning_rate": 4.9584220211853735e-05, + "loss": 1.2902, + "step": 750 + }, + { + "epoch": 0.11806087759633713, + "grad_norm": 0.24720412492752075, + "learning_rate": 4.958309734080987e-05, + "loss": 1.203, + "step": 751 + }, + { + "epoch": 0.11821808249326966, + "grad_norm": 0.287654846906662, + "learning_rate": 4.9581972968323956e-05, + "loss": 1.3141, + "step": 752 + }, + { + "epoch": 0.1183752873902022, + "grad_norm": 0.23071719706058502, + "learning_rate": 4.958084709446466e-05, + "loss": 1.3145, + "step": 753 + }, + { + "epoch": 0.11853249228713475, + "grad_norm": 0.21027110517024994, + "learning_rate": 4.9579719719300746e-05, + "loss": 1.2893, + "step": 754 + }, + { + "epoch": 0.11868969718406729, + "grad_norm": 0.17173202335834503, + "learning_rate": 4.9578590842901066e-05, + "loss": 1.2618, + "step": 755 + }, + { + "epoch": 0.11884690208099982, + "grad_norm": 0.24606984853744507, + "learning_rate": 4.957746046533457e-05, + "loss": 1.1904, + "step": 756 + }, + { + "epoch": 0.11900410697793236, + "grad_norm": 0.248653382062912, + "learning_rate": 4.957632858667031e-05, + "loss": 1.331, + "step": 757 + }, + { + "epoch": 0.11916131187486491, + "grad_norm": 0.1904144436120987, + "learning_rate": 4.9575195206977406e-05, + "loss": 1.3303, + "step": 758 + }, + { + "epoch": 0.11931851677179744, + "grad_norm": 0.39540621638298035, + "learning_rate": 4.9574060326325075e-05, + "loss": 1.3455, + "step": 759 + }, + { + "epoch": 0.11947572166872998, + "grad_norm": 0.20992301404476166, + "learning_rate": 4.957292394478265e-05, + "loss": 1.2911, + "step": 760 + }, + { + "epoch": 0.11963292656566252, + "grad_norm": 0.23418502509593964, + "learning_rate": 4.957178606241951e-05, + "loss": 1.35, + "step": 761 + }, + { + "epoch": 0.11979013146259505, + "grad_norm": 0.24480225145816803, + "learning_rate": 4.957064667930517e-05, + "loss": 1.2138, + "step": 762 + }, + { + "epoch": 0.1199473363595276, + "grad_norm": 0.22909322381019592, + "learning_rate": 4.956950579550922e-05, + "loss": 1.1915, + "step": 763 + }, + { + "epoch": 0.12010454125646014, + "grad_norm": 0.16839763522148132, + "learning_rate": 4.956836341110134e-05, + "loss": 1.234, + "step": 764 + }, + { + "epoch": 0.12026174615339268, + "grad_norm": 0.2291131466627121, + "learning_rate": 4.956721952615129e-05, + "loss": 1.2964, + "step": 765 + }, + { + "epoch": 0.12041895105032521, + "grad_norm": 0.2606765329837799, + "learning_rate": 4.956607414072895e-05, + "loss": 1.2785, + "step": 766 + }, + { + "epoch": 0.12057615594725776, + "grad_norm": 0.24100011587142944, + "learning_rate": 4.956492725490426e-05, + "loss": 1.2389, + "step": 767 + }, + { + "epoch": 0.1207333608441903, + "grad_norm": 0.2868693172931671, + "learning_rate": 4.956377886874729e-05, + "loss": 1.3852, + "step": 768 + }, + { + "epoch": 0.12089056574112284, + "grad_norm": 0.29049259424209595, + "learning_rate": 4.956262898232816e-05, + "loss": 1.1511, + "step": 769 + }, + { + "epoch": 0.12104777063805537, + "grad_norm": 0.31396448612213135, + "learning_rate": 4.9561477595717106e-05, + "loss": 1.2687, + "step": 770 + }, + { + "epoch": 0.12120497553498792, + "grad_norm": 0.3348733186721802, + "learning_rate": 4.956032470898445e-05, + "loss": 1.1933, + "step": 771 + }, + { + "epoch": 0.12136218043192046, + "grad_norm": 0.2009342461824417, + "learning_rate": 4.955917032220061e-05, + "loss": 1.3299, + "step": 772 + }, + { + "epoch": 0.121519385328853, + "grad_norm": 0.2037377655506134, + "learning_rate": 4.9558014435436084e-05, + "loss": 1.3208, + "step": 773 + }, + { + "epoch": 0.12167659022578553, + "grad_norm": 0.3118877410888672, + "learning_rate": 4.955685704876147e-05, + "loss": 1.1927, + "step": 774 + }, + { + "epoch": 0.12183379512271807, + "grad_norm": 0.21884632110595703, + "learning_rate": 4.955569816224747e-05, + "loss": 1.2661, + "step": 775 + }, + { + "epoch": 0.12199100001965062, + "grad_norm": 0.25817862153053284, + "learning_rate": 4.9554537775964846e-05, + "loss": 1.3077, + "step": 776 + }, + { + "epoch": 0.12214820491658315, + "grad_norm": 0.27827751636505127, + "learning_rate": 4.955337588998449e-05, + "loss": 1.2709, + "step": 777 + }, + { + "epoch": 0.12230540981351569, + "grad_norm": 0.30520737171173096, + "learning_rate": 4.955221250437735e-05, + "loss": 1.2407, + "step": 778 + }, + { + "epoch": 0.12246261471044823, + "grad_norm": 0.21729423105716705, + "learning_rate": 4.9551047619214473e-05, + "loss": 1.3392, + "step": 779 + }, + { + "epoch": 0.12261981960738078, + "grad_norm": 0.2408866286277771, + "learning_rate": 4.954988123456703e-05, + "loss": 1.215, + "step": 780 + }, + { + "epoch": 0.12277702450431331, + "grad_norm": 0.23833869397640228, + "learning_rate": 4.954871335050625e-05, + "loss": 1.3607, + "step": 781 + }, + { + "epoch": 0.12293422940124585, + "grad_norm": 0.27017349004745483, + "learning_rate": 4.954754396710345e-05, + "loss": 1.2662, + "step": 782 + }, + { + "epoch": 0.12309143429817838, + "grad_norm": 0.21869684755802155, + "learning_rate": 4.954637308443007e-05, + "loss": 1.2384, + "step": 783 + }, + { + "epoch": 0.12324863919511093, + "grad_norm": 0.18912911415100098, + "learning_rate": 4.9545200702557615e-05, + "loss": 1.2958, + "step": 784 + }, + { + "epoch": 0.12340584409204347, + "grad_norm": 0.27320876717567444, + "learning_rate": 4.954402682155768e-05, + "loss": 1.2546, + "step": 785 + }, + { + "epoch": 0.12356304898897601, + "grad_norm": 0.2938046455383301, + "learning_rate": 4.954285144150198e-05, + "loss": 1.3451, + "step": 786 + }, + { + "epoch": 0.12372025388590854, + "grad_norm": 0.18271508812904358, + "learning_rate": 4.954167456246229e-05, + "loss": 1.2239, + "step": 787 + }, + { + "epoch": 0.12387745878284108, + "grad_norm": 0.21799346804618835, + "learning_rate": 4.9540496184510495e-05, + "loss": 1.2471, + "step": 788 + }, + { + "epoch": 0.12403466367977363, + "grad_norm": 0.21574997901916504, + "learning_rate": 4.9539316307718564e-05, + "loss": 1.3137, + "step": 789 + }, + { + "epoch": 0.12419186857670617, + "grad_norm": 0.21586358547210693, + "learning_rate": 4.953813493215855e-05, + "loss": 1.2763, + "step": 790 + }, + { + "epoch": 0.1243490734736387, + "grad_norm": 0.2723408043384552, + "learning_rate": 4.953695205790262e-05, + "loss": 1.4148, + "step": 791 + }, + { + "epoch": 0.12450627837057124, + "grad_norm": 0.29501527547836304, + "learning_rate": 4.9535767685023026e-05, + "loss": 1.3093, + "step": 792 + }, + { + "epoch": 0.12466348326750379, + "grad_norm": 0.2884112000465393, + "learning_rate": 4.9534581813592086e-05, + "loss": 1.3276, + "step": 793 + }, + { + "epoch": 0.12482068816443632, + "grad_norm": 0.24246759712696075, + "learning_rate": 4.9533394443682234e-05, + "loss": 1.3203, + "step": 794 + }, + { + "epoch": 0.12497789306136886, + "grad_norm": 0.23493270576000214, + "learning_rate": 4.9532205575365995e-05, + "loss": 1.2567, + "step": 795 + }, + { + "epoch": 0.1251350979583014, + "grad_norm": 0.26456305384635925, + "learning_rate": 4.953101520871598e-05, + "loss": 1.3194, + "step": 796 + }, + { + "epoch": 0.12529230285523393, + "grad_norm": 0.18891221284866333, + "learning_rate": 4.952982334380489e-05, + "loss": 1.3041, + "step": 797 + }, + { + "epoch": 0.12544950775216648, + "grad_norm": 0.21460258960723877, + "learning_rate": 4.952862998070552e-05, + "loss": 1.2274, + "step": 798 + }, + { + "epoch": 0.125606712649099, + "grad_norm": 0.2832646667957306, + "learning_rate": 4.9527435119490753e-05, + "loss": 1.2009, + "step": 799 + }, + { + "epoch": 0.12576391754603156, + "grad_norm": 0.22183702886104584, + "learning_rate": 4.9526238760233576e-05, + "loss": 1.31, + "step": 800 + }, + { + "epoch": 0.12576391754603156, + "eval_loss": 1.2745521068572998, + "eval_runtime": 2292.1003, + "eval_samples_per_second": 4.039, + "eval_steps_per_second": 2.02, + "step": 800 + }, + { + "epoch": 0.1259211224429641, + "grad_norm": 0.21697020530700684, + "learning_rate": 4.9525040903007046e-05, + "loss": 1.3197, + "step": 801 + }, + { + "epoch": 0.12607832733989663, + "grad_norm": 0.36354196071624756, + "learning_rate": 4.952384154788433e-05, + "loss": 1.1926, + "step": 802 + }, + { + "epoch": 0.12623553223682918, + "grad_norm": 0.27054232358932495, + "learning_rate": 4.952264069493868e-05, + "loss": 1.3199, + "step": 803 + }, + { + "epoch": 0.12639273713376173, + "grad_norm": 0.2425469011068344, + "learning_rate": 4.952143834424344e-05, + "loss": 1.2906, + "step": 804 + }, + { + "epoch": 0.12654994203069425, + "grad_norm": 0.1988941729068756, + "learning_rate": 4.952023449587205e-05, + "loss": 1.3183, + "step": 805 + }, + { + "epoch": 0.1267071469276268, + "grad_norm": 0.2429157942533493, + "learning_rate": 4.951902914989802e-05, + "loss": 1.2497, + "step": 806 + }, + { + "epoch": 0.12686435182455932, + "grad_norm": 0.2704293727874756, + "learning_rate": 4.951782230639499e-05, + "loss": 1.3113, + "step": 807 + }, + { + "epoch": 0.12702155672149187, + "grad_norm": 0.31801360845565796, + "learning_rate": 4.951661396543664e-05, + "loss": 1.2354, + "step": 808 + }, + { + "epoch": 0.12717876161842442, + "grad_norm": 0.21358463168144226, + "learning_rate": 4.951540412709681e-05, + "loss": 1.3512, + "step": 809 + }, + { + "epoch": 0.12733596651535695, + "grad_norm": 0.24300484359264374, + "learning_rate": 4.951419279144936e-05, + "loss": 1.213, + "step": 810 + }, + { + "epoch": 0.1274931714122895, + "grad_norm": 0.39550015330314636, + "learning_rate": 4.951297995856828e-05, + "loss": 1.1872, + "step": 811 + }, + { + "epoch": 0.12765037630922202, + "grad_norm": 0.20150414109230042, + "learning_rate": 4.951176562852765e-05, + "loss": 1.3469, + "step": 812 + }, + { + "epoch": 0.12780758120615457, + "grad_norm": 0.2050725519657135, + "learning_rate": 4.951054980140164e-05, + "loss": 1.259, + "step": 813 + }, + { + "epoch": 0.12796478610308712, + "grad_norm": 0.23815183341503143, + "learning_rate": 4.950933247726451e-05, + "loss": 1.2961, + "step": 814 + }, + { + "epoch": 0.12812199100001964, + "grad_norm": 0.28224676847457886, + "learning_rate": 4.95081136561906e-05, + "loss": 1.2631, + "step": 815 + }, + { + "epoch": 0.1282791958969522, + "grad_norm": 0.294791042804718, + "learning_rate": 4.9506893338254353e-05, + "loss": 1.1834, + "step": 816 + }, + { + "epoch": 0.12843640079388474, + "grad_norm": 0.29148972034454346, + "learning_rate": 4.9505671523530306e-05, + "loss": 1.2573, + "step": 817 + }, + { + "epoch": 0.12859360569081726, + "grad_norm": 0.29371243715286255, + "learning_rate": 4.950444821209308e-05, + "loss": 1.4532, + "step": 818 + }, + { + "epoch": 0.12875081058774981, + "grad_norm": 0.2303713709115982, + "learning_rate": 4.9503223404017396e-05, + "loss": 1.2828, + "step": 819 + }, + { + "epoch": 0.12890801548468234, + "grad_norm": 0.24906295537948608, + "learning_rate": 4.9501997099378046e-05, + "loss": 1.2759, + "step": 820 + }, + { + "epoch": 0.1290652203816149, + "grad_norm": 0.1983998864889145, + "learning_rate": 4.950076929824994e-05, + "loss": 1.3111, + "step": 821 + }, + { + "epoch": 0.12922242527854744, + "grad_norm": 0.2079075276851654, + "learning_rate": 4.9499540000708064e-05, + "loss": 1.3416, + "step": 822 + }, + { + "epoch": 0.12937963017547996, + "grad_norm": 0.22548237442970276, + "learning_rate": 4.94983092068275e-05, + "loss": 1.3879, + "step": 823 + }, + { + "epoch": 0.1295368350724125, + "grad_norm": 0.2052278220653534, + "learning_rate": 4.949707691668343e-05, + "loss": 1.3347, + "step": 824 + }, + { + "epoch": 0.12969403996934503, + "grad_norm": 0.21978795528411865, + "learning_rate": 4.949584313035109e-05, + "loss": 1.1345, + "step": 825 + }, + { + "epoch": 0.12985124486627758, + "grad_norm": 0.18930193781852722, + "learning_rate": 4.9494607847905863e-05, + "loss": 1.319, + "step": 826 + }, + { + "epoch": 0.13000844976321013, + "grad_norm": 0.24538543820381165, + "learning_rate": 4.9493371069423176e-05, + "loss": 1.3103, + "step": 827 + }, + { + "epoch": 0.13016565466014265, + "grad_norm": 0.2874930799007416, + "learning_rate": 4.9492132794978586e-05, + "loss": 1.3388, + "step": 828 + }, + { + "epoch": 0.1303228595570752, + "grad_norm": 0.23338377475738525, + "learning_rate": 4.949089302464771e-05, + "loss": 1.2793, + "step": 829 + }, + { + "epoch": 0.13048006445400775, + "grad_norm": 0.23670902848243713, + "learning_rate": 4.948965175850626e-05, + "loss": 1.2808, + "step": 830 + }, + { + "epoch": 0.13063726935094028, + "grad_norm": 0.2617732584476471, + "learning_rate": 4.9488408996630066e-05, + "loss": 1.2641, + "step": 831 + }, + { + "epoch": 0.13079447424787283, + "grad_norm": 0.24584044516086578, + "learning_rate": 4.948716473909502e-05, + "loss": 1.2462, + "step": 832 + }, + { + "epoch": 0.13095167914480535, + "grad_norm": 0.2507297098636627, + "learning_rate": 4.948591898597712e-05, + "loss": 1.2211, + "step": 833 + }, + { + "epoch": 0.1311088840417379, + "grad_norm": 0.25439611077308655, + "learning_rate": 4.948467173735245e-05, + "loss": 1.2762, + "step": 834 + }, + { + "epoch": 0.13126608893867045, + "grad_norm": 0.19934779405593872, + "learning_rate": 4.948342299329719e-05, + "loss": 1.1798, + "step": 835 + }, + { + "epoch": 0.13142329383560297, + "grad_norm": 0.24154123663902283, + "learning_rate": 4.948217275388761e-05, + "loss": 1.2608, + "step": 836 + }, + { + "epoch": 0.13158049873253552, + "grad_norm": 0.2484877109527588, + "learning_rate": 4.948092101920006e-05, + "loss": 1.2466, + "step": 837 + }, + { + "epoch": 0.13173770362946804, + "grad_norm": 0.28683343529701233, + "learning_rate": 4.9479667789311e-05, + "loss": 1.1915, + "step": 838 + }, + { + "epoch": 0.1318949085264006, + "grad_norm": 0.21289369463920593, + "learning_rate": 4.9478413064296976e-05, + "loss": 1.2642, + "step": 839 + }, + { + "epoch": 0.13205211342333315, + "grad_norm": 0.22933778166770935, + "learning_rate": 4.947715684423461e-05, + "loss": 1.2182, + "step": 840 + }, + { + "epoch": 0.13220931832026567, + "grad_norm": 0.2507724463939667, + "learning_rate": 4.9475899129200635e-05, + "loss": 1.3089, + "step": 841 + }, + { + "epoch": 0.13236652321719822, + "grad_norm": 0.251770943403244, + "learning_rate": 4.947463991927187e-05, + "loss": 1.3194, + "step": 842 + }, + { + "epoch": 0.13252372811413077, + "grad_norm": 0.2533280849456787, + "learning_rate": 4.947337921452521e-05, + "loss": 1.2141, + "step": 843 + }, + { + "epoch": 0.1326809330110633, + "grad_norm": 0.26309993863105774, + "learning_rate": 4.9472117015037664e-05, + "loss": 1.2265, + "step": 844 + }, + { + "epoch": 0.13283813790799584, + "grad_norm": 0.29711806774139404, + "learning_rate": 4.9470853320886335e-05, + "loss": 1.2538, + "step": 845 + }, + { + "epoch": 0.13299534280492836, + "grad_norm": 0.24551883339881897, + "learning_rate": 4.9469588132148373e-05, + "loss": 1.2927, + "step": 846 + }, + { + "epoch": 0.1331525477018609, + "grad_norm": 0.28027257323265076, + "learning_rate": 4.946832144890108e-05, + "loss": 1.2712, + "step": 847 + }, + { + "epoch": 0.13330975259879346, + "grad_norm": 0.22099149227142334, + "learning_rate": 4.9467053271221804e-05, + "loss": 1.2095, + "step": 848 + }, + { + "epoch": 0.13346695749572599, + "grad_norm": 0.19661381840705872, + "learning_rate": 4.946578359918801e-05, + "loss": 1.2855, + "step": 849 + }, + { + "epoch": 0.13362416239265854, + "grad_norm": 0.22767631709575653, + "learning_rate": 4.946451243287723e-05, + "loss": 1.2932, + "step": 850 + }, + { + "epoch": 0.13378136728959106, + "grad_norm": 0.28008589148521423, + "learning_rate": 4.946323977236712e-05, + "loss": 1.2335, + "step": 851 + }, + { + "epoch": 0.1339385721865236, + "grad_norm": 0.2091825157403946, + "learning_rate": 4.94619656177354e-05, + "loss": 1.3151, + "step": 852 + }, + { + "epoch": 0.13409577708345616, + "grad_norm": 0.1978277713060379, + "learning_rate": 4.946068996905989e-05, + "loss": 1.3359, + "step": 853 + }, + { + "epoch": 0.13425298198038868, + "grad_norm": 0.21397674083709717, + "learning_rate": 4.9459412826418505e-05, + "loss": 1.2998, + "step": 854 + }, + { + "epoch": 0.13441018687732123, + "grad_norm": 0.30490776896476746, + "learning_rate": 4.945813418988925e-05, + "loss": 1.2607, + "step": 855 + }, + { + "epoch": 0.13456739177425378, + "grad_norm": 0.2896914780139923, + "learning_rate": 4.945685405955021e-05, + "loss": 1.2329, + "step": 856 + }, + { + "epoch": 0.1347245966711863, + "grad_norm": 0.1988048106431961, + "learning_rate": 4.945557243547958e-05, + "loss": 1.2877, + "step": 857 + }, + { + "epoch": 0.13488180156811885, + "grad_norm": 0.17888212203979492, + "learning_rate": 4.945428931775563e-05, + "loss": 1.2543, + "step": 858 + }, + { + "epoch": 0.13503900646505138, + "grad_norm": 0.2748056650161743, + "learning_rate": 4.945300470645673e-05, + "loss": 1.3461, + "step": 859 + }, + { + "epoch": 0.13519621136198393, + "grad_norm": 0.23218591511249542, + "learning_rate": 4.945171860166135e-05, + "loss": 1.2878, + "step": 860 + }, + { + "epoch": 0.13535341625891648, + "grad_norm": 0.33142325282096863, + "learning_rate": 4.9450431003448015e-05, + "loss": 1.294, + "step": 861 + }, + { + "epoch": 0.135510621155849, + "grad_norm": 0.2330816686153412, + "learning_rate": 4.944914191189539e-05, + "loss": 1.3593, + "step": 862 + }, + { + "epoch": 0.13566782605278155, + "grad_norm": 0.23989921808242798, + "learning_rate": 4.9447851327082204e-05, + "loss": 1.2879, + "step": 863 + }, + { + "epoch": 0.13582503094971407, + "grad_norm": 0.21358944475650787, + "learning_rate": 4.944655924908727e-05, + "loss": 1.222, + "step": 864 + }, + { + "epoch": 0.13598223584664662, + "grad_norm": 0.30434924364089966, + "learning_rate": 4.9445265677989515e-05, + "loss": 1.273, + "step": 865 + }, + { + "epoch": 0.13613944074357917, + "grad_norm": 0.22028383612632751, + "learning_rate": 4.944397061386794e-05, + "loss": 1.2494, + "step": 866 + }, + { + "epoch": 0.1362966456405117, + "grad_norm": 0.2354927659034729, + "learning_rate": 4.944267405680164e-05, + "loss": 1.1469, + "step": 867 + }, + { + "epoch": 0.13645385053744424, + "grad_norm": 0.28941988945007324, + "learning_rate": 4.944137600686981e-05, + "loss": 1.1678, + "step": 868 + }, + { + "epoch": 0.1366110554343768, + "grad_norm": 0.2538214325904846, + "learning_rate": 4.944007646415172e-05, + "loss": 1.2636, + "step": 869 + }, + { + "epoch": 0.13676826033130932, + "grad_norm": 0.3719157576560974, + "learning_rate": 4.943877542872676e-05, + "loss": 1.2901, + "step": 870 + }, + { + "epoch": 0.13692546522824187, + "grad_norm": 0.2994091212749481, + "learning_rate": 4.943747290067438e-05, + "loss": 1.2209, + "step": 871 + }, + { + "epoch": 0.1370826701251744, + "grad_norm": 0.23586580157279968, + "learning_rate": 4.9436168880074115e-05, + "loss": 1.2989, + "step": 872 + }, + { + "epoch": 0.13723987502210694, + "grad_norm": 0.193126380443573, + "learning_rate": 4.943486336700564e-05, + "loss": 1.204, + "step": 873 + }, + { + "epoch": 0.1373970799190395, + "grad_norm": 0.18505080044269562, + "learning_rate": 4.943355636154868e-05, + "loss": 1.3247, + "step": 874 + }, + { + "epoch": 0.137554284815972, + "grad_norm": 0.2586881220340729, + "learning_rate": 4.9432247863783064e-05, + "loss": 1.3315, + "step": 875 + }, + { + "epoch": 0.13771148971290456, + "grad_norm": 0.2904506027698517, + "learning_rate": 4.943093787378871e-05, + "loss": 1.2593, + "step": 876 + }, + { + "epoch": 0.13786869460983708, + "grad_norm": 0.2971174120903015, + "learning_rate": 4.9429626391645615e-05, + "loss": 1.2241, + "step": 877 + }, + { + "epoch": 0.13802589950676963, + "grad_norm": 0.42521703243255615, + "learning_rate": 4.9428313417433894e-05, + "loss": 1.2638, + "step": 878 + }, + { + "epoch": 0.13818310440370218, + "grad_norm": 0.2515777349472046, + "learning_rate": 4.9426998951233735e-05, + "loss": 1.3111, + "step": 879 + }, + { + "epoch": 0.1383403093006347, + "grad_norm": 0.25959545373916626, + "learning_rate": 4.942568299312541e-05, + "loss": 1.2505, + "step": 880 + }, + { + "epoch": 0.13849751419756726, + "grad_norm": 0.28090932965278625, + "learning_rate": 4.942436554318931e-05, + "loss": 1.1604, + "step": 881 + }, + { + "epoch": 0.1386547190944998, + "grad_norm": 0.21833541989326477, + "learning_rate": 4.942304660150588e-05, + "loss": 1.2246, + "step": 882 + }, + { + "epoch": 0.13881192399143233, + "grad_norm": 0.26167765259742737, + "learning_rate": 4.9421726168155704e-05, + "loss": 1.2399, + "step": 883 + }, + { + "epoch": 0.13896912888836488, + "grad_norm": 0.23778817057609558, + "learning_rate": 4.9420404243219395e-05, + "loss": 1.2692, + "step": 884 + }, + { + "epoch": 0.1391263337852974, + "grad_norm": 0.43253734707832336, + "learning_rate": 4.941908082677773e-05, + "loss": 1.2302, + "step": 885 + }, + { + "epoch": 0.13928353868222995, + "grad_norm": 0.2448786050081253, + "learning_rate": 4.94177559189115e-05, + "loss": 1.3163, + "step": 886 + }, + { + "epoch": 0.1394407435791625, + "grad_norm": 0.24711786210536957, + "learning_rate": 4.941642951970165e-05, + "loss": 1.2756, + "step": 887 + }, + { + "epoch": 0.13959794847609502, + "grad_norm": 0.22932004928588867, + "learning_rate": 4.941510162922917e-05, + "loss": 1.3087, + "step": 888 + }, + { + "epoch": 0.13975515337302757, + "grad_norm": 0.24999158084392548, + "learning_rate": 4.941377224757518e-05, + "loss": 1.3328, + "step": 889 + }, + { + "epoch": 0.1399123582699601, + "grad_norm": 0.21222981810569763, + "learning_rate": 4.941244137482088e-05, + "loss": 1.3177, + "step": 890 + }, + { + "epoch": 0.14006956316689265, + "grad_norm": 0.22691656649112701, + "learning_rate": 4.941110901104754e-05, + "loss": 1.2937, + "step": 891 + }, + { + "epoch": 0.1402267680638252, + "grad_norm": 0.3120933771133423, + "learning_rate": 4.940977515633653e-05, + "loss": 1.1604, + "step": 892 + }, + { + "epoch": 0.14038397296075772, + "grad_norm": 0.24279998242855072, + "learning_rate": 4.940843981076934e-05, + "loss": 1.3234, + "step": 893 + }, + { + "epoch": 0.14054117785769027, + "grad_norm": 0.25406959652900696, + "learning_rate": 4.940710297442751e-05, + "loss": 1.3216, + "step": 894 + }, + { + "epoch": 0.14069838275462282, + "grad_norm": 0.29678472876548767, + "learning_rate": 4.940576464739269e-05, + "loss": 1.2706, + "step": 895 + }, + { + "epoch": 0.14085558765155534, + "grad_norm": 0.25185081362724304, + "learning_rate": 4.9404424829746634e-05, + "loss": 1.2456, + "step": 896 + }, + { + "epoch": 0.1410127925484879, + "grad_norm": 0.2171952873468399, + "learning_rate": 4.940308352157115e-05, + "loss": 1.2943, + "step": 897 + }, + { + "epoch": 0.14116999744542041, + "grad_norm": 0.21498677134513855, + "learning_rate": 4.940174072294818e-05, + "loss": 1.3466, + "step": 898 + }, + { + "epoch": 0.14132720234235296, + "grad_norm": 0.2881999611854553, + "learning_rate": 4.940039643395972e-05, + "loss": 1.2322, + "step": 899 + }, + { + "epoch": 0.14148440723928551, + "grad_norm": 0.2709384858608246, + "learning_rate": 4.939905065468789e-05, + "loss": 1.2228, + "step": 900 + }, + { + "epoch": 0.14164161213621804, + "grad_norm": 0.2723088562488556, + "learning_rate": 4.9397703385214875e-05, + "loss": 1.1937, + "step": 901 + }, + { + "epoch": 0.1417988170331506, + "grad_norm": 0.4296363294124603, + "learning_rate": 4.939635462562297e-05, + "loss": 1.2043, + "step": 902 + }, + { + "epoch": 0.1419560219300831, + "grad_norm": 0.3255182206630707, + "learning_rate": 4.939500437599454e-05, + "loss": 1.1563, + "step": 903 + }, + { + "epoch": 0.14211322682701566, + "grad_norm": 0.33772897720336914, + "learning_rate": 4.939365263641206e-05, + "loss": 1.3019, + "step": 904 + }, + { + "epoch": 0.1422704317239482, + "grad_norm": 0.18991219997406006, + "learning_rate": 4.93922994069581e-05, + "loss": 1.3575, + "step": 905 + }, + { + "epoch": 0.14242763662088073, + "grad_norm": 0.23950403928756714, + "learning_rate": 4.939094468771529e-05, + "loss": 1.2512, + "step": 906 + }, + { + "epoch": 0.14258484151781328, + "grad_norm": 0.29783302545547485, + "learning_rate": 4.938958847876637e-05, + "loss": 1.3033, + "step": 907 + }, + { + "epoch": 0.14274204641474583, + "grad_norm": 0.3168744742870331, + "learning_rate": 4.93882307801942e-05, + "loss": 1.1934, + "step": 908 + }, + { + "epoch": 0.14289925131167835, + "grad_norm": 0.22578391432762146, + "learning_rate": 4.9386871592081675e-05, + "loss": 1.3307, + "step": 909 + }, + { + "epoch": 0.1430564562086109, + "grad_norm": 0.32671108841896057, + "learning_rate": 4.9385510914511824e-05, + "loss": 1.2436, + "step": 910 + }, + { + "epoch": 0.14321366110554343, + "grad_norm": 0.2524665296077728, + "learning_rate": 4.938414874756774e-05, + "loss": 1.2611, + "step": 911 + }, + { + "epoch": 0.14337086600247598, + "grad_norm": 0.3576960563659668, + "learning_rate": 4.9382785091332625e-05, + "loss": 1.3721, + "step": 912 + }, + { + "epoch": 0.14352807089940853, + "grad_norm": 0.2915900945663452, + "learning_rate": 4.9381419945889776e-05, + "loss": 1.3539, + "step": 913 + }, + { + "epoch": 0.14368527579634105, + "grad_norm": 0.3168608844280243, + "learning_rate": 4.938005331132256e-05, + "loss": 1.224, + "step": 914 + }, + { + "epoch": 0.1438424806932736, + "grad_norm": 0.24886426329612732, + "learning_rate": 4.937868518771445e-05, + "loss": 1.2299, + "step": 915 + }, + { + "epoch": 0.14399968559020612, + "grad_norm": 0.26588642597198486, + "learning_rate": 4.9377315575149e-05, + "loss": 1.1947, + "step": 916 + }, + { + "epoch": 0.14415689048713867, + "grad_norm": 0.28032201528549194, + "learning_rate": 4.937594447370986e-05, + "loss": 1.3756, + "step": 917 + }, + { + "epoch": 0.14431409538407122, + "grad_norm": 0.3017072081565857, + "learning_rate": 4.937457188348078e-05, + "loss": 1.2723, + "step": 918 + }, + { + "epoch": 0.14447130028100374, + "grad_norm": 0.2926197648048401, + "learning_rate": 4.937319780454559e-05, + "loss": 1.2716, + "step": 919 + }, + { + "epoch": 0.1446285051779363, + "grad_norm": 0.24066713452339172, + "learning_rate": 4.937182223698821e-05, + "loss": 1.2828, + "step": 920 + }, + { + "epoch": 0.14478571007486885, + "grad_norm": 0.30001577734947205, + "learning_rate": 4.937044518089266e-05, + "loss": 1.2407, + "step": 921 + }, + { + "epoch": 0.14494291497180137, + "grad_norm": 0.25927406549453735, + "learning_rate": 4.9369066636343044e-05, + "loss": 1.3004, + "step": 922 + }, + { + "epoch": 0.14510011986873392, + "grad_norm": 0.2542930543422699, + "learning_rate": 4.936768660342355e-05, + "loss": 1.3312, + "step": 923 + }, + { + "epoch": 0.14525732476566644, + "grad_norm": 0.25233832001686096, + "learning_rate": 4.936630508221847e-05, + "loss": 1.1879, + "step": 924 + }, + { + "epoch": 0.145414529662599, + "grad_norm": 0.22136953473091125, + "learning_rate": 4.9364922072812185e-05, + "loss": 1.2649, + "step": 925 + }, + { + "epoch": 0.14557173455953154, + "grad_norm": 0.21759863197803497, + "learning_rate": 4.936353757528916e-05, + "loss": 1.2467, + "step": 926 + }, + { + "epoch": 0.14572893945646406, + "grad_norm": 0.27614825963974, + "learning_rate": 4.936215158973396e-05, + "loss": 1.1901, + "step": 927 + }, + { + "epoch": 0.1458861443533966, + "grad_norm": 0.2502923309803009, + "learning_rate": 4.936076411623124e-05, + "loss": 1.3358, + "step": 928 + }, + { + "epoch": 0.14604334925032914, + "grad_norm": 0.2419285923242569, + "learning_rate": 4.935937515486573e-05, + "loss": 1.24, + "step": 929 + }, + { + "epoch": 0.14620055414726169, + "grad_norm": 0.35315272212028503, + "learning_rate": 4.935798470572226e-05, + "loss": 1.2452, + "step": 930 + }, + { + "epoch": 0.14635775904419424, + "grad_norm": 0.28915464878082275, + "learning_rate": 4.935659276888577e-05, + "loss": 1.3369, + "step": 931 + }, + { + "epoch": 0.14651496394112676, + "grad_norm": 0.23898139595985413, + "learning_rate": 4.9355199344441254e-05, + "loss": 1.2328, + "step": 932 + }, + { + "epoch": 0.1466721688380593, + "grad_norm": 0.25197896361351013, + "learning_rate": 4.935380443247384e-05, + "loss": 1.2826, + "step": 933 + }, + { + "epoch": 0.14682937373499186, + "grad_norm": 0.26547369360923767, + "learning_rate": 4.9352408033068695e-05, + "loss": 1.2284, + "step": 934 + }, + { + "epoch": 0.14698657863192438, + "grad_norm": 0.22031289339065552, + "learning_rate": 4.935101014631114e-05, + "loss": 1.2918, + "step": 935 + }, + { + "epoch": 0.14714378352885693, + "grad_norm": 0.2603214979171753, + "learning_rate": 4.9349610772286525e-05, + "loss": 1.1767, + "step": 936 + }, + { + "epoch": 0.14730098842578945, + "grad_norm": 0.29469192028045654, + "learning_rate": 4.934820991108032e-05, + "loss": 1.2845, + "step": 937 + }, + { + "epoch": 0.147458193322722, + "grad_norm": 0.30825692415237427, + "learning_rate": 4.934680756277811e-05, + "loss": 1.1999, + "step": 938 + }, + { + "epoch": 0.14761539821965455, + "grad_norm": 0.25342094898223877, + "learning_rate": 4.934540372746552e-05, + "loss": 1.2285, + "step": 939 + }, + { + "epoch": 0.14777260311658708, + "grad_norm": 0.26036733388900757, + "learning_rate": 4.9343998405228295e-05, + "loss": 1.2367, + "step": 940 + }, + { + "epoch": 0.14792980801351963, + "grad_norm": 0.27401411533355713, + "learning_rate": 4.934259159615228e-05, + "loss": 1.1985, + "step": 941 + }, + { + "epoch": 0.14808701291045215, + "grad_norm": 0.23039095103740692, + "learning_rate": 4.934118330032338e-05, + "loss": 1.2649, + "step": 942 + }, + { + "epoch": 0.1482442178073847, + "grad_norm": 0.29547953605651855, + "learning_rate": 4.933977351782761e-05, + "loss": 1.1345, + "step": 943 + }, + { + "epoch": 0.14840142270431725, + "grad_norm": 0.22598884999752045, + "learning_rate": 4.933836224875109e-05, + "loss": 1.2965, + "step": 944 + }, + { + "epoch": 0.14855862760124977, + "grad_norm": 0.31008240580558777, + "learning_rate": 4.9336949493180006e-05, + "loss": 1.1144, + "step": 945 + }, + { + "epoch": 0.14871583249818232, + "grad_norm": 0.28397658467292786, + "learning_rate": 4.9335535251200636e-05, + "loss": 1.266, + "step": 946 + }, + { + "epoch": 0.14887303739511487, + "grad_norm": 0.2284776121377945, + "learning_rate": 4.933411952289937e-05, + "loss": 1.2164, + "step": 947 + }, + { + "epoch": 0.1490302422920474, + "grad_norm": 0.2107551246881485, + "learning_rate": 4.9332702308362665e-05, + "loss": 1.2719, + "step": 948 + }, + { + "epoch": 0.14918744718897994, + "grad_norm": 0.26652616262435913, + "learning_rate": 4.933128360767709e-05, + "loss": 1.2304, + "step": 949 + }, + { + "epoch": 0.14934465208591247, + "grad_norm": 0.22624680399894714, + "learning_rate": 4.932986342092928e-05, + "loss": 1.2999, + "step": 950 + }, + { + "epoch": 0.14950185698284502, + "grad_norm": 0.20410288870334625, + "learning_rate": 4.932844174820598e-05, + "loss": 1.2269, + "step": 951 + }, + { + "epoch": 0.14965906187977757, + "grad_norm": 0.24987919628620148, + "learning_rate": 4.932701858959403e-05, + "loss": 1.3042, + "step": 952 + }, + { + "epoch": 0.1498162667767101, + "grad_norm": 0.191947340965271, + "learning_rate": 4.932559394518033e-05, + "loss": 1.2803, + "step": 953 + }, + { + "epoch": 0.14997347167364264, + "grad_norm": 0.3396085798740387, + "learning_rate": 4.932416781505191e-05, + "loss": 1.2014, + "step": 954 + }, + { + "epoch": 0.15013067657057516, + "grad_norm": 0.22375932335853577, + "learning_rate": 4.932274019929587e-05, + "loss": 1.242, + "step": 955 + }, + { + "epoch": 0.1502878814675077, + "grad_norm": 0.281097412109375, + "learning_rate": 4.93213110979994e-05, + "loss": 1.2742, + "step": 956 + }, + { + "epoch": 0.15044508636444026, + "grad_norm": 0.24049919843673706, + "learning_rate": 4.931988051124979e-05, + "loss": 1.3166, + "step": 957 + }, + { + "epoch": 0.15060229126137278, + "grad_norm": 0.24433936178684235, + "learning_rate": 4.93184484391344e-05, + "loss": 1.2933, + "step": 958 + }, + { + "epoch": 0.15075949615830533, + "grad_norm": 0.3671477138996124, + "learning_rate": 4.9317014881740706e-05, + "loss": 1.1731, + "step": 959 + }, + { + "epoch": 0.15091670105523788, + "grad_norm": 0.22575189173221588, + "learning_rate": 4.931557983915627e-05, + "loss": 1.2509, + "step": 960 + }, + { + "epoch": 0.15091670105523788, + "eval_loss": 1.2455451488494873, + "eval_runtime": 2308.563, + "eval_samples_per_second": 4.01, + "eval_steps_per_second": 2.005, + "step": 960 + }, + { + "epoch": 0.1510739059521704, + "grad_norm": 0.25838157534599304, + "learning_rate": 4.931414331146873e-05, + "loss": 1.3554, + "step": 961 + }, + { + "epoch": 0.15123111084910296, + "grad_norm": 0.3163435459136963, + "learning_rate": 4.931270529876583e-05, + "loss": 1.3133, + "step": 962 + }, + { + "epoch": 0.15138831574603548, + "grad_norm": 0.30024880170822144, + "learning_rate": 4.9311265801135384e-05, + "loss": 1.2303, + "step": 963 + }, + { + "epoch": 0.15154552064296803, + "grad_norm": 0.3424816429615021, + "learning_rate": 4.9309824818665325e-05, + "loss": 1.1929, + "step": 964 + }, + { + "epoch": 0.15170272553990058, + "grad_norm": 0.27401861548423767, + "learning_rate": 4.930838235144366e-05, + "loss": 1.2198, + "step": 965 + }, + { + "epoch": 0.1518599304368331, + "grad_norm": 0.24110247194766998, + "learning_rate": 4.930693839955848e-05, + "loss": 1.2381, + "step": 966 + }, + { + "epoch": 0.15201713533376565, + "grad_norm": 0.159100741147995, + "learning_rate": 4.9305492963098e-05, + "loss": 1.2666, + "step": 967 + }, + { + "epoch": 0.15217434023069817, + "grad_norm": 0.23810729384422302, + "learning_rate": 4.9304046042150474e-05, + "loss": 1.2592, + "step": 968 + }, + { + "epoch": 0.15233154512763072, + "grad_norm": 0.19887159764766693, + "learning_rate": 4.930259763680429e-05, + "loss": 1.3179, + "step": 969 + }, + { + "epoch": 0.15248875002456327, + "grad_norm": 0.22060149908065796, + "learning_rate": 4.930114774714791e-05, + "loss": 1.2712, + "step": 970 + }, + { + "epoch": 0.1526459549214958, + "grad_norm": 0.2443406730890274, + "learning_rate": 4.929969637326989e-05, + "loss": 1.2021, + "step": 971 + }, + { + "epoch": 0.15280315981842835, + "grad_norm": 0.2185499370098114, + "learning_rate": 4.9298243515258855e-05, + "loss": 1.3017, + "step": 972 + }, + { + "epoch": 0.1529603647153609, + "grad_norm": 0.23159849643707275, + "learning_rate": 4.929678917320357e-05, + "loss": 1.2122, + "step": 973 + }, + { + "epoch": 0.15311756961229342, + "grad_norm": 0.3281627595424652, + "learning_rate": 4.929533334719284e-05, + "loss": 1.1151, + "step": 974 + }, + { + "epoch": 0.15327477450922597, + "grad_norm": 0.24456332623958588, + "learning_rate": 4.929387603731558e-05, + "loss": 1.2107, + "step": 975 + }, + { + "epoch": 0.1534319794061585, + "grad_norm": 0.3623602092266083, + "learning_rate": 4.9292417243660814e-05, + "loss": 1.3041, + "step": 976 + }, + { + "epoch": 0.15358918430309104, + "grad_norm": 0.2319325953722, + "learning_rate": 4.929095696631763e-05, + "loss": 1.2331, + "step": 977 + }, + { + "epoch": 0.1537463892000236, + "grad_norm": 0.324660986661911, + "learning_rate": 4.92894952053752e-05, + "loss": 1.2511, + "step": 978 + }, + { + "epoch": 0.15390359409695611, + "grad_norm": 0.23866824805736542, + "learning_rate": 4.9288031960922834e-05, + "loss": 1.2709, + "step": 979 + }, + { + "epoch": 0.15406079899388866, + "grad_norm": 0.21922123432159424, + "learning_rate": 4.928656723304989e-05, + "loss": 1.3108, + "step": 980 + }, + { + "epoch": 0.1542180038908212, + "grad_norm": 0.2919687032699585, + "learning_rate": 4.92851010218458e-05, + "loss": 1.0465, + "step": 981 + }, + { + "epoch": 0.15437520878775374, + "grad_norm": 0.26380711793899536, + "learning_rate": 4.9283633327400156e-05, + "loss": 1.2066, + "step": 982 + }, + { + "epoch": 0.1545324136846863, + "grad_norm": 0.2607104182243347, + "learning_rate": 4.9282164149802576e-05, + "loss": 1.1227, + "step": 983 + }, + { + "epoch": 0.1546896185816188, + "grad_norm": 0.32302606105804443, + "learning_rate": 4.92806934891428e-05, + "loss": 1.2066, + "step": 984 + }, + { + "epoch": 0.15484682347855136, + "grad_norm": 0.28476232290267944, + "learning_rate": 4.927922134551065e-05, + "loss": 1.1447, + "step": 985 + }, + { + "epoch": 0.1550040283754839, + "grad_norm": 0.253738671541214, + "learning_rate": 4.9277747718996036e-05, + "loss": 1.2438, + "step": 986 + }, + { + "epoch": 0.15516123327241643, + "grad_norm": 0.29659610986709595, + "learning_rate": 4.927627260968896e-05, + "loss": 1.1946, + "step": 987 + }, + { + "epoch": 0.15531843816934898, + "grad_norm": 0.27436473965644836, + "learning_rate": 4.927479601767952e-05, + "loss": 1.2783, + "step": 988 + }, + { + "epoch": 0.1554756430662815, + "grad_norm": 0.21257497370243073, + "learning_rate": 4.9273317943057896e-05, + "loss": 1.204, + "step": 989 + }, + { + "epoch": 0.15563284796321405, + "grad_norm": 0.2546120584011078, + "learning_rate": 4.927183838591437e-05, + "loss": 1.2224, + "step": 990 + }, + { + "epoch": 0.1557900528601466, + "grad_norm": 0.2336018979549408, + "learning_rate": 4.92703573463393e-05, + "loss": 1.2903, + "step": 991 + }, + { + "epoch": 0.15594725775707913, + "grad_norm": 0.2048567533493042, + "learning_rate": 4.926887482442315e-05, + "loss": 1.2529, + "step": 992 + }, + { + "epoch": 0.15610446265401168, + "grad_norm": 0.20950450003147125, + "learning_rate": 4.926739082025646e-05, + "loss": 1.4247, + "step": 993 + }, + { + "epoch": 0.1562616675509442, + "grad_norm": 0.22965438663959503, + "learning_rate": 4.926590533392987e-05, + "loss": 1.2573, + "step": 994 + }, + { + "epoch": 0.15641887244787675, + "grad_norm": 0.25654879212379456, + "learning_rate": 4.9264418365534105e-05, + "loss": 1.2115, + "step": 995 + }, + { + "epoch": 0.1565760773448093, + "grad_norm": 0.26419493556022644, + "learning_rate": 4.9262929915159995e-05, + "loss": 1.2994, + "step": 996 + }, + { + "epoch": 0.15673328224174182, + "grad_norm": 0.3215327560901642, + "learning_rate": 4.926143998289843e-05, + "loss": 1.2268, + "step": 997 + }, + { + "epoch": 0.15689048713867437, + "grad_norm": 0.3190024495124817, + "learning_rate": 4.925994856884042e-05, + "loss": 1.2747, + "step": 998 + }, + { + "epoch": 0.15704769203560692, + "grad_norm": 0.26742681860923767, + "learning_rate": 4.9258455673077065e-05, + "loss": 1.2791, + "step": 999 + }, + { + "epoch": 0.15720489693253945, + "grad_norm": 0.21026930212974548, + "learning_rate": 4.925696129569953e-05, + "loss": 1.3694, + "step": 1000 + }, + { + "epoch": 0.157362101829472, + "grad_norm": 0.2838928699493408, + "learning_rate": 4.925546543679909e-05, + "loss": 1.2589, + "step": 1001 + }, + { + "epoch": 0.15751930672640452, + "grad_norm": 0.3007844090461731, + "learning_rate": 4.9253968096467104e-05, + "loss": 1.2488, + "step": 1002 + }, + { + "epoch": 0.15767651162333707, + "grad_norm": 0.2473154515028, + "learning_rate": 4.925246927479503e-05, + "loss": 1.2288, + "step": 1003 + }, + { + "epoch": 0.15783371652026962, + "grad_norm": 0.2360457479953766, + "learning_rate": 4.925096897187441e-05, + "loss": 1.3148, + "step": 1004 + }, + { + "epoch": 0.15799092141720214, + "grad_norm": 0.3671962320804596, + "learning_rate": 4.924946718779687e-05, + "loss": 1.1999, + "step": 1005 + }, + { + "epoch": 0.1581481263141347, + "grad_norm": 0.28391456604003906, + "learning_rate": 4.924796392265414e-05, + "loss": 1.2296, + "step": 1006 + }, + { + "epoch": 0.1583053312110672, + "grad_norm": 0.2550790011882782, + "learning_rate": 4.924645917653802e-05, + "loss": 1.1923, + "step": 1007 + }, + { + "epoch": 0.15846253610799976, + "grad_norm": 0.301740825176239, + "learning_rate": 4.924495294954044e-05, + "loss": 1.2363, + "step": 1008 + }, + { + "epoch": 0.1586197410049323, + "grad_norm": 0.2608097195625305, + "learning_rate": 4.9243445241753374e-05, + "loss": 1.298, + "step": 1009 + }, + { + "epoch": 0.15877694590186484, + "grad_norm": 0.31052571535110474, + "learning_rate": 4.924193605326891e-05, + "loss": 1.3037, + "step": 1010 + }, + { + "epoch": 0.15893415079879739, + "grad_norm": 0.2420274317264557, + "learning_rate": 4.924042538417923e-05, + "loss": 1.2541, + "step": 1011 + }, + { + "epoch": 0.15909135569572994, + "grad_norm": 0.21056459844112396, + "learning_rate": 4.92389132345766e-05, + "loss": 1.3229, + "step": 1012 + }, + { + "epoch": 0.15924856059266246, + "grad_norm": 0.3283076286315918, + "learning_rate": 4.923739960455337e-05, + "loss": 1.1036, + "step": 1013 + }, + { + "epoch": 0.159405765489595, + "grad_norm": 0.2461448460817337, + "learning_rate": 4.9235884494201987e-05, + "loss": 1.2503, + "step": 1014 + }, + { + "epoch": 0.15956297038652753, + "grad_norm": 0.2919829189777374, + "learning_rate": 4.923436790361499e-05, + "loss": 1.2108, + "step": 1015 + }, + { + "epoch": 0.15972017528346008, + "grad_norm": 0.22029368579387665, + "learning_rate": 4.923284983288501e-05, + "loss": 1.2369, + "step": 1016 + }, + { + "epoch": 0.15987738018039263, + "grad_norm": 0.27863630652427673, + "learning_rate": 4.9231330282104756e-05, + "loss": 1.2555, + "step": 1017 + }, + { + "epoch": 0.16003458507732515, + "grad_norm": 0.28123825788497925, + "learning_rate": 4.9229809251367055e-05, + "loss": 1.1519, + "step": 1018 + }, + { + "epoch": 0.1601917899742577, + "grad_norm": 0.231231227517128, + "learning_rate": 4.922828674076478e-05, + "loss": 1.3488, + "step": 1019 + }, + { + "epoch": 0.16034899487119023, + "grad_norm": 0.2704828977584839, + "learning_rate": 4.9226762750390944e-05, + "loss": 1.1102, + "step": 1020 + }, + { + "epoch": 0.16050619976812278, + "grad_norm": 0.17970627546310425, + "learning_rate": 4.922523728033861e-05, + "loss": 1.2237, + "step": 1021 + }, + { + "epoch": 0.16066340466505533, + "grad_norm": 0.22386445105075836, + "learning_rate": 4.9223710330700956e-05, + "loss": 1.2564, + "step": 1022 + }, + { + "epoch": 0.16082060956198785, + "grad_norm": 0.21347114443778992, + "learning_rate": 4.922218190157124e-05, + "loss": 1.2433, + "step": 1023 + }, + { + "epoch": 0.1609778144589204, + "grad_norm": 0.23873676359653473, + "learning_rate": 4.9220651993042813e-05, + "loss": 1.2018, + "step": 1024 + }, + { + "epoch": 0.16113501935585295, + "grad_norm": 0.26954975724220276, + "learning_rate": 4.921912060520912e-05, + "loss": 1.2118, + "step": 1025 + }, + { + "epoch": 0.16129222425278547, + "grad_norm": 0.3023718595504761, + "learning_rate": 4.9217587738163686e-05, + "loss": 1.2717, + "step": 1026 + }, + { + "epoch": 0.16144942914971802, + "grad_norm": 0.31107062101364136, + "learning_rate": 4.921605339200013e-05, + "loss": 1.2017, + "step": 1027 + }, + { + "epoch": 0.16160663404665054, + "grad_norm": 0.2795855402946472, + "learning_rate": 4.921451756681217e-05, + "loss": 1.3492, + "step": 1028 + }, + { + "epoch": 0.1617638389435831, + "grad_norm": 0.24515774846076965, + "learning_rate": 4.921298026269361e-05, + "loss": 1.2763, + "step": 1029 + }, + { + "epoch": 0.16192104384051564, + "grad_norm": 0.2603877782821655, + "learning_rate": 4.921144147973834e-05, + "loss": 1.3196, + "step": 1030 + }, + { + "epoch": 0.16207824873744817, + "grad_norm": 0.265697181224823, + "learning_rate": 4.9209901218040335e-05, + "loss": 1.3067, + "step": 1031 + }, + { + "epoch": 0.16223545363438072, + "grad_norm": 0.2554314136505127, + "learning_rate": 4.9208359477693686e-05, + "loss": 1.2347, + "step": 1032 + }, + { + "epoch": 0.16239265853131324, + "grad_norm": 0.3272973597049713, + "learning_rate": 4.920681625879254e-05, + "loss": 1.2104, + "step": 1033 + }, + { + "epoch": 0.1625498634282458, + "grad_norm": 0.21259547770023346, + "learning_rate": 4.9205271561431166e-05, + "loss": 1.2857, + "step": 1034 + }, + { + "epoch": 0.16270706832517834, + "grad_norm": 0.2505529820919037, + "learning_rate": 4.92037253857039e-05, + "loss": 1.1988, + "step": 1035 + }, + { + "epoch": 0.16286427322211086, + "grad_norm": 0.2369750738143921, + "learning_rate": 4.920217773170517e-05, + "loss": 1.2384, + "step": 1036 + }, + { + "epoch": 0.1630214781190434, + "grad_norm": 0.26577889919281006, + "learning_rate": 4.920062859952951e-05, + "loss": 1.1521, + "step": 1037 + }, + { + "epoch": 0.16317868301597593, + "grad_norm": 0.2224215418100357, + "learning_rate": 4.919907798927153e-05, + "loss": 1.3065, + "step": 1038 + }, + { + "epoch": 0.16333588791290848, + "grad_norm": 0.1882622092962265, + "learning_rate": 4.9197525901025944e-05, + "loss": 1.2472, + "step": 1039 + }, + { + "epoch": 0.16349309280984103, + "grad_norm": 0.25392916798591614, + "learning_rate": 4.919597233488754e-05, + "loss": 1.3387, + "step": 1040 + }, + { + "epoch": 0.16365029770677356, + "grad_norm": 0.3185995817184448, + "learning_rate": 4.91944172909512e-05, + "loss": 1.161, + "step": 1041 + }, + { + "epoch": 0.1638075026037061, + "grad_norm": 0.26924118399620056, + "learning_rate": 4.919286076931191e-05, + "loss": 1.0659, + "step": 1042 + }, + { + "epoch": 0.16396470750063866, + "grad_norm": 0.2224770486354828, + "learning_rate": 4.919130277006473e-05, + "loss": 1.2303, + "step": 1043 + }, + { + "epoch": 0.16412191239757118, + "grad_norm": 0.24008037149906158, + "learning_rate": 4.918974329330482e-05, + "loss": 1.2762, + "step": 1044 + }, + { + "epoch": 0.16427911729450373, + "grad_norm": 0.2728358209133148, + "learning_rate": 4.918818233912742e-05, + "loss": 1.1582, + "step": 1045 + }, + { + "epoch": 0.16443632219143625, + "grad_norm": 0.24911800026893616, + "learning_rate": 4.918661990762788e-05, + "loss": 1.2155, + "step": 1046 + }, + { + "epoch": 0.1645935270883688, + "grad_norm": 0.2444472759962082, + "learning_rate": 4.918505599890162e-05, + "loss": 1.2838, + "step": 1047 + }, + { + "epoch": 0.16475073198530135, + "grad_norm": 0.2379113882780075, + "learning_rate": 4.918349061304416e-05, + "loss": 1.3043, + "step": 1048 + }, + { + "epoch": 0.16490793688223387, + "grad_norm": 0.3085183799266815, + "learning_rate": 4.9181923750151095e-05, + "loss": 1.2568, + "step": 1049 + }, + { + "epoch": 0.16506514177916642, + "grad_norm": 0.2629674971103668, + "learning_rate": 4.918035541031814e-05, + "loss": 1.2171, + "step": 1050 + }, + { + "epoch": 0.16522234667609895, + "grad_norm": 0.2707282602787018, + "learning_rate": 4.917878559364107e-05, + "loss": 1.1597, + "step": 1051 + }, + { + "epoch": 0.1653795515730315, + "grad_norm": 0.27305370569229126, + "learning_rate": 4.9177214300215784e-05, + "loss": 1.36, + "step": 1052 + }, + { + "epoch": 0.16553675646996405, + "grad_norm": 0.20558474957942963, + "learning_rate": 4.9175641530138226e-05, + "loss": 1.2225, + "step": 1053 + }, + { + "epoch": 0.16569396136689657, + "grad_norm": 0.23680521547794342, + "learning_rate": 4.917406728350448e-05, + "loss": 1.2149, + "step": 1054 + }, + { + "epoch": 0.16585116626382912, + "grad_norm": 0.2101297229528427, + "learning_rate": 4.917249156041066e-05, + "loss": 1.2313, + "step": 1055 + }, + { + "epoch": 0.16600837116076167, + "grad_norm": 0.2601447105407715, + "learning_rate": 4.917091436095304e-05, + "loss": 1.1907, + "step": 1056 + }, + { + "epoch": 0.1661655760576942, + "grad_norm": 0.2113189995288849, + "learning_rate": 4.916933568522793e-05, + "loss": 1.2852, + "step": 1057 + }, + { + "epoch": 0.16632278095462674, + "grad_norm": 0.21135227382183075, + "learning_rate": 4.916775553333176e-05, + "loss": 1.2852, + "step": 1058 + }, + { + "epoch": 0.16647998585155926, + "grad_norm": 0.2743116617202759, + "learning_rate": 4.916617390536102e-05, + "loss": 1.2032, + "step": 1059 + }, + { + "epoch": 0.16663719074849181, + "grad_norm": 0.2520056664943695, + "learning_rate": 4.916459080141234e-05, + "loss": 1.3038, + "step": 1060 + }, + { + "epoch": 0.16679439564542436, + "grad_norm": 0.21614307165145874, + "learning_rate": 4.916300622158239e-05, + "loss": 1.2216, + "step": 1061 + }, + { + "epoch": 0.1669516005423569, + "grad_norm": 0.28615444898605347, + "learning_rate": 4.9161420165967956e-05, + "loss": 1.2162, + "step": 1062 + }, + { + "epoch": 0.16710880543928944, + "grad_norm": 0.33522137999534607, + "learning_rate": 4.91598326346659e-05, + "loss": 1.1458, + "step": 1063 + }, + { + "epoch": 0.16726601033622196, + "grad_norm": 0.30597957968711853, + "learning_rate": 4.9158243627773194e-05, + "loss": 1.2623, + "step": 1064 + }, + { + "epoch": 0.1674232152331545, + "grad_norm": 0.2643260061740875, + "learning_rate": 4.915665314538688e-05, + "loss": 1.2092, + "step": 1065 + }, + { + "epoch": 0.16758042013008706, + "grad_norm": 0.3190208673477173, + "learning_rate": 4.91550611876041e-05, + "loss": 1.0543, + "step": 1066 + }, + { + "epoch": 0.16773762502701958, + "grad_norm": 0.3194860816001892, + "learning_rate": 4.9153467754522095e-05, + "loss": 1.1393, + "step": 1067 + }, + { + "epoch": 0.16789482992395213, + "grad_norm": 0.2031661570072174, + "learning_rate": 4.915187284623817e-05, + "loss": 1.2136, + "step": 1068 + }, + { + "epoch": 0.16805203482088468, + "grad_norm": 0.2047189325094223, + "learning_rate": 4.915027646284974e-05, + "loss": 1.2962, + "step": 1069 + }, + { + "epoch": 0.1682092397178172, + "grad_norm": 0.31565096974372864, + "learning_rate": 4.9148678604454325e-05, + "loss": 1.1979, + "step": 1070 + }, + { + "epoch": 0.16836644461474976, + "grad_norm": 0.23875312507152557, + "learning_rate": 4.914707927114949e-05, + "loss": 1.3002, + "step": 1071 + }, + { + "epoch": 0.16852364951168228, + "grad_norm": 0.24424909055233002, + "learning_rate": 4.9145478463032924e-05, + "loss": 1.1491, + "step": 1072 + }, + { + "epoch": 0.16868085440861483, + "grad_norm": 0.28339752554893494, + "learning_rate": 4.91438761802024e-05, + "loss": 1.2398, + "step": 1073 + }, + { + "epoch": 0.16883805930554738, + "grad_norm": 0.2435888648033142, + "learning_rate": 4.9142272422755786e-05, + "loss": 1.3292, + "step": 1074 + }, + { + "epoch": 0.1689952642024799, + "grad_norm": 0.21540984511375427, + "learning_rate": 4.9140667190791026e-05, + "loss": 1.3665, + "step": 1075 + }, + { + "epoch": 0.16915246909941245, + "grad_norm": 0.2556820809841156, + "learning_rate": 4.913906048440617e-05, + "loss": 1.2557, + "step": 1076 + }, + { + "epoch": 0.16930967399634497, + "grad_norm": 0.23769475519657135, + "learning_rate": 4.913745230369934e-05, + "loss": 1.2163, + "step": 1077 + }, + { + "epoch": 0.16946687889327752, + "grad_norm": 0.31578120589256287, + "learning_rate": 4.913584264876875e-05, + "loss": 1.3176, + "step": 1078 + }, + { + "epoch": 0.16962408379021007, + "grad_norm": 0.22278232872486115, + "learning_rate": 4.913423151971273e-05, + "loss": 1.2206, + "step": 1079 + }, + { + "epoch": 0.1697812886871426, + "grad_norm": 0.31810736656188965, + "learning_rate": 4.913261891662967e-05, + "loss": 1.2254, + "step": 1080 + }, + { + "epoch": 0.16993849358407515, + "grad_norm": 0.22623823583126068, + "learning_rate": 4.913100483961807e-05, + "loss": 1.208, + "step": 1081 + }, + { + "epoch": 0.1700956984810077, + "grad_norm": 0.27108776569366455, + "learning_rate": 4.9129389288776504e-05, + "loss": 1.2989, + "step": 1082 + }, + { + "epoch": 0.17025290337794022, + "grad_norm": 0.194550558924675, + "learning_rate": 4.912777226420365e-05, + "loss": 1.3849, + "step": 1083 + }, + { + "epoch": 0.17041010827487277, + "grad_norm": 0.20856213569641113, + "learning_rate": 4.912615376599826e-05, + "loss": 1.2736, + "step": 1084 + }, + { + "epoch": 0.1705673131718053, + "grad_norm": 0.19355203211307526, + "learning_rate": 4.91245337942592e-05, + "loss": 1.256, + "step": 1085 + }, + { + "epoch": 0.17072451806873784, + "grad_norm": 0.21832303702831268, + "learning_rate": 4.9122912349085395e-05, + "loss": 1.1987, + "step": 1086 + }, + { + "epoch": 0.1708817229656704, + "grad_norm": 0.22642913460731506, + "learning_rate": 4.912128943057589e-05, + "loss": 1.3043, + "step": 1087 + }, + { + "epoch": 0.1710389278626029, + "grad_norm": 0.22713351249694824, + "learning_rate": 4.911966503882981e-05, + "loss": 1.1951, + "step": 1088 + }, + { + "epoch": 0.17119613275953546, + "grad_norm": 0.29707837104797363, + "learning_rate": 4.911803917394634e-05, + "loss": 1.2674, + "step": 1089 + }, + { + "epoch": 0.17135333765646799, + "grad_norm": 0.27017104625701904, + "learning_rate": 4.911641183602481e-05, + "loss": 1.1727, + "step": 1090 + }, + { + "epoch": 0.17151054255340054, + "grad_norm": 0.23867738246917725, + "learning_rate": 4.911478302516461e-05, + "loss": 1.2061, + "step": 1091 + }, + { + "epoch": 0.17166774745033309, + "grad_norm": 0.26270055770874023, + "learning_rate": 4.911315274146521e-05, + "loss": 1.2735, + "step": 1092 + }, + { + "epoch": 0.1718249523472656, + "grad_norm": 0.25167661905288696, + "learning_rate": 4.911152098502617e-05, + "loss": 1.2643, + "step": 1093 + }, + { + "epoch": 0.17198215724419816, + "grad_norm": 0.46500882506370544, + "learning_rate": 4.9109887755947185e-05, + "loss": 1.1743, + "step": 1094 + }, + { + "epoch": 0.1721393621411307, + "grad_norm": 0.2189512848854065, + "learning_rate": 4.910825305432798e-05, + "loss": 1.1232, + "step": 1095 + }, + { + "epoch": 0.17229656703806323, + "grad_norm": 0.21188926696777344, + "learning_rate": 4.9106616880268405e-05, + "loss": 1.2031, + "step": 1096 + }, + { + "epoch": 0.17245377193499578, + "grad_norm": 0.2135314792394638, + "learning_rate": 4.910497923386839e-05, + "loss": 1.2547, + "step": 1097 + }, + { + "epoch": 0.1726109768319283, + "grad_norm": 0.27527931332588196, + "learning_rate": 4.910334011522796e-05, + "loss": 1.119, + "step": 1098 + }, + { + "epoch": 0.17276818172886085, + "grad_norm": 0.23516559600830078, + "learning_rate": 4.910169952444722e-05, + "loss": 1.3006, + "step": 1099 + }, + { + "epoch": 0.1729253866257934, + "grad_norm": 0.23166057467460632, + "learning_rate": 4.910005746162637e-05, + "loss": 1.281, + "step": 1100 + }, + { + "epoch": 0.17308259152272593, + "grad_norm": 0.22281822562217712, + "learning_rate": 4.9098413926865714e-05, + "loss": 1.1526, + "step": 1101 + }, + { + "epoch": 0.17323979641965848, + "grad_norm": 0.32062655687332153, + "learning_rate": 4.909676892026563e-05, + "loss": 1.2388, + "step": 1102 + }, + { + "epoch": 0.173397001316591, + "grad_norm": 0.29868969321250916, + "learning_rate": 4.909512244192657e-05, + "loss": 1.2303, + "step": 1103 + }, + { + "epoch": 0.17355420621352355, + "grad_norm": 0.23143270611763, + "learning_rate": 4.90934744919491e-05, + "loss": 1.2137, + "step": 1104 + }, + { + "epoch": 0.1737114111104561, + "grad_norm": 0.2830474376678467, + "learning_rate": 4.909182507043389e-05, + "loss": 1.2178, + "step": 1105 + }, + { + "epoch": 0.17386861600738862, + "grad_norm": 0.22427986562252045, + "learning_rate": 4.909017417748166e-05, + "loss": 1.3153, + "step": 1106 + }, + { + "epoch": 0.17402582090432117, + "grad_norm": 0.2587423622608185, + "learning_rate": 4.908852181319326e-05, + "loss": 1.2669, + "step": 1107 + }, + { + "epoch": 0.17418302580125372, + "grad_norm": 0.24905993044376373, + "learning_rate": 4.9086867977669594e-05, + "loss": 1.2549, + "step": 1108 + }, + { + "epoch": 0.17434023069818624, + "grad_norm": 0.26877379417419434, + "learning_rate": 4.908521267101167e-05, + "loss": 1.2694, + "step": 1109 + }, + { + "epoch": 0.1744974355951188, + "grad_norm": 0.2501152753829956, + "learning_rate": 4.9083555893320596e-05, + "loss": 1.2241, + "step": 1110 + }, + { + "epoch": 0.17465464049205132, + "grad_norm": 0.27815014123916626, + "learning_rate": 4.908189764469757e-05, + "loss": 1.2152, + "step": 1111 + }, + { + "epoch": 0.17481184538898387, + "grad_norm": 0.32891881465911865, + "learning_rate": 4.9080237925243856e-05, + "loss": 1.2638, + "step": 1112 + }, + { + "epoch": 0.17496905028591642, + "grad_norm": 0.2137015461921692, + "learning_rate": 4.9078576735060825e-05, + "loss": 1.2041, + "step": 1113 + }, + { + "epoch": 0.17512625518284894, + "grad_norm": 0.17862486839294434, + "learning_rate": 4.907691407424995e-05, + "loss": 1.3349, + "step": 1114 + }, + { + "epoch": 0.1752834600797815, + "grad_norm": 0.25791284441947937, + "learning_rate": 4.907524994291276e-05, + "loss": 1.2337, + "step": 1115 + }, + { + "epoch": 0.175440664976714, + "grad_norm": 0.24266491830348969, + "learning_rate": 4.90735843411509e-05, + "loss": 1.0939, + "step": 1116 + }, + { + "epoch": 0.17559786987364656, + "grad_norm": 0.2618250250816345, + "learning_rate": 4.9071917269066114e-05, + "loss": 1.2855, + "step": 1117 + }, + { + "epoch": 0.1757550747705791, + "grad_norm": 0.2477390021085739, + "learning_rate": 4.9070248726760206e-05, + "loss": 1.1675, + "step": 1118 + }, + { + "epoch": 0.17591227966751163, + "grad_norm": 0.29105502367019653, + "learning_rate": 4.906857871433508e-05, + "loss": 1.183, + "step": 1119 + }, + { + "epoch": 0.17606948456444418, + "grad_norm": 0.2923283874988556, + "learning_rate": 4.906690723189275e-05, + "loss": 1.1386, + "step": 1120 + }, + { + "epoch": 0.17606948456444418, + "eval_loss": 1.219694972038269, + "eval_runtime": 2300.2931, + "eval_samples_per_second": 4.025, + "eval_steps_per_second": 2.012, + "step": 1120 + }, + { + "epoch": 0.17622668946137673, + "grad_norm": 0.3278633952140808, + "learning_rate": 4.906523427953529e-05, + "loss": 1.1738, + "step": 1121 + }, + { + "epoch": 0.17638389435830926, + "grad_norm": 0.31546783447265625, + "learning_rate": 4.906355985736488e-05, + "loss": 1.0894, + "step": 1122 + }, + { + "epoch": 0.1765410992552418, + "grad_norm": 0.28350481390953064, + "learning_rate": 4.906188396548379e-05, + "loss": 1.2774, + "step": 1123 + }, + { + "epoch": 0.17669830415217433, + "grad_norm": 0.21374982595443726, + "learning_rate": 4.9060206603994385e-05, + "loss": 1.37, + "step": 1124 + }, + { + "epoch": 0.17685550904910688, + "grad_norm": 0.2343566119670868, + "learning_rate": 4.9058527772999095e-05, + "loss": 1.2065, + "step": 1125 + }, + { + "epoch": 0.17701271394603943, + "grad_norm": 0.29571887850761414, + "learning_rate": 4.905684747260047e-05, + "loss": 1.1967, + "step": 1126 + }, + { + "epoch": 0.17716991884297195, + "grad_norm": 0.2689303457736969, + "learning_rate": 4.905516570290113e-05, + "loss": 1.2337, + "step": 1127 + }, + { + "epoch": 0.1773271237399045, + "grad_norm": 0.22743673622608185, + "learning_rate": 4.90534824640038e-05, + "loss": 1.1673, + "step": 1128 + }, + { + "epoch": 0.17748432863683702, + "grad_norm": 0.36731019616127014, + "learning_rate": 4.905179775601126e-05, + "loss": 1.1397, + "step": 1129 + }, + { + "epoch": 0.17764153353376957, + "grad_norm": 0.2571149468421936, + "learning_rate": 4.905011157902645e-05, + "loss": 1.1166, + "step": 1130 + }, + { + "epoch": 0.17779873843070212, + "grad_norm": 0.2615256905555725, + "learning_rate": 4.904842393315231e-05, + "loss": 1.2095, + "step": 1131 + }, + { + "epoch": 0.17795594332763465, + "grad_norm": 0.28919360041618347, + "learning_rate": 4.904673481849194e-05, + "loss": 1.0976, + "step": 1132 + }, + { + "epoch": 0.1781131482245672, + "grad_norm": 0.3858489990234375, + "learning_rate": 4.90450442351485e-05, + "loss": 1.1934, + "step": 1133 + }, + { + "epoch": 0.17827035312149975, + "grad_norm": 0.2448245733976364, + "learning_rate": 4.904335218322524e-05, + "loss": 1.1604, + "step": 1134 + }, + { + "epoch": 0.17842755801843227, + "grad_norm": 0.2626294195652008, + "learning_rate": 4.9041658662825514e-05, + "loss": 1.1301, + "step": 1135 + }, + { + "epoch": 0.17858476291536482, + "grad_norm": 0.3016091287136078, + "learning_rate": 4.903996367405275e-05, + "loss": 1.2579, + "step": 1136 + }, + { + "epoch": 0.17874196781229734, + "grad_norm": 0.28168612718582153, + "learning_rate": 4.9038267217010455e-05, + "loss": 1.1471, + "step": 1137 + }, + { + "epoch": 0.1788991727092299, + "grad_norm": 0.29256439208984375, + "learning_rate": 4.903656929180228e-05, + "loss": 1.1598, + "step": 1138 + }, + { + "epoch": 0.17905637760616244, + "grad_norm": 0.19786624610424042, + "learning_rate": 4.9034869898531895e-05, + "loss": 1.2115, + "step": 1139 + }, + { + "epoch": 0.17921358250309496, + "grad_norm": 0.17216260731220245, + "learning_rate": 4.9033169037303106e-05, + "loss": 1.2471, + "step": 1140 + }, + { + "epoch": 0.17937078740002751, + "grad_norm": 0.22571730613708496, + "learning_rate": 4.9031466708219785e-05, + "loss": 1.2226, + "step": 1141 + }, + { + "epoch": 0.17952799229696004, + "grad_norm": 0.25510528683662415, + "learning_rate": 4.9029762911385915e-05, + "loss": 1.1428, + "step": 1142 + }, + { + "epoch": 0.1796851971938926, + "grad_norm": 0.19014020264148712, + "learning_rate": 4.902805764690556e-05, + "loss": 1.2268, + "step": 1143 + }, + { + "epoch": 0.17984240209082514, + "grad_norm": 0.25155729055404663, + "learning_rate": 4.902635091488286e-05, + "loss": 1.1943, + "step": 1144 + }, + { + "epoch": 0.17999960698775766, + "grad_norm": 0.3109387159347534, + "learning_rate": 4.902464271542206e-05, + "loss": 1.176, + "step": 1145 + }, + { + "epoch": 0.1801568118846902, + "grad_norm": 0.2269504815340042, + "learning_rate": 4.9022933048627496e-05, + "loss": 1.2166, + "step": 1146 + }, + { + "epoch": 0.18031401678162276, + "grad_norm": 0.20270425081253052, + "learning_rate": 4.902122191460358e-05, + "loss": 1.235, + "step": 1147 + }, + { + "epoch": 0.18047122167855528, + "grad_norm": 0.2519841194152832, + "learning_rate": 4.901950931345481e-05, + "loss": 1.2418, + "step": 1148 + }, + { + "epoch": 0.18062842657548783, + "grad_norm": 0.1967516988515854, + "learning_rate": 4.901779524528582e-05, + "loss": 1.2979, + "step": 1149 + }, + { + "epoch": 0.18078563147242036, + "grad_norm": 0.21120384335517883, + "learning_rate": 4.901607971020127e-05, + "loss": 1.1557, + "step": 1150 + }, + { + "epoch": 0.1809428363693529, + "grad_norm": 0.31649792194366455, + "learning_rate": 4.9014362708305944e-05, + "loss": 1.3237, + "step": 1151 + }, + { + "epoch": 0.18110004126628546, + "grad_norm": 0.24945318698883057, + "learning_rate": 4.901264423970471e-05, + "loss": 1.2099, + "step": 1152 + }, + { + "epoch": 0.18125724616321798, + "grad_norm": 0.30652904510498047, + "learning_rate": 4.901092430450254e-05, + "loss": 1.1918, + "step": 1153 + }, + { + "epoch": 0.18141445106015053, + "grad_norm": 0.2480253279209137, + "learning_rate": 4.900920290280446e-05, + "loss": 1.2675, + "step": 1154 + }, + { + "epoch": 0.18157165595708305, + "grad_norm": 0.3034304976463318, + "learning_rate": 4.900748003471561e-05, + "loss": 1.2012, + "step": 1155 + }, + { + "epoch": 0.1817288608540156, + "grad_norm": 0.2113679200410843, + "learning_rate": 4.900575570034124e-05, + "loss": 1.2824, + "step": 1156 + }, + { + "epoch": 0.18188606575094815, + "grad_norm": 0.34726831316947937, + "learning_rate": 4.9004029899786627e-05, + "loss": 1.1426, + "step": 1157 + }, + { + "epoch": 0.18204327064788067, + "grad_norm": 0.20344194769859314, + "learning_rate": 4.900230263315722e-05, + "loss": 1.2096, + "step": 1158 + }, + { + "epoch": 0.18220047554481322, + "grad_norm": 0.28635072708129883, + "learning_rate": 4.900057390055847e-05, + "loss": 1.166, + "step": 1159 + }, + { + "epoch": 0.18235768044174577, + "grad_norm": 0.21670344471931458, + "learning_rate": 4.8998843702095995e-05, + "loss": 1.2103, + "step": 1160 + }, + { + "epoch": 0.1825148853386783, + "grad_norm": 0.31661516427993774, + "learning_rate": 4.899711203787545e-05, + "loss": 1.2345, + "step": 1161 + }, + { + "epoch": 0.18267209023561085, + "grad_norm": 0.30255556106567383, + "learning_rate": 4.899537890800261e-05, + "loss": 1.2342, + "step": 1162 + }, + { + "epoch": 0.18282929513254337, + "grad_norm": 0.23636944591999054, + "learning_rate": 4.899364431258332e-05, + "loss": 1.1685, + "step": 1163 + }, + { + "epoch": 0.18298650002947592, + "grad_norm": 0.27452319860458374, + "learning_rate": 4.8991908251723524e-05, + "loss": 1.1263, + "step": 1164 + }, + { + "epoch": 0.18314370492640847, + "grad_norm": 0.28636041283607483, + "learning_rate": 4.899017072552926e-05, + "loss": 1.1961, + "step": 1165 + }, + { + "epoch": 0.183300909823341, + "grad_norm": 0.29220953583717346, + "learning_rate": 4.8988431734106635e-05, + "loss": 1.2414, + "step": 1166 + }, + { + "epoch": 0.18345811472027354, + "grad_norm": 0.20738068222999573, + "learning_rate": 4.898669127756188e-05, + "loss": 1.1499, + "step": 1167 + }, + { + "epoch": 0.18361531961720606, + "grad_norm": 0.19913551211357117, + "learning_rate": 4.898494935600127e-05, + "loss": 1.3538, + "step": 1168 + }, + { + "epoch": 0.1837725245141386, + "grad_norm": 0.256979763507843, + "learning_rate": 4.8983205969531234e-05, + "loss": 1.1979, + "step": 1169 + }, + { + "epoch": 0.18392972941107116, + "grad_norm": 0.26307129859924316, + "learning_rate": 4.898146111825821e-05, + "loss": 1.2054, + "step": 1170 + }, + { + "epoch": 0.18408693430800369, + "grad_norm": 0.2451772540807724, + "learning_rate": 4.897971480228879e-05, + "loss": 1.1901, + "step": 1171 + }, + { + "epoch": 0.18424413920493624, + "grad_norm": 0.3223975896835327, + "learning_rate": 4.897796702172962e-05, + "loss": 1.1825, + "step": 1172 + }, + { + "epoch": 0.18440134410186879, + "grad_norm": 0.34991317987442017, + "learning_rate": 4.897621777668746e-05, + "loss": 1.1371, + "step": 1173 + }, + { + "epoch": 0.1845585489988013, + "grad_norm": 0.2680002748966217, + "learning_rate": 4.897446706726915e-05, + "loss": 1.2179, + "step": 1174 + }, + { + "epoch": 0.18471575389573386, + "grad_norm": 0.21509090065956116, + "learning_rate": 4.897271489358159e-05, + "loss": 1.1284, + "step": 1175 + }, + { + "epoch": 0.18487295879266638, + "grad_norm": 0.20545831322669983, + "learning_rate": 4.8970961255731826e-05, + "loss": 1.2188, + "step": 1176 + }, + { + "epoch": 0.18503016368959893, + "grad_norm": 0.23479585349559784, + "learning_rate": 4.896920615382695e-05, + "loss": 1.2947, + "step": 1177 + }, + { + "epoch": 0.18518736858653148, + "grad_norm": 0.2880757749080658, + "learning_rate": 4.896744958797417e-05, + "loss": 1.1443, + "step": 1178 + }, + { + "epoch": 0.185344573483464, + "grad_norm": 0.2431318610906601, + "learning_rate": 4.8965691558280744e-05, + "loss": 1.1123, + "step": 1179 + }, + { + "epoch": 0.18550177838039655, + "grad_norm": 0.21252453327178955, + "learning_rate": 4.896393206485407e-05, + "loss": 1.326, + "step": 1180 + }, + { + "epoch": 0.18565898327732908, + "grad_norm": 0.28821709752082825, + "learning_rate": 4.8962171107801596e-05, + "loss": 1.1508, + "step": 1181 + }, + { + "epoch": 0.18581618817426163, + "grad_norm": 0.2636358141899109, + "learning_rate": 4.8960408687230886e-05, + "loss": 1.1061, + "step": 1182 + }, + { + "epoch": 0.18597339307119418, + "grad_norm": 0.23121225833892822, + "learning_rate": 4.895864480324957e-05, + "loss": 1.2486, + "step": 1183 + }, + { + "epoch": 0.1861305979681267, + "grad_norm": 0.29034245014190674, + "learning_rate": 4.895687945596539e-05, + "loss": 1.186, + "step": 1184 + }, + { + "epoch": 0.18628780286505925, + "grad_norm": 0.3220363259315491, + "learning_rate": 4.895511264548617e-05, + "loss": 1.1727, + "step": 1185 + }, + { + "epoch": 0.1864450077619918, + "grad_norm": 0.2863159477710724, + "learning_rate": 4.89533443719198e-05, + "loss": 1.1946, + "step": 1186 + }, + { + "epoch": 0.18660221265892432, + "grad_norm": 0.27671483159065247, + "learning_rate": 4.89515746353743e-05, + "loss": 1.2271, + "step": 1187 + }, + { + "epoch": 0.18675941755585687, + "grad_norm": 0.2535041570663452, + "learning_rate": 4.894980343595775e-05, + "loss": 1.2437, + "step": 1188 + }, + { + "epoch": 0.1869166224527894, + "grad_norm": 0.34405645728111267, + "learning_rate": 4.894803077377833e-05, + "loss": 1.1397, + "step": 1189 + }, + { + "epoch": 0.18707382734972194, + "grad_norm": 0.28299692273139954, + "learning_rate": 4.8946256648944307e-05, + "loss": 1.1215, + "step": 1190 + }, + { + "epoch": 0.1872310322466545, + "grad_norm": 0.1962118297815323, + "learning_rate": 4.8944481061564035e-05, + "loss": 1.1908, + "step": 1191 + }, + { + "epoch": 0.18738823714358702, + "grad_norm": 0.24563154578208923, + "learning_rate": 4.894270401174597e-05, + "loss": 1.2265, + "step": 1192 + }, + { + "epoch": 0.18754544204051957, + "grad_norm": 0.22452424466609955, + "learning_rate": 4.894092549959862e-05, + "loss": 1.1673, + "step": 1193 + }, + { + "epoch": 0.1877026469374521, + "grad_norm": 0.1847248673439026, + "learning_rate": 4.8939145525230646e-05, + "loss": 1.2706, + "step": 1194 + }, + { + "epoch": 0.18785985183438464, + "grad_norm": 0.2578265964984894, + "learning_rate": 4.893736408875075e-05, + "loss": 1.2011, + "step": 1195 + }, + { + "epoch": 0.1880170567313172, + "grad_norm": 0.2686786353588104, + "learning_rate": 4.893558119026772e-05, + "loss": 1.3191, + "step": 1196 + }, + { + "epoch": 0.1881742616282497, + "grad_norm": 0.27492383122444153, + "learning_rate": 4.893379682989047e-05, + "loss": 1.1755, + "step": 1197 + }, + { + "epoch": 0.18833146652518226, + "grad_norm": 0.2544412612915039, + "learning_rate": 4.8932011007727965e-05, + "loss": 1.1842, + "step": 1198 + }, + { + "epoch": 0.1884886714221148, + "grad_norm": 0.24790935218334198, + "learning_rate": 4.893022372388928e-05, + "loss": 1.2408, + "step": 1199 + }, + { + "epoch": 0.18864587631904733, + "grad_norm": 0.2788006067276001, + "learning_rate": 4.892843497848358e-05, + "loss": 1.2671, + "step": 1200 + }, + { + "epoch": 0.18880308121597988, + "grad_norm": 0.2571476101875305, + "learning_rate": 4.892664477162012e-05, + "loss": 1.1894, + "step": 1201 + }, + { + "epoch": 0.1889602861129124, + "grad_norm": 0.22788426280021667, + "learning_rate": 4.892485310340822e-05, + "loss": 1.2261, + "step": 1202 + }, + { + "epoch": 0.18911749100984496, + "grad_norm": 0.2010507732629776, + "learning_rate": 4.892305997395733e-05, + "loss": 1.2399, + "step": 1203 + }, + { + "epoch": 0.1892746959067775, + "grad_norm": 0.23946425318717957, + "learning_rate": 4.892126538337696e-05, + "loss": 1.2727, + "step": 1204 + }, + { + "epoch": 0.18943190080371003, + "grad_norm": 0.2885929346084595, + "learning_rate": 4.8919469331776714e-05, + "loss": 1.2376, + "step": 1205 + }, + { + "epoch": 0.18958910570064258, + "grad_norm": 0.31879860162734985, + "learning_rate": 4.891767181926629e-05, + "loss": 1.22, + "step": 1206 + }, + { + "epoch": 0.1897463105975751, + "grad_norm": 0.2895459532737732, + "learning_rate": 4.891587284595546e-05, + "loss": 1.2387, + "step": 1207 + }, + { + "epoch": 0.18990351549450765, + "grad_norm": 0.27507272362709045, + "learning_rate": 4.891407241195412e-05, + "loss": 1.1723, + "step": 1208 + }, + { + "epoch": 0.1900607203914402, + "grad_norm": 0.26780039072036743, + "learning_rate": 4.8912270517372224e-05, + "loss": 1.1549, + "step": 1209 + }, + { + "epoch": 0.19021792528837272, + "grad_norm": 0.1915176510810852, + "learning_rate": 4.8910467162319826e-05, + "loss": 1.109, + "step": 1210 + }, + { + "epoch": 0.19037513018530527, + "grad_norm": 0.25054261088371277, + "learning_rate": 4.8908662346907064e-05, + "loss": 1.1197, + "step": 1211 + }, + { + "epoch": 0.19053233508223782, + "grad_norm": 0.24239963293075562, + "learning_rate": 4.8906856071244176e-05, + "loss": 1.2614, + "step": 1212 + }, + { + "epoch": 0.19068953997917035, + "grad_norm": 0.21543578803539276, + "learning_rate": 4.890504833544147e-05, + "loss": 1.3804, + "step": 1213 + }, + { + "epoch": 0.1908467448761029, + "grad_norm": 0.2045610100030899, + "learning_rate": 4.8903239139609376e-05, + "loss": 1.2108, + "step": 1214 + }, + { + "epoch": 0.19100394977303542, + "grad_norm": 0.2209930568933487, + "learning_rate": 4.890142848385838e-05, + "loss": 1.2329, + "step": 1215 + }, + { + "epoch": 0.19116115466996797, + "grad_norm": 0.24921675026416779, + "learning_rate": 4.889961636829906e-05, + "loss": 1.2009, + "step": 1216 + }, + { + "epoch": 0.19131835956690052, + "grad_norm": 0.2356979250907898, + "learning_rate": 4.8897802793042115e-05, + "loss": 1.211, + "step": 1217 + }, + { + "epoch": 0.19147556446383304, + "grad_norm": 0.20199252665042877, + "learning_rate": 4.88959877581983e-05, + "loss": 1.18, + "step": 1218 + }, + { + "epoch": 0.1916327693607656, + "grad_norm": 0.24907195568084717, + "learning_rate": 4.889417126387846e-05, + "loss": 1.2438, + "step": 1219 + }, + { + "epoch": 0.19178997425769811, + "grad_norm": 0.2976427674293518, + "learning_rate": 4.889235331019356e-05, + "loss": 1.1526, + "step": 1220 + }, + { + "epoch": 0.19194717915463066, + "grad_norm": 0.25074872374534607, + "learning_rate": 4.889053389725463e-05, + "loss": 1.1805, + "step": 1221 + }, + { + "epoch": 0.19210438405156322, + "grad_norm": 0.2157672792673111, + "learning_rate": 4.8888713025172776e-05, + "loss": 1.2103, + "step": 1222 + }, + { + "epoch": 0.19226158894849574, + "grad_norm": 0.24573171138763428, + "learning_rate": 4.888689069405923e-05, + "loss": 1.1981, + "step": 1223 + }, + { + "epoch": 0.1924187938454283, + "grad_norm": 0.294160932302475, + "learning_rate": 4.888506690402528e-05, + "loss": 1.2667, + "step": 1224 + }, + { + "epoch": 0.19257599874236084, + "grad_norm": 0.8444136381149292, + "learning_rate": 4.8883241655182314e-05, + "loss": 1.1977, + "step": 1225 + }, + { + "epoch": 0.19273320363929336, + "grad_norm": 0.4191160798072815, + "learning_rate": 4.888141494764182e-05, + "loss": 1.1981, + "step": 1226 + }, + { + "epoch": 0.1928904085362259, + "grad_norm": 0.31621554493904114, + "learning_rate": 4.8879586781515376e-05, + "loss": 1.2224, + "step": 1227 + }, + { + "epoch": 0.19304761343315843, + "grad_norm": 0.2715776860713959, + "learning_rate": 4.887775715691462e-05, + "loss": 1.1029, + "step": 1228 + }, + { + "epoch": 0.19320481833009098, + "grad_norm": 0.2641848623752594, + "learning_rate": 4.88759260739513e-05, + "loss": 1.1738, + "step": 1229 + }, + { + "epoch": 0.19336202322702353, + "grad_norm": 0.2537270188331604, + "learning_rate": 4.887409353273727e-05, + "loss": 1.2847, + "step": 1230 + }, + { + "epoch": 0.19351922812395606, + "grad_norm": 0.2998782694339752, + "learning_rate": 4.8872259533384423e-05, + "loss": 1.1814, + "step": 1231 + }, + { + "epoch": 0.1936764330208886, + "grad_norm": 0.2254815697669983, + "learning_rate": 4.8870424076004806e-05, + "loss": 1.2004, + "step": 1232 + }, + { + "epoch": 0.19383363791782113, + "grad_norm": 0.3711993396282196, + "learning_rate": 4.88685871607105e-05, + "loss": 1.1502, + "step": 1233 + }, + { + "epoch": 0.19399084281475368, + "grad_norm": 0.24783778190612793, + "learning_rate": 4.886674878761371e-05, + "loss": 1.1185, + "step": 1234 + }, + { + "epoch": 0.19414804771168623, + "grad_norm": 0.1896362453699112, + "learning_rate": 4.88649089568267e-05, + "loss": 1.1856, + "step": 1235 + }, + { + "epoch": 0.19430525260861875, + "grad_norm": 0.28106558322906494, + "learning_rate": 4.886306766846187e-05, + "loss": 1.2196, + "step": 1236 + }, + { + "epoch": 0.1944624575055513, + "grad_norm": 0.3023208975791931, + "learning_rate": 4.8861224922631645e-05, + "loss": 1.1836, + "step": 1237 + }, + { + "epoch": 0.19461966240248385, + "grad_norm": 0.36752450466156006, + "learning_rate": 4.8859380719448596e-05, + "loss": 1.1831, + "step": 1238 + }, + { + "epoch": 0.19477686729941637, + "grad_norm": 0.2593975365161896, + "learning_rate": 4.885753505902535e-05, + "loss": 1.1955, + "step": 1239 + }, + { + "epoch": 0.19493407219634892, + "grad_norm": 0.2952882647514343, + "learning_rate": 4.885568794147463e-05, + "loss": 1.108, + "step": 1240 + }, + { + "epoch": 0.19509127709328145, + "grad_norm": 0.2335767149925232, + "learning_rate": 4.885383936690926e-05, + "loss": 1.2389, + "step": 1241 + }, + { + "epoch": 0.195248481990214, + "grad_norm": 0.3618619441986084, + "learning_rate": 4.885198933544214e-05, + "loss": 1.0247, + "step": 1242 + }, + { + "epoch": 0.19540568688714655, + "grad_norm": 0.26691627502441406, + "learning_rate": 4.885013784718626e-05, + "loss": 1.1516, + "step": 1243 + }, + { + "epoch": 0.19556289178407907, + "grad_norm": 0.2977723777294159, + "learning_rate": 4.8848284902254705e-05, + "loss": 1.1617, + "step": 1244 + }, + { + "epoch": 0.19572009668101162, + "grad_norm": 0.33515632152557373, + "learning_rate": 4.884643050076064e-05, + "loss": 1.1789, + "step": 1245 + }, + { + "epoch": 0.19587730157794414, + "grad_norm": 0.275840163230896, + "learning_rate": 4.8844574642817334e-05, + "loss": 1.1103, + "step": 1246 + }, + { + "epoch": 0.1960345064748767, + "grad_norm": 0.26756566762924194, + "learning_rate": 4.884271732853813e-05, + "loss": 1.2101, + "step": 1247 + }, + { + "epoch": 0.19619171137180924, + "grad_norm": 0.20770548284053802, + "learning_rate": 4.884085855803647e-05, + "loss": 1.2506, + "step": 1248 + }, + { + "epoch": 0.19634891626874176, + "grad_norm": 0.2700664699077606, + "learning_rate": 4.883899833142588e-05, + "loss": 1.2034, + "step": 1249 + }, + { + "epoch": 0.1965061211656743, + "grad_norm": 0.2403496950864792, + "learning_rate": 4.883713664881997e-05, + "loss": 1.1622, + "step": 1250 + }, + { + "epoch": 0.19666332606260686, + "grad_norm": 0.2710270881652832, + "learning_rate": 4.883527351033245e-05, + "loss": 1.0679, + "step": 1251 + }, + { + "epoch": 0.19682053095953939, + "grad_norm": 0.2600773870944977, + "learning_rate": 4.8833408916077104e-05, + "loss": 1.3343, + "step": 1252 + }, + { + "epoch": 0.19697773585647194, + "grad_norm": 0.25740665197372437, + "learning_rate": 4.883154286616783e-05, + "loss": 1.2206, + "step": 1253 + }, + { + "epoch": 0.19713494075340446, + "grad_norm": 0.3393601179122925, + "learning_rate": 4.8829675360718585e-05, + "loss": 1.1518, + "step": 1254 + }, + { + "epoch": 0.197292145650337, + "grad_norm": 0.2968616783618927, + "learning_rate": 4.8827806399843444e-05, + "loss": 1.2547, + "step": 1255 + }, + { + "epoch": 0.19744935054726956, + "grad_norm": 0.24990178644657135, + "learning_rate": 4.8825935983656535e-05, + "loss": 1.2733, + "step": 1256 + }, + { + "epoch": 0.19760655544420208, + "grad_norm": 0.31955957412719727, + "learning_rate": 4.882406411227212e-05, + "loss": 1.2138, + "step": 1257 + }, + { + "epoch": 0.19776376034113463, + "grad_norm": 0.22445374727249146, + "learning_rate": 4.88221907858045e-05, + "loss": 1.1845, + "step": 1258 + }, + { + "epoch": 0.19792096523806715, + "grad_norm": 0.32888510823249817, + "learning_rate": 4.8820316004368116e-05, + "loss": 1.2339, + "step": 1259 + }, + { + "epoch": 0.1980781701349997, + "grad_norm": 0.29760921001434326, + "learning_rate": 4.8818439768077456e-05, + "loss": 1.2216, + "step": 1260 + }, + { + "epoch": 0.19823537503193225, + "grad_norm": 0.19965974986553192, + "learning_rate": 4.881656207704712e-05, + "loss": 1.2608, + "step": 1261 + }, + { + "epoch": 0.19839257992886478, + "grad_norm": 0.2538587749004364, + "learning_rate": 4.881468293139179e-05, + "loss": 1.1989, + "step": 1262 + }, + { + "epoch": 0.19854978482579733, + "grad_norm": 0.35299167037010193, + "learning_rate": 4.8812802331226224e-05, + "loss": 1.1426, + "step": 1263 + }, + { + "epoch": 0.19870698972272988, + "grad_norm": 0.3230816423892975, + "learning_rate": 4.8810920276665306e-05, + "loss": 1.2546, + "step": 1264 + }, + { + "epoch": 0.1988641946196624, + "grad_norm": 0.3077559769153595, + "learning_rate": 4.880903676782397e-05, + "loss": 1.1661, + "step": 1265 + }, + { + "epoch": 0.19902139951659495, + "grad_norm": 0.32157936692237854, + "learning_rate": 4.8807151804817254e-05, + "loss": 1.2141, + "step": 1266 + }, + { + "epoch": 0.19917860441352747, + "grad_norm": 0.32653504610061646, + "learning_rate": 4.880526538776029e-05, + "loss": 1.0623, + "step": 1267 + }, + { + "epoch": 0.19933580931046002, + "grad_norm": 0.2675210237503052, + "learning_rate": 4.880337751676828e-05, + "loss": 1.1408, + "step": 1268 + }, + { + "epoch": 0.19949301420739257, + "grad_norm": 0.28380653262138367, + "learning_rate": 4.880148819195654e-05, + "loss": 1.223, + "step": 1269 + }, + { + "epoch": 0.1996502191043251, + "grad_norm": 0.2532847821712494, + "learning_rate": 4.8799597413440466e-05, + "loss": 1.2133, + "step": 1270 + }, + { + "epoch": 0.19980742400125764, + "grad_norm": 0.2972438633441925, + "learning_rate": 4.8797705181335526e-05, + "loss": 1.2806, + "step": 1271 + }, + { + "epoch": 0.19996462889819017, + "grad_norm": 0.2725450098514557, + "learning_rate": 4.8795811495757306e-05, + "loss": 1.1627, + "step": 1272 + }, + { + "epoch": 0.20012183379512272, + "grad_norm": 0.2451506108045578, + "learning_rate": 4.879391635682145e-05, + "loss": 1.3242, + "step": 1273 + }, + { + "epoch": 0.20027903869205527, + "grad_norm": 0.22880415618419647, + "learning_rate": 4.8792019764643714e-05, + "loss": 1.1535, + "step": 1274 + }, + { + "epoch": 0.2004362435889878, + "grad_norm": 0.22470681369304657, + "learning_rate": 4.8790121719339935e-05, + "loss": 1.268, + "step": 1275 + }, + { + "epoch": 0.20059344848592034, + "grad_norm": 0.2413133829832077, + "learning_rate": 4.878822222102604e-05, + "loss": 1.2291, + "step": 1276 + }, + { + "epoch": 0.2007506533828529, + "grad_norm": 0.23373375833034515, + "learning_rate": 4.878632126981804e-05, + "loss": 1.1007, + "step": 1277 + }, + { + "epoch": 0.2009078582797854, + "grad_norm": 0.3018023371696472, + "learning_rate": 4.878441886583203e-05, + "loss": 1.2393, + "step": 1278 + }, + { + "epoch": 0.20106506317671796, + "grad_norm": 0.2107972353696823, + "learning_rate": 4.878251500918421e-05, + "loss": 1.3164, + "step": 1279 + }, + { + "epoch": 0.20122226807365048, + "grad_norm": 0.24787524342536926, + "learning_rate": 4.878060969999087e-05, + "loss": 1.217, + "step": 1280 + }, + { + "epoch": 0.20122226807365048, + "eval_loss": 1.2021143436431885, + "eval_runtime": 2276.1827, + "eval_samples_per_second": 4.067, + "eval_steps_per_second": 2.034, + "step": 1280 + }, + { + "epoch": 0.20137947297058303, + "grad_norm": 0.22325897216796875, + "learning_rate": 4.877870293836837e-05, + "loss": 1.2739, + "step": 1281 + }, + { + "epoch": 0.20153667786751558, + "grad_norm": 0.20739248394966125, + "learning_rate": 4.877679472443315e-05, + "loss": 1.2458, + "step": 1282 + }, + { + "epoch": 0.2016938827644481, + "grad_norm": 0.38787081837654114, + "learning_rate": 4.877488505830179e-05, + "loss": 1.2039, + "step": 1283 + }, + { + "epoch": 0.20185108766138066, + "grad_norm": 0.2838675379753113, + "learning_rate": 4.8772973940090895e-05, + "loss": 1.1647, + "step": 1284 + }, + { + "epoch": 0.20200829255831318, + "grad_norm": 0.2516341805458069, + "learning_rate": 4.877106136991721e-05, + "loss": 1.1952, + "step": 1285 + }, + { + "epoch": 0.20216549745524573, + "grad_norm": 0.24181115627288818, + "learning_rate": 4.8769147347897535e-05, + "loss": 1.1822, + "step": 1286 + }, + { + "epoch": 0.20232270235217828, + "grad_norm": 0.3560396730899811, + "learning_rate": 4.876723187414878e-05, + "loss": 1.1863, + "step": 1287 + }, + { + "epoch": 0.2024799072491108, + "grad_norm": 0.41868817806243896, + "learning_rate": 4.8765314948787934e-05, + "loss": 1.1446, + "step": 1288 + }, + { + "epoch": 0.20263711214604335, + "grad_norm": 0.21799515187740326, + "learning_rate": 4.8763396571932066e-05, + "loss": 1.155, + "step": 1289 + }, + { + "epoch": 0.2027943170429759, + "grad_norm": 0.24254827201366425, + "learning_rate": 4.876147674369834e-05, + "loss": 1.1363, + "step": 1290 + }, + { + "epoch": 0.20295152193990842, + "grad_norm": 0.24996933341026306, + "learning_rate": 4.875955546420404e-05, + "loss": 1.1773, + "step": 1291 + }, + { + "epoch": 0.20310872683684097, + "grad_norm": 0.33059096336364746, + "learning_rate": 4.8757632733566484e-05, + "loss": 1.2217, + "step": 1292 + }, + { + "epoch": 0.2032659317337735, + "grad_norm": 0.3126905858516693, + "learning_rate": 4.875570855190311e-05, + "loss": 1.2031, + "step": 1293 + }, + { + "epoch": 0.20342313663070605, + "grad_norm": 0.26368340849876404, + "learning_rate": 4.8753782919331436e-05, + "loss": 1.2348, + "step": 1294 + }, + { + "epoch": 0.2035803415276386, + "grad_norm": 0.2591840922832489, + "learning_rate": 4.875185583596909e-05, + "loss": 1.2303, + "step": 1295 + }, + { + "epoch": 0.20373754642457112, + "grad_norm": 0.3512473702430725, + "learning_rate": 4.874992730193375e-05, + "loss": 1.149, + "step": 1296 + }, + { + "epoch": 0.20389475132150367, + "grad_norm": 0.28538331389427185, + "learning_rate": 4.874799731734322e-05, + "loss": 1.2177, + "step": 1297 + }, + { + "epoch": 0.2040519562184362, + "grad_norm": 0.27414706349372864, + "learning_rate": 4.8746065882315375e-05, + "loss": 1.1767, + "step": 1298 + }, + { + "epoch": 0.20420916111536874, + "grad_norm": 0.2481449991464615, + "learning_rate": 4.874413299696816e-05, + "loss": 1.1928, + "step": 1299 + }, + { + "epoch": 0.2043663660123013, + "grad_norm": 0.26623374223709106, + "learning_rate": 4.8742198661419646e-05, + "loss": 1.2455, + "step": 1300 + }, + { + "epoch": 0.20452357090923381, + "grad_norm": 0.22189855575561523, + "learning_rate": 4.874026287578798e-05, + "loss": 1.1934, + "step": 1301 + }, + { + "epoch": 0.20468077580616637, + "grad_norm": 0.2549070119857788, + "learning_rate": 4.873832564019137e-05, + "loss": 1.0798, + "step": 1302 + }, + { + "epoch": 0.20483798070309892, + "grad_norm": 0.2875777781009674, + "learning_rate": 4.873638695474816e-05, + "loss": 1.2025, + "step": 1303 + }, + { + "epoch": 0.20499518560003144, + "grad_norm": 0.26281726360321045, + "learning_rate": 4.873444681957674e-05, + "loss": 1.2533, + "step": 1304 + }, + { + "epoch": 0.205152390496964, + "grad_norm": 0.294112890958786, + "learning_rate": 4.873250523479561e-05, + "loss": 1.177, + "step": 1305 + }, + { + "epoch": 0.2053095953938965, + "grad_norm": 0.24901710450649261, + "learning_rate": 4.873056220052336e-05, + "loss": 1.2624, + "step": 1306 + }, + { + "epoch": 0.20546680029082906, + "grad_norm": 0.2856816351413727, + "learning_rate": 4.8728617716878664e-05, + "loss": 1.0575, + "step": 1307 + }, + { + "epoch": 0.2056240051877616, + "grad_norm": 0.34814971685409546, + "learning_rate": 4.872667178398027e-05, + "loss": 1.1829, + "step": 1308 + }, + { + "epoch": 0.20578121008469413, + "grad_norm": 0.3625463545322418, + "learning_rate": 4.872472440194704e-05, + "loss": 1.1988, + "step": 1309 + }, + { + "epoch": 0.20593841498162668, + "grad_norm": 0.2561318576335907, + "learning_rate": 4.8722775570897915e-05, + "loss": 1.185, + "step": 1310 + }, + { + "epoch": 0.2060956198785592, + "grad_norm": 0.25931569933891296, + "learning_rate": 4.872082529095191e-05, + "loss": 1.15, + "step": 1311 + }, + { + "epoch": 0.20625282477549176, + "grad_norm": 0.21291913092136383, + "learning_rate": 4.871887356222815e-05, + "loss": 1.2019, + "step": 1312 + }, + { + "epoch": 0.2064100296724243, + "grad_norm": 0.21931886672973633, + "learning_rate": 4.8716920384845844e-05, + "loss": 1.3054, + "step": 1313 + }, + { + "epoch": 0.20656723456935683, + "grad_norm": 0.22694003582000732, + "learning_rate": 4.8714965758924276e-05, + "loss": 1.1884, + "step": 1314 + }, + { + "epoch": 0.20672443946628938, + "grad_norm": 0.25568726658821106, + "learning_rate": 4.871300968458282e-05, + "loss": 1.2516, + "step": 1315 + }, + { + "epoch": 0.20688164436322193, + "grad_norm": 0.24557702243328094, + "learning_rate": 4.871105216194096e-05, + "loss": 1.3418, + "step": 1316 + }, + { + "epoch": 0.20703884926015445, + "grad_norm": 0.2368367463350296, + "learning_rate": 4.870909319111825e-05, + "loss": 1.2088, + "step": 1317 + }, + { + "epoch": 0.207196054157087, + "grad_norm": 0.3659820556640625, + "learning_rate": 4.870713277223434e-05, + "loss": 1.0446, + "step": 1318 + }, + { + "epoch": 0.20735325905401952, + "grad_norm": 0.2582785189151764, + "learning_rate": 4.870517090540896e-05, + "loss": 1.3072, + "step": 1319 + }, + { + "epoch": 0.20751046395095207, + "grad_norm": 0.27062729001045227, + "learning_rate": 4.870320759076192e-05, + "loss": 1.2977, + "step": 1320 + }, + { + "epoch": 0.20766766884788462, + "grad_norm": 0.2483299970626831, + "learning_rate": 4.870124282841316e-05, + "loss": 1.2001, + "step": 1321 + }, + { + "epoch": 0.20782487374481715, + "grad_norm": 0.17987532913684845, + "learning_rate": 4.869927661848266e-05, + "loss": 1.253, + "step": 1322 + }, + { + "epoch": 0.2079820786417497, + "grad_norm": 0.24004274606704712, + "learning_rate": 4.869730896109051e-05, + "loss": 1.2242, + "step": 1323 + }, + { + "epoch": 0.20813928353868222, + "grad_norm": 0.259755402803421, + "learning_rate": 4.869533985635689e-05, + "loss": 1.2338, + "step": 1324 + }, + { + "epoch": 0.20829648843561477, + "grad_norm": 0.2481742948293686, + "learning_rate": 4.869336930440207e-05, + "loss": 1.0983, + "step": 1325 + }, + { + "epoch": 0.20845369333254732, + "grad_norm": 0.2746635675430298, + "learning_rate": 4.8691397305346404e-05, + "loss": 1.2491, + "step": 1326 + }, + { + "epoch": 0.20861089822947984, + "grad_norm": 0.3291498124599457, + "learning_rate": 4.868942385931032e-05, + "loss": 1.2045, + "step": 1327 + }, + { + "epoch": 0.2087681031264124, + "grad_norm": 0.2693649232387543, + "learning_rate": 4.8687448966414376e-05, + "loss": 1.2367, + "step": 1328 + }, + { + "epoch": 0.2089253080233449, + "grad_norm": 0.3101809620857239, + "learning_rate": 4.868547262677916e-05, + "loss": 1.1759, + "step": 1329 + }, + { + "epoch": 0.20908251292027746, + "grad_norm": 0.22869786620140076, + "learning_rate": 4.86834948405254e-05, + "loss": 1.2219, + "step": 1330 + }, + { + "epoch": 0.20923971781721, + "grad_norm": 0.32914999127388, + "learning_rate": 4.868151560777388e-05, + "loss": 1.1465, + "step": 1331 + }, + { + "epoch": 0.20939692271414254, + "grad_norm": 0.2585611343383789, + "learning_rate": 4.867953492864549e-05, + "loss": 1.202, + "step": 1332 + }, + { + "epoch": 0.2095541276110751, + "grad_norm": 0.22913898527622223, + "learning_rate": 4.8677552803261203e-05, + "loss": 1.1182, + "step": 1333 + }, + { + "epoch": 0.20971133250800764, + "grad_norm": 0.19015748798847198, + "learning_rate": 4.867556923174208e-05, + "loss": 1.2997, + "step": 1334 + }, + { + "epoch": 0.20986853740494016, + "grad_norm": 0.28327012062072754, + "learning_rate": 4.867358421420927e-05, + "loss": 1.2135, + "step": 1335 + }, + { + "epoch": 0.2100257423018727, + "grad_norm": 0.20216205716133118, + "learning_rate": 4.8671597750784006e-05, + "loss": 1.0898, + "step": 1336 + }, + { + "epoch": 0.21018294719880523, + "grad_norm": 0.23876157402992249, + "learning_rate": 4.8669609841587607e-05, + "loss": 1.1954, + "step": 1337 + }, + { + "epoch": 0.21034015209573778, + "grad_norm": 0.22727316617965698, + "learning_rate": 4.86676204867415e-05, + "loss": 1.1578, + "step": 1338 + }, + { + "epoch": 0.21049735699267033, + "grad_norm": 0.23092162609100342, + "learning_rate": 4.8665629686367185e-05, + "loss": 1.2883, + "step": 1339 + }, + { + "epoch": 0.21065456188960285, + "grad_norm": 0.5870768427848816, + "learning_rate": 4.8663637440586255e-05, + "loss": 1.1947, + "step": 1340 + }, + { + "epoch": 0.2108117667865354, + "grad_norm": 0.20369422435760498, + "learning_rate": 4.866164374952038e-05, + "loss": 1.1551, + "step": 1341 + }, + { + "epoch": 0.21096897168346793, + "grad_norm": 0.21872729063034058, + "learning_rate": 4.865964861329133e-05, + "loss": 1.195, + "step": 1342 + }, + { + "epoch": 0.21112617658040048, + "grad_norm": 0.23386871814727783, + "learning_rate": 4.8657652032020965e-05, + "loss": 1.1417, + "step": 1343 + }, + { + "epoch": 0.21128338147733303, + "grad_norm": 0.25174540281295776, + "learning_rate": 4.865565400583123e-05, + "loss": 1.2826, + "step": 1344 + }, + { + "epoch": 0.21144058637426555, + "grad_norm": 0.2607194781303406, + "learning_rate": 4.865365453484415e-05, + "loss": 1.1529, + "step": 1345 + }, + { + "epoch": 0.2115977912711981, + "grad_norm": 0.28137052059173584, + "learning_rate": 4.8651653619181835e-05, + "loss": 1.2041, + "step": 1346 + }, + { + "epoch": 0.21175499616813065, + "grad_norm": 0.3423405587673187, + "learning_rate": 4.864965125896652e-05, + "loss": 1.217, + "step": 1347 + }, + { + "epoch": 0.21191220106506317, + "grad_norm": 0.35149091482162476, + "learning_rate": 4.864764745432048e-05, + "loss": 1.234, + "step": 1348 + }, + { + "epoch": 0.21206940596199572, + "grad_norm": 0.19579124450683594, + "learning_rate": 4.864564220536611e-05, + "loss": 1.1335, + "step": 1349 + }, + { + "epoch": 0.21222661085892824, + "grad_norm": 0.2690495550632477, + "learning_rate": 4.8643635512225874e-05, + "loss": 1.2253, + "step": 1350 + }, + { + "epoch": 0.2123838157558608, + "grad_norm": 0.2389325350522995, + "learning_rate": 4.8641627375022346e-05, + "loss": 1.2362, + "step": 1351 + }, + { + "epoch": 0.21254102065279334, + "grad_norm": 0.2795645296573639, + "learning_rate": 4.863961779387817e-05, + "loss": 1.0509, + "step": 1352 + }, + { + "epoch": 0.21269822554972587, + "grad_norm": 0.1987501084804535, + "learning_rate": 4.863760676891608e-05, + "loss": 1.0825, + "step": 1353 + }, + { + "epoch": 0.21285543044665842, + "grad_norm": 0.2520771324634552, + "learning_rate": 4.8635594300258905e-05, + "loss": 1.2879, + "step": 1354 + }, + { + "epoch": 0.21301263534359094, + "grad_norm": 0.3053792119026184, + "learning_rate": 4.863358038802955e-05, + "loss": 1.1737, + "step": 1355 + }, + { + "epoch": 0.2131698402405235, + "grad_norm": 0.24323543906211853, + "learning_rate": 4.863156503235102e-05, + "loss": 1.3105, + "step": 1356 + }, + { + "epoch": 0.21332704513745604, + "grad_norm": 0.2622387111186981, + "learning_rate": 4.862954823334643e-05, + "loss": 1.1817, + "step": 1357 + }, + { + "epoch": 0.21348425003438856, + "grad_norm": 0.28524765372276306, + "learning_rate": 4.862752999113893e-05, + "loss": 1.1191, + "step": 1358 + }, + { + "epoch": 0.2136414549313211, + "grad_norm": 0.2917231619358063, + "learning_rate": 4.8625510305851784e-05, + "loss": 1.1717, + "step": 1359 + }, + { + "epoch": 0.21379865982825366, + "grad_norm": 0.23248536884784698, + "learning_rate": 4.862348917760837e-05, + "loss": 1.1472, + "step": 1360 + }, + { + "epoch": 0.21395586472518618, + "grad_norm": 0.30648741126060486, + "learning_rate": 4.862146660653212e-05, + "loss": 1.1325, + "step": 1361 + }, + { + "epoch": 0.21411306962211873, + "grad_norm": 0.26287996768951416, + "learning_rate": 4.8619442592746554e-05, + "loss": 1.1891, + "step": 1362 + }, + { + "epoch": 0.21427027451905126, + "grad_norm": 0.2846413254737854, + "learning_rate": 4.861741713637531e-05, + "loss": 1.2429, + "step": 1363 + }, + { + "epoch": 0.2144274794159838, + "grad_norm": 0.26600465178489685, + "learning_rate": 4.861539023754208e-05, + "loss": 1.2825, + "step": 1364 + }, + { + "epoch": 0.21458468431291636, + "grad_norm": 0.23721352219581604, + "learning_rate": 4.861336189637066e-05, + "loss": 1.16, + "step": 1365 + }, + { + "epoch": 0.21474188920984888, + "grad_norm": 0.21250367164611816, + "learning_rate": 4.8611332112984946e-05, + "loss": 1.1917, + "step": 1366 + }, + { + "epoch": 0.21489909410678143, + "grad_norm": 0.2471015751361847, + "learning_rate": 4.86093008875089e-05, + "loss": 1.2403, + "step": 1367 + }, + { + "epoch": 0.21505629900371395, + "grad_norm": 0.2969186007976532, + "learning_rate": 4.860726822006659e-05, + "loss": 1.2443, + "step": 1368 + }, + { + "epoch": 0.2152135039006465, + "grad_norm": 0.24633657932281494, + "learning_rate": 4.860523411078215e-05, + "loss": 1.2468, + "step": 1369 + }, + { + "epoch": 0.21537070879757905, + "grad_norm": 0.24867349863052368, + "learning_rate": 4.860319855977982e-05, + "loss": 1.194, + "step": 1370 + }, + { + "epoch": 0.21552791369451157, + "grad_norm": 0.27883896231651306, + "learning_rate": 4.8601161567183925e-05, + "loss": 1.2181, + "step": 1371 + }, + { + "epoch": 0.21568511859144412, + "grad_norm": 0.20830753445625305, + "learning_rate": 4.859912313311888e-05, + "loss": 1.1579, + "step": 1372 + }, + { + "epoch": 0.21584232348837668, + "grad_norm": 0.25265562534332275, + "learning_rate": 4.8597083257709194e-05, + "loss": 1.1455, + "step": 1373 + }, + { + "epoch": 0.2159995283853092, + "grad_norm": 0.23922070860862732, + "learning_rate": 4.859504194107943e-05, + "loss": 1.1434, + "step": 1374 + }, + { + "epoch": 0.21615673328224175, + "grad_norm": 0.1661684215068817, + "learning_rate": 4.859299918335428e-05, + "loss": 1.2426, + "step": 1375 + }, + { + "epoch": 0.21631393817917427, + "grad_norm": 0.20215146243572235, + "learning_rate": 4.859095498465851e-05, + "loss": 1.2485, + "step": 1376 + }, + { + "epoch": 0.21647114307610682, + "grad_norm": 0.2150086909532547, + "learning_rate": 4.858890934511697e-05, + "loss": 1.2034, + "step": 1377 + }, + { + "epoch": 0.21662834797303937, + "grad_norm": 0.24220801889896393, + "learning_rate": 4.8586862264854595e-05, + "loss": 1.2063, + "step": 1378 + }, + { + "epoch": 0.2167855528699719, + "grad_norm": 0.3015029728412628, + "learning_rate": 4.85848137439964e-05, + "loss": 1.2371, + "step": 1379 + }, + { + "epoch": 0.21694275776690444, + "grad_norm": 0.32718077301979065, + "learning_rate": 4.8582763782667534e-05, + "loss": 1.1915, + "step": 1380 + }, + { + "epoch": 0.21709996266383697, + "grad_norm": 0.2721043527126312, + "learning_rate": 4.858071238099318e-05, + "loss": 1.1692, + "step": 1381 + }, + { + "epoch": 0.21725716756076952, + "grad_norm": 0.21820539236068726, + "learning_rate": 4.857865953909862e-05, + "loss": 1.2529, + "step": 1382 + }, + { + "epoch": 0.21741437245770207, + "grad_norm": 0.24213218688964844, + "learning_rate": 4.857660525710927e-05, + "loss": 1.2273, + "step": 1383 + }, + { + "epoch": 0.2175715773546346, + "grad_norm": 0.25169044733047485, + "learning_rate": 4.857454953515055e-05, + "loss": 1.1917, + "step": 1384 + }, + { + "epoch": 0.21772878225156714, + "grad_norm": 0.1900857836008072, + "learning_rate": 4.8572492373348055e-05, + "loss": 1.1924, + "step": 1385 + }, + { + "epoch": 0.2178859871484997, + "grad_norm": 0.27372950315475464, + "learning_rate": 4.857043377182741e-05, + "loss": 1.2062, + "step": 1386 + }, + { + "epoch": 0.2180431920454322, + "grad_norm": 0.2589268386363983, + "learning_rate": 4.8568373730714344e-05, + "loss": 1.1294, + "step": 1387 + }, + { + "epoch": 0.21820039694236476, + "grad_norm": 0.32234373688697815, + "learning_rate": 4.856631225013468e-05, + "loss": 1.0772, + "step": 1388 + }, + { + "epoch": 0.21835760183929728, + "grad_norm": 0.34723344445228577, + "learning_rate": 4.8564249330214337e-05, + "loss": 1.1989, + "step": 1389 + }, + { + "epoch": 0.21851480673622983, + "grad_norm": 0.23347902297973633, + "learning_rate": 4.85621849710793e-05, + "loss": 1.3154, + "step": 1390 + }, + { + "epoch": 0.21867201163316238, + "grad_norm": 0.19097594916820526, + "learning_rate": 4.856011917285565e-05, + "loss": 1.241, + "step": 1391 + }, + { + "epoch": 0.2188292165300949, + "grad_norm": 0.17387251555919647, + "learning_rate": 4.855805193566956e-05, + "loss": 1.2396, + "step": 1392 + }, + { + "epoch": 0.21898642142702746, + "grad_norm": 0.25310957431793213, + "learning_rate": 4.85559832596473e-05, + "loss": 1.1169, + "step": 1393 + }, + { + "epoch": 0.21914362632395998, + "grad_norm": 0.20254233479499817, + "learning_rate": 4.85539131449152e-05, + "loss": 1.2233, + "step": 1394 + }, + { + "epoch": 0.21930083122089253, + "grad_norm": 0.22399556636810303, + "learning_rate": 4.8551841591599696e-05, + "loss": 1.1257, + "step": 1395 + }, + { + "epoch": 0.21945803611782508, + "grad_norm": 0.24220795929431915, + "learning_rate": 4.854976859982732e-05, + "loss": 1.1364, + "step": 1396 + }, + { + "epoch": 0.2196152410147576, + "grad_norm": 0.2996869683265686, + "learning_rate": 4.854769416972468e-05, + "loss": 1.277, + "step": 1397 + }, + { + "epoch": 0.21977244591169015, + "grad_norm": 0.22607775032520294, + "learning_rate": 4.854561830141848e-05, + "loss": 1.1538, + "step": 1398 + }, + { + "epoch": 0.2199296508086227, + "grad_norm": 0.3184898793697357, + "learning_rate": 4.854354099503549e-05, + "loss": 1.2933, + "step": 1399 + }, + { + "epoch": 0.22008685570555522, + "grad_norm": 0.26133978366851807, + "learning_rate": 4.8541462250702595e-05, + "loss": 1.1539, + "step": 1400 + }, + { + "epoch": 0.22024406060248777, + "grad_norm": 0.2325352430343628, + "learning_rate": 4.853938206854676e-05, + "loss": 1.1242, + "step": 1401 + }, + { + "epoch": 0.2204012654994203, + "grad_norm": 0.20906803011894226, + "learning_rate": 4.853730044869503e-05, + "loss": 1.1166, + "step": 1402 + }, + { + "epoch": 0.22055847039635285, + "grad_norm": 0.27185148000717163, + "learning_rate": 4.853521739127453e-05, + "loss": 1.1067, + "step": 1403 + }, + { + "epoch": 0.2207156752932854, + "grad_norm": 0.2759348452091217, + "learning_rate": 4.8533132896412514e-05, + "loss": 1.1237, + "step": 1404 + }, + { + "epoch": 0.22087288019021792, + "grad_norm": 0.21420961618423462, + "learning_rate": 4.853104696423627e-05, + "loss": 1.1362, + "step": 1405 + }, + { + "epoch": 0.22103008508715047, + "grad_norm": 0.22797361016273499, + "learning_rate": 4.852895959487321e-05, + "loss": 1.1114, + "step": 1406 + }, + { + "epoch": 0.221187289984083, + "grad_norm": 0.303501695394516, + "learning_rate": 4.8526870788450816e-05, + "loss": 1.1836, + "step": 1407 + }, + { + "epoch": 0.22134449488101554, + "grad_norm": 0.22205421328544617, + "learning_rate": 4.852478054509667e-05, + "loss": 1.2281, + "step": 1408 + }, + { + "epoch": 0.2215016997779481, + "grad_norm": 0.31351590156555176, + "learning_rate": 4.852268886493844e-05, + "loss": 1.161, + "step": 1409 + }, + { + "epoch": 0.2216589046748806, + "grad_norm": 0.2639506757259369, + "learning_rate": 4.852059574810386e-05, + "loss": 1.1397, + "step": 1410 + }, + { + "epoch": 0.22181610957181316, + "grad_norm": 0.2192612588405609, + "learning_rate": 4.851850119472079e-05, + "loss": 1.1476, + "step": 1411 + }, + { + "epoch": 0.2219733144687457, + "grad_norm": 0.27032819390296936, + "learning_rate": 4.851640520491715e-05, + "loss": 1.2416, + "step": 1412 + }, + { + "epoch": 0.22213051936567824, + "grad_norm": 0.3159463405609131, + "learning_rate": 4.851430777882095e-05, + "loss": 1.1131, + "step": 1413 + }, + { + "epoch": 0.2222877242626108, + "grad_norm": 0.20352302491664886, + "learning_rate": 4.85122089165603e-05, + "loss": 1.2406, + "step": 1414 + }, + { + "epoch": 0.2224449291595433, + "grad_norm": 0.2544812858104706, + "learning_rate": 4.8510108618263385e-05, + "loss": 1.2213, + "step": 1415 + }, + { + "epoch": 0.22260213405647586, + "grad_norm": 0.38868314027786255, + "learning_rate": 4.8508006884058485e-05, + "loss": 1.1821, + "step": 1416 + }, + { + "epoch": 0.2227593389534084, + "grad_norm": 0.293169766664505, + "learning_rate": 4.850590371407397e-05, + "loss": 1.2964, + "step": 1417 + }, + { + "epoch": 0.22291654385034093, + "grad_norm": 0.282044917345047, + "learning_rate": 4.850379910843829e-05, + "loss": 1.2502, + "step": 1418 + }, + { + "epoch": 0.22307374874727348, + "grad_norm": 0.2392999529838562, + "learning_rate": 4.850169306727999e-05, + "loss": 1.1388, + "step": 1419 + }, + { + "epoch": 0.223230953644206, + "grad_norm": 0.18563714623451233, + "learning_rate": 4.849958559072768e-05, + "loss": 1.2458, + "step": 1420 + }, + { + "epoch": 0.22338815854113855, + "grad_norm": 0.21055997908115387, + "learning_rate": 4.84974766789101e-05, + "loss": 1.1713, + "step": 1421 + }, + { + "epoch": 0.2235453634380711, + "grad_norm": 0.21380756795406342, + "learning_rate": 4.849536633195606e-05, + "loss": 1.2113, + "step": 1422 + }, + { + "epoch": 0.22370256833500363, + "grad_norm": 0.2441507875919342, + "learning_rate": 4.849325454999443e-05, + "loss": 1.275, + "step": 1423 + }, + { + "epoch": 0.22385977323193618, + "grad_norm": 0.2958972454071045, + "learning_rate": 4.849114133315419e-05, + "loss": 1.1454, + "step": 1424 + }, + { + "epoch": 0.22401697812886873, + "grad_norm": 0.30271342396736145, + "learning_rate": 4.848902668156442e-05, + "loss": 1.1151, + "step": 1425 + }, + { + "epoch": 0.22417418302580125, + "grad_norm": 0.2217937856912613, + "learning_rate": 4.848691059535427e-05, + "loss": 1.2406, + "step": 1426 + }, + { + "epoch": 0.2243313879227338, + "grad_norm": 0.22862625122070312, + "learning_rate": 4.848479307465299e-05, + "loss": 1.1971, + "step": 1427 + }, + { + "epoch": 0.22448859281966632, + "grad_norm": 0.2594766616821289, + "learning_rate": 4.8482674119589896e-05, + "loss": 1.1845, + "step": 1428 + }, + { + "epoch": 0.22464579771659887, + "grad_norm": 0.21012337505817413, + "learning_rate": 4.848055373029441e-05, + "loss": 1.1644, + "step": 1429 + }, + { + "epoch": 0.22480300261353142, + "grad_norm": 0.3061239421367645, + "learning_rate": 4.847843190689605e-05, + "loss": 1.1177, + "step": 1430 + }, + { + "epoch": 0.22496020751046394, + "grad_norm": 0.27125445008277893, + "learning_rate": 4.847630864952439e-05, + "loss": 1.2566, + "step": 1431 + }, + { + "epoch": 0.2251174124073965, + "grad_norm": 0.16465838253498077, + "learning_rate": 4.847418395830911e-05, + "loss": 1.2477, + "step": 1432 + }, + { + "epoch": 0.22527461730432902, + "grad_norm": 0.2064242959022522, + "learning_rate": 4.8472057833380005e-05, + "loss": 1.2777, + "step": 1433 + }, + { + "epoch": 0.22543182220126157, + "grad_norm": 0.1920788437128067, + "learning_rate": 4.846993027486691e-05, + "loss": 1.0682, + "step": 1434 + }, + { + "epoch": 0.22558902709819412, + "grad_norm": 0.2445499151945114, + "learning_rate": 4.8467801282899775e-05, + "loss": 1.1468, + "step": 1435 + }, + { + "epoch": 0.22574623199512664, + "grad_norm": 0.2358809858560562, + "learning_rate": 4.846567085760861e-05, + "loss": 1.2001, + "step": 1436 + }, + { + "epoch": 0.2259034368920592, + "grad_norm": 0.2340790331363678, + "learning_rate": 4.846353899912356e-05, + "loss": 1.1401, + "step": 1437 + }, + { + "epoch": 0.22606064178899174, + "grad_norm": 0.23034712672233582, + "learning_rate": 4.8461405707574824e-05, + "loss": 1.1617, + "step": 1438 + }, + { + "epoch": 0.22621784668592426, + "grad_norm": 0.2453261762857437, + "learning_rate": 4.8459270983092686e-05, + "loss": 1.2889, + "step": 1439 + }, + { + "epoch": 0.2263750515828568, + "grad_norm": 0.23590409755706787, + "learning_rate": 4.8457134825807535e-05, + "loss": 1.1807, + "step": 1440 + }, + { + "epoch": 0.2263750515828568, + "eval_loss": 1.1835801601409912, + "eval_runtime": 2320.0135, + "eval_samples_per_second": 3.99, + "eval_steps_per_second": 1.995, + "step": 1440 + }, + { + "epoch": 0.22653225647978933, + "grad_norm": 0.2932775318622589, + "learning_rate": 4.845499723584984e-05, + "loss": 1.1899, + "step": 1441 + }, + { + "epoch": 0.22668946137672188, + "grad_norm": 0.23739026486873627, + "learning_rate": 4.845285821335015e-05, + "loss": 1.2037, + "step": 1442 + }, + { + "epoch": 0.22684666627365443, + "grad_norm": 0.1993498057126999, + "learning_rate": 4.8450717758439115e-05, + "loss": 1.1681, + "step": 1443 + }, + { + "epoch": 0.22700387117058696, + "grad_norm": 0.23352783918380737, + "learning_rate": 4.8448575871247465e-05, + "loss": 1.2268, + "step": 1444 + }, + { + "epoch": 0.2271610760675195, + "grad_norm": 0.22201289236545563, + "learning_rate": 4.844643255190602e-05, + "loss": 1.2883, + "step": 1445 + }, + { + "epoch": 0.22731828096445203, + "grad_norm": 0.2467186152935028, + "learning_rate": 4.8444287800545676e-05, + "loss": 1.1602, + "step": 1446 + }, + { + "epoch": 0.22747548586138458, + "grad_norm": 0.20474953949451447, + "learning_rate": 4.844214161729743e-05, + "loss": 1.3125, + "step": 1447 + }, + { + "epoch": 0.22763269075831713, + "grad_norm": 0.1738600730895996, + "learning_rate": 4.843999400229238e-05, + "loss": 1.275, + "step": 1448 + }, + { + "epoch": 0.22778989565524965, + "grad_norm": 0.23264817893505096, + "learning_rate": 4.843784495566166e-05, + "loss": 1.2151, + "step": 1449 + }, + { + "epoch": 0.2279471005521822, + "grad_norm": 0.2865363657474518, + "learning_rate": 4.843569447753656e-05, + "loss": 1.2071, + "step": 1450 + }, + { + "epoch": 0.22810430544911475, + "grad_norm": 0.2803146243095398, + "learning_rate": 4.84335425680484e-05, + "loss": 1.207, + "step": 1451 + }, + { + "epoch": 0.22826151034604727, + "grad_norm": 0.22393058240413666, + "learning_rate": 4.843138922732863e-05, + "loss": 1.1584, + "step": 1452 + }, + { + "epoch": 0.22841871524297983, + "grad_norm": 0.22326746582984924, + "learning_rate": 4.8429234455508746e-05, + "loss": 1.2231, + "step": 1453 + }, + { + "epoch": 0.22857592013991235, + "grad_norm": 0.21326550841331482, + "learning_rate": 4.8427078252720366e-05, + "loss": 1.138, + "step": 1454 + }, + { + "epoch": 0.2287331250368449, + "grad_norm": 0.19931691884994507, + "learning_rate": 4.842492061909518e-05, + "loss": 1.2628, + "step": 1455 + }, + { + "epoch": 0.22889032993377745, + "grad_norm": 0.4217057228088379, + "learning_rate": 4.8422761554764974e-05, + "loss": 1.0622, + "step": 1456 + }, + { + "epoch": 0.22904753483070997, + "grad_norm": 0.2697439193725586, + "learning_rate": 4.8420601059861605e-05, + "loss": 1.1374, + "step": 1457 + }, + { + "epoch": 0.22920473972764252, + "grad_norm": 0.25051379203796387, + "learning_rate": 4.841843913451703e-05, + "loss": 1.1349, + "step": 1458 + }, + { + "epoch": 0.22936194462457504, + "grad_norm": 0.30390244722366333, + "learning_rate": 4.841627577886329e-05, + "loss": 1.1929, + "step": 1459 + }, + { + "epoch": 0.2295191495215076, + "grad_norm": 0.2657618820667267, + "learning_rate": 4.8414110993032535e-05, + "loss": 1.1913, + "step": 1460 + }, + { + "epoch": 0.22967635441844014, + "grad_norm": 0.27086779475212097, + "learning_rate": 4.841194477715696e-05, + "loss": 1.2208, + "step": 1461 + }, + { + "epoch": 0.22983355931537267, + "grad_norm": 0.2512415647506714, + "learning_rate": 4.840977713136887e-05, + "loss": 1.1451, + "step": 1462 + }, + { + "epoch": 0.22999076421230522, + "grad_norm": 0.2690669298171997, + "learning_rate": 4.8407608055800656e-05, + "loss": 1.2148, + "step": 1463 + }, + { + "epoch": 0.23014796910923777, + "grad_norm": 0.2143964320421219, + "learning_rate": 4.8405437550584816e-05, + "loss": 1.2793, + "step": 1464 + }, + { + "epoch": 0.2303051740061703, + "grad_norm": 0.21806931495666504, + "learning_rate": 4.8403265615853894e-05, + "loss": 1.1906, + "step": 1465 + }, + { + "epoch": 0.23046237890310284, + "grad_norm": 0.23820914328098297, + "learning_rate": 4.8401092251740555e-05, + "loss": 1.3061, + "step": 1466 + }, + { + "epoch": 0.23061958380003536, + "grad_norm": 0.2485547661781311, + "learning_rate": 4.839891745837753e-05, + "loss": 1.2976, + "step": 1467 + }, + { + "epoch": 0.2307767886969679, + "grad_norm": 0.2677574157714844, + "learning_rate": 4.8396741235897655e-05, + "loss": 1.0837, + "step": 1468 + }, + { + "epoch": 0.23093399359390046, + "grad_norm": 0.21473652124404907, + "learning_rate": 4.839456358443385e-05, + "loss": 1.1982, + "step": 1469 + }, + { + "epoch": 0.23109119849083298, + "grad_norm": 0.27341189980506897, + "learning_rate": 4.8392384504119116e-05, + "loss": 1.1508, + "step": 1470 + }, + { + "epoch": 0.23124840338776553, + "grad_norm": 0.20952042937278748, + "learning_rate": 4.8390203995086525e-05, + "loss": 1.2806, + "step": 1471 + }, + { + "epoch": 0.23140560828469806, + "grad_norm": 0.19671006500720978, + "learning_rate": 4.838802205746927e-05, + "loss": 1.1397, + "step": 1472 + }, + { + "epoch": 0.2315628131816306, + "grad_norm": 0.2122461348772049, + "learning_rate": 4.838583869140063e-05, + "loss": 1.2206, + "step": 1473 + }, + { + "epoch": 0.23172001807856316, + "grad_norm": 0.2837271988391876, + "learning_rate": 4.838365389701392e-05, + "loss": 1.1932, + "step": 1474 + }, + { + "epoch": 0.23187722297549568, + "grad_norm": 0.20707464218139648, + "learning_rate": 4.838146767444261e-05, + "loss": 1.1619, + "step": 1475 + }, + { + "epoch": 0.23203442787242823, + "grad_norm": 0.2600030303001404, + "learning_rate": 4.837928002382021e-05, + "loss": 1.1239, + "step": 1476 + }, + { + "epoch": 0.23219163276936078, + "grad_norm": 0.1838095337152481, + "learning_rate": 4.837709094528035e-05, + "loss": 1.2329, + "step": 1477 + }, + { + "epoch": 0.2323488376662933, + "grad_norm": 0.21165348589420319, + "learning_rate": 4.83749004389567e-05, + "loss": 1.2103, + "step": 1478 + }, + { + "epoch": 0.23250604256322585, + "grad_norm": 0.21954618394374847, + "learning_rate": 4.837270850498308e-05, + "loss": 1.1957, + "step": 1479 + }, + { + "epoch": 0.23266324746015837, + "grad_norm": 0.1750314086675644, + "learning_rate": 4.8370515143493346e-05, + "loss": 1.2116, + "step": 1480 + }, + { + "epoch": 0.23282045235709092, + "grad_norm": 0.24269573390483856, + "learning_rate": 4.8368320354621474e-05, + "loss": 1.1123, + "step": 1481 + }, + { + "epoch": 0.23297765725402347, + "grad_norm": 0.2614949345588684, + "learning_rate": 4.83661241385015e-05, + "loss": 1.2208, + "step": 1482 + }, + { + "epoch": 0.233134862150956, + "grad_norm": 0.252847284078598, + "learning_rate": 4.836392649526756e-05, + "loss": 1.1886, + "step": 1483 + }, + { + "epoch": 0.23329206704788855, + "grad_norm": 0.26073166728019714, + "learning_rate": 4.8361727425053895e-05, + "loss": 1.1645, + "step": 1484 + }, + { + "epoch": 0.23344927194482107, + "grad_norm": 0.33702126145362854, + "learning_rate": 4.83595269279948e-05, + "loss": 1.278, + "step": 1485 + }, + { + "epoch": 0.23360647684175362, + "grad_norm": 0.23756952583789825, + "learning_rate": 4.8357325004224675e-05, + "loss": 1.2537, + "step": 1486 + }, + { + "epoch": 0.23376368173868617, + "grad_norm": 0.2630425989627838, + "learning_rate": 4.835512165387801e-05, + "loss": 1.1478, + "step": 1487 + }, + { + "epoch": 0.2339208866356187, + "grad_norm": 0.2218979150056839, + "learning_rate": 4.835291687708937e-05, + "loss": 1.1887, + "step": 1488 + }, + { + "epoch": 0.23407809153255124, + "grad_norm": 0.23264755308628082, + "learning_rate": 4.8350710673993425e-05, + "loss": 1.1516, + "step": 1489 + }, + { + "epoch": 0.2342352964294838, + "grad_norm": 0.2717369794845581, + "learning_rate": 4.834850304472491e-05, + "loss": 1.1529, + "step": 1490 + }, + { + "epoch": 0.2343925013264163, + "grad_norm": 0.19794031977653503, + "learning_rate": 4.8346293989418666e-05, + "loss": 1.2004, + "step": 1491 + }, + { + "epoch": 0.23454970622334886, + "grad_norm": 0.21382425725460052, + "learning_rate": 4.8344083508209614e-05, + "loss": 1.1859, + "step": 1492 + }, + { + "epoch": 0.2347069111202814, + "grad_norm": 0.3373514413833618, + "learning_rate": 4.834187160123276e-05, + "loss": 1.1786, + "step": 1493 + }, + { + "epoch": 0.23486411601721394, + "grad_norm": 0.2665463089942932, + "learning_rate": 4.83396582686232e-05, + "loss": 1.2605, + "step": 1494 + }, + { + "epoch": 0.2350213209141465, + "grad_norm": 0.2265530377626419, + "learning_rate": 4.833744351051611e-05, + "loss": 1.2043, + "step": 1495 + }, + { + "epoch": 0.235178525811079, + "grad_norm": 0.23344869911670685, + "learning_rate": 4.833522732704677e-05, + "loss": 1.2518, + "step": 1496 + }, + { + "epoch": 0.23533573070801156, + "grad_norm": 0.2448417842388153, + "learning_rate": 4.833300971835053e-05, + "loss": 1.2219, + "step": 1497 + }, + { + "epoch": 0.23549293560494408, + "grad_norm": 0.21578192710876465, + "learning_rate": 4.8330790684562827e-05, + "loss": 1.1285, + "step": 1498 + }, + { + "epoch": 0.23565014050187663, + "grad_norm": 0.25885578989982605, + "learning_rate": 4.8328570225819195e-05, + "loss": 1.1466, + "step": 1499 + }, + { + "epoch": 0.23580734539880918, + "grad_norm": 0.22060716152191162, + "learning_rate": 4.832634834225526e-05, + "loss": 1.126, + "step": 1500 + }, + { + "epoch": 0.2359645502957417, + "grad_norm": 0.23449023067951202, + "learning_rate": 4.8324125034006715e-05, + "loss": 1.1628, + "step": 1501 + }, + { + "epoch": 0.23612175519267425, + "grad_norm": 0.2863908112049103, + "learning_rate": 4.832190030120936e-05, + "loss": 1.1403, + "step": 1502 + }, + { + "epoch": 0.2362789600896068, + "grad_norm": 0.25917142629623413, + "learning_rate": 4.8319674143999063e-05, + "loss": 1.1563, + "step": 1503 + }, + { + "epoch": 0.23643616498653933, + "grad_norm": 0.2577166259288788, + "learning_rate": 4.83174465625118e-05, + "loss": 1.2036, + "step": 1504 + }, + { + "epoch": 0.23659336988347188, + "grad_norm": 0.26499852538108826, + "learning_rate": 4.831521755688361e-05, + "loss": 1.2324, + "step": 1505 + }, + { + "epoch": 0.2367505747804044, + "grad_norm": 0.32207560539245605, + "learning_rate": 4.831298712725065e-05, + "loss": 1.109, + "step": 1506 + }, + { + "epoch": 0.23690777967733695, + "grad_norm": 0.2242748886346817, + "learning_rate": 4.831075527374913e-05, + "loss": 1.0943, + "step": 1507 + }, + { + "epoch": 0.2370649845742695, + "grad_norm": 0.23681138455867767, + "learning_rate": 4.830852199651537e-05, + "loss": 1.2074, + "step": 1508 + }, + { + "epoch": 0.23722218947120202, + "grad_norm": 0.23508530855178833, + "learning_rate": 4.830628729568577e-05, + "loss": 1.1592, + "step": 1509 + }, + { + "epoch": 0.23737939436813457, + "grad_norm": 0.24309656023979187, + "learning_rate": 4.8304051171396815e-05, + "loss": 1.1721, + "step": 1510 + }, + { + "epoch": 0.2375365992650671, + "grad_norm": 0.22660185396671295, + "learning_rate": 4.830181362378509e-05, + "loss": 1.2004, + "step": 1511 + }, + { + "epoch": 0.23769380416199964, + "grad_norm": 0.19725121557712555, + "learning_rate": 4.8299574652987236e-05, + "loss": 1.142, + "step": 1512 + }, + { + "epoch": 0.2378510090589322, + "grad_norm": 0.251643568277359, + "learning_rate": 4.8297334259140015e-05, + "loss": 1.24, + "step": 1513 + }, + { + "epoch": 0.23800821395586472, + "grad_norm": 0.21497240662574768, + "learning_rate": 4.829509244238026e-05, + "loss": 1.0919, + "step": 1514 + }, + { + "epoch": 0.23816541885279727, + "grad_norm": 0.20723997056484222, + "learning_rate": 4.829284920284488e-05, + "loss": 1.2857, + "step": 1515 + }, + { + "epoch": 0.23832262374972982, + "grad_norm": 0.1896023154258728, + "learning_rate": 4.82906045406709e-05, + "loss": 1.1827, + "step": 1516 + }, + { + "epoch": 0.23847982864666234, + "grad_norm": 0.2979254126548767, + "learning_rate": 4.828835845599542e-05, + "loss": 1.1791, + "step": 1517 + }, + { + "epoch": 0.2386370335435949, + "grad_norm": 0.22216519713401794, + "learning_rate": 4.82861109489556e-05, + "loss": 1.1883, + "step": 1518 + }, + { + "epoch": 0.2387942384405274, + "grad_norm": 0.21972821652889252, + "learning_rate": 4.828386201968873e-05, + "loss": 1.1213, + "step": 1519 + }, + { + "epoch": 0.23895144333745996, + "grad_norm": 0.22813616693019867, + "learning_rate": 4.828161166833215e-05, + "loss": 1.1776, + "step": 1520 + }, + { + "epoch": 0.2391086482343925, + "grad_norm": 0.2273959070444107, + "learning_rate": 4.827935989502331e-05, + "loss": 1.0856, + "step": 1521 + }, + { + "epoch": 0.23926585313132503, + "grad_norm": 0.219630166888237, + "learning_rate": 4.827710669989974e-05, + "loss": 1.1141, + "step": 1522 + }, + { + "epoch": 0.23942305802825758, + "grad_norm": 0.278255432844162, + "learning_rate": 4.8274852083099065e-05, + "loss": 1.1454, + "step": 1523 + }, + { + "epoch": 0.2395802629251901, + "grad_norm": 0.27673086524009705, + "learning_rate": 4.8272596044758974e-05, + "loss": 1.1366, + "step": 1524 + }, + { + "epoch": 0.23973746782212266, + "grad_norm": 0.32165762782096863, + "learning_rate": 4.827033858501726e-05, + "loss": 1.2367, + "step": 1525 + }, + { + "epoch": 0.2398946727190552, + "grad_norm": 0.20993465185165405, + "learning_rate": 4.82680797040118e-05, + "loss": 1.2368, + "step": 1526 + }, + { + "epoch": 0.24005187761598773, + "grad_norm": 0.23632824420928955, + "learning_rate": 4.8265819401880575e-05, + "loss": 1.2395, + "step": 1527 + }, + { + "epoch": 0.24020908251292028, + "grad_norm": 0.2364589422941208, + "learning_rate": 4.826355767876161e-05, + "loss": 1.1977, + "step": 1528 + }, + { + "epoch": 0.24036628740985283, + "grad_norm": 0.2299450784921646, + "learning_rate": 4.826129453479306e-05, + "loss": 1.2261, + "step": 1529 + }, + { + "epoch": 0.24052349230678535, + "grad_norm": 0.2673666477203369, + "learning_rate": 4.825902997011314e-05, + "loss": 1.2938, + "step": 1530 + }, + { + "epoch": 0.2406806972037179, + "grad_norm": 0.19846846163272858, + "learning_rate": 4.8256763984860164e-05, + "loss": 1.1954, + "step": 1531 + }, + { + "epoch": 0.24083790210065043, + "grad_norm": 0.23329487442970276, + "learning_rate": 4.825449657917253e-05, + "loss": 1.2483, + "step": 1532 + }, + { + "epoch": 0.24099510699758298, + "grad_norm": 0.22006340324878693, + "learning_rate": 4.825222775318872e-05, + "loss": 1.14, + "step": 1533 + }, + { + "epoch": 0.24115231189451553, + "grad_norm": 0.2518123984336853, + "learning_rate": 4.8249957507047315e-05, + "loss": 1.041, + "step": 1534 + }, + { + "epoch": 0.24130951679144805, + "grad_norm": 0.26999431848526, + "learning_rate": 4.824768584088696e-05, + "loss": 1.1871, + "step": 1535 + }, + { + "epoch": 0.2414667216883806, + "grad_norm": 0.26676446199417114, + "learning_rate": 4.824541275484641e-05, + "loss": 1.1527, + "step": 1536 + }, + { + "epoch": 0.24162392658531312, + "grad_norm": 0.30466800928115845, + "learning_rate": 4.824313824906449e-05, + "loss": 1.0792, + "step": 1537 + }, + { + "epoch": 0.24178113148224567, + "grad_norm": 0.23601631820201874, + "learning_rate": 4.824086232368011e-05, + "loss": 1.1099, + "step": 1538 + }, + { + "epoch": 0.24193833637917822, + "grad_norm": 0.21866478025913239, + "learning_rate": 4.82385849788323e-05, + "loss": 1.2394, + "step": 1539 + }, + { + "epoch": 0.24209554127611074, + "grad_norm": 0.2272699475288391, + "learning_rate": 4.823630621466013e-05, + "loss": 1.2503, + "step": 1540 + }, + { + "epoch": 0.2422527461730433, + "grad_norm": 0.2118367850780487, + "learning_rate": 4.823402603130279e-05, + "loss": 1.2348, + "step": 1541 + }, + { + "epoch": 0.24240995106997584, + "grad_norm": 0.2032533437013626, + "learning_rate": 4.823174442889953e-05, + "loss": 1.205, + "step": 1542 + }, + { + "epoch": 0.24256715596690837, + "grad_norm": 0.22420114278793335, + "learning_rate": 4.822946140758972e-05, + "loss": 1.1653, + "step": 1543 + }, + { + "epoch": 0.24272436086384092, + "grad_norm": 0.22077587246894836, + "learning_rate": 4.8227176967512785e-05, + "loss": 1.0768, + "step": 1544 + }, + { + "epoch": 0.24288156576077344, + "grad_norm": 0.25682517886161804, + "learning_rate": 4.8224891108808255e-05, + "loss": 1.1728, + "step": 1545 + }, + { + "epoch": 0.243038770657706, + "grad_norm": 0.28335657715797424, + "learning_rate": 4.8222603831615744e-05, + "loss": 1.1615, + "step": 1546 + }, + { + "epoch": 0.24319597555463854, + "grad_norm": 0.2960416376590729, + "learning_rate": 4.8220315136074946e-05, + "loss": 1.1786, + "step": 1547 + }, + { + "epoch": 0.24335318045157106, + "grad_norm": 0.24929380416870117, + "learning_rate": 4.821802502232565e-05, + "loss": 1.1626, + "step": 1548 + }, + { + "epoch": 0.2435103853485036, + "grad_norm": 0.2701495587825775, + "learning_rate": 4.821573349050772e-05, + "loss": 1.1638, + "step": 1549 + }, + { + "epoch": 0.24366759024543613, + "grad_norm": 0.21584048867225647, + "learning_rate": 4.821344054076111e-05, + "loss": 1.083, + "step": 1550 + }, + { + "epoch": 0.24382479514236868, + "grad_norm": 0.20484817028045654, + "learning_rate": 4.8211146173225884e-05, + "loss": 1.1552, + "step": 1551 + }, + { + "epoch": 0.24398200003930123, + "grad_norm": 0.2720612585544586, + "learning_rate": 4.8208850388042166e-05, + "loss": 1.2093, + "step": 1552 + }, + { + "epoch": 0.24413920493623376, + "grad_norm": 0.2705625295639038, + "learning_rate": 4.820655318535017e-05, + "loss": 1.2524, + "step": 1553 + }, + { + "epoch": 0.2442964098331663, + "grad_norm": 0.2290465533733368, + "learning_rate": 4.820425456529019e-05, + "loss": 1.2138, + "step": 1554 + }, + { + "epoch": 0.24445361473009886, + "grad_norm": 0.20924489200115204, + "learning_rate": 4.8201954528002634e-05, + "loss": 1.215, + "step": 1555 + }, + { + "epoch": 0.24461081962703138, + "grad_norm": 0.4101350009441376, + "learning_rate": 4.819965307362797e-05, + "loss": 1.1646, + "step": 1556 + }, + { + "epoch": 0.24476802452396393, + "grad_norm": 0.3153195381164551, + "learning_rate": 4.819735020230677e-05, + "loss": 1.0581, + "step": 1557 + }, + { + "epoch": 0.24492522942089645, + "grad_norm": 0.2894124686717987, + "learning_rate": 4.819504591417967e-05, + "loss": 1.1619, + "step": 1558 + }, + { + "epoch": 0.245082434317829, + "grad_norm": 0.2255009263753891, + "learning_rate": 4.8192740209387425e-05, + "loss": 1.2306, + "step": 1559 + }, + { + "epoch": 0.24523963921476155, + "grad_norm": 0.23586878180503845, + "learning_rate": 4.819043308807085e-05, + "loss": 1.0897, + "step": 1560 + }, + { + "epoch": 0.24539684411169407, + "grad_norm": 0.19656193256378174, + "learning_rate": 4.818812455037086e-05, + "loss": 1.2547, + "step": 1561 + }, + { + "epoch": 0.24555404900862662, + "grad_norm": 0.19186031818389893, + "learning_rate": 4.818581459642844e-05, + "loss": 1.2, + "step": 1562 + }, + { + "epoch": 0.24571125390555915, + "grad_norm": 0.30889350175857544, + "learning_rate": 4.8183503226384685e-05, + "loss": 1.0726, + "step": 1563 + }, + { + "epoch": 0.2458684588024917, + "grad_norm": 0.28955504298210144, + "learning_rate": 4.8181190440380755e-05, + "loss": 1.0853, + "step": 1564 + }, + { + "epoch": 0.24602566369942425, + "grad_norm": 0.20687651634216309, + "learning_rate": 4.817887623855792e-05, + "loss": 1.1026, + "step": 1565 + }, + { + "epoch": 0.24618286859635677, + "grad_norm": 0.20754511654376984, + "learning_rate": 4.817656062105751e-05, + "loss": 1.1793, + "step": 1566 + }, + { + "epoch": 0.24634007349328932, + "grad_norm": 0.20881116390228271, + "learning_rate": 4.817424358802096e-05, + "loss": 1.1812, + "step": 1567 + }, + { + "epoch": 0.24649727839022187, + "grad_norm": 0.22653476893901825, + "learning_rate": 4.8171925139589777e-05, + "loss": 1.1733, + "step": 1568 + }, + { + "epoch": 0.2466544832871544, + "grad_norm": 0.2595761716365814, + "learning_rate": 4.8169605275905574e-05, + "loss": 1.1817, + "step": 1569 + }, + { + "epoch": 0.24681168818408694, + "grad_norm": 0.265379935503006, + "learning_rate": 4.8167283997110044e-05, + "loss": 1.0631, + "step": 1570 + }, + { + "epoch": 0.24696889308101946, + "grad_norm": 0.28662997484207153, + "learning_rate": 4.816496130334494e-05, + "loss": 1.171, + "step": 1571 + }, + { + "epoch": 0.24712609797795201, + "grad_norm": 0.28659316897392273, + "learning_rate": 4.8162637194752146e-05, + "loss": 1.158, + "step": 1572 + }, + { + "epoch": 0.24728330287488456, + "grad_norm": 0.25767526030540466, + "learning_rate": 4.8160311671473596e-05, + "loss": 1.1179, + "step": 1573 + }, + { + "epoch": 0.2474405077718171, + "grad_norm": 0.2345789521932602, + "learning_rate": 4.815798473365133e-05, + "loss": 1.0722, + "step": 1574 + }, + { + "epoch": 0.24759771266874964, + "grad_norm": 0.20178478956222534, + "learning_rate": 4.8155656381427464e-05, + "loss": 1.1837, + "step": 1575 + }, + { + "epoch": 0.24775491756568216, + "grad_norm": 0.2254524976015091, + "learning_rate": 4.815332661494421e-05, + "loss": 1.0697, + "step": 1576 + }, + { + "epoch": 0.2479121224626147, + "grad_norm": 0.24192413687705994, + "learning_rate": 4.815099543434386e-05, + "loss": 1.1321, + "step": 1577 + }, + { + "epoch": 0.24806932735954726, + "grad_norm": 0.24133244156837463, + "learning_rate": 4.814866283976879e-05, + "loss": 0.9985, + "step": 1578 + }, + { + "epoch": 0.24822653225647978, + "grad_norm": 0.25949928164482117, + "learning_rate": 4.814632883136146e-05, + "loss": 1.1197, + "step": 1579 + }, + { + "epoch": 0.24838373715341233, + "grad_norm": 0.2583736479282379, + "learning_rate": 4.8143993409264446e-05, + "loss": 1.1219, + "step": 1580 + }, + { + "epoch": 0.24854094205034488, + "grad_norm": 0.27031365036964417, + "learning_rate": 4.814165657362037e-05, + "loss": 1.1165, + "step": 1581 + }, + { + "epoch": 0.2486981469472774, + "grad_norm": 0.21057547628879547, + "learning_rate": 4.813931832457195e-05, + "loss": 1.2199, + "step": 1582 + }, + { + "epoch": 0.24885535184420995, + "grad_norm": 0.16742344200611115, + "learning_rate": 4.813697866226201e-05, + "loss": 1.1562, + "step": 1583 + }, + { + "epoch": 0.24901255674114248, + "grad_norm": 0.17304782569408417, + "learning_rate": 4.813463758683345e-05, + "loss": 1.28, + "step": 1584 + }, + { + "epoch": 0.24916976163807503, + "grad_norm": 0.22176629304885864, + "learning_rate": 4.813229509842924e-05, + "loss": 1.261, + "step": 1585 + }, + { + "epoch": 0.24932696653500758, + "grad_norm": 0.2836274802684784, + "learning_rate": 4.812995119719246e-05, + "loss": 1.0815, + "step": 1586 + }, + { + "epoch": 0.2494841714319401, + "grad_norm": 0.28422456979751587, + "learning_rate": 4.812760588326627e-05, + "loss": 1.2035, + "step": 1587 + }, + { + "epoch": 0.24964137632887265, + "grad_norm": 0.21732592582702637, + "learning_rate": 4.81252591567939e-05, + "loss": 1.2436, + "step": 1588 + }, + { + "epoch": 0.24979858122580517, + "grad_norm": 0.24617740511894226, + "learning_rate": 4.8122911017918694e-05, + "loss": 1.1604, + "step": 1589 + }, + { + "epoch": 0.24995578612273772, + "grad_norm": 0.22099459171295166, + "learning_rate": 4.8120561466784056e-05, + "loss": 1.2597, + "step": 1590 + }, + { + "epoch": 0.25011299101967027, + "grad_norm": 0.21290400624275208, + "learning_rate": 4.811821050353349e-05, + "loss": 1.2431, + "step": 1591 + }, + { + "epoch": 0.2502701959166028, + "grad_norm": 0.32815465331077576, + "learning_rate": 4.811585812831059e-05, + "loss": 1.202, + "step": 1592 + }, + { + "epoch": 0.2504274008135353, + "grad_norm": 0.19528530538082123, + "learning_rate": 4.811350434125902e-05, + "loss": 1.2669, + "step": 1593 + }, + { + "epoch": 0.25058460571046787, + "grad_norm": 0.2987164855003357, + "learning_rate": 4.8111149142522545e-05, + "loss": 1.1263, + "step": 1594 + }, + { + "epoch": 0.2507418106074004, + "grad_norm": 0.22502346336841583, + "learning_rate": 4.810879253224502e-05, + "loss": 1.0881, + "step": 1595 + }, + { + "epoch": 0.25089901550433297, + "grad_norm": 0.18701261281967163, + "learning_rate": 4.810643451057036e-05, + "loss": 1.1612, + "step": 1596 + }, + { + "epoch": 0.2510562204012655, + "grad_norm": 0.2719084322452545, + "learning_rate": 4.81040750776426e-05, + "loss": 1.0618, + "step": 1597 + }, + { + "epoch": 0.251213425298198, + "grad_norm": 0.2659015655517578, + "learning_rate": 4.8101714233605845e-05, + "loss": 1.2364, + "step": 1598 + }, + { + "epoch": 0.25137063019513056, + "grad_norm": 0.23821905255317688, + "learning_rate": 4.809935197860427e-05, + "loss": 1.2978, + "step": 1599 + }, + { + "epoch": 0.2515278350920631, + "grad_norm": 0.2146797925233841, + "learning_rate": 4.8096988312782174e-05, + "loss": 1.2891, + "step": 1600 + }, + { + "epoch": 0.2515278350920631, + "eval_loss": 1.1705546379089355, + "eval_runtime": 2320.6543, + "eval_samples_per_second": 3.989, + "eval_steps_per_second": 1.995, + "step": 1600 + }, + { + "epoch": 0.25168503998899566, + "grad_norm": 0.2162550389766693, + "learning_rate": 4.80946232362839e-05, + "loss": 1.1259, + "step": 1601 + }, + { + "epoch": 0.2518422448859282, + "grad_norm": 0.24746862053871155, + "learning_rate": 4.809225674925392e-05, + "loss": 1.1532, + "step": 1602 + }, + { + "epoch": 0.25199944978286076, + "grad_norm": 0.21859407424926758, + "learning_rate": 4.808988885183675e-05, + "loss": 1.2823, + "step": 1603 + }, + { + "epoch": 0.25215665467979326, + "grad_norm": 0.24052634835243225, + "learning_rate": 4.808751954417702e-05, + "loss": 1.2075, + "step": 1604 + }, + { + "epoch": 0.2523138595767258, + "grad_norm": 0.3079143464565277, + "learning_rate": 4.808514882641944e-05, + "loss": 1.2258, + "step": 1605 + }, + { + "epoch": 0.25247106447365836, + "grad_norm": 0.2353007048368454, + "learning_rate": 4.8082776698708805e-05, + "loss": 1.0725, + "step": 1606 + }, + { + "epoch": 0.2526282693705909, + "grad_norm": 0.24463996291160583, + "learning_rate": 4.808040316118999e-05, + "loss": 1.1763, + "step": 1607 + }, + { + "epoch": 0.25278547426752346, + "grad_norm": 0.24785065650939941, + "learning_rate": 4.807802821400796e-05, + "loss": 1.146, + "step": 1608 + }, + { + "epoch": 0.25294267916445595, + "grad_norm": 0.3086298704147339, + "learning_rate": 4.8075651857307786e-05, + "loss": 1.1113, + "step": 1609 + }, + { + "epoch": 0.2530998840613885, + "grad_norm": 0.2168322056531906, + "learning_rate": 4.807327409123459e-05, + "loss": 1.2544, + "step": 1610 + }, + { + "epoch": 0.25325708895832105, + "grad_norm": 0.23688393831253052, + "learning_rate": 4.807089491593359e-05, + "loss": 1.2082, + "step": 1611 + }, + { + "epoch": 0.2534142938552536, + "grad_norm": 0.184744194149971, + "learning_rate": 4.8068514331550116e-05, + "loss": 1.2256, + "step": 1612 + }, + { + "epoch": 0.25357149875218615, + "grad_norm": 0.26341819763183594, + "learning_rate": 4.8066132338229564e-05, + "loss": 1.0727, + "step": 1613 + }, + { + "epoch": 0.25372870364911865, + "grad_norm": 0.24953097105026245, + "learning_rate": 4.80637489361174e-05, + "loss": 1.197, + "step": 1614 + }, + { + "epoch": 0.2538859085460512, + "grad_norm": 0.26930662989616394, + "learning_rate": 4.8061364125359204e-05, + "loss": 1.2333, + "step": 1615 + }, + { + "epoch": 0.25404311344298375, + "grad_norm": 0.22073645889759064, + "learning_rate": 4.805897790610063e-05, + "loss": 1.1749, + "step": 1616 + }, + { + "epoch": 0.2542003183399163, + "grad_norm": 0.23651045560836792, + "learning_rate": 4.805659027848742e-05, + "loss": 1.2632, + "step": 1617 + }, + { + "epoch": 0.25435752323684885, + "grad_norm": 0.2211553007364273, + "learning_rate": 4.80542012426654e-05, + "loss": 1.2297, + "step": 1618 + }, + { + "epoch": 0.25451472813378134, + "grad_norm": 0.21516153216362, + "learning_rate": 4.805181079878048e-05, + "loss": 1.2396, + "step": 1619 + }, + { + "epoch": 0.2546719330307139, + "grad_norm": 0.2111159861087799, + "learning_rate": 4.804941894697867e-05, + "loss": 1.0961, + "step": 1620 + }, + { + "epoch": 0.25482913792764644, + "grad_norm": 0.2587776184082031, + "learning_rate": 4.804702568740604e-05, + "loss": 1.1243, + "step": 1621 + }, + { + "epoch": 0.254986342824579, + "grad_norm": 0.21024088561534882, + "learning_rate": 4.804463102020878e-05, + "loss": 1.1276, + "step": 1622 + }, + { + "epoch": 0.25514354772151154, + "grad_norm": 0.2880707383155823, + "learning_rate": 4.8042234945533127e-05, + "loss": 1.0841, + "step": 1623 + }, + { + "epoch": 0.25530075261844404, + "grad_norm": 0.21504652500152588, + "learning_rate": 4.803983746352544e-05, + "loss": 1.0336, + "step": 1624 + }, + { + "epoch": 0.2554579575153766, + "grad_norm": 0.2103573977947235, + "learning_rate": 4.803743857433214e-05, + "loss": 1.2497, + "step": 1625 + }, + { + "epoch": 0.25561516241230914, + "grad_norm": 0.24323992431163788, + "learning_rate": 4.803503827809974e-05, + "loss": 1.1702, + "step": 1626 + }, + { + "epoch": 0.2557723673092417, + "grad_norm": 0.1768578141927719, + "learning_rate": 4.8032636574974845e-05, + "loss": 1.2472, + "step": 1627 + }, + { + "epoch": 0.25592957220617424, + "grad_norm": 0.20027288794517517, + "learning_rate": 4.803023346510415e-05, + "loss": 1.1757, + "step": 1628 + }, + { + "epoch": 0.2560867771031068, + "grad_norm": 0.2526282072067261, + "learning_rate": 4.8027828948634405e-05, + "loss": 1.1338, + "step": 1629 + }, + { + "epoch": 0.2562439820000393, + "grad_norm": 0.2610374689102173, + "learning_rate": 4.802542302571249e-05, + "loss": 1.0715, + "step": 1630 + }, + { + "epoch": 0.25640118689697183, + "grad_norm": 0.2409505993127823, + "learning_rate": 4.802301569648534e-05, + "loss": 1.1915, + "step": 1631 + }, + { + "epoch": 0.2565583917939044, + "grad_norm": 0.3539654314517975, + "learning_rate": 4.8020606961099996e-05, + "loss": 1.1752, + "step": 1632 + }, + { + "epoch": 0.25671559669083693, + "grad_norm": 0.23121047019958496, + "learning_rate": 4.801819681970357e-05, + "loss": 1.1966, + "step": 1633 + }, + { + "epoch": 0.2568728015877695, + "grad_norm": 0.2272186279296875, + "learning_rate": 4.801578527244325e-05, + "loss": 1.1131, + "step": 1634 + }, + { + "epoch": 0.257030006484702, + "grad_norm": 0.183711439371109, + "learning_rate": 4.801337231946633e-05, + "loss": 1.1569, + "step": 1635 + }, + { + "epoch": 0.25718721138163453, + "grad_norm": 0.22497425973415375, + "learning_rate": 4.80109579609202e-05, + "loss": 1.1491, + "step": 1636 + }, + { + "epoch": 0.2573444162785671, + "grad_norm": 0.22534243762493134, + "learning_rate": 4.80085421969523e-05, + "loss": 1.1827, + "step": 1637 + }, + { + "epoch": 0.25750162117549963, + "grad_norm": 0.14900386333465576, + "learning_rate": 4.800612502771019e-05, + "loss": 1.1905, + "step": 1638 + }, + { + "epoch": 0.2576588260724322, + "grad_norm": 0.26234182715415955, + "learning_rate": 4.80037064533415e-05, + "loss": 1.1197, + "step": 1639 + }, + { + "epoch": 0.2578160309693647, + "grad_norm": 0.23205281794071198, + "learning_rate": 4.800128647399393e-05, + "loss": 1.1325, + "step": 1640 + }, + { + "epoch": 0.2579732358662972, + "grad_norm": 0.22982637584209442, + "learning_rate": 4.799886508981531e-05, + "loss": 1.1097, + "step": 1641 + }, + { + "epoch": 0.2581304407632298, + "grad_norm": 0.2268412709236145, + "learning_rate": 4.799644230095351e-05, + "loss": 1.2452, + "step": 1642 + }, + { + "epoch": 0.2582876456601623, + "grad_norm": 0.2071431428194046, + "learning_rate": 4.799401810755651e-05, + "loss": 1.2338, + "step": 1643 + }, + { + "epoch": 0.2584448505570949, + "grad_norm": 0.21589568257331848, + "learning_rate": 4.799159250977237e-05, + "loss": 1.1177, + "step": 1644 + }, + { + "epoch": 0.25860205545402737, + "grad_norm": 0.19412028789520264, + "learning_rate": 4.798916550774924e-05, + "loss": 1.113, + "step": 1645 + }, + { + "epoch": 0.2587592603509599, + "grad_norm": 0.2528839707374573, + "learning_rate": 4.798673710163535e-05, + "loss": 1.2024, + "step": 1646 + }, + { + "epoch": 0.25891646524789247, + "grad_norm": 0.21848973631858826, + "learning_rate": 4.798430729157901e-05, + "loss": 0.9633, + "step": 1647 + }, + { + "epoch": 0.259073670144825, + "grad_norm": 0.2397562563419342, + "learning_rate": 4.7981876077728625e-05, + "loss": 1.14, + "step": 1648 + }, + { + "epoch": 0.25923087504175757, + "grad_norm": 0.19531749188899994, + "learning_rate": 4.7979443460232703e-05, + "loss": 1.1785, + "step": 1649 + }, + { + "epoch": 0.25938807993869006, + "grad_norm": 0.20693683624267578, + "learning_rate": 4.79770094392398e-05, + "loss": 1.2601, + "step": 1650 + }, + { + "epoch": 0.2595452848356226, + "grad_norm": 0.26081717014312744, + "learning_rate": 4.797457401489858e-05, + "loss": 1.1489, + "step": 1651 + }, + { + "epoch": 0.25970248973255516, + "grad_norm": 0.23810921609401703, + "learning_rate": 4.7972137187357795e-05, + "loss": 1.1896, + "step": 1652 + }, + { + "epoch": 0.2598596946294877, + "grad_norm": 0.22295863926410675, + "learning_rate": 4.796969895676627e-05, + "loss": 1.1954, + "step": 1653 + }, + { + "epoch": 0.26001689952642026, + "grad_norm": 0.26917287707328796, + "learning_rate": 4.7967259323272935e-05, + "loss": 1.2985, + "step": 1654 + }, + { + "epoch": 0.2601741044233528, + "grad_norm": 0.20920400321483612, + "learning_rate": 4.796481828702678e-05, + "loss": 1.2217, + "step": 1655 + }, + { + "epoch": 0.2603313093202853, + "grad_norm": 0.24075470864772797, + "learning_rate": 4.79623758481769e-05, + "loss": 1.2349, + "step": 1656 + }, + { + "epoch": 0.26048851421721786, + "grad_norm": 0.2986784279346466, + "learning_rate": 4.795993200687247e-05, + "loss": 0.9704, + "step": 1657 + }, + { + "epoch": 0.2606457191141504, + "grad_norm": 0.21063056588172913, + "learning_rate": 4.795748676326275e-05, + "loss": 1.1764, + "step": 1658 + }, + { + "epoch": 0.26080292401108296, + "grad_norm": 0.17640748620033264, + "learning_rate": 4.7955040117497084e-05, + "loss": 1.1998, + "step": 1659 + }, + { + "epoch": 0.2609601289080155, + "grad_norm": 0.3088374137878418, + "learning_rate": 4.79525920697249e-05, + "loss": 1.108, + "step": 1660 + }, + { + "epoch": 0.261117333804948, + "grad_norm": 0.21907536685466766, + "learning_rate": 4.795014262009573e-05, + "loss": 1.2134, + "step": 1661 + }, + { + "epoch": 0.26127453870188055, + "grad_norm": 0.25505852699279785, + "learning_rate": 4.794769176875917e-05, + "loss": 1.2556, + "step": 1662 + }, + { + "epoch": 0.2614317435988131, + "grad_norm": 0.2881571650505066, + "learning_rate": 4.79452395158649e-05, + "loss": 1.1194, + "step": 1663 + }, + { + "epoch": 0.26158894849574565, + "grad_norm": 0.24344374239444733, + "learning_rate": 4.794278586156271e-05, + "loss": 1.114, + "step": 1664 + }, + { + "epoch": 0.2617461533926782, + "grad_norm": 0.2396748960018158, + "learning_rate": 4.794033080600244e-05, + "loss": 1.0962, + "step": 1665 + }, + { + "epoch": 0.2619033582896107, + "grad_norm": 0.2769325375556946, + "learning_rate": 4.7937874349334056e-05, + "loss": 1.0822, + "step": 1666 + }, + { + "epoch": 0.26206056318654325, + "grad_norm": 0.20145046710968018, + "learning_rate": 4.793541649170757e-05, + "loss": 1.1856, + "step": 1667 + }, + { + "epoch": 0.2622177680834758, + "grad_norm": 0.2373715192079544, + "learning_rate": 4.7932957233273123e-05, + "loss": 1.195, + "step": 1668 + }, + { + "epoch": 0.26237497298040835, + "grad_norm": 0.26741209626197815, + "learning_rate": 4.7930496574180894e-05, + "loss": 1.1812, + "step": 1669 + }, + { + "epoch": 0.2625321778773409, + "grad_norm": 0.18911203742027283, + "learning_rate": 4.7928034514581174e-05, + "loss": 1.1561, + "step": 1670 + }, + { + "epoch": 0.2626893827742734, + "grad_norm": 0.23078587651252747, + "learning_rate": 4.792557105462434e-05, + "loss": 1.1746, + "step": 1671 + }, + { + "epoch": 0.26284658767120594, + "grad_norm": 0.25644704699516296, + "learning_rate": 4.792310619446087e-05, + "loss": 1.1917, + "step": 1672 + }, + { + "epoch": 0.2630037925681385, + "grad_norm": 0.2350175380706787, + "learning_rate": 4.7920639934241274e-05, + "loss": 1.2823, + "step": 1673 + }, + { + "epoch": 0.26316099746507104, + "grad_norm": 0.28728723526000977, + "learning_rate": 4.79181722741162e-05, + "loss": 1.2291, + "step": 1674 + }, + { + "epoch": 0.2633182023620036, + "grad_norm": 0.28967636823654175, + "learning_rate": 4.791570321423637e-05, + "loss": 1.0443, + "step": 1675 + }, + { + "epoch": 0.2634754072589361, + "grad_norm": 0.20903019607067108, + "learning_rate": 4.791323275475257e-05, + "loss": 1.2378, + "step": 1676 + }, + { + "epoch": 0.26363261215586864, + "grad_norm": 0.25918152928352356, + "learning_rate": 4.791076089581569e-05, + "loss": 1.0944, + "step": 1677 + }, + { + "epoch": 0.2637898170528012, + "grad_norm": 0.2330826371908188, + "learning_rate": 4.790828763757671e-05, + "loss": 1.2158, + "step": 1678 + }, + { + "epoch": 0.26394702194973374, + "grad_norm": 0.22032979130744934, + "learning_rate": 4.790581298018667e-05, + "loss": 1.0987, + "step": 1679 + }, + { + "epoch": 0.2641042268466663, + "grad_norm": 0.25354599952697754, + "learning_rate": 4.7903336923796736e-05, + "loss": 1.2191, + "step": 1680 + }, + { + "epoch": 0.26426143174359884, + "grad_norm": 0.29382967948913574, + "learning_rate": 4.7900859468558123e-05, + "loss": 1.1906, + "step": 1681 + }, + { + "epoch": 0.26441863664053133, + "grad_norm": 0.26418671011924744, + "learning_rate": 4.7898380614622144e-05, + "loss": 1.1357, + "step": 1682 + }, + { + "epoch": 0.2645758415374639, + "grad_norm": 0.4071028232574463, + "learning_rate": 4.78959003621402e-05, + "loss": 1.1547, + "step": 1683 + }, + { + "epoch": 0.26473304643439644, + "grad_norm": 0.20212817192077637, + "learning_rate": 4.789341871126378e-05, + "loss": 1.2021, + "step": 1684 + }, + { + "epoch": 0.264890251331329, + "grad_norm": 0.22908912599086761, + "learning_rate": 4.789093566214444e-05, + "loss": 1.3014, + "step": 1685 + }, + { + "epoch": 0.26504745622826154, + "grad_norm": 0.2181553691625595, + "learning_rate": 4.788845121493385e-05, + "loss": 1.2074, + "step": 1686 + }, + { + "epoch": 0.26520466112519403, + "grad_norm": 0.2574876546859741, + "learning_rate": 4.788596536978374e-05, + "loss": 1.1311, + "step": 1687 + }, + { + "epoch": 0.2653618660221266, + "grad_norm": 0.20918451249599457, + "learning_rate": 4.7883478126845945e-05, + "loss": 1.1652, + "step": 1688 + }, + { + "epoch": 0.26551907091905913, + "grad_norm": 0.1706637293100357, + "learning_rate": 4.7880989486272366e-05, + "loss": 1.1742, + "step": 1689 + }, + { + "epoch": 0.2656762758159917, + "grad_norm": 0.22445620596408844, + "learning_rate": 4.787849944821501e-05, + "loss": 1.1579, + "step": 1690 + }, + { + "epoch": 0.26583348071292423, + "grad_norm": 0.25707024335861206, + "learning_rate": 4.787600801282596e-05, + "loss": 1.0905, + "step": 1691 + }, + { + "epoch": 0.2659906856098567, + "grad_norm": 0.22234748303890228, + "learning_rate": 4.787351518025737e-05, + "loss": 1.1657, + "step": 1692 + }, + { + "epoch": 0.2661478905067893, + "grad_norm": 0.26487070322036743, + "learning_rate": 4.78710209506615e-05, + "loss": 1.2234, + "step": 1693 + }, + { + "epoch": 0.2663050954037218, + "grad_norm": 0.26550817489624023, + "learning_rate": 4.786852532419069e-05, + "loss": 1.1627, + "step": 1694 + }, + { + "epoch": 0.2664623003006544, + "grad_norm": 0.24167504906654358, + "learning_rate": 4.786602830099737e-05, + "loss": 1.2216, + "step": 1695 + }, + { + "epoch": 0.2666195051975869, + "grad_norm": 0.30660563707351685, + "learning_rate": 4.786352988123403e-05, + "loss": 1.1593, + "step": 1696 + }, + { + "epoch": 0.2667767100945194, + "grad_norm": 0.222911536693573, + "learning_rate": 4.786103006505328e-05, + "loss": 1.2223, + "step": 1697 + }, + { + "epoch": 0.26693391499145197, + "grad_norm": 0.19984647631645203, + "learning_rate": 4.78585288526078e-05, + "loss": 1.2453, + "step": 1698 + }, + { + "epoch": 0.2670911198883845, + "grad_norm": 0.19021253287792206, + "learning_rate": 4.785602624405034e-05, + "loss": 1.2094, + "step": 1699 + }, + { + "epoch": 0.26724832478531707, + "grad_norm": 0.24404233694076538, + "learning_rate": 4.785352223953376e-05, + "loss": 1.1247, + "step": 1700 + }, + { + "epoch": 0.2674055296822496, + "grad_norm": 0.24491339921951294, + "learning_rate": 4.7851016839210995e-05, + "loss": 1.2667, + "step": 1701 + }, + { + "epoch": 0.2675627345791821, + "grad_norm": 0.2175382375717163, + "learning_rate": 4.7848510043235064e-05, + "loss": 1.1664, + "step": 1702 + }, + { + "epoch": 0.26771993947611467, + "grad_norm": 0.31710314750671387, + "learning_rate": 4.784600185175907e-05, + "loss": 1.1909, + "step": 1703 + }, + { + "epoch": 0.2678771443730472, + "grad_norm": 0.21164441108703613, + "learning_rate": 4.7843492264936214e-05, + "loss": 1.1851, + "step": 1704 + }, + { + "epoch": 0.26803434926997977, + "grad_norm": 0.18407614529132843, + "learning_rate": 4.784098128291976e-05, + "loss": 1.175, + "step": 1705 + }, + { + "epoch": 0.2681915541669123, + "grad_norm": 0.2403915822505951, + "learning_rate": 4.783846890586307e-05, + "loss": 1.1775, + "step": 1706 + }, + { + "epoch": 0.26834875906384487, + "grad_norm": 0.25552019476890564, + "learning_rate": 4.78359551339196e-05, + "loss": 1.1153, + "step": 1707 + }, + { + "epoch": 0.26850596396077736, + "grad_norm": 0.24397552013397217, + "learning_rate": 4.783343996724287e-05, + "loss": 1.1795, + "step": 1708 + }, + { + "epoch": 0.2686631688577099, + "grad_norm": 0.2242836058139801, + "learning_rate": 4.78309234059865e-05, + "loss": 1.1872, + "step": 1709 + }, + { + "epoch": 0.26882037375464246, + "grad_norm": 0.2508406639099121, + "learning_rate": 4.78284054503042e-05, + "loss": 1.177, + "step": 1710 + }, + { + "epoch": 0.268977578651575, + "grad_norm": 0.31523460149765015, + "learning_rate": 4.7825886100349756e-05, + "loss": 1.0777, + "step": 1711 + }, + { + "epoch": 0.26913478354850756, + "grad_norm": 0.2677992582321167, + "learning_rate": 4.782336535627703e-05, + "loss": 1.135, + "step": 1712 + }, + { + "epoch": 0.26929198844544006, + "grad_norm": 0.2804398536682129, + "learning_rate": 4.782084321823998e-05, + "loss": 1.2421, + "step": 1713 + }, + { + "epoch": 0.2694491933423726, + "grad_norm": 0.2114623337984085, + "learning_rate": 4.781831968639266e-05, + "loss": 1.1308, + "step": 1714 + }, + { + "epoch": 0.26960639823930516, + "grad_norm": 0.24072472751140594, + "learning_rate": 4.7815794760889196e-05, + "loss": 1.0372, + "step": 1715 + }, + { + "epoch": 0.2697636031362377, + "grad_norm": 0.25040605664253235, + "learning_rate": 4.7813268441883784e-05, + "loss": 1.2141, + "step": 1716 + }, + { + "epoch": 0.26992080803317026, + "grad_norm": 0.2768096625804901, + "learning_rate": 4.781074072953074e-05, + "loss": 1.1514, + "step": 1717 + }, + { + "epoch": 0.27007801293010275, + "grad_norm": 0.21474453806877136, + "learning_rate": 4.780821162398444e-05, + "loss": 1.1375, + "step": 1718 + }, + { + "epoch": 0.2702352178270353, + "grad_norm": 0.2671203911304474, + "learning_rate": 4.780568112539936e-05, + "loss": 1.1992, + "step": 1719 + }, + { + "epoch": 0.27039242272396785, + "grad_norm": 0.19425953924655914, + "learning_rate": 4.780314923393005e-05, + "loss": 1.1796, + "step": 1720 + }, + { + "epoch": 0.2705496276209004, + "grad_norm": 0.21683478355407715, + "learning_rate": 4.780061594973114e-05, + "loss": 1.102, + "step": 1721 + }, + { + "epoch": 0.27070683251783295, + "grad_norm": 0.22656619548797607, + "learning_rate": 4.779808127295735e-05, + "loss": 1.2064, + "step": 1722 + }, + { + "epoch": 0.27086403741476545, + "grad_norm": 0.27819374203681946, + "learning_rate": 4.779554520376351e-05, + "loss": 1.1881, + "step": 1723 + }, + { + "epoch": 0.271021242311698, + "grad_norm": 0.2076413929462433, + "learning_rate": 4.779300774230449e-05, + "loss": 1.1586, + "step": 1724 + }, + { + "epoch": 0.27117844720863055, + "grad_norm": 0.28731614351272583, + "learning_rate": 4.779046888873529e-05, + "loss": 0.9967, + "step": 1725 + }, + { + "epoch": 0.2713356521055631, + "grad_norm": 0.23616041243076324, + "learning_rate": 4.7787928643210955e-05, + "loss": 1.2109, + "step": 1726 + }, + { + "epoch": 0.27149285700249565, + "grad_norm": 0.3352977931499481, + "learning_rate": 4.778538700588664e-05, + "loss": 1.316, + "step": 1727 + }, + { + "epoch": 0.27165006189942814, + "grad_norm": 0.275420218706131, + "learning_rate": 4.778284397691758e-05, + "loss": 1.2542, + "step": 1728 + }, + { + "epoch": 0.2718072667963607, + "grad_norm": 0.20388716459274292, + "learning_rate": 4.77802995564591e-05, + "loss": 1.187, + "step": 1729 + }, + { + "epoch": 0.27196447169329324, + "grad_norm": 0.24802348017692566, + "learning_rate": 4.777775374466659e-05, + "loss": 1.2225, + "step": 1730 + }, + { + "epoch": 0.2721216765902258, + "grad_norm": 0.2412358671426773, + "learning_rate": 4.777520654169554e-05, + "loss": 1.1717, + "step": 1731 + }, + { + "epoch": 0.27227888148715834, + "grad_norm": 0.24212747812271118, + "learning_rate": 4.777265794770153e-05, + "loss": 1.0965, + "step": 1732 + }, + { + "epoch": 0.2724360863840909, + "grad_norm": 0.2116345465183258, + "learning_rate": 4.7770107962840225e-05, + "loss": 1.0881, + "step": 1733 + }, + { + "epoch": 0.2725932912810234, + "grad_norm": 0.1802130490541458, + "learning_rate": 4.7767556587267356e-05, + "loss": 1.2671, + "step": 1734 + }, + { + "epoch": 0.27275049617795594, + "grad_norm": 0.22544068098068237, + "learning_rate": 4.776500382113875e-05, + "loss": 1.1557, + "step": 1735 + }, + { + "epoch": 0.2729077010748885, + "grad_norm": 0.22627638280391693, + "learning_rate": 4.776244966461034e-05, + "loss": 1.0589, + "step": 1736 + }, + { + "epoch": 0.27306490597182104, + "grad_norm": 0.24201518297195435, + "learning_rate": 4.77598941178381e-05, + "loss": 1.217, + "step": 1737 + }, + { + "epoch": 0.2732221108687536, + "grad_norm": 0.2708778381347656, + "learning_rate": 4.775733718097812e-05, + "loss": 1.1762, + "step": 1738 + }, + { + "epoch": 0.2733793157656861, + "grad_norm": 0.22093307971954346, + "learning_rate": 4.775477885418658e-05, + "loss": 1.1599, + "step": 1739 + }, + { + "epoch": 0.27353652066261863, + "grad_norm": 0.2544403672218323, + "learning_rate": 4.775221913761971e-05, + "loss": 1.1177, + "step": 1740 + }, + { + "epoch": 0.2736937255595512, + "grad_norm": 0.2178761512041092, + "learning_rate": 4.7749658031433873e-05, + "loss": 1.1493, + "step": 1741 + }, + { + "epoch": 0.27385093045648373, + "grad_norm": 0.2969399392604828, + "learning_rate": 4.774709553578548e-05, + "loss": 1.1048, + "step": 1742 + }, + { + "epoch": 0.2740081353534163, + "grad_norm": 0.18794193863868713, + "learning_rate": 4.7744531650831034e-05, + "loss": 1.2954, + "step": 1743 + }, + { + "epoch": 0.2741653402503488, + "grad_norm": 0.19405794143676758, + "learning_rate": 4.774196637672714e-05, + "loss": 1.1211, + "step": 1744 + }, + { + "epoch": 0.2743225451472813, + "grad_norm": 0.24118031561374664, + "learning_rate": 4.773939971363046e-05, + "loss": 1.0933, + "step": 1745 + }, + { + "epoch": 0.2744797500442139, + "grad_norm": 0.24610240757465363, + "learning_rate": 4.7736831661697766e-05, + "loss": 1.3337, + "step": 1746 + }, + { + "epoch": 0.2746369549411464, + "grad_norm": 0.1838586926460266, + "learning_rate": 4.77342622210859e-05, + "loss": 1.1453, + "step": 1747 + }, + { + "epoch": 0.274794159838079, + "grad_norm": 0.26292094588279724, + "learning_rate": 4.77316913919518e-05, + "loss": 1.1206, + "step": 1748 + }, + { + "epoch": 0.27495136473501147, + "grad_norm": 0.15059764683246613, + "learning_rate": 4.7729119174452475e-05, + "loss": 1.2451, + "step": 1749 + }, + { + "epoch": 0.275108569631944, + "grad_norm": 0.2880924940109253, + "learning_rate": 4.772654556874503e-05, + "loss": 1.1765, + "step": 1750 + }, + { + "epoch": 0.2752657745288766, + "grad_norm": 0.20336076617240906, + "learning_rate": 4.7723970574986656e-05, + "loss": 1.1369, + "step": 1751 + }, + { + "epoch": 0.2754229794258091, + "grad_norm": 0.24111486971378326, + "learning_rate": 4.77213941933346e-05, + "loss": 1.0741, + "step": 1752 + }, + { + "epoch": 0.2755801843227417, + "grad_norm": 0.23586514592170715, + "learning_rate": 4.7718816423946256e-05, + "loss": 1.214, + "step": 1753 + }, + { + "epoch": 0.27573738921967417, + "grad_norm": 0.2305273711681366, + "learning_rate": 4.7716237266979036e-05, + "loss": 1.1988, + "step": 1754 + }, + { + "epoch": 0.2758945941166067, + "grad_norm": 0.18287290632724762, + "learning_rate": 4.7713656722590475e-05, + "loss": 1.1478, + "step": 1755 + }, + { + "epoch": 0.27605179901353927, + "grad_norm": 0.28260156512260437, + "learning_rate": 4.7711074790938184e-05, + "loss": 1.0779, + "step": 1756 + }, + { + "epoch": 0.2762090039104718, + "grad_norm": 0.2582015097141266, + "learning_rate": 4.770849147217985e-05, + "loss": 1.1002, + "step": 1757 + }, + { + "epoch": 0.27636620880740437, + "grad_norm": 0.28041109442710876, + "learning_rate": 4.770590676647326e-05, + "loss": 1.0999, + "step": 1758 + }, + { + "epoch": 0.2765234137043369, + "grad_norm": 0.23540420830249786, + "learning_rate": 4.770332067397627e-05, + "loss": 1.1478, + "step": 1759 + }, + { + "epoch": 0.2766806186012694, + "grad_norm": 0.29831403493881226, + "learning_rate": 4.770073319484684e-05, + "loss": 1.0932, + "step": 1760 + }, + { + "epoch": 0.2766806186012694, + "eval_loss": 1.1590473651885986, + "eval_runtime": 2321.8933, + "eval_samples_per_second": 3.987, + "eval_steps_per_second": 1.994, + "step": 1760 + }, + { + "epoch": 0.27683782349820196, + "grad_norm": 0.21759991347789764, + "learning_rate": 4.769814432924299e-05, + "loss": 1.166, + "step": 1761 + }, + { + "epoch": 0.2769950283951345, + "grad_norm": 0.2622128129005432, + "learning_rate": 4.7695554077322845e-05, + "loss": 1.1155, + "step": 1762 + }, + { + "epoch": 0.27715223329206706, + "grad_norm": 0.24021178483963013, + "learning_rate": 4.769296243924462e-05, + "loss": 1.1922, + "step": 1763 + }, + { + "epoch": 0.2773094381889996, + "grad_norm": 0.23425696790218353, + "learning_rate": 4.769036941516658e-05, + "loss": 1.0903, + "step": 1764 + }, + { + "epoch": 0.2774666430859321, + "grad_norm": 0.17257444560527802, + "learning_rate": 4.768777500524711e-05, + "loss": 1.1319, + "step": 1765 + }, + { + "epoch": 0.27762384798286466, + "grad_norm": 0.2458474487066269, + "learning_rate": 4.7685179209644664e-05, + "loss": 1.2144, + "step": 1766 + }, + { + "epoch": 0.2777810528797972, + "grad_norm": 0.18771199882030487, + "learning_rate": 4.7682582028517784e-05, + "loss": 1.1654, + "step": 1767 + }, + { + "epoch": 0.27793825777672976, + "grad_norm": 0.2979085445404053, + "learning_rate": 4.767998346202509e-05, + "loss": 1.1537, + "step": 1768 + }, + { + "epoch": 0.2780954626736623, + "grad_norm": 0.22919511795043945, + "learning_rate": 4.767738351032531e-05, + "loss": 1.15, + "step": 1769 + }, + { + "epoch": 0.2782526675705948, + "grad_norm": 0.24717088043689728, + "learning_rate": 4.7674782173577214e-05, + "loss": 1.1205, + "step": 1770 + }, + { + "epoch": 0.27840987246752735, + "grad_norm": 0.21218983829021454, + "learning_rate": 4.7672179451939704e-05, + "loss": 1.2007, + "step": 1771 + }, + { + "epoch": 0.2785670773644599, + "grad_norm": 0.16957569122314453, + "learning_rate": 4.766957534557173e-05, + "loss": 1.183, + "step": 1772 + }, + { + "epoch": 0.27872428226139245, + "grad_norm": 0.17404644191265106, + "learning_rate": 4.766696985463235e-05, + "loss": 1.0727, + "step": 1773 + }, + { + "epoch": 0.278881487158325, + "grad_norm": 0.22840437293052673, + "learning_rate": 4.766436297928068e-05, + "loss": 1.0331, + "step": 1774 + }, + { + "epoch": 0.2790386920552575, + "grad_norm": 0.18459245562553406, + "learning_rate": 4.766175471967597e-05, + "loss": 1.1837, + "step": 1775 + }, + { + "epoch": 0.27919589695219005, + "grad_norm": 0.1901281177997589, + "learning_rate": 4.7659145075977496e-05, + "loss": 1.2815, + "step": 1776 + }, + { + "epoch": 0.2793531018491226, + "grad_norm": 0.20891423523426056, + "learning_rate": 4.7656534048344656e-05, + "loss": 1.1554, + "step": 1777 + }, + { + "epoch": 0.27951030674605515, + "grad_norm": 0.2291899174451828, + "learning_rate": 4.765392163693691e-05, + "loss": 1.137, + "step": 1778 + }, + { + "epoch": 0.2796675116429877, + "grad_norm": 0.21621862053871155, + "learning_rate": 4.765130784191384e-05, + "loss": 1.1818, + "step": 1779 + }, + { + "epoch": 0.2798247165399202, + "grad_norm": 0.24925227463245392, + "learning_rate": 4.7648692663435054e-05, + "loss": 1.1611, + "step": 1780 + }, + { + "epoch": 0.27998192143685274, + "grad_norm": 0.26903223991394043, + "learning_rate": 4.76460761016603e-05, + "loss": 1.1493, + "step": 1781 + }, + { + "epoch": 0.2801391263337853, + "grad_norm": 0.17821714282035828, + "learning_rate": 4.764345815674937e-05, + "loss": 1.1292, + "step": 1782 + }, + { + "epoch": 0.28029633123071784, + "grad_norm": 0.19402022659778595, + "learning_rate": 4.764083882886218e-05, + "loss": 1.1462, + "step": 1783 + }, + { + "epoch": 0.2804535361276504, + "grad_norm": 0.18578824400901794, + "learning_rate": 4.7638218118158694e-05, + "loss": 1.1745, + "step": 1784 + }, + { + "epoch": 0.2806107410245829, + "grad_norm": 0.2654874920845032, + "learning_rate": 4.763559602479898e-05, + "loss": 1.1627, + "step": 1785 + }, + { + "epoch": 0.28076794592151544, + "grad_norm": 0.3167024850845337, + "learning_rate": 4.763297254894318e-05, + "loss": 1.0536, + "step": 1786 + }, + { + "epoch": 0.280925150818448, + "grad_norm": 0.21574537456035614, + "learning_rate": 4.763034769075153e-05, + "loss": 1.1318, + "step": 1787 + }, + { + "epoch": 0.28108235571538054, + "grad_norm": 0.2255323827266693, + "learning_rate": 4.7627721450384354e-05, + "loss": 1.299, + "step": 1788 + }, + { + "epoch": 0.2812395606123131, + "grad_norm": 0.23979492485523224, + "learning_rate": 4.7625093828002035e-05, + "loss": 1.2133, + "step": 1789 + }, + { + "epoch": 0.28139676550924564, + "grad_norm": 0.26039859652519226, + "learning_rate": 4.762246482376507e-05, + "loss": 1.1472, + "step": 1790 + }, + { + "epoch": 0.28155397040617813, + "grad_norm": 0.2618144452571869, + "learning_rate": 4.761983443783403e-05, + "loss": 1.1851, + "step": 1791 + }, + { + "epoch": 0.2817111753031107, + "grad_norm": 0.2428680956363678, + "learning_rate": 4.7617202670369556e-05, + "loss": 1.1583, + "step": 1792 + }, + { + "epoch": 0.28186838020004323, + "grad_norm": 0.280111700296402, + "learning_rate": 4.76145695215324e-05, + "loss": 1.2112, + "step": 1793 + }, + { + "epoch": 0.2820255850969758, + "grad_norm": 0.24310675263404846, + "learning_rate": 4.761193499148339e-05, + "loss": 1.1552, + "step": 1794 + }, + { + "epoch": 0.28218278999390833, + "grad_norm": 0.28014469146728516, + "learning_rate": 4.7609299080383415e-05, + "loss": 1.2571, + "step": 1795 + }, + { + "epoch": 0.28233999489084083, + "grad_norm": 0.3029899001121521, + "learning_rate": 4.760666178839347e-05, + "loss": 1.1485, + "step": 1796 + }, + { + "epoch": 0.2824971997877734, + "grad_norm": 0.34275612235069275, + "learning_rate": 4.7604023115674644e-05, + "loss": 1.101, + "step": 1797 + }, + { + "epoch": 0.28265440468470593, + "grad_norm": 0.20968905091285706, + "learning_rate": 4.760138306238809e-05, + "loss": 1.0953, + "step": 1798 + }, + { + "epoch": 0.2828116095816385, + "grad_norm": 0.19242046773433685, + "learning_rate": 4.759874162869505e-05, + "loss": 1.1754, + "step": 1799 + }, + { + "epoch": 0.28296881447857103, + "grad_norm": 0.31839972734451294, + "learning_rate": 4.759609881475685e-05, + "loss": 1.0707, + "step": 1800 + }, + { + "epoch": 0.2831260193755035, + "grad_norm": 0.3110107183456421, + "learning_rate": 4.7593454620734914e-05, + "loss": 1.1576, + "step": 1801 + }, + { + "epoch": 0.2832832242724361, + "grad_norm": 0.2405199557542801, + "learning_rate": 4.759080904679072e-05, + "loss": 1.179, + "step": 1802 + }, + { + "epoch": 0.2834404291693686, + "grad_norm": 0.21492940187454224, + "learning_rate": 4.758816209308587e-05, + "loss": 1.1806, + "step": 1803 + }, + { + "epoch": 0.2835976340663012, + "grad_norm": 0.2658292055130005, + "learning_rate": 4.758551375978202e-05, + "loss": 1.0843, + "step": 1804 + }, + { + "epoch": 0.2837548389632337, + "grad_norm": 0.2901574969291687, + "learning_rate": 4.758286404704092e-05, + "loss": 1.1257, + "step": 1805 + }, + { + "epoch": 0.2839120438601662, + "grad_norm": 0.34066644310951233, + "learning_rate": 4.758021295502441e-05, + "loss": 1.1073, + "step": 1806 + }, + { + "epoch": 0.28406924875709877, + "grad_norm": 0.2622152268886566, + "learning_rate": 4.7577560483894406e-05, + "loss": 1.1046, + "step": 1807 + }, + { + "epoch": 0.2842264536540313, + "grad_norm": 0.1930551379919052, + "learning_rate": 4.757490663381291e-05, + "loss": 1.2328, + "step": 1808 + }, + { + "epoch": 0.28438365855096387, + "grad_norm": 0.30099764466285706, + "learning_rate": 4.7572251404942e-05, + "loss": 1.1743, + "step": 1809 + }, + { + "epoch": 0.2845408634478964, + "grad_norm": 0.24297019839286804, + "learning_rate": 4.756959479744386e-05, + "loss": 1.1596, + "step": 1810 + }, + { + "epoch": 0.2846980683448289, + "grad_norm": 0.2792363464832306, + "learning_rate": 4.7566936811480744e-05, + "loss": 1.0061, + "step": 1811 + }, + { + "epoch": 0.28485527324176146, + "grad_norm": 0.2141488492488861, + "learning_rate": 4.756427744721499e-05, + "loss": 1.1381, + "step": 1812 + }, + { + "epoch": 0.285012478138694, + "grad_norm": 0.23667530715465546, + "learning_rate": 4.756161670480902e-05, + "loss": 1.1434, + "step": 1813 + }, + { + "epoch": 0.28516968303562656, + "grad_norm": 0.21142233908176422, + "learning_rate": 4.755895458442534e-05, + "loss": 1.1959, + "step": 1814 + }, + { + "epoch": 0.2853268879325591, + "grad_norm": 0.22952231764793396, + "learning_rate": 4.755629108622655e-05, + "loss": 1.132, + "step": 1815 + }, + { + "epoch": 0.28548409282949166, + "grad_norm": 0.2787313759326935, + "learning_rate": 4.7553626210375326e-05, + "loss": 0.9319, + "step": 1816 + }, + { + "epoch": 0.28564129772642416, + "grad_norm": 0.1895769089460373, + "learning_rate": 4.755095995703441e-05, + "loss": 1.2251, + "step": 1817 + }, + { + "epoch": 0.2857985026233567, + "grad_norm": 0.2436160445213318, + "learning_rate": 4.754829232636667e-05, + "loss": 1.14, + "step": 1818 + }, + { + "epoch": 0.28595570752028926, + "grad_norm": 0.3004104793071747, + "learning_rate": 4.7545623318535024e-05, + "loss": 1.1939, + "step": 1819 + }, + { + "epoch": 0.2861129124172218, + "grad_norm": 0.30581870675086975, + "learning_rate": 4.754295293370248e-05, + "loss": 1.1904, + "step": 1820 + }, + { + "epoch": 0.28627011731415436, + "grad_norm": 0.3518725335597992, + "learning_rate": 4.754028117203215e-05, + "loss": 1.1693, + "step": 1821 + }, + { + "epoch": 0.28642732221108685, + "grad_norm": 0.2542705237865448, + "learning_rate": 4.7537608033687204e-05, + "loss": 1.0971, + "step": 1822 + }, + { + "epoch": 0.2865845271080194, + "grad_norm": 0.19205254316329956, + "learning_rate": 4.7534933518830904e-05, + "loss": 1.0652, + "step": 1823 + }, + { + "epoch": 0.28674173200495195, + "grad_norm": 0.2477174997329712, + "learning_rate": 4.753225762762661e-05, + "loss": 1.2265, + "step": 1824 + }, + { + "epoch": 0.2868989369018845, + "grad_norm": 0.19431249797344208, + "learning_rate": 4.7529580360237744e-05, + "loss": 1.1722, + "step": 1825 + }, + { + "epoch": 0.28705614179881705, + "grad_norm": 0.24031542241573334, + "learning_rate": 4.7526901716827846e-05, + "loss": 1.1593, + "step": 1826 + }, + { + "epoch": 0.28721334669574955, + "grad_norm": 0.3656073808670044, + "learning_rate": 4.752422169756048e-05, + "loss": 1.1655, + "step": 1827 + }, + { + "epoch": 0.2873705515926821, + "grad_norm": 0.24645781517028809, + "learning_rate": 4.752154030259936e-05, + "loss": 1.0771, + "step": 1828 + }, + { + "epoch": 0.28752775648961465, + "grad_norm": 0.22396346926689148, + "learning_rate": 4.7518857532108245e-05, + "loss": 1.2857, + "step": 1829 + }, + { + "epoch": 0.2876849613865472, + "grad_norm": 0.37394481897354126, + "learning_rate": 4.751617338625099e-05, + "loss": 1.1057, + "step": 1830 + }, + { + "epoch": 0.28784216628347975, + "grad_norm": 0.260421484708786, + "learning_rate": 4.751348786519154e-05, + "loss": 1.1764, + "step": 1831 + }, + { + "epoch": 0.28799937118041224, + "grad_norm": 0.23633582890033722, + "learning_rate": 4.7510800969093904e-05, + "loss": 1.2257, + "step": 1832 + }, + { + "epoch": 0.2881565760773448, + "grad_norm": 0.2963007092475891, + "learning_rate": 4.750811269812219e-05, + "loss": 1.1226, + "step": 1833 + }, + { + "epoch": 0.28831378097427734, + "grad_norm": 0.2502879798412323, + "learning_rate": 4.75054230524406e-05, + "loss": 1.1942, + "step": 1834 + }, + { + "epoch": 0.2884709858712099, + "grad_norm": 0.20743811130523682, + "learning_rate": 4.750273203221339e-05, + "loss": 1.2321, + "step": 1835 + }, + { + "epoch": 0.28862819076814245, + "grad_norm": 0.3560789227485657, + "learning_rate": 4.750003963760493e-05, + "loss": 1.1188, + "step": 1836 + }, + { + "epoch": 0.28878539566507494, + "grad_norm": 0.3189667761325836, + "learning_rate": 4.7497345868779644e-05, + "loss": 1.3159, + "step": 1837 + }, + { + "epoch": 0.2889426005620075, + "grad_norm": 0.22662386298179626, + "learning_rate": 4.749465072590208e-05, + "loss": 1.055, + "step": 1838 + }, + { + "epoch": 0.28909980545894004, + "grad_norm": 0.18307331204414368, + "learning_rate": 4.749195420913683e-05, + "loss": 1.3076, + "step": 1839 + }, + { + "epoch": 0.2892570103558726, + "grad_norm": 0.25430959463119507, + "learning_rate": 4.74892563186486e-05, + "loss": 1.2806, + "step": 1840 + }, + { + "epoch": 0.28941421525280514, + "grad_norm": 0.21403716504573822, + "learning_rate": 4.748655705460215e-05, + "loss": 1.1572, + "step": 1841 + }, + { + "epoch": 0.2895714201497377, + "grad_norm": 0.1849280297756195, + "learning_rate": 4.7483856417162365e-05, + "loss": 1.1613, + "step": 1842 + }, + { + "epoch": 0.2897286250466702, + "grad_norm": 0.21544575691223145, + "learning_rate": 4.7481154406494164e-05, + "loss": 1.1786, + "step": 1843 + }, + { + "epoch": 0.28988582994360274, + "grad_norm": 0.2631804943084717, + "learning_rate": 4.7478451022762596e-05, + "loss": 1.1764, + "step": 1844 + }, + { + "epoch": 0.2900430348405353, + "grad_norm": 0.21197448670864105, + "learning_rate": 4.747574626613276e-05, + "loss": 1.1235, + "step": 1845 + }, + { + "epoch": 0.29020023973746784, + "grad_norm": 0.26589202880859375, + "learning_rate": 4.7473040136769855e-05, + "loss": 1.2104, + "step": 1846 + }, + { + "epoch": 0.2903574446344004, + "grad_norm": 0.3811849057674408, + "learning_rate": 4.7470332634839165e-05, + "loss": 1.1747, + "step": 1847 + }, + { + "epoch": 0.2905146495313329, + "grad_norm": 0.23455043137073517, + "learning_rate": 4.7467623760506054e-05, + "loss": 1.1475, + "step": 1848 + }, + { + "epoch": 0.29067185442826543, + "grad_norm": 0.2337242066860199, + "learning_rate": 4.746491351393596e-05, + "loss": 1.1085, + "step": 1849 + }, + { + "epoch": 0.290829059325198, + "grad_norm": 0.30514079332351685, + "learning_rate": 4.746220189529442e-05, + "loss": 1.235, + "step": 1850 + }, + { + "epoch": 0.29098626422213053, + "grad_norm": 0.20872575044631958, + "learning_rate": 4.7459488904747064e-05, + "loss": 1.1893, + "step": 1851 + }, + { + "epoch": 0.2911434691190631, + "grad_norm": 0.23578737676143646, + "learning_rate": 4.745677454245957e-05, + "loss": 1.1986, + "step": 1852 + }, + { + "epoch": 0.2913006740159956, + "grad_norm": 0.26326242089271545, + "learning_rate": 4.745405880859773e-05, + "loss": 1.213, + "step": 1853 + }, + { + "epoch": 0.2914578789129281, + "grad_norm": 0.23654600977897644, + "learning_rate": 4.7451341703327414e-05, + "loss": 1.19, + "step": 1854 + }, + { + "epoch": 0.2916150838098607, + "grad_norm": 0.2641269564628601, + "learning_rate": 4.744862322681457e-05, + "loss": 1.1428, + "step": 1855 + }, + { + "epoch": 0.2917722887067932, + "grad_norm": 0.24482795596122742, + "learning_rate": 4.744590337922522e-05, + "loss": 1.1508, + "step": 1856 + }, + { + "epoch": 0.2919294936037258, + "grad_norm": 0.28008776903152466, + "learning_rate": 4.744318216072551e-05, + "loss": 1.3053, + "step": 1857 + }, + { + "epoch": 0.29208669850065827, + "grad_norm": 0.23641882836818695, + "learning_rate": 4.744045957148161e-05, + "loss": 1.0879, + "step": 1858 + }, + { + "epoch": 0.2922439033975908, + "grad_norm": 0.25675082206726074, + "learning_rate": 4.743773561165982e-05, + "loss": 1.2489, + "step": 1859 + }, + { + "epoch": 0.29240110829452337, + "grad_norm": 0.25265899300575256, + "learning_rate": 4.743501028142652e-05, + "loss": 1.1042, + "step": 1860 + }, + { + "epoch": 0.2925583131914559, + "grad_norm": 0.2375422716140747, + "learning_rate": 4.743228358094814e-05, + "loss": 1.2003, + "step": 1861 + }, + { + "epoch": 0.29271551808838847, + "grad_norm": 0.24216416478157043, + "learning_rate": 4.742955551039123e-05, + "loss": 1.1058, + "step": 1862 + }, + { + "epoch": 0.29287272298532097, + "grad_norm": 0.3152044713497162, + "learning_rate": 4.7426826069922416e-05, + "loss": 1.2503, + "step": 1863 + }, + { + "epoch": 0.2930299278822535, + "grad_norm": 0.19799089431762695, + "learning_rate": 4.7424095259708384e-05, + "loss": 1.0635, + "step": 1864 + }, + { + "epoch": 0.29318713277918607, + "grad_norm": 0.24691304564476013, + "learning_rate": 4.742136307991594e-05, + "loss": 1.232, + "step": 1865 + }, + { + "epoch": 0.2933443376761186, + "grad_norm": 0.24523115158081055, + "learning_rate": 4.741862953071194e-05, + "loss": 1.1678, + "step": 1866 + }, + { + "epoch": 0.29350154257305117, + "grad_norm": 0.26968106627464294, + "learning_rate": 4.7415894612263344e-05, + "loss": 1.194, + "step": 1867 + }, + { + "epoch": 0.2936587474699837, + "grad_norm": 0.23013943433761597, + "learning_rate": 4.7413158324737206e-05, + "loss": 1.1052, + "step": 1868 + }, + { + "epoch": 0.2938159523669162, + "grad_norm": 0.21373358368873596, + "learning_rate": 4.741042066830062e-05, + "loss": 1.1555, + "step": 1869 + }, + { + "epoch": 0.29397315726384876, + "grad_norm": 0.26143181324005127, + "learning_rate": 4.740768164312081e-05, + "loss": 1.0752, + "step": 1870 + }, + { + "epoch": 0.2941303621607813, + "grad_norm": 0.216646209359169, + "learning_rate": 4.7404941249365066e-05, + "loss": 1.1678, + "step": 1871 + }, + { + "epoch": 0.29428756705771386, + "grad_norm": 0.22760845720767975, + "learning_rate": 4.740219948720075e-05, + "loss": 1.0414, + "step": 1872 + }, + { + "epoch": 0.2944447719546464, + "grad_norm": 0.2713901400566101, + "learning_rate": 4.739945635679532e-05, + "loss": 1.2089, + "step": 1873 + }, + { + "epoch": 0.2946019768515789, + "grad_norm": 0.2115233987569809, + "learning_rate": 4.739671185831633e-05, + "loss": 1.2092, + "step": 1874 + }, + { + "epoch": 0.29475918174851146, + "grad_norm": 0.24696362018585205, + "learning_rate": 4.739396599193139e-05, + "loss": 1.1438, + "step": 1875 + }, + { + "epoch": 0.294916386645444, + "grad_norm": 0.29846012592315674, + "learning_rate": 4.739121875780821e-05, + "loss": 1.1604, + "step": 1876 + }, + { + "epoch": 0.29507359154237656, + "grad_norm": 0.22811514139175415, + "learning_rate": 4.7388470156114576e-05, + "loss": 1.2149, + "step": 1877 + }, + { + "epoch": 0.2952307964393091, + "grad_norm": 0.24468787014484406, + "learning_rate": 4.738572018701838e-05, + "loss": 1.0935, + "step": 1878 + }, + { + "epoch": 0.2953880013362416, + "grad_norm": 0.28037217259407043, + "learning_rate": 4.738296885068756e-05, + "loss": 1.0999, + "step": 1879 + }, + { + "epoch": 0.29554520623317415, + "grad_norm": 0.2207103669643402, + "learning_rate": 4.738021614729016e-05, + "loss": 1.242, + "step": 1880 + }, + { + "epoch": 0.2957024111301067, + "grad_norm": 0.16394954919815063, + "learning_rate": 4.7377462076994313e-05, + "loss": 1.1138, + "step": 1881 + }, + { + "epoch": 0.29585961602703925, + "grad_norm": 0.3569471538066864, + "learning_rate": 4.7374706639968224e-05, + "loss": 1.1034, + "step": 1882 + }, + { + "epoch": 0.2960168209239718, + "grad_norm": 0.19428043067455292, + "learning_rate": 4.737194983638018e-05, + "loss": 1.2082, + "step": 1883 + }, + { + "epoch": 0.2961740258209043, + "grad_norm": 0.23990128934383392, + "learning_rate": 4.736919166639856e-05, + "loss": 1.2445, + "step": 1884 + }, + { + "epoch": 0.29633123071783685, + "grad_norm": 0.2282816767692566, + "learning_rate": 4.736643213019183e-05, + "loss": 1.0495, + "step": 1885 + }, + { + "epoch": 0.2964884356147694, + "grad_norm": 0.279305100440979, + "learning_rate": 4.736367122792852e-05, + "loss": 1.0828, + "step": 1886 + }, + { + "epoch": 0.29664564051170195, + "grad_norm": 0.18702171742916107, + "learning_rate": 4.736090895977725e-05, + "loss": 1.2215, + "step": 1887 + }, + { + "epoch": 0.2968028454086345, + "grad_norm": 0.232543483376503, + "learning_rate": 4.735814532590675e-05, + "loss": 1.0841, + "step": 1888 + }, + { + "epoch": 0.296960050305567, + "grad_norm": 0.1895131915807724, + "learning_rate": 4.7355380326485796e-05, + "loss": 1.169, + "step": 1889 + }, + { + "epoch": 0.29711725520249954, + "grad_norm": 0.1745792180299759, + "learning_rate": 4.735261396168327e-05, + "loss": 1.2403, + "step": 1890 + }, + { + "epoch": 0.2972744600994321, + "grad_norm": 0.25837990641593933, + "learning_rate": 4.734984623166813e-05, + "loss": 1.0719, + "step": 1891 + }, + { + "epoch": 0.29743166499636464, + "grad_norm": 0.21535173058509827, + "learning_rate": 4.7347077136609416e-05, + "loss": 1.1058, + "step": 1892 + }, + { + "epoch": 0.2975888698932972, + "grad_norm": 0.25150197744369507, + "learning_rate": 4.7344306676676254e-05, + "loss": 1.1828, + "step": 1893 + }, + { + "epoch": 0.29774607479022974, + "grad_norm": 0.2469971477985382, + "learning_rate": 4.734153485203786e-05, + "loss": 1.0672, + "step": 1894 + }, + { + "epoch": 0.29790327968716224, + "grad_norm": 0.2318231761455536, + "learning_rate": 4.733876166286352e-05, + "loss": 1.1708, + "step": 1895 + }, + { + "epoch": 0.2980604845840948, + "grad_norm": 0.1978289932012558, + "learning_rate": 4.733598710932261e-05, + "loss": 1.2133, + "step": 1896 + }, + { + "epoch": 0.29821768948102734, + "grad_norm": 0.22039441764354706, + "learning_rate": 4.733321119158459e-05, + "loss": 1.1628, + "step": 1897 + }, + { + "epoch": 0.2983748943779599, + "grad_norm": 0.1828751415014267, + "learning_rate": 4.7330433909819e-05, + "loss": 1.0819, + "step": 1898 + }, + { + "epoch": 0.29853209927489244, + "grad_norm": 0.3677425980567932, + "learning_rate": 4.732765526419547e-05, + "loss": 1.2305, + "step": 1899 + }, + { + "epoch": 0.29868930417182493, + "grad_norm": 0.2010020911693573, + "learning_rate": 4.732487525488371e-05, + "loss": 1.1933, + "step": 1900 + }, + { + "epoch": 0.2988465090687575, + "grad_norm": 0.2198139876127243, + "learning_rate": 4.732209388205351e-05, + "loss": 1.1228, + "step": 1901 + }, + { + "epoch": 0.29900371396569003, + "grad_norm": 0.26586952805519104, + "learning_rate": 4.731931114587474e-05, + "loss": 1.1978, + "step": 1902 + }, + { + "epoch": 0.2991609188626226, + "grad_norm": 0.18092826008796692, + "learning_rate": 4.731652704651737e-05, + "loss": 1.2788, + "step": 1903 + }, + { + "epoch": 0.29931812375955513, + "grad_norm": 0.23159582912921906, + "learning_rate": 4.731374158415144e-05, + "loss": 1.0886, + "step": 1904 + }, + { + "epoch": 0.2994753286564876, + "grad_norm": 0.24246764183044434, + "learning_rate": 4.7310954758947066e-05, + "loss": 1.1374, + "step": 1905 + }, + { + "epoch": 0.2996325335534202, + "grad_norm": 0.25246042013168335, + "learning_rate": 4.730816657107446e-05, + "loss": 1.2619, + "step": 1906 + }, + { + "epoch": 0.2997897384503527, + "grad_norm": 0.23286496102809906, + "learning_rate": 4.730537702070393e-05, + "loss": 1.2179, + "step": 1907 + }, + { + "epoch": 0.2999469433472853, + "grad_norm": 0.23141705989837646, + "learning_rate": 4.7302586108005834e-05, + "loss": 1.219, + "step": 1908 + }, + { + "epoch": 0.3001041482442178, + "grad_norm": 0.2506294846534729, + "learning_rate": 4.7299793833150624e-05, + "loss": 1.173, + "step": 1909 + }, + { + "epoch": 0.3002613531411503, + "grad_norm": 0.23536083102226257, + "learning_rate": 4.729700019630886e-05, + "loss": 0.9852, + "step": 1910 + }, + { + "epoch": 0.3004185580380829, + "grad_norm": 0.27178215980529785, + "learning_rate": 4.729420519765115e-05, + "loss": 1.2663, + "step": 1911 + }, + { + "epoch": 0.3005757629350154, + "grad_norm": 0.2641531229019165, + "learning_rate": 4.7291408837348224e-05, + "loss": 1.2179, + "step": 1912 + }, + { + "epoch": 0.300732967831948, + "grad_norm": 0.2041010558605194, + "learning_rate": 4.728861111557085e-05, + "loss": 1.1546, + "step": 1913 + }, + { + "epoch": 0.3008901727288805, + "grad_norm": 0.24100835621356964, + "learning_rate": 4.728581203248992e-05, + "loss": 1.2314, + "step": 1914 + }, + { + "epoch": 0.301047377625813, + "grad_norm": 0.2499173879623413, + "learning_rate": 4.7283011588276374e-05, + "loss": 1.1391, + "step": 1915 + }, + { + "epoch": 0.30120458252274557, + "grad_norm": 0.17696775496006012, + "learning_rate": 4.7280209783101265e-05, + "loss": 1.1424, + "step": 1916 + }, + { + "epoch": 0.3013617874196781, + "grad_norm": 0.2696003019809723, + "learning_rate": 4.727740661713571e-05, + "loss": 1.1785, + "step": 1917 + }, + { + "epoch": 0.30151899231661067, + "grad_norm": 0.19227688014507294, + "learning_rate": 4.727460209055092e-05, + "loss": 1.1431, + "step": 1918 + }, + { + "epoch": 0.3016761972135432, + "grad_norm": 0.2030278742313385, + "learning_rate": 4.7271796203518184e-05, + "loss": 1.1968, + "step": 1919 + }, + { + "epoch": 0.30183340211047577, + "grad_norm": 0.2489650398492813, + "learning_rate": 4.726898895620888e-05, + "loss": 1.1133, + "step": 1920 + }, + { + "epoch": 0.30183340211047577, + "eval_loss": 1.1504688262939453, + "eval_runtime": 2346.6617, + "eval_samples_per_second": 3.945, + "eval_steps_per_second": 1.973, + "step": 1920 + }, + { + "epoch": 0.30199060700740826, + "grad_norm": 0.20980967581272125, + "learning_rate": 4.7266180348794456e-05, + "loss": 1.1863, + "step": 1921 + }, + { + "epoch": 0.3021478119043408, + "grad_norm": 0.25084492564201355, + "learning_rate": 4.726337038144645e-05, + "loss": 1.1693, + "step": 1922 + }, + { + "epoch": 0.30230501680127336, + "grad_norm": 0.24765540659427643, + "learning_rate": 4.726055905433649e-05, + "loss": 1.1415, + "step": 1923 + }, + { + "epoch": 0.3024622216982059, + "grad_norm": 0.21769382059574127, + "learning_rate": 4.725774636763628e-05, + "loss": 1.2113, + "step": 1924 + }, + { + "epoch": 0.30261942659513846, + "grad_norm": 0.18449430167675018, + "learning_rate": 4.725493232151761e-05, + "loss": 1.1187, + "step": 1925 + }, + { + "epoch": 0.30277663149207096, + "grad_norm": 0.2294544279575348, + "learning_rate": 4.725211691615234e-05, + "loss": 1.011, + "step": 1926 + }, + { + "epoch": 0.3029338363890035, + "grad_norm": 0.235177144408226, + "learning_rate": 4.724930015171244e-05, + "loss": 1.1604, + "step": 1927 + }, + { + "epoch": 0.30309104128593606, + "grad_norm": 0.32669633626937866, + "learning_rate": 4.724648202836993e-05, + "loss": 1.1437, + "step": 1928 + }, + { + "epoch": 0.3032482461828686, + "grad_norm": 0.1598648726940155, + "learning_rate": 4.7243662546296954e-05, + "loss": 1.2719, + "step": 1929 + }, + { + "epoch": 0.30340545107980116, + "grad_norm": 0.222189262509346, + "learning_rate": 4.724084170566569e-05, + "loss": 1.1576, + "step": 1930 + }, + { + "epoch": 0.30356265597673365, + "grad_norm": 0.20110036432743073, + "learning_rate": 4.723801950664844e-05, + "loss": 1.1906, + "step": 1931 + }, + { + "epoch": 0.3037198608736662, + "grad_norm": 0.25523391366004944, + "learning_rate": 4.7235195949417564e-05, + "loss": 1.1705, + "step": 1932 + }, + { + "epoch": 0.30387706577059875, + "grad_norm": 0.20221413671970367, + "learning_rate": 4.723237103414553e-05, + "loss": 1.13, + "step": 1933 + }, + { + "epoch": 0.3040342706675313, + "grad_norm": 0.2736821472644806, + "learning_rate": 4.722954476100485e-05, + "loss": 0.9517, + "step": 1934 + }, + { + "epoch": 0.30419147556446385, + "grad_norm": 0.20103007555007935, + "learning_rate": 4.722671713016816e-05, + "loss": 1.2214, + "step": 1935 + }, + { + "epoch": 0.30434868046139635, + "grad_norm": 0.23819975554943085, + "learning_rate": 4.7223888141808156e-05, + "loss": 1.1757, + "step": 1936 + }, + { + "epoch": 0.3045058853583289, + "grad_norm": 0.22529488801956177, + "learning_rate": 4.7221057796097614e-05, + "loss": 1.2031, + "step": 1937 + }, + { + "epoch": 0.30466309025526145, + "grad_norm": 0.2521119713783264, + "learning_rate": 4.7218226093209416e-05, + "loss": 1.1728, + "step": 1938 + }, + { + "epoch": 0.304820295152194, + "grad_norm": 0.23056873679161072, + "learning_rate": 4.72153930333165e-05, + "loss": 1.2113, + "step": 1939 + }, + { + "epoch": 0.30497750004912655, + "grad_norm": 0.29192107915878296, + "learning_rate": 4.7212558616591895e-05, + "loss": 1.0284, + "step": 1940 + }, + { + "epoch": 0.30513470494605904, + "grad_norm": 0.1903986781835556, + "learning_rate": 4.7209722843208725e-05, + "loss": 1.2149, + "step": 1941 + }, + { + "epoch": 0.3052919098429916, + "grad_norm": 0.18553946912288666, + "learning_rate": 4.720688571334019e-05, + "loss": 1.1815, + "step": 1942 + }, + { + "epoch": 0.30544911473992414, + "grad_norm": 0.2598629593849182, + "learning_rate": 4.720404722715957e-05, + "loss": 1.0763, + "step": 1943 + }, + { + "epoch": 0.3056063196368567, + "grad_norm": 0.3054077625274658, + "learning_rate": 4.720120738484022e-05, + "loss": 1.1202, + "step": 1944 + }, + { + "epoch": 0.30576352453378924, + "grad_norm": 0.20831403136253357, + "learning_rate": 4.71983661865556e-05, + "loss": 1.1807, + "step": 1945 + }, + { + "epoch": 0.3059207294307218, + "grad_norm": 0.18264667689800262, + "learning_rate": 4.7195523632479226e-05, + "loss": 1.172, + "step": 1946 + }, + { + "epoch": 0.3060779343276543, + "grad_norm": 0.18661391735076904, + "learning_rate": 4.719267972278472e-05, + "loss": 1.2601, + "step": 1947 + }, + { + "epoch": 0.30623513922458684, + "grad_norm": 0.19430875778198242, + "learning_rate": 4.7189834457645775e-05, + "loss": 1.1476, + "step": 1948 + }, + { + "epoch": 0.3063923441215194, + "grad_norm": 0.23284098505973816, + "learning_rate": 4.718698783723616e-05, + "loss": 1.0859, + "step": 1949 + }, + { + "epoch": 0.30654954901845194, + "grad_norm": 0.19055165350437164, + "learning_rate": 4.7184139861729756e-05, + "loss": 1.1245, + "step": 1950 + }, + { + "epoch": 0.3067067539153845, + "grad_norm": 0.20607997477054596, + "learning_rate": 4.7181290531300496e-05, + "loss": 1.2386, + "step": 1951 + }, + { + "epoch": 0.306863958812317, + "grad_norm": 0.16489629447460175, + "learning_rate": 4.717843984612239e-05, + "loss": 1.2782, + "step": 1952 + }, + { + "epoch": 0.30702116370924953, + "grad_norm": 0.22474288940429688, + "learning_rate": 4.717558780636957e-05, + "loss": 1.219, + "step": 1953 + }, + { + "epoch": 0.3071783686061821, + "grad_norm": 0.22763358056545258, + "learning_rate": 4.7172734412216224e-05, + "loss": 1.0742, + "step": 1954 + }, + { + "epoch": 0.30733557350311463, + "grad_norm": 0.2578442692756653, + "learning_rate": 4.7169879663836614e-05, + "loss": 1.0377, + "step": 1955 + }, + { + "epoch": 0.3074927784000472, + "grad_norm": 0.26955658197402954, + "learning_rate": 4.716702356140511e-05, + "loss": 1.0492, + "step": 1956 + }, + { + "epoch": 0.3076499832969797, + "grad_norm": 0.2685282826423645, + "learning_rate": 4.716416610509614e-05, + "loss": 1.1809, + "step": 1957 + }, + { + "epoch": 0.30780718819391223, + "grad_norm": 0.26675713062286377, + "learning_rate": 4.716130729508424e-05, + "loss": 1.2132, + "step": 1958 + }, + { + "epoch": 0.3079643930908448, + "grad_norm": 0.22978322207927704, + "learning_rate": 4.7158447131544e-05, + "loss": 1.1327, + "step": 1959 + }, + { + "epoch": 0.30812159798777733, + "grad_norm": 0.2514789402484894, + "learning_rate": 4.7155585614650134e-05, + "loss": 1.2075, + "step": 1960 + }, + { + "epoch": 0.3082788028847099, + "grad_norm": 0.2616156339645386, + "learning_rate": 4.715272274457738e-05, + "loss": 1.0809, + "step": 1961 + }, + { + "epoch": 0.3084360077816424, + "grad_norm": 0.3378554880619049, + "learning_rate": 4.7149858521500614e-05, + "loss": 1.1245, + "step": 1962 + }, + { + "epoch": 0.3085932126785749, + "grad_norm": 0.20755833387374878, + "learning_rate": 4.714699294559476e-05, + "loss": 1.2121, + "step": 1963 + }, + { + "epoch": 0.3087504175755075, + "grad_norm": 0.2765822410583496, + "learning_rate": 4.714412601703484e-05, + "loss": 1.0652, + "step": 1964 + }, + { + "epoch": 0.30890762247244, + "grad_norm": 0.2736920416355133, + "learning_rate": 4.714125773599596e-05, + "loss": 1.1376, + "step": 1965 + }, + { + "epoch": 0.3090648273693726, + "grad_norm": 0.15031395852565765, + "learning_rate": 4.7138388102653295e-05, + "loss": 1.1655, + "step": 1966 + }, + { + "epoch": 0.30922203226630507, + "grad_norm": 0.30680006742477417, + "learning_rate": 4.713551711718212e-05, + "loss": 1.1064, + "step": 1967 + }, + { + "epoch": 0.3093792371632376, + "grad_norm": 0.2674984335899353, + "learning_rate": 4.713264477975777e-05, + "loss": 1.2259, + "step": 1968 + }, + { + "epoch": 0.30953644206017017, + "grad_norm": 0.2514277398586273, + "learning_rate": 4.7129771090555694e-05, + "loss": 1.1184, + "step": 1969 + }, + { + "epoch": 0.3096936469571027, + "grad_norm": 0.25297811627388, + "learning_rate": 4.71268960497514e-05, + "loss": 1.0941, + "step": 1970 + }, + { + "epoch": 0.30985085185403527, + "grad_norm": 0.24282206594944, + "learning_rate": 4.712401965752048e-05, + "loss": 1.0437, + "step": 1971 + }, + { + "epoch": 0.3100080567509678, + "grad_norm": 0.17618770897388458, + "learning_rate": 4.712114191403862e-05, + "loss": 1.2034, + "step": 1972 + }, + { + "epoch": 0.3101652616479003, + "grad_norm": 0.2321152687072754, + "learning_rate": 4.7118262819481576e-05, + "loss": 1.1255, + "step": 1973 + }, + { + "epoch": 0.31032246654483286, + "grad_norm": 0.2128259241580963, + "learning_rate": 4.711538237402518e-05, + "loss": 1.2008, + "step": 1974 + }, + { + "epoch": 0.3104796714417654, + "grad_norm": 0.20988841354846954, + "learning_rate": 4.711250057784539e-05, + "loss": 1.2586, + "step": 1975 + }, + { + "epoch": 0.31063687633869796, + "grad_norm": 0.16251863539218903, + "learning_rate": 4.7109617431118195e-05, + "loss": 1.1829, + "step": 1976 + }, + { + "epoch": 0.3107940812356305, + "grad_norm": 0.24834904074668884, + "learning_rate": 4.710673293401968e-05, + "loss": 1.1107, + "step": 1977 + }, + { + "epoch": 0.310951286132563, + "grad_norm": 0.42274773120880127, + "learning_rate": 4.7103847086726026e-05, + "loss": 1.1066, + "step": 1978 + }, + { + "epoch": 0.31110849102949556, + "grad_norm": 0.22393664717674255, + "learning_rate": 4.7100959889413505e-05, + "loss": 1.1203, + "step": 1979 + }, + { + "epoch": 0.3112656959264281, + "grad_norm": 0.2756640315055847, + "learning_rate": 4.709807134225843e-05, + "loss": 1.0559, + "step": 1980 + }, + { + "epoch": 0.31142290082336066, + "grad_norm": 0.3086540997028351, + "learning_rate": 4.709518144543724e-05, + "loss": 1.0609, + "step": 1981 + }, + { + "epoch": 0.3115801057202932, + "grad_norm": 0.22327710688114166, + "learning_rate": 4.7092290199126444e-05, + "loss": 1.1378, + "step": 1982 + }, + { + "epoch": 0.3117373106172257, + "grad_norm": 0.24802890419960022, + "learning_rate": 4.708939760350261e-05, + "loss": 1.3069, + "step": 1983 + }, + { + "epoch": 0.31189451551415825, + "grad_norm": 0.24601417779922485, + "learning_rate": 4.708650365874241e-05, + "loss": 0.9913, + "step": 1984 + }, + { + "epoch": 0.3120517204110908, + "grad_norm": 0.1627042442560196, + "learning_rate": 4.70836083650226e-05, + "loss": 1.1499, + "step": 1985 + }, + { + "epoch": 0.31220892530802336, + "grad_norm": 0.28036314249038696, + "learning_rate": 4.708071172252002e-05, + "loss": 1.2129, + "step": 1986 + }, + { + "epoch": 0.3123661302049559, + "grad_norm": 0.1716984510421753, + "learning_rate": 4.707781373141158e-05, + "loss": 1.0891, + "step": 1987 + }, + { + "epoch": 0.3125233351018884, + "grad_norm": 0.23412878811359406, + "learning_rate": 4.707491439187427e-05, + "loss": 1.1228, + "step": 1988 + }, + { + "epoch": 0.31268053999882095, + "grad_norm": 0.1973845213651657, + "learning_rate": 4.707201370408518e-05, + "loss": 1.0968, + "step": 1989 + }, + { + "epoch": 0.3128377448957535, + "grad_norm": 0.22760836780071259, + "learning_rate": 4.7069111668221476e-05, + "loss": 1.0974, + "step": 1990 + }, + { + "epoch": 0.31299494979268605, + "grad_norm": 0.24710045754909515, + "learning_rate": 4.70662082844604e-05, + "loss": 1.189, + "step": 1991 + }, + { + "epoch": 0.3131521546896186, + "grad_norm": 0.1627720296382904, + "learning_rate": 4.7063303552979275e-05, + "loss": 1.2501, + "step": 1992 + }, + { + "epoch": 0.3133093595865511, + "grad_norm": 0.2823135554790497, + "learning_rate": 4.706039747395552e-05, + "loss": 1.1686, + "step": 1993 + }, + { + "epoch": 0.31346656448348365, + "grad_norm": 0.16645194590091705, + "learning_rate": 4.705749004756662e-05, + "loss": 1.1522, + "step": 1994 + }, + { + "epoch": 0.3136237693804162, + "grad_norm": 0.2954739034175873, + "learning_rate": 4.705458127399015e-05, + "loss": 1.0194, + "step": 1995 + }, + { + "epoch": 0.31378097427734875, + "grad_norm": 0.21758581697940826, + "learning_rate": 4.705167115340376e-05, + "loss": 1.1406, + "step": 1996 + }, + { + "epoch": 0.3139381791742813, + "grad_norm": 0.2450348287820816, + "learning_rate": 4.704875968598521e-05, + "loss": 1.1483, + "step": 1997 + }, + { + "epoch": 0.31409538407121385, + "grad_norm": 0.22131307423114777, + "learning_rate": 4.70458468719123e-05, + "loss": 1.2813, + "step": 1998 + }, + { + "epoch": 0.31425258896814634, + "grad_norm": 0.20907478034496307, + "learning_rate": 4.704293271136294e-05, + "loss": 1.2129, + "step": 1999 + }, + { + "epoch": 0.3144097938650789, + "grad_norm": 0.18173012137413025, + "learning_rate": 4.704001720451513e-05, + "loss": 1.1854, + "step": 2000 + }, + { + "epoch": 0.31456699876201144, + "grad_norm": 0.24892567098140717, + "learning_rate": 4.7037100351546914e-05, + "loss": 1.2041, + "step": 2001 + }, + { + "epoch": 0.314724203658944, + "grad_norm": 0.24656441807746887, + "learning_rate": 4.703418215263646e-05, + "loss": 1.1454, + "step": 2002 + }, + { + "epoch": 0.31488140855587654, + "grad_norm": 0.31545913219451904, + "learning_rate": 4.703126260796199e-05, + "loss": 1.0506, + "step": 2003 + }, + { + "epoch": 0.31503861345280904, + "grad_norm": 0.19431567192077637, + "learning_rate": 4.702834171770183e-05, + "loss": 1.1689, + "step": 2004 + }, + { + "epoch": 0.3151958183497416, + "grad_norm": 0.22574302554130554, + "learning_rate": 4.702541948203436e-05, + "loss": 1.1604, + "step": 2005 + }, + { + "epoch": 0.31535302324667414, + "grad_norm": 0.18993429839611053, + "learning_rate": 4.7022495901138084e-05, + "loss": 1.2154, + "step": 2006 + }, + { + "epoch": 0.3155102281436067, + "grad_norm": 0.21746498346328735, + "learning_rate": 4.7019570975191544e-05, + "loss": 1.1902, + "step": 2007 + }, + { + "epoch": 0.31566743304053924, + "grad_norm": 0.274848610162735, + "learning_rate": 4.701664470437338e-05, + "loss": 1.0607, + "step": 2008 + }, + { + "epoch": 0.31582463793747173, + "grad_norm": 0.2473202347755432, + "learning_rate": 4.701371708886233e-05, + "loss": 1.1909, + "step": 2009 + }, + { + "epoch": 0.3159818428344043, + "grad_norm": 0.22823210060596466, + "learning_rate": 4.701078812883719e-05, + "loss": 1.1499, + "step": 2010 + }, + { + "epoch": 0.31613904773133683, + "grad_norm": 0.21258869767189026, + "learning_rate": 4.700785782447686e-05, + "loss": 1.029, + "step": 2011 + }, + { + "epoch": 0.3162962526282694, + "grad_norm": 0.17091694474220276, + "learning_rate": 4.700492617596032e-05, + "loss": 1.1888, + "step": 2012 + }, + { + "epoch": 0.31645345752520193, + "grad_norm": 0.22551240026950836, + "learning_rate": 4.70019931834666e-05, + "loss": 1.1017, + "step": 2013 + }, + { + "epoch": 0.3166106624221344, + "grad_norm": 0.24327991902828217, + "learning_rate": 4.6999058847174856e-05, + "loss": 1.1317, + "step": 2014 + }, + { + "epoch": 0.316767867319067, + "grad_norm": 0.2276056557893753, + "learning_rate": 4.699612316726429e-05, + "loss": 1.1481, + "step": 2015 + }, + { + "epoch": 0.3169250722159995, + "grad_norm": 0.3077816069126129, + "learning_rate": 4.699318614391422e-05, + "loss": 1.1448, + "step": 2016 + }, + { + "epoch": 0.3170822771129321, + "grad_norm": 0.17493993043899536, + "learning_rate": 4.699024777730402e-05, + "loss": 1.1789, + "step": 2017 + }, + { + "epoch": 0.3172394820098646, + "grad_norm": 0.2110036462545395, + "learning_rate": 4.698730806761314e-05, + "loss": 1.1272, + "step": 2018 + }, + { + "epoch": 0.3173966869067971, + "grad_norm": 0.23337996006011963, + "learning_rate": 4.6984367015021154e-05, + "loss": 1.0113, + "step": 2019 + }, + { + "epoch": 0.31755389180372967, + "grad_norm": 0.23595476150512695, + "learning_rate": 4.698142461970767e-05, + "loss": 1.0771, + "step": 2020 + }, + { + "epoch": 0.3177110967006622, + "grad_norm": 0.3300482928752899, + "learning_rate": 4.697848088185241e-05, + "loss": 1.0853, + "step": 2021 + }, + { + "epoch": 0.31786830159759477, + "grad_norm": 0.26284661889076233, + "learning_rate": 4.6975535801635146e-05, + "loss": 1.1216, + "step": 2022 + }, + { + "epoch": 0.3180255064945273, + "grad_norm": 0.2010652720928192, + "learning_rate": 4.6972589379235775e-05, + "loss": 1.1745, + "step": 2023 + }, + { + "epoch": 0.31818271139145987, + "grad_norm": 0.29553526639938354, + "learning_rate": 4.6969641614834244e-05, + "loss": 1.1722, + "step": 2024 + }, + { + "epoch": 0.31833991628839237, + "grad_norm": 0.2783648669719696, + "learning_rate": 4.6966692508610586e-05, + "loss": 1.1632, + "step": 2025 + }, + { + "epoch": 0.3184971211853249, + "grad_norm": 0.2214052528142929, + "learning_rate": 4.696374206074494e-05, + "loss": 1.2004, + "step": 2026 + }, + { + "epoch": 0.31865432608225747, + "grad_norm": 0.3076881766319275, + "learning_rate": 4.696079027141749e-05, + "loss": 1.1441, + "step": 2027 + }, + { + "epoch": 0.31881153097919, + "grad_norm": 0.2371261715888977, + "learning_rate": 4.695783714080852e-05, + "loss": 1.1661, + "step": 2028 + }, + { + "epoch": 0.31896873587612257, + "grad_norm": 0.20042219758033752, + "learning_rate": 4.695488266909841e-05, + "loss": 1.2247, + "step": 2029 + }, + { + "epoch": 0.31912594077305506, + "grad_norm": 0.1989884376525879, + "learning_rate": 4.69519268564676e-05, + "loss": 1.1628, + "step": 2030 + }, + { + "epoch": 0.3192831456699876, + "grad_norm": 0.16829486191272736, + "learning_rate": 4.6948969703096614e-05, + "loss": 1.0928, + "step": 2031 + }, + { + "epoch": 0.31944035056692016, + "grad_norm": 0.20480427145957947, + "learning_rate": 4.694601120916607e-05, + "loss": 1.3009, + "step": 2032 + }, + { + "epoch": 0.3195975554638527, + "grad_norm": 0.20662841200828552, + "learning_rate": 4.694305137485666e-05, + "loss": 1.1324, + "step": 2033 + }, + { + "epoch": 0.31975476036078526, + "grad_norm": 0.22225816547870636, + "learning_rate": 4.6940090200349165e-05, + "loss": 1.0711, + "step": 2034 + }, + { + "epoch": 0.31991196525771776, + "grad_norm": 0.1969664990901947, + "learning_rate": 4.6937127685824426e-05, + "loss": 1.223, + "step": 2035 + }, + { + "epoch": 0.3200691701546503, + "grad_norm": 0.21913392841815948, + "learning_rate": 4.6934163831463405e-05, + "loss": 1.1268, + "step": 2036 + }, + { + "epoch": 0.32022637505158286, + "grad_norm": 0.21957406401634216, + "learning_rate": 4.69311986374471e-05, + "loss": 1.1839, + "step": 2037 + }, + { + "epoch": 0.3203835799485154, + "grad_norm": 0.189280167222023, + "learning_rate": 4.6928232103956635e-05, + "loss": 1.1528, + "step": 2038 + }, + { + "epoch": 0.32054078484544796, + "grad_norm": 0.26548171043395996, + "learning_rate": 4.6925264231173185e-05, + "loss": 1.1047, + "step": 2039 + }, + { + "epoch": 0.32069798974238045, + "grad_norm": 0.20444843173027039, + "learning_rate": 4.6922295019278005e-05, + "loss": 1.0089, + "step": 2040 + }, + { + "epoch": 0.320855194639313, + "grad_norm": 0.2803153693675995, + "learning_rate": 4.691932446845246e-05, + "loss": 1.0605, + "step": 2041 + }, + { + "epoch": 0.32101239953624555, + "grad_norm": 0.22570960223674774, + "learning_rate": 4.691635257887798e-05, + "loss": 1.2089, + "step": 2042 + }, + { + "epoch": 0.3211696044331781, + "grad_norm": 0.2413252294063568, + "learning_rate": 4.691337935073606e-05, + "loss": 1.2844, + "step": 2043 + }, + { + "epoch": 0.32132680933011065, + "grad_norm": 0.2029626965522766, + "learning_rate": 4.691040478420832e-05, + "loss": 1.1383, + "step": 2044 + }, + { + "epoch": 0.32148401422704315, + "grad_norm": 0.22830082476139069, + "learning_rate": 4.6907428879476404e-05, + "loss": 1.2141, + "step": 2045 + }, + { + "epoch": 0.3216412191239757, + "grad_norm": 0.2177695333957672, + "learning_rate": 4.690445163672209e-05, + "loss": 1.0376, + "step": 2046 + }, + { + "epoch": 0.32179842402090825, + "grad_norm": 0.24039793014526367, + "learning_rate": 4.690147305612721e-05, + "loss": 1.1292, + "step": 2047 + }, + { + "epoch": 0.3219556289178408, + "grad_norm": 0.20877334475517273, + "learning_rate": 4.6898493137873687e-05, + "loss": 1.1784, + "step": 2048 + }, + { + "epoch": 0.32211283381477335, + "grad_norm": 0.238002210855484, + "learning_rate": 4.689551188214352e-05, + "loss": 1.1585, + "step": 2049 + }, + { + "epoch": 0.3222700387117059, + "grad_norm": 0.24130697548389435, + "learning_rate": 4.68925292891188e-05, + "loss": 1.0301, + "step": 2050 + }, + { + "epoch": 0.3224272436086384, + "grad_norm": 0.2053060382604599, + "learning_rate": 4.688954535898168e-05, + "loss": 1.2153, + "step": 2051 + }, + { + "epoch": 0.32258444850557094, + "grad_norm": 0.2303832322359085, + "learning_rate": 4.6886560091914415e-05, + "loss": 1.1762, + "step": 2052 + }, + { + "epoch": 0.3227416534025035, + "grad_norm": 0.25025346875190735, + "learning_rate": 4.688357348809933e-05, + "loss": 1.111, + "step": 2053 + }, + { + "epoch": 0.32289885829943604, + "grad_norm": 0.2501404583454132, + "learning_rate": 4.6880585547718845e-05, + "loss": 1.0784, + "step": 2054 + }, + { + "epoch": 0.3230560631963686, + "grad_norm": 0.2034914195537567, + "learning_rate": 4.687759627095544e-05, + "loss": 1.17, + "step": 2055 + }, + { + "epoch": 0.3232132680933011, + "grad_norm": 0.17908981442451477, + "learning_rate": 4.68746056579917e-05, + "loss": 1.1601, + "step": 2056 + }, + { + "epoch": 0.32337047299023364, + "grad_norm": 0.22271564602851868, + "learning_rate": 4.6871613709010266e-05, + "loss": 1.2194, + "step": 2057 + }, + { + "epoch": 0.3235276778871662, + "grad_norm": 0.23161068558692932, + "learning_rate": 4.6868620424193885e-05, + "loss": 1.1704, + "step": 2058 + }, + { + "epoch": 0.32368488278409874, + "grad_norm": 0.24310912191867828, + "learning_rate": 4.6865625803725375e-05, + "loss": 1.0526, + "step": 2059 + }, + { + "epoch": 0.3238420876810313, + "grad_norm": 0.2134714424610138, + "learning_rate": 4.6862629847787633e-05, + "loss": 1.1974, + "step": 2060 + }, + { + "epoch": 0.3239992925779638, + "grad_norm": 0.17639285326004028, + "learning_rate": 4.685963255656364e-05, + "loss": 1.2024, + "step": 2061 + }, + { + "epoch": 0.32415649747489633, + "grad_norm": 0.22746217250823975, + "learning_rate": 4.6856633930236457e-05, + "loss": 1.1888, + "step": 2062 + }, + { + "epoch": 0.3243137023718289, + "grad_norm": 0.17931844294071198, + "learning_rate": 4.6853633968989244e-05, + "loss": 1.2835, + "step": 2063 + }, + { + "epoch": 0.32447090726876143, + "grad_norm": 0.18917080760002136, + "learning_rate": 4.68506326730052e-05, + "loss": 1.2193, + "step": 2064 + }, + { + "epoch": 0.324628112165694, + "grad_norm": 0.2527455687522888, + "learning_rate": 4.684763004246766e-05, + "loss": 1.2187, + "step": 2065 + }, + { + "epoch": 0.3247853170626265, + "grad_norm": 0.1864977329969406, + "learning_rate": 4.684462607756001e-05, + "loss": 1.2025, + "step": 2066 + }, + { + "epoch": 0.324942521959559, + "grad_norm": 0.3215543031692505, + "learning_rate": 4.6841620778465695e-05, + "loss": 1.16, + "step": 2067 + }, + { + "epoch": 0.3250997268564916, + "grad_norm": 0.3193890452384949, + "learning_rate": 4.683861414536829e-05, + "loss": 1.0823, + "step": 2068 + }, + { + "epoch": 0.32525693175342413, + "grad_norm": 0.2766779959201813, + "learning_rate": 4.683560617845143e-05, + "loss": 1.0615, + "step": 2069 + }, + { + "epoch": 0.3254141366503567, + "grad_norm": 0.2471621334552765, + "learning_rate": 4.683259687789881e-05, + "loss": 1.1033, + "step": 2070 + }, + { + "epoch": 0.3255713415472892, + "grad_norm": 0.22947926819324493, + "learning_rate": 4.682958624389426e-05, + "loss": 1.3086, + "step": 2071 + }, + { + "epoch": 0.3257285464442217, + "grad_norm": 0.19345812499523163, + "learning_rate": 4.682657427662163e-05, + "loss": 1.1013, + "step": 2072 + }, + { + "epoch": 0.3258857513411543, + "grad_norm": 0.21470381319522858, + "learning_rate": 4.682356097626488e-05, + "loss": 1.1439, + "step": 2073 + }, + { + "epoch": 0.3260429562380868, + "grad_norm": 0.19939084351062775, + "learning_rate": 4.682054634300807e-05, + "loss": 1.0446, + "step": 2074 + }, + { + "epoch": 0.3262001611350194, + "grad_norm": 0.23096968233585358, + "learning_rate": 4.68175303770353e-05, + "loss": 1.1722, + "step": 2075 + }, + { + "epoch": 0.32635736603195187, + "grad_norm": 0.19705285131931305, + "learning_rate": 4.6814513078530796e-05, + "loss": 1.2121, + "step": 2076 + }, + { + "epoch": 0.3265145709288844, + "grad_norm": 0.14740808308124542, + "learning_rate": 4.681149444767883e-05, + "loss": 1.0735, + "step": 2077 + }, + { + "epoch": 0.32667177582581697, + "grad_norm": 0.1801644265651703, + "learning_rate": 4.680847448466376e-05, + "loss": 1.14, + "step": 2078 + }, + { + "epoch": 0.3268289807227495, + "grad_norm": 0.221450537443161, + "learning_rate": 4.680545318967006e-05, + "loss": 1.1093, + "step": 2079 + }, + { + "epoch": 0.32698618561968207, + "grad_norm": 0.25395452976226807, + "learning_rate": 4.6802430562882226e-05, + "loss": 1.1791, + "step": 2080 + }, + { + "epoch": 0.32698618561968207, + "eval_loss": 1.1437312364578247, + "eval_runtime": 2317.1332, + "eval_samples_per_second": 3.995, + "eval_steps_per_second": 1.998, + "step": 2080 + }, + { + "epoch": 0.3271433905166146, + "grad_norm": 0.15380804240703583, + "learning_rate": 4.6799406604484894e-05, + "loss": 1.1315, + "step": 2081 + }, + { + "epoch": 0.3273005954135471, + "grad_norm": 0.24307769536972046, + "learning_rate": 4.679638131466275e-05, + "loss": 1.0447, + "step": 2082 + }, + { + "epoch": 0.32745780031047966, + "grad_norm": 0.19176973402500153, + "learning_rate": 4.6793354693600565e-05, + "loss": 1.1879, + "step": 2083 + }, + { + "epoch": 0.3276150052074122, + "grad_norm": 0.2640921175479889, + "learning_rate": 4.679032674148319e-05, + "loss": 1.0972, + "step": 2084 + }, + { + "epoch": 0.32777221010434476, + "grad_norm": 0.2140382081270218, + "learning_rate": 4.678729745849557e-05, + "loss": 1.2197, + "step": 2085 + }, + { + "epoch": 0.3279294150012773, + "grad_norm": 0.2784416675567627, + "learning_rate": 4.678426684482272e-05, + "loss": 1.1183, + "step": 2086 + }, + { + "epoch": 0.3280866198982098, + "grad_norm": 0.2722271978855133, + "learning_rate": 4.678123490064973e-05, + "loss": 1.1631, + "step": 2087 + }, + { + "epoch": 0.32824382479514236, + "grad_norm": 0.18923896551132202, + "learning_rate": 4.6778201626161776e-05, + "loss": 1.1795, + "step": 2088 + }, + { + "epoch": 0.3284010296920749, + "grad_norm": 0.18081101775169373, + "learning_rate": 4.6775167021544136e-05, + "loss": 1.1308, + "step": 2089 + }, + { + "epoch": 0.32855823458900746, + "grad_norm": 0.24127820134162903, + "learning_rate": 4.677213108698214e-05, + "loss": 1.1461, + "step": 2090 + }, + { + "epoch": 0.32871543948594, + "grad_norm": 0.20159919559955597, + "learning_rate": 4.6769093822661214e-05, + "loss": 1.2414, + "step": 2091 + }, + { + "epoch": 0.3288726443828725, + "grad_norm": 0.2225925475358963, + "learning_rate": 4.676605522876687e-05, + "loss": 1.3019, + "step": 2092 + }, + { + "epoch": 0.32902984927980505, + "grad_norm": 0.20402312278747559, + "learning_rate": 4.676301530548468e-05, + "loss": 0.9615, + "step": 2093 + }, + { + "epoch": 0.3291870541767376, + "grad_norm": 0.2000385820865631, + "learning_rate": 4.6759974053000324e-05, + "loss": 1.1958, + "step": 2094 + }, + { + "epoch": 0.32934425907367015, + "grad_norm": 0.27835071086883545, + "learning_rate": 4.6756931471499546e-05, + "loss": 1.1249, + "step": 2095 + }, + { + "epoch": 0.3295014639706027, + "grad_norm": 0.20449519157409668, + "learning_rate": 4.675388756116816e-05, + "loss": 1.0496, + "step": 2096 + }, + { + "epoch": 0.3296586688675352, + "grad_norm": 0.26013025641441345, + "learning_rate": 4.67508423221921e-05, + "loss": 1.1194, + "step": 2097 + }, + { + "epoch": 0.32981587376446775, + "grad_norm": 0.19826790690422058, + "learning_rate": 4.6747795754757354e-05, + "loss": 1.1878, + "step": 2098 + }, + { + "epoch": 0.3299730786614003, + "grad_norm": 0.20292681455612183, + "learning_rate": 4.6744747859049975e-05, + "loss": 1.2216, + "step": 2099 + }, + { + "epoch": 0.33013028355833285, + "grad_norm": 0.2701912224292755, + "learning_rate": 4.674169863525614e-05, + "loss": 1.1728, + "step": 2100 + }, + { + "epoch": 0.3302874884552654, + "grad_norm": 0.2722685933113098, + "learning_rate": 4.673864808356206e-05, + "loss": 1.1967, + "step": 2101 + }, + { + "epoch": 0.3304446933521979, + "grad_norm": 0.19252356886863708, + "learning_rate": 4.673559620415408e-05, + "loss": 1.1614, + "step": 2102 + }, + { + "epoch": 0.33060189824913044, + "grad_norm": 0.2063681185245514, + "learning_rate": 4.673254299721858e-05, + "loss": 1.192, + "step": 2103 + }, + { + "epoch": 0.330759103146063, + "grad_norm": 0.22739455103874207, + "learning_rate": 4.6729488462942036e-05, + "loss": 1.207, + "step": 2104 + }, + { + "epoch": 0.33091630804299554, + "grad_norm": 0.220958411693573, + "learning_rate": 4.672643260151101e-05, + "loss": 1.1716, + "step": 2105 + }, + { + "epoch": 0.3310735129399281, + "grad_norm": 0.24066145718097687, + "learning_rate": 4.672337541311215e-05, + "loss": 1.1382, + "step": 2106 + }, + { + "epoch": 0.33123071783686064, + "grad_norm": 0.25801724195480347, + "learning_rate": 4.672031689793217e-05, + "loss": 1.1494, + "step": 2107 + }, + { + "epoch": 0.33138792273379314, + "grad_norm": 0.2185317426919937, + "learning_rate": 4.671725705615787e-05, + "loss": 1.1325, + "step": 2108 + }, + { + "epoch": 0.3315451276307257, + "grad_norm": 0.25223109126091003, + "learning_rate": 4.671419588797615e-05, + "loss": 1.1606, + "step": 2109 + }, + { + "epoch": 0.33170233252765824, + "grad_norm": 0.320625364780426, + "learning_rate": 4.6711133393573945e-05, + "loss": 1.0385, + "step": 2110 + }, + { + "epoch": 0.3318595374245908, + "grad_norm": 0.2799528241157532, + "learning_rate": 4.6708069573138335e-05, + "loss": 1.1079, + "step": 2111 + }, + { + "epoch": 0.33201674232152334, + "grad_norm": 0.2274870127439499, + "learning_rate": 4.670500442685642e-05, + "loss": 1.0838, + "step": 2112 + }, + { + "epoch": 0.33217394721845583, + "grad_norm": 0.27315694093704224, + "learning_rate": 4.670193795491542e-05, + "loss": 1.095, + "step": 2113 + }, + { + "epoch": 0.3323311521153884, + "grad_norm": 0.21600811183452606, + "learning_rate": 4.669887015750262e-05, + "loss": 1.0079, + "step": 2114 + }, + { + "epoch": 0.33248835701232093, + "grad_norm": 0.2445525974035263, + "learning_rate": 4.669580103480539e-05, + "loss": 1.1513, + "step": 2115 + }, + { + "epoch": 0.3326455619092535, + "grad_norm": 0.2136092483997345, + "learning_rate": 4.669273058701117e-05, + "loss": 1.203, + "step": 2116 + }, + { + "epoch": 0.33280276680618603, + "grad_norm": 0.2629857361316681, + "learning_rate": 4.668965881430751e-05, + "loss": 1.0757, + "step": 2117 + }, + { + "epoch": 0.33295997170311853, + "grad_norm": 0.2461937963962555, + "learning_rate": 4.668658571688201e-05, + "loss": 1.0329, + "step": 2118 + }, + { + "epoch": 0.3331171766000511, + "grad_norm": 0.21537308394908905, + "learning_rate": 4.668351129492237e-05, + "loss": 1.1246, + "step": 2119 + }, + { + "epoch": 0.33327438149698363, + "grad_norm": 0.21597877144813538, + "learning_rate": 4.6680435548616366e-05, + "loss": 1.1728, + "step": 2120 + }, + { + "epoch": 0.3334315863939162, + "grad_norm": 0.2764485776424408, + "learning_rate": 4.667735847815183e-05, + "loss": 1.1108, + "step": 2121 + }, + { + "epoch": 0.33358879129084873, + "grad_norm": 0.24224579334259033, + "learning_rate": 4.667428008371674e-05, + "loss": 1.1129, + "step": 2122 + }, + { + "epoch": 0.3337459961877812, + "grad_norm": 0.2688109874725342, + "learning_rate": 4.667120036549907e-05, + "loss": 1.0214, + "step": 2123 + }, + { + "epoch": 0.3339032010847138, + "grad_norm": 0.30181390047073364, + "learning_rate": 4.666811932368693e-05, + "loss": 1.1261, + "step": 2124 + }, + { + "epoch": 0.3340604059816463, + "grad_norm": 0.20725978910923004, + "learning_rate": 4.666503695846852e-05, + "loss": 1.193, + "step": 2125 + }, + { + "epoch": 0.3342176108785789, + "grad_norm": 0.1896386742591858, + "learning_rate": 4.666195327003208e-05, + "loss": 1.1811, + "step": 2126 + }, + { + "epoch": 0.3343748157755114, + "grad_norm": 0.1871979534626007, + "learning_rate": 4.665886825856594e-05, + "loss": 1.2184, + "step": 2127 + }, + { + "epoch": 0.3345320206724439, + "grad_norm": 0.2065206915140152, + "learning_rate": 4.665578192425854e-05, + "loss": 1.1762, + "step": 2128 + }, + { + "epoch": 0.33468922556937647, + "grad_norm": 0.293587863445282, + "learning_rate": 4.665269426729838e-05, + "loss": 1.1041, + "step": 2129 + }, + { + "epoch": 0.334846430466309, + "grad_norm": 0.23634661734104156, + "learning_rate": 4.664960528787403e-05, + "loss": 1.0625, + "step": 2130 + }, + { + "epoch": 0.33500363536324157, + "grad_norm": 0.21932774782180786, + "learning_rate": 4.664651498617417e-05, + "loss": 1.0748, + "step": 2131 + }, + { + "epoch": 0.3351608402601741, + "grad_norm": 0.23783034086227417, + "learning_rate": 4.6643423362387526e-05, + "loss": 1.1149, + "step": 2132 + }, + { + "epoch": 0.33531804515710667, + "grad_norm": 0.24081239104270935, + "learning_rate": 4.664033041670293e-05, + "loss": 1.2532, + "step": 2133 + }, + { + "epoch": 0.33547525005403916, + "grad_norm": 0.23142661154270172, + "learning_rate": 4.6637236149309296e-05, + "loss": 1.178, + "step": 2134 + }, + { + "epoch": 0.3356324549509717, + "grad_norm": 0.27728673815727234, + "learning_rate": 4.663414056039559e-05, + "loss": 1.0755, + "step": 2135 + }, + { + "epoch": 0.33578965984790426, + "grad_norm": 0.22718535363674164, + "learning_rate": 4.6631043650150905e-05, + "loss": 1.1332, + "step": 2136 + }, + { + "epoch": 0.3359468647448368, + "grad_norm": 0.28607967495918274, + "learning_rate": 4.6627945418764366e-05, + "loss": 1.2812, + "step": 2137 + }, + { + "epoch": 0.33610406964176937, + "grad_norm": 0.19152137637138367, + "learning_rate": 4.6624845866425215e-05, + "loss": 1.0983, + "step": 2138 + }, + { + "epoch": 0.33626127453870186, + "grad_norm": 0.230951726436615, + "learning_rate": 4.662174499332275e-05, + "loss": 1.1511, + "step": 2139 + }, + { + "epoch": 0.3364184794356344, + "grad_norm": 0.23929493129253387, + "learning_rate": 4.661864279964637e-05, + "loss": 1.1524, + "step": 2140 + }, + { + "epoch": 0.33657568433256696, + "grad_norm": 0.2901476323604584, + "learning_rate": 4.661553928558554e-05, + "loss": 1.1344, + "step": 2141 + }, + { + "epoch": 0.3367328892294995, + "grad_norm": 0.26585260033607483, + "learning_rate": 4.661243445132981e-05, + "loss": 1.1513, + "step": 2142 + }, + { + "epoch": 0.33689009412643206, + "grad_norm": 0.23678666353225708, + "learning_rate": 4.660932829706882e-05, + "loss": 1.0852, + "step": 2143 + }, + { + "epoch": 0.33704729902336455, + "grad_norm": 0.4830389618873596, + "learning_rate": 4.660622082299227e-05, + "loss": 1.1329, + "step": 2144 + }, + { + "epoch": 0.3372045039202971, + "grad_norm": 0.21181213855743408, + "learning_rate": 4.660311202928996e-05, + "loss": 1.1513, + "step": 2145 + }, + { + "epoch": 0.33736170881722966, + "grad_norm": 0.22538620233535767, + "learning_rate": 4.660000191615176e-05, + "loss": 1.1165, + "step": 2146 + }, + { + "epoch": 0.3375189137141622, + "grad_norm": 0.19452321529388428, + "learning_rate": 4.659689048376763e-05, + "loss": 1.1231, + "step": 2147 + }, + { + "epoch": 0.33767611861109476, + "grad_norm": 0.2382746934890747, + "learning_rate": 4.6593777732327595e-05, + "loss": 1.1805, + "step": 2148 + }, + { + "epoch": 0.33783332350802725, + "grad_norm": 0.26696088910102844, + "learning_rate": 4.659066366202178e-05, + "loss": 1.1743, + "step": 2149 + }, + { + "epoch": 0.3379905284049598, + "grad_norm": 0.19131755828857422, + "learning_rate": 4.658754827304037e-05, + "loss": 1.1538, + "step": 2150 + }, + { + "epoch": 0.33814773330189235, + "grad_norm": 0.20308135449886322, + "learning_rate": 4.658443156557365e-05, + "loss": 1.1274, + "step": 2151 + }, + { + "epoch": 0.3383049381988249, + "grad_norm": 0.25910693407058716, + "learning_rate": 4.658131353981198e-05, + "loss": 1.2042, + "step": 2152 + }, + { + "epoch": 0.33846214309575745, + "grad_norm": 0.2608768939971924, + "learning_rate": 4.6578194195945776e-05, + "loss": 1.0935, + "step": 2153 + }, + { + "epoch": 0.33861934799268995, + "grad_norm": 0.22527897357940674, + "learning_rate": 4.657507353416558e-05, + "loss": 1.1007, + "step": 2154 + }, + { + "epoch": 0.3387765528896225, + "grad_norm": 0.2805950343608856, + "learning_rate": 4.657195155466198e-05, + "loss": 1.0509, + "step": 2155 + }, + { + "epoch": 0.33893375778655505, + "grad_norm": 0.244426891207695, + "learning_rate": 4.656882825762565e-05, + "loss": 1.0811, + "step": 2156 + }, + { + "epoch": 0.3390909626834876, + "grad_norm": 0.20174293220043182, + "learning_rate": 4.656570364324736e-05, + "loss": 1.197, + "step": 2157 + }, + { + "epoch": 0.33924816758042015, + "grad_norm": 0.2714351713657379, + "learning_rate": 4.656257771171795e-05, + "loss": 1.1018, + "step": 2158 + }, + { + "epoch": 0.3394053724773527, + "grad_norm": 0.266418993473053, + "learning_rate": 4.6559450463228316e-05, + "loss": 1.1652, + "step": 2159 + }, + { + "epoch": 0.3395625773742852, + "grad_norm": 0.29422682523727417, + "learning_rate": 4.655632189796949e-05, + "loss": 1.0787, + "step": 2160 + }, + { + "epoch": 0.33971978227121774, + "grad_norm": 0.26280736923217773, + "learning_rate": 4.655319201613253e-05, + "loss": 1.2125, + "step": 2161 + }, + { + "epoch": 0.3398769871681503, + "grad_norm": 0.20241224765777588, + "learning_rate": 4.655006081790861e-05, + "loss": 1.227, + "step": 2162 + }, + { + "epoch": 0.34003419206508284, + "grad_norm": 0.23165303468704224, + "learning_rate": 4.654692830348897e-05, + "loss": 1.0675, + "step": 2163 + }, + { + "epoch": 0.3401913969620154, + "grad_norm": 0.3363533020019531, + "learning_rate": 4.654379447306493e-05, + "loss": 1.1536, + "step": 2164 + }, + { + "epoch": 0.3403486018589479, + "grad_norm": 0.21518482267856598, + "learning_rate": 4.654065932682789e-05, + "loss": 1.1749, + "step": 2165 + }, + { + "epoch": 0.34050580675588044, + "grad_norm": 0.16388140618801117, + "learning_rate": 4.653752286496933e-05, + "loss": 1.2337, + "step": 2166 + }, + { + "epoch": 0.340663011652813, + "grad_norm": 0.20146115124225616, + "learning_rate": 4.6534385087680824e-05, + "loss": 1.1984, + "step": 2167 + }, + { + "epoch": 0.34082021654974554, + "grad_norm": 0.22612817585468292, + "learning_rate": 4.653124599515401e-05, + "loss": 1.1037, + "step": 2168 + }, + { + "epoch": 0.3409774214466781, + "grad_norm": 0.23036076128482819, + "learning_rate": 4.652810558758061e-05, + "loss": 1.2341, + "step": 2169 + }, + { + "epoch": 0.3411346263436106, + "grad_norm": 0.23168572783470154, + "learning_rate": 4.652496386515243e-05, + "loss": 1.167, + "step": 2170 + }, + { + "epoch": 0.34129183124054313, + "grad_norm": 0.24674174189567566, + "learning_rate": 4.6521820828061354e-05, + "loss": 1.1577, + "step": 2171 + }, + { + "epoch": 0.3414490361374757, + "grad_norm": 0.1900520622730255, + "learning_rate": 4.6518676476499353e-05, + "loss": 1.1402, + "step": 2172 + }, + { + "epoch": 0.34160624103440823, + "grad_norm": 0.18541108071804047, + "learning_rate": 4.651553081065846e-05, + "loss": 1.0964, + "step": 2173 + }, + { + "epoch": 0.3417634459313408, + "grad_norm": 0.2279513031244278, + "learning_rate": 4.651238383073081e-05, + "loss": 1.0895, + "step": 2174 + }, + { + "epoch": 0.3419206508282733, + "grad_norm": 0.2681223750114441, + "learning_rate": 4.65092355369086e-05, + "loss": 1.0725, + "step": 2175 + }, + { + "epoch": 0.3420778557252058, + "grad_norm": 0.2828345000743866, + "learning_rate": 4.6506085929384124e-05, + "loss": 1.0066, + "step": 2176 + }, + { + "epoch": 0.3422350606221384, + "grad_norm": 0.20930872857570648, + "learning_rate": 4.6502935008349747e-05, + "loss": 1.2589, + "step": 2177 + }, + { + "epoch": 0.3423922655190709, + "grad_norm": 0.17460274696350098, + "learning_rate": 4.6499782773997906e-05, + "loss": 1.1705, + "step": 2178 + }, + { + "epoch": 0.3425494704160035, + "grad_norm": 0.24282746016979218, + "learning_rate": 4.649662922652114e-05, + "loss": 1.152, + "step": 2179 + }, + { + "epoch": 0.34270667531293597, + "grad_norm": 0.21630579233169556, + "learning_rate": 4.649347436611205e-05, + "loss": 1.1686, + "step": 2180 + }, + { + "epoch": 0.3428638802098685, + "grad_norm": 0.2531259059906006, + "learning_rate": 4.649031819296332e-05, + "loss": 1.1321, + "step": 2181 + }, + { + "epoch": 0.34302108510680107, + "grad_norm": 0.26737165451049805, + "learning_rate": 4.648716070726772e-05, + "loss": 1.1569, + "step": 2182 + }, + { + "epoch": 0.3431782900037336, + "grad_norm": 0.27461764216423035, + "learning_rate": 4.64840019092181e-05, + "loss": 1.0203, + "step": 2183 + }, + { + "epoch": 0.34333549490066617, + "grad_norm": 0.2627179026603699, + "learning_rate": 4.648084179900739e-05, + "loss": 1.2092, + "step": 2184 + }, + { + "epoch": 0.3434926997975987, + "grad_norm": 0.23868447542190552, + "learning_rate": 4.647768037682858e-05, + "loss": 1.0356, + "step": 2185 + }, + { + "epoch": 0.3436499046945312, + "grad_norm": 0.24952903389930725, + "learning_rate": 4.647451764287478e-05, + "loss": 1.0801, + "step": 2186 + }, + { + "epoch": 0.34380710959146377, + "grad_norm": 0.23518601059913635, + "learning_rate": 4.647135359733914e-05, + "loss": 1.0065, + "step": 2187 + }, + { + "epoch": 0.3439643144883963, + "grad_norm": 0.20000571012496948, + "learning_rate": 4.6468188240414924e-05, + "loss": 1.1048, + "step": 2188 + }, + { + "epoch": 0.34412151938532887, + "grad_norm": 0.2511467933654785, + "learning_rate": 4.646502157229544e-05, + "loss": 1.1423, + "step": 2189 + }, + { + "epoch": 0.3442787242822614, + "grad_norm": 0.18525715172290802, + "learning_rate": 4.646185359317412e-05, + "loss": 1.1245, + "step": 2190 + }, + { + "epoch": 0.3444359291791939, + "grad_norm": 0.18307547271251678, + "learning_rate": 4.6458684303244435e-05, + "loss": 1.0499, + "step": 2191 + }, + { + "epoch": 0.34459313407612646, + "grad_norm": 0.263036847114563, + "learning_rate": 4.645551370269995e-05, + "loss": 1.1585, + "step": 2192 + }, + { + "epoch": 0.344750338973059, + "grad_norm": 0.276479572057724, + "learning_rate": 4.6452341791734335e-05, + "loss": 1.1801, + "step": 2193 + }, + { + "epoch": 0.34490754386999156, + "grad_norm": 0.19550184905529022, + "learning_rate": 4.644916857054129e-05, + "loss": 1.09, + "step": 2194 + }, + { + "epoch": 0.3450647487669241, + "grad_norm": 0.22002463042736053, + "learning_rate": 4.644599403931465e-05, + "loss": 1.0928, + "step": 2195 + }, + { + "epoch": 0.3452219536638566, + "grad_norm": 0.26032400131225586, + "learning_rate": 4.6442818198248276e-05, + "loss": 1.1699, + "step": 2196 + }, + { + "epoch": 0.34537915856078916, + "grad_norm": 0.19974485039710999, + "learning_rate": 4.643964104753617e-05, + "loss": 1.1668, + "step": 2197 + }, + { + "epoch": 0.3455363634577217, + "grad_norm": 0.2452566772699356, + "learning_rate": 4.6436462587372345e-05, + "loss": 1.1597, + "step": 2198 + }, + { + "epoch": 0.34569356835465426, + "grad_norm": 0.2143898904323578, + "learning_rate": 4.643328281795095e-05, + "loss": 1.1134, + "step": 2199 + }, + { + "epoch": 0.3458507732515868, + "grad_norm": 0.21687857806682587, + "learning_rate": 4.643010173946619e-05, + "loss": 1.1802, + "step": 2200 + }, + { + "epoch": 0.3460079781485193, + "grad_norm": 0.21475355327129364, + "learning_rate": 4.6426919352112355e-05, + "loss": 1.2065, + "step": 2201 + }, + { + "epoch": 0.34616518304545185, + "grad_norm": 0.25029754638671875, + "learning_rate": 4.64237356560838e-05, + "loss": 1.1723, + "step": 2202 + }, + { + "epoch": 0.3463223879423844, + "grad_norm": 0.23117133975028992, + "learning_rate": 4.642055065157499e-05, + "loss": 1.1915, + "step": 2203 + }, + { + "epoch": 0.34647959283931695, + "grad_norm": 0.28021204471588135, + "learning_rate": 4.641736433878045e-05, + "loss": 1.0219, + "step": 2204 + }, + { + "epoch": 0.3466367977362495, + "grad_norm": 0.24753634631633759, + "learning_rate": 4.641417671789478e-05, + "loss": 1.0664, + "step": 2205 + }, + { + "epoch": 0.346794002633182, + "grad_norm": 0.2812795341014862, + "learning_rate": 4.6410987789112676e-05, + "loss": 1.1794, + "step": 2206 + }, + { + "epoch": 0.34695120753011455, + "grad_norm": 0.17878840863704681, + "learning_rate": 4.64077975526289e-05, + "loss": 1.1064, + "step": 2207 + }, + { + "epoch": 0.3471084124270471, + "grad_norm": 0.20422449707984924, + "learning_rate": 4.6404606008638295e-05, + "loss": 1.2858, + "step": 2208 + }, + { + "epoch": 0.34726561732397965, + "grad_norm": 0.2892017066478729, + "learning_rate": 4.6401413157335796e-05, + "loss": 1.1227, + "step": 2209 + }, + { + "epoch": 0.3474228222209122, + "grad_norm": 0.16432105004787445, + "learning_rate": 4.639821899891641e-05, + "loss": 1.1985, + "step": 2210 + }, + { + "epoch": 0.34758002711784475, + "grad_norm": 0.2849401533603668, + "learning_rate": 4.639502353357522e-05, + "loss": 1.056, + "step": 2211 + }, + { + "epoch": 0.34773723201477724, + "grad_norm": 0.17997637391090393, + "learning_rate": 4.6391826761507403e-05, + "loss": 1.1146, + "step": 2212 + }, + { + "epoch": 0.3478944369117098, + "grad_norm": 0.22177653014659882, + "learning_rate": 4.6388628682908186e-05, + "loss": 1.1871, + "step": 2213 + }, + { + "epoch": 0.34805164180864234, + "grad_norm": 0.18821461498737335, + "learning_rate": 4.6385429297972914e-05, + "loss": 1.1184, + "step": 2214 + }, + { + "epoch": 0.3482088467055749, + "grad_norm": 0.2185995727777481, + "learning_rate": 4.6382228606896994e-05, + "loss": 1.2098, + "step": 2215 + }, + { + "epoch": 0.34836605160250744, + "grad_norm": 0.2325439453125, + "learning_rate": 4.6379026609875894e-05, + "loss": 1.1827, + "step": 2216 + }, + { + "epoch": 0.34852325649943994, + "grad_norm": 0.23903654515743256, + "learning_rate": 4.637582330710519e-05, + "loss": 1.1347, + "step": 2217 + }, + { + "epoch": 0.3486804613963725, + "grad_norm": 0.2033540904521942, + "learning_rate": 4.637261869878054e-05, + "loss": 1.0412, + "step": 2218 + }, + { + "epoch": 0.34883766629330504, + "grad_norm": 0.21630467474460602, + "learning_rate": 4.6369412785097644e-05, + "loss": 1.2504, + "step": 2219 + }, + { + "epoch": 0.3489948711902376, + "grad_norm": 0.2330722063779831, + "learning_rate": 4.636620556625233e-05, + "loss": 1.2137, + "step": 2220 + }, + { + "epoch": 0.34915207608717014, + "grad_norm": 0.22527045011520386, + "learning_rate": 4.636299704244047e-05, + "loss": 1.1868, + "step": 2221 + }, + { + "epoch": 0.34930928098410263, + "grad_norm": 0.26466044783592224, + "learning_rate": 4.635978721385803e-05, + "loss": 1.0843, + "step": 2222 + }, + { + "epoch": 0.3494664858810352, + "grad_norm": 0.1883208006620407, + "learning_rate": 4.6356576080701054e-05, + "loss": 1.1547, + "step": 2223 + }, + { + "epoch": 0.34962369077796773, + "grad_norm": 0.23476985096931458, + "learning_rate": 4.635336364316567e-05, + "loss": 1.1051, + "step": 2224 + }, + { + "epoch": 0.3497808956749003, + "grad_norm": 0.224160835146904, + "learning_rate": 4.635014990144808e-05, + "loss": 1.1945, + "step": 2225 + }, + { + "epoch": 0.34993810057183283, + "grad_norm": 0.19404496252536774, + "learning_rate": 4.634693485574457e-05, + "loss": 1.2492, + "step": 2226 + }, + { + "epoch": 0.3500953054687653, + "grad_norm": 0.253548800945282, + "learning_rate": 4.6343718506251485e-05, + "loss": 1.2643, + "step": 2227 + }, + { + "epoch": 0.3502525103656979, + "grad_norm": 0.3645852208137512, + "learning_rate": 4.634050085316529e-05, + "loss": 1.147, + "step": 2228 + }, + { + "epoch": 0.35040971526263043, + "grad_norm": 0.25876176357269287, + "learning_rate": 4.6337281896682504e-05, + "loss": 0.9978, + "step": 2229 + }, + { + "epoch": 0.350566920159563, + "grad_norm": 0.20177343487739563, + "learning_rate": 4.633406163699972e-05, + "loss": 1.1169, + "step": 2230 + }, + { + "epoch": 0.35072412505649553, + "grad_norm": 0.23162272572517395, + "learning_rate": 4.633084007431361e-05, + "loss": 1.0649, + "step": 2231 + }, + { + "epoch": 0.350881329953428, + "grad_norm": 0.18436746299266815, + "learning_rate": 4.6327617208820964e-05, + "loss": 1.2476, + "step": 2232 + }, + { + "epoch": 0.3510385348503606, + "grad_norm": 0.17864318192005157, + "learning_rate": 4.6324393040718596e-05, + "loss": 1.1171, + "step": 2233 + }, + { + "epoch": 0.3511957397472931, + "grad_norm": 0.21066221594810486, + "learning_rate": 4.632116757020343e-05, + "loss": 1.1708, + "step": 2234 + }, + { + "epoch": 0.3513529446442257, + "grad_norm": 0.19731271266937256, + "learning_rate": 4.631794079747248e-05, + "loss": 1.0509, + "step": 2235 + }, + { + "epoch": 0.3515101495411582, + "grad_norm": 0.24790634214878082, + "learning_rate": 4.631471272272281e-05, + "loss": 1.0759, + "step": 2236 + }, + { + "epoch": 0.3516673544380908, + "grad_norm": 0.23736214637756348, + "learning_rate": 4.6311483346151587e-05, + "loss": 1.1302, + "step": 2237 + }, + { + "epoch": 0.35182455933502327, + "grad_norm": 0.20299890637397766, + "learning_rate": 4.630825266795605e-05, + "loss": 1.171, + "step": 2238 + }, + { + "epoch": 0.3519817642319558, + "grad_norm": 0.17163951694965363, + "learning_rate": 4.63050206883335e-05, + "loss": 1.1312, + "step": 2239 + }, + { + "epoch": 0.35213896912888837, + "grad_norm": 0.22283077239990234, + "learning_rate": 4.6301787407481356e-05, + "loss": 1.2501, + "step": 2240 + }, + { + "epoch": 0.35213896912888837, + "eval_loss": 1.137406349182129, + "eval_runtime": 2319.6638, + "eval_samples_per_second": 3.991, + "eval_steps_per_second": 1.996, + "step": 2240 + }, + { + "epoch": 0.3522961740258209, + "grad_norm": 0.21536953747272491, + "learning_rate": 4.6298552825597084e-05, + "loss": 1.1445, + "step": 2241 + }, + { + "epoch": 0.35245337892275347, + "grad_norm": 0.18569806218147278, + "learning_rate": 4.629531694287824e-05, + "loss": 1.1741, + "step": 2242 + }, + { + "epoch": 0.35261058381968596, + "grad_norm": 0.21473194658756256, + "learning_rate": 4.629207975952247e-05, + "loss": 1.1673, + "step": 2243 + }, + { + "epoch": 0.3527677887166185, + "grad_norm": 0.2836077809333801, + "learning_rate": 4.628884127572747e-05, + "loss": 1.1015, + "step": 2244 + }, + { + "epoch": 0.35292499361355106, + "grad_norm": 0.25129783153533936, + "learning_rate": 4.6285601491691044e-05, + "loss": 1.0962, + "step": 2245 + }, + { + "epoch": 0.3530821985104836, + "grad_norm": 0.1728084534406662, + "learning_rate": 4.628236040761106e-05, + "loss": 1.2618, + "step": 2246 + }, + { + "epoch": 0.35323940340741616, + "grad_norm": 0.24836137890815735, + "learning_rate": 4.6279118023685485e-05, + "loss": 1.1205, + "step": 2247 + }, + { + "epoch": 0.35339660830434866, + "grad_norm": 0.18902529776096344, + "learning_rate": 4.627587434011234e-05, + "loss": 1.2781, + "step": 2248 + }, + { + "epoch": 0.3535538132012812, + "grad_norm": 0.16344639658927917, + "learning_rate": 4.6272629357089745e-05, + "loss": 1.1919, + "step": 2249 + }, + { + "epoch": 0.35371101809821376, + "grad_norm": 0.2835703194141388, + "learning_rate": 4.6269383074815874e-05, + "loss": 1.0918, + "step": 2250 + }, + { + "epoch": 0.3538682229951463, + "grad_norm": 0.21615809202194214, + "learning_rate": 4.6266135493489015e-05, + "loss": 1.1403, + "step": 2251 + }, + { + "epoch": 0.35402542789207886, + "grad_norm": 0.2002420276403427, + "learning_rate": 4.6262886613307516e-05, + "loss": 1.2075, + "step": 2252 + }, + { + "epoch": 0.35418263278901135, + "grad_norm": 0.21766100823879242, + "learning_rate": 4.62596364344698e-05, + "loss": 1.0956, + "step": 2253 + }, + { + "epoch": 0.3543398376859439, + "grad_norm": 0.24325627088546753, + "learning_rate": 4.625638495717438e-05, + "loss": 1.1861, + "step": 2254 + }, + { + "epoch": 0.35449704258287645, + "grad_norm": 0.2508534789085388, + "learning_rate": 4.625313218161984e-05, + "loss": 1.1318, + "step": 2255 + }, + { + "epoch": 0.354654247479809, + "grad_norm": 0.20808853209018707, + "learning_rate": 4.624987810800485e-05, + "loss": 1.1406, + "step": 2256 + }, + { + "epoch": 0.35481145237674155, + "grad_norm": 0.18308702111244202, + "learning_rate": 4.6246622736528154e-05, + "loss": 1.099, + "step": 2257 + }, + { + "epoch": 0.35496865727367405, + "grad_norm": 0.2754043936729431, + "learning_rate": 4.6243366067388585e-05, + "loss": 1.1066, + "step": 2258 + }, + { + "epoch": 0.3551258621706066, + "grad_norm": 0.22903695702552795, + "learning_rate": 4.624010810078504e-05, + "loss": 1.1673, + "step": 2259 + }, + { + "epoch": 0.35528306706753915, + "grad_norm": 0.23207323253154755, + "learning_rate": 4.62368488369165e-05, + "loss": 1.0022, + "step": 2260 + }, + { + "epoch": 0.3554402719644717, + "grad_norm": 0.17466799914836884, + "learning_rate": 4.623358827598204e-05, + "loss": 1.2006, + "step": 2261 + }, + { + "epoch": 0.35559747686140425, + "grad_norm": 0.17655743658542633, + "learning_rate": 4.623032641818079e-05, + "loss": 1.2022, + "step": 2262 + }, + { + "epoch": 0.3557546817583368, + "grad_norm": 0.21429038047790527, + "learning_rate": 4.622706326371199e-05, + "loss": 1.1119, + "step": 2263 + }, + { + "epoch": 0.3559118866552693, + "grad_norm": 0.2043597400188446, + "learning_rate": 4.622379881277492e-05, + "loss": 1.0538, + "step": 2264 + }, + { + "epoch": 0.35606909155220184, + "grad_norm": 0.2455209642648697, + "learning_rate": 4.622053306556897e-05, + "loss": 1.1122, + "step": 2265 + }, + { + "epoch": 0.3562262964491344, + "grad_norm": 0.22231566905975342, + "learning_rate": 4.6217266022293605e-05, + "loss": 1.0947, + "step": 2266 + }, + { + "epoch": 0.35638350134606694, + "grad_norm": 0.26289692521095276, + "learning_rate": 4.6213997683148355e-05, + "loss": 1.2072, + "step": 2267 + }, + { + "epoch": 0.3565407062429995, + "grad_norm": 0.208029642701149, + "learning_rate": 4.621072804833284e-05, + "loss": 1.2221, + "step": 2268 + }, + { + "epoch": 0.356697911139932, + "grad_norm": 0.23250968754291534, + "learning_rate": 4.620745711804676e-05, + "loss": 1.1863, + "step": 2269 + }, + { + "epoch": 0.35685511603686454, + "grad_norm": 0.26194247603416443, + "learning_rate": 4.6204184892489875e-05, + "loss": 1.0326, + "step": 2270 + }, + { + "epoch": 0.3570123209337971, + "grad_norm": 0.3551010191440582, + "learning_rate": 4.6200911371862063e-05, + "loss": 1.1552, + "step": 2271 + }, + { + "epoch": 0.35716952583072964, + "grad_norm": 0.24021030962467194, + "learning_rate": 4.6197636556363256e-05, + "loss": 1.1127, + "step": 2272 + }, + { + "epoch": 0.3573267307276622, + "grad_norm": 0.22826530039310455, + "learning_rate": 4.619436044619345e-05, + "loss": 1.134, + "step": 2273 + }, + { + "epoch": 0.3574839356245947, + "grad_norm": 0.2583426237106323, + "learning_rate": 4.619108304155275e-05, + "loss": 1.0296, + "step": 2274 + }, + { + "epoch": 0.35764114052152723, + "grad_norm": 0.2455807328224182, + "learning_rate": 4.618780434264133e-05, + "loss": 1.087, + "step": 2275 + }, + { + "epoch": 0.3577983454184598, + "grad_norm": 0.21768860518932343, + "learning_rate": 4.618452434965943e-05, + "loss": 1.1911, + "step": 2276 + }, + { + "epoch": 0.35795555031539233, + "grad_norm": 0.19263923168182373, + "learning_rate": 4.6181243062807387e-05, + "loss": 1.1437, + "step": 2277 + }, + { + "epoch": 0.3581127552123249, + "grad_norm": 0.23038311302661896, + "learning_rate": 4.61779604822856e-05, + "loss": 1.1394, + "step": 2278 + }, + { + "epoch": 0.3582699601092574, + "grad_norm": 0.38476505875587463, + "learning_rate": 4.6174676608294574e-05, + "loss": 1.1242, + "step": 2279 + }, + { + "epoch": 0.35842716500618993, + "grad_norm": 0.349468469619751, + "learning_rate": 4.617139144103486e-05, + "loss": 1.2413, + "step": 2280 + }, + { + "epoch": 0.3585843699031225, + "grad_norm": 0.18502017855644226, + "learning_rate": 4.6168104980707107e-05, + "loss": 1.202, + "step": 2281 + }, + { + "epoch": 0.35874157480005503, + "grad_norm": 0.18562203645706177, + "learning_rate": 4.616481722751205e-05, + "loss": 1.0617, + "step": 2282 + }, + { + "epoch": 0.3588987796969876, + "grad_norm": 0.24319809675216675, + "learning_rate": 4.616152818165047e-05, + "loss": 1.1994, + "step": 2283 + }, + { + "epoch": 0.3590559845939201, + "grad_norm": 0.21097324788570404, + "learning_rate": 4.615823784332327e-05, + "loss": 1.1771, + "step": 2284 + }, + { + "epoch": 0.3592131894908526, + "grad_norm": 0.20706066489219666, + "learning_rate": 4.61549462127314e-05, + "loss": 1.0416, + "step": 2285 + }, + { + "epoch": 0.3593703943877852, + "grad_norm": 0.17828263342380524, + "learning_rate": 4.615165329007591e-05, + "loss": 1.0867, + "step": 2286 + }, + { + "epoch": 0.3595275992847177, + "grad_norm": 0.20547035336494446, + "learning_rate": 4.6148359075557915e-05, + "loss": 1.1115, + "step": 2287 + }, + { + "epoch": 0.3596848041816503, + "grad_norm": 0.30150124430656433, + "learning_rate": 4.614506356937861e-05, + "loss": 1.0364, + "step": 2288 + }, + { + "epoch": 0.3598420090785828, + "grad_norm": 0.2805284261703491, + "learning_rate": 4.6141766771739267e-05, + "loss": 1.1075, + "step": 2289 + }, + { + "epoch": 0.3599992139755153, + "grad_norm": 0.2630583345890045, + "learning_rate": 4.613846868284126e-05, + "loss": 1.1411, + "step": 2290 + }, + { + "epoch": 0.36015641887244787, + "grad_norm": 0.20439459383487701, + "learning_rate": 4.6135169302886006e-05, + "loss": 1.0507, + "step": 2291 + }, + { + "epoch": 0.3603136237693804, + "grad_norm": 0.18172521889209747, + "learning_rate": 4.6131868632075024e-05, + "loss": 1.1667, + "step": 2292 + }, + { + "epoch": 0.36047082866631297, + "grad_norm": 0.14271654188632965, + "learning_rate": 4.612856667060991e-05, + "loss": 1.1963, + "step": 2293 + }, + { + "epoch": 0.3606280335632455, + "grad_norm": 0.22915297746658325, + "learning_rate": 4.612526341869233e-05, + "loss": 1.1523, + "step": 2294 + }, + { + "epoch": 0.360785238460178, + "grad_norm": 0.19867637753486633, + "learning_rate": 4.612195887652404e-05, + "loss": 1.1402, + "step": 2295 + }, + { + "epoch": 0.36094244335711057, + "grad_norm": 0.2168378382921219, + "learning_rate": 4.611865304430687e-05, + "loss": 1.1781, + "step": 2296 + }, + { + "epoch": 0.3610996482540431, + "grad_norm": 0.2496039867401123, + "learning_rate": 4.6115345922242716e-05, + "loss": 1.1168, + "step": 2297 + }, + { + "epoch": 0.36125685315097567, + "grad_norm": 0.15307332575321198, + "learning_rate": 4.6112037510533574e-05, + "loss": 1.1752, + "step": 2298 + }, + { + "epoch": 0.3614140580479082, + "grad_norm": 0.22675082087516785, + "learning_rate": 4.610872780938151e-05, + "loss": 1.2404, + "step": 2299 + }, + { + "epoch": 0.3615712629448407, + "grad_norm": 0.216874897480011, + "learning_rate": 4.610541681898865e-05, + "loss": 1.1117, + "step": 2300 + }, + { + "epoch": 0.36172846784177326, + "grad_norm": 0.2463904470205307, + "learning_rate": 4.6102104539557254e-05, + "loss": 1.0656, + "step": 2301 + }, + { + "epoch": 0.3618856727387058, + "grad_norm": 0.21900813281536102, + "learning_rate": 4.609879097128959e-05, + "loss": 1.149, + "step": 2302 + }, + { + "epoch": 0.36204287763563836, + "grad_norm": 0.23547019064426422, + "learning_rate": 4.6095476114388046e-05, + "loss": 1.0561, + "step": 2303 + }, + { + "epoch": 0.3622000825325709, + "grad_norm": 0.23986342549324036, + "learning_rate": 4.609215996905509e-05, + "loss": 1.1362, + "step": 2304 + }, + { + "epoch": 0.3623572874295034, + "grad_norm": 0.1994561403989792, + "learning_rate": 4.6088842535493247e-05, + "loss": 1.256, + "step": 2305 + }, + { + "epoch": 0.36251449232643596, + "grad_norm": 0.2824605405330658, + "learning_rate": 4.608552381390515e-05, + "loss": 1.0065, + "step": 2306 + }, + { + "epoch": 0.3626716972233685, + "grad_norm": 0.24372591078281403, + "learning_rate": 4.6082203804493474e-05, + "loss": 1.1457, + "step": 2307 + }, + { + "epoch": 0.36282890212030106, + "grad_norm": 0.19230058789253235, + "learning_rate": 4.607888250746101e-05, + "loss": 1.2162, + "step": 2308 + }, + { + "epoch": 0.3629861070172336, + "grad_norm": 0.22559943795204163, + "learning_rate": 4.6075559923010594e-05, + "loss": 1.0851, + "step": 2309 + }, + { + "epoch": 0.3631433119141661, + "grad_norm": 0.30149683356285095, + "learning_rate": 4.607223605134517e-05, + "loss": 1.1287, + "step": 2310 + }, + { + "epoch": 0.36330051681109865, + "grad_norm": 0.21107155084609985, + "learning_rate": 4.6068910892667744e-05, + "loss": 1.0637, + "step": 2311 + }, + { + "epoch": 0.3634577217080312, + "grad_norm": 0.23039104044437408, + "learning_rate": 4.60655844471814e-05, + "loss": 1.1839, + "step": 2312 + }, + { + "epoch": 0.36361492660496375, + "grad_norm": 0.19493019580841064, + "learning_rate": 4.6062256715089304e-05, + "loss": 1.0697, + "step": 2313 + }, + { + "epoch": 0.3637721315018963, + "grad_norm": 0.24054986238479614, + "learning_rate": 4.605892769659471e-05, + "loss": 1.1201, + "step": 2314 + }, + { + "epoch": 0.36392933639882885, + "grad_norm": 0.2367154210805893, + "learning_rate": 4.6055597391900933e-05, + "loss": 1.1292, + "step": 2315 + }, + { + "epoch": 0.36408654129576135, + "grad_norm": 0.18168602883815765, + "learning_rate": 4.605226580121138e-05, + "loss": 1.116, + "step": 2316 + }, + { + "epoch": 0.3642437461926939, + "grad_norm": 0.22764810919761658, + "learning_rate": 4.604893292472954e-05, + "loss": 1.0941, + "step": 2317 + }, + { + "epoch": 0.36440095108962645, + "grad_norm": 0.1890590786933899, + "learning_rate": 4.6045598762658945e-05, + "loss": 1.1693, + "step": 2318 + }, + { + "epoch": 0.364558155986559, + "grad_norm": 0.30014708638191223, + "learning_rate": 4.604226331520326e-05, + "loss": 1.1798, + "step": 2319 + }, + { + "epoch": 0.36471536088349155, + "grad_norm": 0.29448553919792175, + "learning_rate": 4.603892658256619e-05, + "loss": 1.1618, + "step": 2320 + }, + { + "epoch": 0.36487256578042404, + "grad_norm": 0.220899298787117, + "learning_rate": 4.603558856495154e-05, + "loss": 1.1511, + "step": 2321 + }, + { + "epoch": 0.3650297706773566, + "grad_norm": 0.21098098158836365, + "learning_rate": 4.603224926256317e-05, + "loss": 1.0377, + "step": 2322 + }, + { + "epoch": 0.36518697557428914, + "grad_norm": 0.24701356887817383, + "learning_rate": 4.602890867560503e-05, + "loss": 1.1466, + "step": 2323 + }, + { + "epoch": 0.3653441804712217, + "grad_norm": 0.24385958909988403, + "learning_rate": 4.602556680428117e-05, + "loss": 1.0753, + "step": 2324 + }, + { + "epoch": 0.36550138536815424, + "grad_norm": 0.28453299403190613, + "learning_rate": 4.6022223648795685e-05, + "loss": 1.2767, + "step": 2325 + }, + { + "epoch": 0.36565859026508674, + "grad_norm": 0.16848404705524445, + "learning_rate": 4.6018879209352755e-05, + "loss": 1.1453, + "step": 2326 + }, + { + "epoch": 0.3658157951620193, + "grad_norm": 0.20252664387226105, + "learning_rate": 4.601553348615666e-05, + "loss": 1.1672, + "step": 2327 + }, + { + "epoch": 0.36597300005895184, + "grad_norm": 0.25076255202293396, + "learning_rate": 4.601218647941174e-05, + "loss": 1.1014, + "step": 2328 + }, + { + "epoch": 0.3661302049558844, + "grad_norm": 0.2385731041431427, + "learning_rate": 4.600883818932241e-05, + "loss": 1.2667, + "step": 2329 + }, + { + "epoch": 0.36628740985281694, + "grad_norm": 0.22775901854038239, + "learning_rate": 4.600548861609318e-05, + "loss": 1.1281, + "step": 2330 + }, + { + "epoch": 0.36644461474974943, + "grad_norm": 0.2670734226703644, + "learning_rate": 4.600213775992863e-05, + "loss": 1.2638, + "step": 2331 + }, + { + "epoch": 0.366601819646682, + "grad_norm": 0.19620922207832336, + "learning_rate": 4.599878562103341e-05, + "loss": 1.2204, + "step": 2332 + }, + { + "epoch": 0.36675902454361453, + "grad_norm": 0.20084300637245178, + "learning_rate": 4.599543219961226e-05, + "loss": 1.1472, + "step": 2333 + }, + { + "epoch": 0.3669162294405471, + "grad_norm": 0.18164968490600586, + "learning_rate": 4.599207749587e-05, + "loss": 1.2188, + "step": 2334 + }, + { + "epoch": 0.36707343433747963, + "grad_norm": 0.2300948202610016, + "learning_rate": 4.598872151001151e-05, + "loss": 1.1519, + "step": 2335 + }, + { + "epoch": 0.3672306392344121, + "grad_norm": 0.22130456566810608, + "learning_rate": 4.598536424224177e-05, + "loss": 1.1081, + "step": 2336 + }, + { + "epoch": 0.3673878441313447, + "grad_norm": 0.2190874069929123, + "learning_rate": 4.598200569276582e-05, + "loss": 0.9442, + "step": 2337 + }, + { + "epoch": 0.3675450490282772, + "grad_norm": 0.244587242603302, + "learning_rate": 4.5978645861788793e-05, + "loss": 0.9082, + "step": 2338 + }, + { + "epoch": 0.3677022539252098, + "grad_norm": 0.1806013435125351, + "learning_rate": 4.5975284749515904e-05, + "loss": 1.1944, + "step": 2339 + }, + { + "epoch": 0.3678594588221423, + "grad_norm": 0.21137726306915283, + "learning_rate": 4.597192235615242e-05, + "loss": 1.1479, + "step": 2340 + }, + { + "epoch": 0.3680166637190749, + "grad_norm": 0.18507908284664154, + "learning_rate": 4.5968558681903716e-05, + "loss": 1.1367, + "step": 2341 + }, + { + "epoch": 0.36817386861600737, + "grad_norm": 0.2921428680419922, + "learning_rate": 4.596519372697523e-05, + "loss": 1.0496, + "step": 2342 + }, + { + "epoch": 0.3683310735129399, + "grad_norm": 0.1963038295507431, + "learning_rate": 4.596182749157247e-05, + "loss": 1.2727, + "step": 2343 + }, + { + "epoch": 0.36848827840987247, + "grad_norm": 0.16632871329784393, + "learning_rate": 4.5958459975901046e-05, + "loss": 1.0428, + "step": 2344 + }, + { + "epoch": 0.368645483306805, + "grad_norm": 0.21543806791305542, + "learning_rate": 4.595509118016663e-05, + "loss": 1.1868, + "step": 2345 + }, + { + "epoch": 0.36880268820373757, + "grad_norm": 0.20484694838523865, + "learning_rate": 4.595172110457497e-05, + "loss": 1.1649, + "step": 2346 + }, + { + "epoch": 0.36895989310067007, + "grad_norm": 0.2756751775741577, + "learning_rate": 4.594834974933191e-05, + "loss": 1.0987, + "step": 2347 + }, + { + "epoch": 0.3691170979976026, + "grad_norm": 0.2065090388059616, + "learning_rate": 4.5944977114643346e-05, + "loss": 1.2065, + "step": 2348 + }, + { + "epoch": 0.36927430289453517, + "grad_norm": 0.2959758937358856, + "learning_rate": 4.594160320071527e-05, + "loss": 1.0922, + "step": 2349 + }, + { + "epoch": 0.3694315077914677, + "grad_norm": 0.19524092972278595, + "learning_rate": 4.593822800775375e-05, + "loss": 1.1071, + "step": 2350 + }, + { + "epoch": 0.36958871268840027, + "grad_norm": 0.17790867388248444, + "learning_rate": 4.593485153596492e-05, + "loss": 1.171, + "step": 2351 + }, + { + "epoch": 0.36974591758533276, + "grad_norm": 0.18420405685901642, + "learning_rate": 4.593147378555501e-05, + "loss": 1.1355, + "step": 2352 + }, + { + "epoch": 0.3699031224822653, + "grad_norm": 0.2588077485561371, + "learning_rate": 4.5928094756730326e-05, + "loss": 1.1258, + "step": 2353 + }, + { + "epoch": 0.37006032737919786, + "grad_norm": 0.2283763289451599, + "learning_rate": 4.592471444969724e-05, + "loss": 1.1717, + "step": 2354 + }, + { + "epoch": 0.3702175322761304, + "grad_norm": 0.2567402422428131, + "learning_rate": 4.5921332864662215e-05, + "loss": 1.1495, + "step": 2355 + }, + { + "epoch": 0.37037473717306296, + "grad_norm": 0.18225303292274475, + "learning_rate": 4.5917950001831766e-05, + "loss": 1.2088, + "step": 2356 + }, + { + "epoch": 0.37053194206999546, + "grad_norm": 0.2010773867368698, + "learning_rate": 4.591456586141253e-05, + "loss": 1.2081, + "step": 2357 + }, + { + "epoch": 0.370689146966928, + "grad_norm": 0.19127878546714783, + "learning_rate": 4.591118044361118e-05, + "loss": 1.0942, + "step": 2358 + }, + { + "epoch": 0.37084635186386056, + "grad_norm": 0.18000200390815735, + "learning_rate": 4.590779374863449e-05, + "loss": 0.9802, + "step": 2359 + }, + { + "epoch": 0.3710035567607931, + "grad_norm": 0.16793574392795563, + "learning_rate": 4.590440577668931e-05, + "loss": 1.2521, + "step": 2360 + }, + { + "epoch": 0.37116076165772566, + "grad_norm": 0.16233059763908386, + "learning_rate": 4.5901016527982555e-05, + "loss": 1.1423, + "step": 2361 + }, + { + "epoch": 0.37131796655465815, + "grad_norm": 0.2126810997724533, + "learning_rate": 4.5897626002721236e-05, + "loss": 1.125, + "step": 2362 + }, + { + "epoch": 0.3714751714515907, + "grad_norm": 0.20584918558597565, + "learning_rate": 4.589423420111244e-05, + "loss": 1.1424, + "step": 2363 + }, + { + "epoch": 0.37163237634852325, + "grad_norm": 0.19902043044567108, + "learning_rate": 4.5890841123363305e-05, + "loss": 1.0977, + "step": 2364 + }, + { + "epoch": 0.3717895812454558, + "grad_norm": 0.20691698789596558, + "learning_rate": 4.588744676968109e-05, + "loss": 1.1506, + "step": 2365 + }, + { + "epoch": 0.37194678614238835, + "grad_norm": 0.2217256873846054, + "learning_rate": 4.588405114027309e-05, + "loss": 1.2978, + "step": 2366 + }, + { + "epoch": 0.37210399103932085, + "grad_norm": 0.24339617788791656, + "learning_rate": 4.5880654235346705e-05, + "loss": 1.1033, + "step": 2367 + }, + { + "epoch": 0.3722611959362534, + "grad_norm": 0.22742657363414764, + "learning_rate": 4.587725605510941e-05, + "loss": 1.1811, + "step": 2368 + }, + { + "epoch": 0.37241840083318595, + "grad_norm": 0.3227018117904663, + "learning_rate": 4.587385659976874e-05, + "loss": 1.1963, + "step": 2369 + }, + { + "epoch": 0.3725756057301185, + "grad_norm": 0.22187107801437378, + "learning_rate": 4.587045586953233e-05, + "loss": 1.1627, + "step": 2370 + }, + { + "epoch": 0.37273281062705105, + "grad_norm": 0.2444133311510086, + "learning_rate": 4.586705386460789e-05, + "loss": 1.2408, + "step": 2371 + }, + { + "epoch": 0.3728900155239836, + "grad_norm": 0.21586081385612488, + "learning_rate": 4.586365058520319e-05, + "loss": 1.1803, + "step": 2372 + }, + { + "epoch": 0.3730472204209161, + "grad_norm": 0.15065719187259674, + "learning_rate": 4.586024603152609e-05, + "loss": 1.1824, + "step": 2373 + }, + { + "epoch": 0.37320442531784864, + "grad_norm": 0.18252897262573242, + "learning_rate": 4.585684020378453e-05, + "loss": 1.0426, + "step": 2374 + }, + { + "epoch": 0.3733616302147812, + "grad_norm": 0.27016788721084595, + "learning_rate": 4.585343310218653e-05, + "loss": 1.0083, + "step": 2375 + }, + { + "epoch": 0.37351883511171374, + "grad_norm": 0.23528437316417694, + "learning_rate": 4.585002472694018e-05, + "loss": 1.1698, + "step": 2376 + }, + { + "epoch": 0.3736760400086463, + "grad_norm": 0.2290978729724884, + "learning_rate": 4.5846615078253644e-05, + "loss": 1.0959, + "step": 2377 + }, + { + "epoch": 0.3738332449055788, + "grad_norm": 0.25804629921913147, + "learning_rate": 4.5843204156335176e-05, + "loss": 1.1847, + "step": 2378 + }, + { + "epoch": 0.37399044980251134, + "grad_norm": 0.21353112161159515, + "learning_rate": 4.58397919613931e-05, + "loss": 1.1614, + "step": 2379 + }, + { + "epoch": 0.3741476546994439, + "grad_norm": 0.1873713582754135, + "learning_rate": 4.5836378493635826e-05, + "loss": 1.2798, + "step": 2380 + }, + { + "epoch": 0.37430485959637644, + "grad_norm": 0.22049100697040558, + "learning_rate": 4.583296375327182e-05, + "loss": 1.1098, + "step": 2381 + }, + { + "epoch": 0.374462064493309, + "grad_norm": 0.20885449647903442, + "learning_rate": 4.582954774050966e-05, + "loss": 1.144, + "step": 2382 + }, + { + "epoch": 0.3746192693902415, + "grad_norm": 0.23146776854991913, + "learning_rate": 4.582613045555798e-05, + "loss": 1.0994, + "step": 2383 + }, + { + "epoch": 0.37477647428717403, + "grad_norm": 0.19679352641105652, + "learning_rate": 4.582271189862548e-05, + "loss": 1.1002, + "step": 2384 + }, + { + "epoch": 0.3749336791841066, + "grad_norm": 0.20884890854358673, + "learning_rate": 4.581929206992097e-05, + "loss": 1.0454, + "step": 2385 + }, + { + "epoch": 0.37509088408103913, + "grad_norm": 0.20319582521915436, + "learning_rate": 4.581587096965331e-05, + "loss": 1.0725, + "step": 2386 + }, + { + "epoch": 0.3752480889779717, + "grad_norm": 0.2325729876756668, + "learning_rate": 4.581244859803146e-05, + "loss": 1.0087, + "step": 2387 + }, + { + "epoch": 0.3754052938749042, + "grad_norm": 0.19451522827148438, + "learning_rate": 4.580902495526442e-05, + "loss": 1.1993, + "step": 2388 + }, + { + "epoch": 0.37556249877183673, + "grad_norm": 0.28159642219543457, + "learning_rate": 4.580560004156131e-05, + "loss": 1.1678, + "step": 2389 + }, + { + "epoch": 0.3757197036687693, + "grad_norm": 0.30161935091018677, + "learning_rate": 4.580217385713132e-05, + "loss": 1.1238, + "step": 2390 + }, + { + "epoch": 0.37587690856570183, + "grad_norm": 0.34735164046287537, + "learning_rate": 4.579874640218369e-05, + "loss": 1.1889, + "step": 2391 + }, + { + "epoch": 0.3760341134626344, + "grad_norm": 0.27134138345718384, + "learning_rate": 4.579531767692777e-05, + "loss": 1.0985, + "step": 2392 + }, + { + "epoch": 0.3761913183595669, + "grad_norm": 0.15087197721004486, + "learning_rate": 4.5791887681572964e-05, + "loss": 1.1239, + "step": 2393 + }, + { + "epoch": 0.3763485232564994, + "grad_norm": 0.18321779370307922, + "learning_rate": 4.5788456416328766e-05, + "loss": 1.0854, + "step": 2394 + }, + { + "epoch": 0.376505728153432, + "grad_norm": 0.21554580330848694, + "learning_rate": 4.5785023881404744e-05, + "loss": 1.1672, + "step": 2395 + }, + { + "epoch": 0.3766629330503645, + "grad_norm": 0.21578359603881836, + "learning_rate": 4.578159007701055e-05, + "loss": 1.0585, + "step": 2396 + }, + { + "epoch": 0.3768201379472971, + "grad_norm": 0.21935155987739563, + "learning_rate": 4.57781550033559e-05, + "loss": 1.1878, + "step": 2397 + }, + { + "epoch": 0.3769773428442296, + "grad_norm": 0.22799645364284515, + "learning_rate": 4.5774718660650594e-05, + "loss": 1.1155, + "step": 2398 + }, + { + "epoch": 0.3771345477411621, + "grad_norm": 0.18265032768249512, + "learning_rate": 4.577128104910452e-05, + "loss": 1.2611, + "step": 2399 + }, + { + "epoch": 0.37729175263809467, + "grad_norm": 0.24530020356178284, + "learning_rate": 4.576784216892763e-05, + "loss": 1.0692, + "step": 2400 + }, + { + "epoch": 0.37729175263809467, + "eval_loss": 1.130866289138794, + "eval_runtime": 2318.0007, + "eval_samples_per_second": 3.994, + "eval_steps_per_second": 1.997, + "step": 2400 + }, + { + "epoch": 0.3774489575350272, + "grad_norm": 0.22097055613994598, + "learning_rate": 4.5764402020329953e-05, + "loss": 1.1712, + "step": 2401 + }, + { + "epoch": 0.37760616243195977, + "grad_norm": 0.24752137064933777, + "learning_rate": 4.576096060352161e-05, + "loss": 1.0011, + "step": 2402 + }, + { + "epoch": 0.3777633673288923, + "grad_norm": 0.20604589581489563, + "learning_rate": 4.5757517918712775e-05, + "loss": 1.1616, + "step": 2403 + }, + { + "epoch": 0.3779205722258248, + "grad_norm": 0.23652517795562744, + "learning_rate": 4.5754073966113734e-05, + "loss": 1.0465, + "step": 2404 + }, + { + "epoch": 0.37807777712275736, + "grad_norm": 0.2082519680261612, + "learning_rate": 4.575062874593481e-05, + "loss": 1.0695, + "step": 2405 + }, + { + "epoch": 0.3782349820196899, + "grad_norm": 0.3347269892692566, + "learning_rate": 4.574718225838644e-05, + "loss": 1.1395, + "step": 2406 + }, + { + "epoch": 0.37839218691662246, + "grad_norm": 0.19232496619224548, + "learning_rate": 4.57437345036791e-05, + "loss": 1.0818, + "step": 2407 + }, + { + "epoch": 0.378549391813555, + "grad_norm": 0.23415827751159668, + "learning_rate": 4.5740285482023396e-05, + "loss": 1.1472, + "step": 2408 + }, + { + "epoch": 0.3787065967104875, + "grad_norm": 0.24319365620613098, + "learning_rate": 4.5736835193629964e-05, + "loss": 1.1827, + "step": 2409 + }, + { + "epoch": 0.37886380160742006, + "grad_norm": 0.18908008933067322, + "learning_rate": 4.5733383638709536e-05, + "loss": 1.1589, + "step": 2410 + }, + { + "epoch": 0.3790210065043526, + "grad_norm": 0.23955419659614563, + "learning_rate": 4.572993081747291e-05, + "loss": 1.1292, + "step": 2411 + }, + { + "epoch": 0.37917821140128516, + "grad_norm": 0.27845147252082825, + "learning_rate": 4.5726476730130994e-05, + "loss": 1.0401, + "step": 2412 + }, + { + "epoch": 0.3793354162982177, + "grad_norm": 0.2088197022676468, + "learning_rate": 4.572302137689474e-05, + "loss": 1.1491, + "step": 2413 + }, + { + "epoch": 0.3794926211951502, + "grad_norm": 0.24244044721126556, + "learning_rate": 4.571956475797519e-05, + "loss": 1.1756, + "step": 2414 + }, + { + "epoch": 0.37964982609208275, + "grad_norm": 0.18544475734233856, + "learning_rate": 4.571610687358344e-05, + "loss": 1.1849, + "step": 2415 + }, + { + "epoch": 0.3798070309890153, + "grad_norm": 0.21180425584316254, + "learning_rate": 4.571264772393071e-05, + "loss": 1.1463, + "step": 2416 + }, + { + "epoch": 0.37996423588594785, + "grad_norm": 0.18626239895820618, + "learning_rate": 4.5709187309228273e-05, + "loss": 1.0571, + "step": 2417 + }, + { + "epoch": 0.3801214407828804, + "grad_norm": 0.21916179358959198, + "learning_rate": 4.570572562968746e-05, + "loss": 1.0754, + "step": 2418 + }, + { + "epoch": 0.3802786456798129, + "grad_norm": 0.22719772160053253, + "learning_rate": 4.570226268551971e-05, + "loss": 1.1745, + "step": 2419 + }, + { + "epoch": 0.38043585057674545, + "grad_norm": 0.1658662110567093, + "learning_rate": 4.5698798476936515e-05, + "loss": 1.1501, + "step": 2420 + }, + { + "epoch": 0.380593055473678, + "grad_norm": 0.20030854642391205, + "learning_rate": 4.5695333004149465e-05, + "loss": 1.1977, + "step": 2421 + }, + { + "epoch": 0.38075026037061055, + "grad_norm": 0.22993536293506622, + "learning_rate": 4.569186626737022e-05, + "loss": 1.1129, + "step": 2422 + }, + { + "epoch": 0.3809074652675431, + "grad_norm": 0.24522928893566132, + "learning_rate": 4.568839826681051e-05, + "loss": 1.133, + "step": 2423 + }, + { + "epoch": 0.38106467016447565, + "grad_norm": 0.1903398483991623, + "learning_rate": 4.568492900268214e-05, + "loss": 1.0693, + "step": 2424 + }, + { + "epoch": 0.38122187506140814, + "grad_norm": 0.3089503049850464, + "learning_rate": 4.568145847519702e-05, + "loss": 1.1383, + "step": 2425 + }, + { + "epoch": 0.3813790799583407, + "grad_norm": 0.1913723647594452, + "learning_rate": 4.5677986684567095e-05, + "loss": 1.1326, + "step": 2426 + }, + { + "epoch": 0.38153628485527324, + "grad_norm": 0.2117031067609787, + "learning_rate": 4.5674513631004424e-05, + "loss": 1.0813, + "step": 2427 + }, + { + "epoch": 0.3816934897522058, + "grad_norm": 0.21630004048347473, + "learning_rate": 4.567103931472112e-05, + "loss": 1.0788, + "step": 2428 + }, + { + "epoch": 0.38185069464913834, + "grad_norm": 0.22654882073402405, + "learning_rate": 4.566756373592938e-05, + "loss": 1.135, + "step": 2429 + }, + { + "epoch": 0.38200789954607084, + "grad_norm": 0.20162400603294373, + "learning_rate": 4.566408689484148e-05, + "loss": 1.1594, + "step": 2430 + }, + { + "epoch": 0.3821651044430034, + "grad_norm": 0.1734067052602768, + "learning_rate": 4.566060879166978e-05, + "loss": 1.1672, + "step": 2431 + }, + { + "epoch": 0.38232230933993594, + "grad_norm": 0.21130593121051788, + "learning_rate": 4.56571294266267e-05, + "loss": 1.1983, + "step": 2432 + }, + { + "epoch": 0.3824795142368685, + "grad_norm": 0.2882966697216034, + "learning_rate": 4.565364879992475e-05, + "loss": 1.0704, + "step": 2433 + }, + { + "epoch": 0.38263671913380104, + "grad_norm": 0.20934030413627625, + "learning_rate": 4.565016691177651e-05, + "loss": 1.0241, + "step": 2434 + }, + { + "epoch": 0.38279392403073353, + "grad_norm": 0.2850327789783478, + "learning_rate": 4.564668376239466e-05, + "loss": 1.0301, + "step": 2435 + }, + { + "epoch": 0.3829511289276661, + "grad_norm": 0.19671715795993805, + "learning_rate": 4.564319935199191e-05, + "loss": 1.1333, + "step": 2436 + }, + { + "epoch": 0.38310833382459863, + "grad_norm": 0.18312984704971313, + "learning_rate": 4.563971368078108e-05, + "loss": 1.203, + "step": 2437 + }, + { + "epoch": 0.3832655387215312, + "grad_norm": 0.1931515783071518, + "learning_rate": 4.563622674897507e-05, + "loss": 1.0835, + "step": 2438 + }, + { + "epoch": 0.38342274361846373, + "grad_norm": 0.24310877919197083, + "learning_rate": 4.563273855678685e-05, + "loss": 1.0888, + "step": 2439 + }, + { + "epoch": 0.38357994851539623, + "grad_norm": 0.20095118880271912, + "learning_rate": 4.562924910442946e-05, + "loss": 1.0833, + "step": 2440 + }, + { + "epoch": 0.3837371534123288, + "grad_norm": 0.25016844272613525, + "learning_rate": 4.5625758392116025e-05, + "loss": 1.0446, + "step": 2441 + }, + { + "epoch": 0.38389435830926133, + "grad_norm": 0.21593958139419556, + "learning_rate": 4.5622266420059745e-05, + "loss": 1.137, + "step": 2442 + }, + { + "epoch": 0.3840515632061939, + "grad_norm": 0.25021597743034363, + "learning_rate": 4.561877318847389e-05, + "loss": 1.2507, + "step": 2443 + }, + { + "epoch": 0.38420876810312643, + "grad_norm": 0.2984952926635742, + "learning_rate": 4.561527869757182e-05, + "loss": 1.0938, + "step": 2444 + }, + { + "epoch": 0.3843659730000589, + "grad_norm": 0.20226380228996277, + "learning_rate": 4.561178294756696e-05, + "loss": 1.1269, + "step": 2445 + }, + { + "epoch": 0.3845231778969915, + "grad_norm": 0.23726654052734375, + "learning_rate": 4.5608285938672826e-05, + "loss": 1.1787, + "step": 2446 + }, + { + "epoch": 0.384680382793924, + "grad_norm": 0.22613126039505005, + "learning_rate": 4.560478767110299e-05, + "loss": 1.1021, + "step": 2447 + }, + { + "epoch": 0.3848375876908566, + "grad_norm": 0.31989023089408875, + "learning_rate": 4.560128814507112e-05, + "loss": 1.0293, + "step": 2448 + }, + { + "epoch": 0.3849947925877891, + "grad_norm": 0.2094157338142395, + "learning_rate": 4.559778736079096e-05, + "loss": 1.1724, + "step": 2449 + }, + { + "epoch": 0.3851519974847217, + "grad_norm": 0.26532888412475586, + "learning_rate": 4.5594285318476315e-05, + "loss": 1.197, + "step": 2450 + }, + { + "epoch": 0.38530920238165417, + "grad_norm": 0.17318859696388245, + "learning_rate": 4.559078201834107e-05, + "loss": 1.1503, + "step": 2451 + }, + { + "epoch": 0.3854664072785867, + "grad_norm": 0.2698046863079071, + "learning_rate": 4.558727746059922e-05, + "loss": 1.0119, + "step": 2452 + }, + { + "epoch": 0.38562361217551927, + "grad_norm": 0.18497617542743683, + "learning_rate": 4.558377164546478e-05, + "loss": 1.0471, + "step": 2453 + }, + { + "epoch": 0.3857808170724518, + "grad_norm": 0.257929265499115, + "learning_rate": 4.558026457315188e-05, + "loss": 1.1298, + "step": 2454 + }, + { + "epoch": 0.38593802196938437, + "grad_norm": 0.17087925970554352, + "learning_rate": 4.557675624387473e-05, + "loss": 1.174, + "step": 2455 + }, + { + "epoch": 0.38609522686631687, + "grad_norm": 0.19346606731414795, + "learning_rate": 4.5573246657847595e-05, + "loss": 1.152, + "step": 2456 + }, + { + "epoch": 0.3862524317632494, + "grad_norm": 0.22339493036270142, + "learning_rate": 4.556973581528483e-05, + "loss": 1.0753, + "step": 2457 + }, + { + "epoch": 0.38640963666018197, + "grad_norm": 0.19931474328041077, + "learning_rate": 4.556622371640087e-05, + "loss": 1.1562, + "step": 2458 + }, + { + "epoch": 0.3865668415571145, + "grad_norm": 0.20702451467514038, + "learning_rate": 4.556271036141021e-05, + "loss": 1.1617, + "step": 2459 + }, + { + "epoch": 0.38672404645404707, + "grad_norm": 0.16566786170005798, + "learning_rate": 4.5559195750527436e-05, + "loss": 1.2268, + "step": 2460 + }, + { + "epoch": 0.38688125135097956, + "grad_norm": 0.18909968435764313, + "learning_rate": 4.5555679883967206e-05, + "loss": 1.2718, + "step": 2461 + }, + { + "epoch": 0.3870384562479121, + "grad_norm": 0.16908888518810272, + "learning_rate": 4.555216276194426e-05, + "loss": 1.2435, + "step": 2462 + }, + { + "epoch": 0.38719566114484466, + "grad_norm": 0.18960361182689667, + "learning_rate": 4.554864438467341e-05, + "loss": 1.081, + "step": 2463 + }, + { + "epoch": 0.3873528660417772, + "grad_norm": 0.18155843019485474, + "learning_rate": 4.5545124752369546e-05, + "loss": 1.2362, + "step": 2464 + }, + { + "epoch": 0.38751007093870976, + "grad_norm": 0.195807084441185, + "learning_rate": 4.554160386524763e-05, + "loss": 1.1104, + "step": 2465 + }, + { + "epoch": 0.38766727583564226, + "grad_norm": 0.36397311091423035, + "learning_rate": 4.55380817235227e-05, + "loss": 1.1308, + "step": 2466 + }, + { + "epoch": 0.3878244807325748, + "grad_norm": 0.28057196736335754, + "learning_rate": 4.553455832740989e-05, + "loss": 1.0053, + "step": 2467 + }, + { + "epoch": 0.38798168562950736, + "grad_norm": 0.24437671899795532, + "learning_rate": 4.553103367712438e-05, + "loss": 1.2225, + "step": 2468 + }, + { + "epoch": 0.3881388905264399, + "grad_norm": 0.25742071866989136, + "learning_rate": 4.5527507772881454e-05, + "loss": 1.1577, + "step": 2469 + }, + { + "epoch": 0.38829609542337246, + "grad_norm": 0.2540157735347748, + "learning_rate": 4.5523980614896445e-05, + "loss": 0.9798, + "step": 2470 + }, + { + "epoch": 0.38845330032030495, + "grad_norm": 0.21823832392692566, + "learning_rate": 4.5520452203384795e-05, + "loss": 1.0552, + "step": 2471 + }, + { + "epoch": 0.3886105052172375, + "grad_norm": 0.22293493151664734, + "learning_rate": 4.5516922538562e-05, + "loss": 1.2099, + "step": 2472 + }, + { + "epoch": 0.38876771011417005, + "grad_norm": 0.1513427495956421, + "learning_rate": 4.551339162064364e-05, + "loss": 1.2112, + "step": 2473 + }, + { + "epoch": 0.3889249150111026, + "grad_norm": 0.18423447012901306, + "learning_rate": 4.550985944984536e-05, + "loss": 1.2173, + "step": 2474 + }, + { + "epoch": 0.38908211990803515, + "grad_norm": 0.17693789303302765, + "learning_rate": 4.550632602638291e-05, + "loss": 1.2094, + "step": 2475 + }, + { + "epoch": 0.3892393248049677, + "grad_norm": 0.19538095593452454, + "learning_rate": 4.550279135047208e-05, + "loss": 1.1339, + "step": 2476 + }, + { + "epoch": 0.3893965297019002, + "grad_norm": 0.17086082696914673, + "learning_rate": 4.549925542232877e-05, + "loss": 1.0723, + "step": 2477 + }, + { + "epoch": 0.38955373459883275, + "grad_norm": 0.21500667929649353, + "learning_rate": 4.549571824216892e-05, + "loss": 1.0772, + "step": 2478 + }, + { + "epoch": 0.3897109394957653, + "grad_norm": 0.21435636281967163, + "learning_rate": 4.549217981020859e-05, + "loss": 1.0753, + "step": 2479 + }, + { + "epoch": 0.38986814439269785, + "grad_norm": 0.21024413406848907, + "learning_rate": 4.5488640126663883e-05, + "loss": 1.0733, + "step": 2480 + }, + { + "epoch": 0.3900253492896304, + "grad_norm": 0.18347331881523132, + "learning_rate": 4.5485099191751e-05, + "loss": 1.1209, + "step": 2481 + }, + { + "epoch": 0.3901825541865629, + "grad_norm": 0.2669816315174103, + "learning_rate": 4.548155700568619e-05, + "loss": 1.2356, + "step": 2482 + }, + { + "epoch": 0.39033975908349544, + "grad_norm": 0.24279122054576874, + "learning_rate": 4.547801356868581e-05, + "loss": 1.1473, + "step": 2483 + }, + { + "epoch": 0.390496963980428, + "grad_norm": 0.24355027079582214, + "learning_rate": 4.547446888096627e-05, + "loss": 1.1443, + "step": 2484 + }, + { + "epoch": 0.39065416887736054, + "grad_norm": 0.17095209658145905, + "learning_rate": 4.547092294274407e-05, + "loss": 1.1495, + "step": 2485 + }, + { + "epoch": 0.3908113737742931, + "grad_norm": 0.2063921093940735, + "learning_rate": 4.546737575423579e-05, + "loss": 1.0875, + "step": 2486 + }, + { + "epoch": 0.3909685786712256, + "grad_norm": 0.22583013772964478, + "learning_rate": 4.546382731565807e-05, + "loss": 1.1615, + "step": 2487 + }, + { + "epoch": 0.39112578356815814, + "grad_norm": 0.1564696729183197, + "learning_rate": 4.546027762722763e-05, + "loss": 1.1193, + "step": 2488 + }, + { + "epoch": 0.3912829884650907, + "grad_norm": 0.23818811774253845, + "learning_rate": 4.5456726689161285e-05, + "loss": 1.2302, + "step": 2489 + }, + { + "epoch": 0.39144019336202324, + "grad_norm": 0.21855293214321136, + "learning_rate": 4.545317450167591e-05, + "loss": 1.1537, + "step": 2490 + }, + { + "epoch": 0.3915973982589558, + "grad_norm": 0.18843285739421844, + "learning_rate": 4.544962106498846e-05, + "loss": 1.1447, + "step": 2491 + }, + { + "epoch": 0.3917546031558883, + "grad_norm": 0.27255892753601074, + "learning_rate": 4.544606637931594e-05, + "loss": 1.1954, + "step": 2492 + }, + { + "epoch": 0.39191180805282083, + "grad_norm": 0.22189125418663025, + "learning_rate": 4.54425104448755e-05, + "loss": 1.1074, + "step": 2493 + }, + { + "epoch": 0.3920690129497534, + "grad_norm": 0.20159167051315308, + "learning_rate": 4.5438953261884286e-05, + "loss": 1.1248, + "step": 2494 + }, + { + "epoch": 0.39222621784668593, + "grad_norm": 0.22371767461299896, + "learning_rate": 4.543539483055958e-05, + "loss": 1.0238, + "step": 2495 + }, + { + "epoch": 0.3923834227436185, + "grad_norm": 0.21290668845176697, + "learning_rate": 4.5431835151118704e-05, + "loss": 1.1483, + "step": 2496 + }, + { + "epoch": 0.392540627640551, + "grad_norm": 0.21544845402240753, + "learning_rate": 4.542827422377908e-05, + "loss": 1.1274, + "step": 2497 + }, + { + "epoch": 0.3926978325374835, + "grad_norm": 0.2112491875886917, + "learning_rate": 4.542471204875819e-05, + "loss": 1.1891, + "step": 2498 + }, + { + "epoch": 0.3928550374344161, + "grad_norm": 0.25035732984542847, + "learning_rate": 4.54211486262736e-05, + "loss": 1.0903, + "step": 2499 + }, + { + "epoch": 0.3930122423313486, + "grad_norm": 0.27401211857795715, + "learning_rate": 4.541758395654294e-05, + "loss": 1.1587, + "step": 2500 + }, + { + "epoch": 0.3931694472282812, + "grad_norm": 0.1791762411594391, + "learning_rate": 4.541401803978394e-05, + "loss": 1.1841, + "step": 2501 + }, + { + "epoch": 0.3933266521252137, + "grad_norm": 0.2872371971607208, + "learning_rate": 4.54104508762144e-05, + "loss": 1.1494, + "step": 2502 + }, + { + "epoch": 0.3934838570221462, + "grad_norm": 0.2522941529750824, + "learning_rate": 4.540688246605217e-05, + "loss": 1.0157, + "step": 2503 + }, + { + "epoch": 0.39364106191907877, + "grad_norm": 0.19489054381847382, + "learning_rate": 4.5403312809515194e-05, + "loss": 1.1834, + "step": 2504 + }, + { + "epoch": 0.3937982668160113, + "grad_norm": 0.22451898455619812, + "learning_rate": 4.539974190682151e-05, + "loss": 1.057, + "step": 2505 + }, + { + "epoch": 0.39395547171294387, + "grad_norm": 0.22620606422424316, + "learning_rate": 4.539616975818921e-05, + "loss": 1.1665, + "step": 2506 + }, + { + "epoch": 0.3941126766098764, + "grad_norm": 0.1968408226966858, + "learning_rate": 4.539259636383646e-05, + "loss": 1.1033, + "step": 2507 + }, + { + "epoch": 0.3942698815068089, + "grad_norm": 0.207768514752388, + "learning_rate": 4.538902172398151e-05, + "loss": 1.1113, + "step": 2508 + }, + { + "epoch": 0.39442708640374147, + "grad_norm": 0.20376287400722504, + "learning_rate": 4.538544583884269e-05, + "loss": 1.1207, + "step": 2509 + }, + { + "epoch": 0.394584291300674, + "grad_norm": 0.22011759877204895, + "learning_rate": 4.5381868708638395e-05, + "loss": 1.1046, + "step": 2510 + }, + { + "epoch": 0.39474149619760657, + "grad_norm": 0.1497177630662918, + "learning_rate": 4.537829033358711e-05, + "loss": 1.2215, + "step": 2511 + }, + { + "epoch": 0.3948987010945391, + "grad_norm": 0.17269910871982574, + "learning_rate": 4.5374710713907386e-05, + "loss": 1.2025, + "step": 2512 + }, + { + "epoch": 0.3950559059914716, + "grad_norm": 0.24141822755336761, + "learning_rate": 4.537112984981785e-05, + "loss": 1.2037, + "step": 2513 + }, + { + "epoch": 0.39521311088840416, + "grad_norm": 0.2416863590478897, + "learning_rate": 4.536754774153722e-05, + "loss": 1.2009, + "step": 2514 + }, + { + "epoch": 0.3953703157853367, + "grad_norm": 0.19514837861061096, + "learning_rate": 4.536396438928426e-05, + "loss": 1.0463, + "step": 2515 + }, + { + "epoch": 0.39552752068226926, + "grad_norm": 0.19753624498844147, + "learning_rate": 4.536037979327783e-05, + "loss": 1.1548, + "step": 2516 + }, + { + "epoch": 0.3956847255792018, + "grad_norm": 0.23067662119865417, + "learning_rate": 4.535679395373687e-05, + "loss": 1.1382, + "step": 2517 + }, + { + "epoch": 0.3958419304761343, + "grad_norm": 0.28596460819244385, + "learning_rate": 4.53532068708804e-05, + "loss": 1.1092, + "step": 2518 + }, + { + "epoch": 0.39599913537306686, + "grad_norm": 0.2753187417984009, + "learning_rate": 4.5349618544927486e-05, + "loss": 1.0776, + "step": 2519 + }, + { + "epoch": 0.3961563402699994, + "grad_norm": 0.20237961411476135, + "learning_rate": 4.534602897609729e-05, + "loss": 1.1795, + "step": 2520 + }, + { + "epoch": 0.39631354516693196, + "grad_norm": 0.21416132152080536, + "learning_rate": 4.534243816460906e-05, + "loss": 1.042, + "step": 2521 + }, + { + "epoch": 0.3964707500638645, + "grad_norm": 0.15869580209255219, + "learning_rate": 4.5338846110682106e-05, + "loss": 1.1952, + "step": 2522 + }, + { + "epoch": 0.396627954960797, + "grad_norm": 0.19368652999401093, + "learning_rate": 4.533525281453582e-05, + "loss": 1.0974, + "step": 2523 + }, + { + "epoch": 0.39678515985772955, + "grad_norm": 0.19327567517757416, + "learning_rate": 4.533165827638965e-05, + "loss": 1.2733, + "step": 2524 + }, + { + "epoch": 0.3969423647546621, + "grad_norm": 0.25186845660209656, + "learning_rate": 4.532806249646316e-05, + "loss": 1.2386, + "step": 2525 + }, + { + "epoch": 0.39709956965159465, + "grad_norm": 0.22584927082061768, + "learning_rate": 4.5324465474975955e-05, + "loss": 1.0643, + "step": 2526 + }, + { + "epoch": 0.3972567745485272, + "grad_norm": 0.19394904375076294, + "learning_rate": 4.532086721214773e-05, + "loss": 1.2075, + "step": 2527 + }, + { + "epoch": 0.39741397944545975, + "grad_norm": 0.20700658857822418, + "learning_rate": 4.531726770819825e-05, + "loss": 1.1718, + "step": 2528 + }, + { + "epoch": 0.39757118434239225, + "grad_norm": 0.19494110345840454, + "learning_rate": 4.5313666963347356e-05, + "loss": 1.0573, + "step": 2529 + }, + { + "epoch": 0.3977283892393248, + "grad_norm": 0.2095107138156891, + "learning_rate": 4.5310064977814977e-05, + "loss": 1.1195, + "step": 2530 + }, + { + "epoch": 0.39788559413625735, + "grad_norm": 0.22246003150939941, + "learning_rate": 4.530646175182111e-05, + "loss": 1.152, + "step": 2531 + }, + { + "epoch": 0.3980427990331899, + "grad_norm": 0.16814185678958893, + "learning_rate": 4.53028572855858e-05, + "loss": 1.1613, + "step": 2532 + }, + { + "epoch": 0.39820000393012245, + "grad_norm": 0.2411145269870758, + "learning_rate": 4.529925157932923e-05, + "loss": 1.0908, + "step": 2533 + }, + { + "epoch": 0.39835720882705494, + "grad_norm": 0.28689107298851013, + "learning_rate": 4.529564463327161e-05, + "loss": 1.0446, + "step": 2534 + }, + { + "epoch": 0.3985144137239875, + "grad_norm": 0.1856895238161087, + "learning_rate": 4.529203644763322e-05, + "loss": 1.2297, + "step": 2535 + }, + { + "epoch": 0.39867161862092004, + "grad_norm": 0.2174742966890335, + "learning_rate": 4.528842702263446e-05, + "loss": 1.1054, + "step": 2536 + }, + { + "epoch": 0.3988288235178526, + "grad_norm": 0.2260787934064865, + "learning_rate": 4.528481635849577e-05, + "loss": 1.0361, + "step": 2537 + }, + { + "epoch": 0.39898602841478514, + "grad_norm": 0.2079022079706192, + "learning_rate": 4.5281204455437676e-05, + "loss": 1.096, + "step": 2538 + }, + { + "epoch": 0.39914323331171764, + "grad_norm": 0.17258255183696747, + "learning_rate": 4.527759131368078e-05, + "loss": 1.1946, + "step": 2539 + }, + { + "epoch": 0.3993004382086502, + "grad_norm": 0.19373294711112976, + "learning_rate": 4.527397693344575e-05, + "loss": 1.089, + "step": 2540 + }, + { + "epoch": 0.39945764310558274, + "grad_norm": 0.17514479160308838, + "learning_rate": 4.527036131495336e-05, + "loss": 1.089, + "step": 2541 + }, + { + "epoch": 0.3996148480025153, + "grad_norm": 0.2525259852409363, + "learning_rate": 4.5266744458424414e-05, + "loss": 1.0712, + "step": 2542 + }, + { + "epoch": 0.39977205289944784, + "grad_norm": 0.19268137216567993, + "learning_rate": 4.5263126364079834e-05, + "loss": 1.1875, + "step": 2543 + }, + { + "epoch": 0.39992925779638033, + "grad_norm": 0.20745940506458282, + "learning_rate": 4.525950703214058e-05, + "loss": 1.0354, + "step": 2544 + }, + { + "epoch": 0.4000864626933129, + "grad_norm": 0.193342387676239, + "learning_rate": 4.525588646282773e-05, + "loss": 1.1441, + "step": 2545 + }, + { + "epoch": 0.40024366759024543, + "grad_norm": 0.1741735190153122, + "learning_rate": 4.52522646563624e-05, + "loss": 1.1172, + "step": 2546 + }, + { + "epoch": 0.400400872487178, + "grad_norm": 0.15238472819328308, + "learning_rate": 4.52486416129658e-05, + "loss": 1.1061, + "step": 2547 + }, + { + "epoch": 0.40055807738411053, + "grad_norm": 0.2244487702846527, + "learning_rate": 4.5245017332859206e-05, + "loss": 1.0468, + "step": 2548 + }, + { + "epoch": 0.40071528228104303, + "grad_norm": 0.19981707632541656, + "learning_rate": 4.5241391816263986e-05, + "loss": 1.0995, + "step": 2549 + }, + { + "epoch": 0.4008724871779756, + "grad_norm": 0.20777346193790436, + "learning_rate": 4.523776506340157e-05, + "loss": 1.0742, + "step": 2550 + }, + { + "epoch": 0.40102969207490813, + "grad_norm": 0.24414701759815216, + "learning_rate": 4.523413707449345e-05, + "loss": 1.1748, + "step": 2551 + }, + { + "epoch": 0.4011868969718407, + "grad_norm": 0.20141269266605377, + "learning_rate": 4.523050784976124e-05, + "loss": 1.0779, + "step": 2552 + }, + { + "epoch": 0.40134410186877323, + "grad_norm": 0.21372921764850616, + "learning_rate": 4.522687738942658e-05, + "loss": 1.0947, + "step": 2553 + }, + { + "epoch": 0.4015013067657058, + "grad_norm": 0.3348469138145447, + "learning_rate": 4.5223245693711196e-05, + "loss": 0.9531, + "step": 2554 + }, + { + "epoch": 0.4016585116626383, + "grad_norm": 0.2114667445421219, + "learning_rate": 4.521961276283691e-05, + "loss": 1.2247, + "step": 2555 + }, + { + "epoch": 0.4018157165595708, + "grad_norm": 0.20499029755592346, + "learning_rate": 4.521597859702562e-05, + "loss": 1.1545, + "step": 2556 + }, + { + "epoch": 0.4019729214565034, + "grad_norm": 0.21829423308372498, + "learning_rate": 4.521234319649927e-05, + "loss": 1.1829, + "step": 2557 + }, + { + "epoch": 0.4021301263534359, + "grad_norm": 0.1649905890226364, + "learning_rate": 4.5208706561479895e-05, + "loss": 1.0656, + "step": 2558 + }, + { + "epoch": 0.4022873312503685, + "grad_norm": 0.21117526292800903, + "learning_rate": 4.5205068692189617e-05, + "loss": 1.0623, + "step": 2559 + }, + { + "epoch": 0.40244453614730097, + "grad_norm": 0.18778620660305023, + "learning_rate": 4.520142958885062e-05, + "loss": 1.1203, + "step": 2560 + }, + { + "epoch": 0.40244453614730097, + "eval_loss": 1.1262409687042236, + "eval_runtime": 2319.1512, + "eval_samples_per_second": 3.992, + "eval_steps_per_second": 1.996, + "step": 2560 + }, + { + "epoch": 0.4026017410442335, + "grad_norm": 0.17170308530330658, + "learning_rate": 4.519778925168516e-05, + "loss": 1.0933, + "step": 2561 + }, + { + "epoch": 0.40275894594116607, + "grad_norm": 0.21042533218860626, + "learning_rate": 4.519414768091558e-05, + "loss": 1.1722, + "step": 2562 + }, + { + "epoch": 0.4029161508380986, + "grad_norm": 0.17878204584121704, + "learning_rate": 4.5190504876764296e-05, + "loss": 1.1351, + "step": 2563 + }, + { + "epoch": 0.40307335573503117, + "grad_norm": 0.15718936920166016, + "learning_rate": 4.5186860839453795e-05, + "loss": 0.9882, + "step": 2564 + }, + { + "epoch": 0.40323056063196366, + "grad_norm": 0.17324978113174438, + "learning_rate": 4.518321556920664e-05, + "loss": 1.1786, + "step": 2565 + }, + { + "epoch": 0.4033877655288962, + "grad_norm": 0.2370029240846634, + "learning_rate": 4.517956906624546e-05, + "loss": 1.1443, + "step": 2566 + }, + { + "epoch": 0.40354497042582876, + "grad_norm": 0.209358811378479, + "learning_rate": 4.517592133079299e-05, + "loss": 1.1474, + "step": 2567 + }, + { + "epoch": 0.4037021753227613, + "grad_norm": 0.23134967684745789, + "learning_rate": 4.517227236307201e-05, + "loss": 1.1006, + "step": 2568 + }, + { + "epoch": 0.40385938021969386, + "grad_norm": 0.2563744783401489, + "learning_rate": 4.5168622163305384e-05, + "loss": 1.095, + "step": 2569 + }, + { + "epoch": 0.40401658511662636, + "grad_norm": 0.19634851813316345, + "learning_rate": 4.516497073171605e-05, + "loss": 1.049, + "step": 2570 + }, + { + "epoch": 0.4041737900135589, + "grad_norm": 0.15794876217842102, + "learning_rate": 4.5161318068527025e-05, + "loss": 1.2387, + "step": 2571 + }, + { + "epoch": 0.40433099491049146, + "grad_norm": 0.1933317631483078, + "learning_rate": 4.515766417396141e-05, + "loss": 1.1564, + "step": 2572 + }, + { + "epoch": 0.404488199807424, + "grad_norm": 0.22028346359729767, + "learning_rate": 4.5154009048242355e-05, + "loss": 1.2238, + "step": 2573 + }, + { + "epoch": 0.40464540470435656, + "grad_norm": 0.19432009756565094, + "learning_rate": 4.515035269159311e-05, + "loss": 1.1126, + "step": 2574 + }, + { + "epoch": 0.40480260960128905, + "grad_norm": 0.16745704412460327, + "learning_rate": 4.5146695104236985e-05, + "loss": 1.1979, + "step": 2575 + }, + { + "epoch": 0.4049598144982216, + "grad_norm": 0.23030900955200195, + "learning_rate": 4.514303628639738e-05, + "loss": 1.0651, + "step": 2576 + }, + { + "epoch": 0.40511701939515415, + "grad_norm": 0.1960119605064392, + "learning_rate": 4.513937623829776e-05, + "loss": 1.1086, + "step": 2577 + }, + { + "epoch": 0.4052742242920867, + "grad_norm": 0.17087821662425995, + "learning_rate": 4.513571496016166e-05, + "loss": 1.1907, + "step": 2578 + }, + { + "epoch": 0.40543142918901925, + "grad_norm": 0.24511630833148956, + "learning_rate": 4.5132052452212706e-05, + "loss": 1.0378, + "step": 2579 + }, + { + "epoch": 0.4055886340859518, + "grad_norm": 0.2701500356197357, + "learning_rate": 4.512838871467458e-05, + "loss": 1.0078, + "step": 2580 + }, + { + "epoch": 0.4057458389828843, + "grad_norm": 0.18003912270069122, + "learning_rate": 4.512472374777106e-05, + "loss": 1.1813, + "step": 2581 + }, + { + "epoch": 0.40590304387981685, + "grad_norm": 0.19022230803966522, + "learning_rate": 4.512105755172599e-05, + "loss": 1.105, + "step": 2582 + }, + { + "epoch": 0.4060602487767494, + "grad_norm": 0.19312334060668945, + "learning_rate": 4.5117390126763273e-05, + "loss": 1.1694, + "step": 2583 + }, + { + "epoch": 0.40621745367368195, + "grad_norm": 0.15560056269168854, + "learning_rate": 4.5113721473106904e-05, + "loss": 1.2581, + "step": 2584 + }, + { + "epoch": 0.4063746585706145, + "grad_norm": 0.27965423464775085, + "learning_rate": 4.511005159098096e-05, + "loss": 1.079, + "step": 2585 + }, + { + "epoch": 0.406531863467547, + "grad_norm": 0.1963738203048706, + "learning_rate": 4.5106380480609575e-05, + "loss": 1.1422, + "step": 2586 + }, + { + "epoch": 0.40668906836447954, + "grad_norm": 0.2169744223356247, + "learning_rate": 4.5102708142216974e-05, + "loss": 1.1722, + "step": 2587 + }, + { + "epoch": 0.4068462732614121, + "grad_norm": 0.21945542097091675, + "learning_rate": 4.509903457602744e-05, + "loss": 1.0711, + "step": 2588 + }, + { + "epoch": 0.40700347815834464, + "grad_norm": 0.16455252468585968, + "learning_rate": 4.5095359782265355e-05, + "loss": 1.2068, + "step": 2589 + }, + { + "epoch": 0.4071606830552772, + "grad_norm": 0.18729642033576965, + "learning_rate": 4.5091683761155144e-05, + "loss": 1.1389, + "step": 2590 + }, + { + "epoch": 0.4073178879522097, + "grad_norm": 0.15742127597332, + "learning_rate": 4.508800651292134e-05, + "loss": 1.1224, + "step": 2591 + }, + { + "epoch": 0.40747509284914224, + "grad_norm": 0.2024814486503601, + "learning_rate": 4.5084328037788526e-05, + "loss": 1.1028, + "step": 2592 + }, + { + "epoch": 0.4076322977460748, + "grad_norm": 0.25306496024131775, + "learning_rate": 4.5080648335981366e-05, + "loss": 1.1402, + "step": 2593 + }, + { + "epoch": 0.40778950264300734, + "grad_norm": 0.22500720620155334, + "learning_rate": 4.5076967407724614e-05, + "loss": 1.0391, + "step": 2594 + }, + { + "epoch": 0.4079467075399399, + "grad_norm": 0.22207467257976532, + "learning_rate": 4.507328525324308e-05, + "loss": 1.1483, + "step": 2595 + }, + { + "epoch": 0.4081039124368724, + "grad_norm": 0.16076889634132385, + "learning_rate": 4.506960187276164e-05, + "loss": 1.1517, + "step": 2596 + }, + { + "epoch": 0.40826111733380493, + "grad_norm": 0.19102972745895386, + "learning_rate": 4.50659172665053e-05, + "loss": 1.0793, + "step": 2597 + }, + { + "epoch": 0.4084183222307375, + "grad_norm": 0.19929270446300507, + "learning_rate": 4.506223143469908e-05, + "loss": 1.1269, + "step": 2598 + }, + { + "epoch": 0.40857552712767004, + "grad_norm": 0.2082059532403946, + "learning_rate": 4.5058544377568076e-05, + "loss": 1.0286, + "step": 2599 + }, + { + "epoch": 0.4087327320246026, + "grad_norm": 0.21476319432258606, + "learning_rate": 4.5054856095337516e-05, + "loss": 1.1064, + "step": 2600 + }, + { + "epoch": 0.4088899369215351, + "grad_norm": 0.20130178332328796, + "learning_rate": 4.505116658823264e-05, + "loss": 1.0551, + "step": 2601 + }, + { + "epoch": 0.40904714181846763, + "grad_norm": 0.16284184157848358, + "learning_rate": 4.5047475856478805e-05, + "loss": 1.1272, + "step": 2602 + }, + { + "epoch": 0.4092043467154002, + "grad_norm": 0.1783224493265152, + "learning_rate": 4.504378390030142e-05, + "loss": 1.0616, + "step": 2603 + }, + { + "epoch": 0.40936155161233273, + "grad_norm": 0.18486501276493073, + "learning_rate": 4.504009071992597e-05, + "loss": 1.2219, + "step": 2604 + }, + { + "epoch": 0.4095187565092653, + "grad_norm": 0.17707344889640808, + "learning_rate": 4.503639631557803e-05, + "loss": 0.9766, + "step": 2605 + }, + { + "epoch": 0.40967596140619783, + "grad_norm": 0.23095828294754028, + "learning_rate": 4.503270068748324e-05, + "loss": 1.1355, + "step": 2606 + }, + { + "epoch": 0.4098331663031303, + "grad_norm": 0.21981950104236603, + "learning_rate": 4.5029003835867305e-05, + "loss": 1.1692, + "step": 2607 + }, + { + "epoch": 0.4099903712000629, + "grad_norm": 0.28627362847328186, + "learning_rate": 4.502530576095603e-05, + "loss": 1.1312, + "step": 2608 + }, + { + "epoch": 0.4101475760969954, + "grad_norm": 0.22414131462574005, + "learning_rate": 4.502160646297526e-05, + "loss": 1.1141, + "step": 2609 + }, + { + "epoch": 0.410304780993928, + "grad_norm": 0.2514089047908783, + "learning_rate": 4.501790594215095e-05, + "loss": 1.1178, + "step": 2610 + }, + { + "epoch": 0.4104619858908605, + "grad_norm": 0.21636080741882324, + "learning_rate": 4.50142041987091e-05, + "loss": 1.1528, + "step": 2611 + }, + { + "epoch": 0.410619190787793, + "grad_norm": 0.14287950098514557, + "learning_rate": 4.501050123287582e-05, + "loss": 1.0704, + "step": 2612 + }, + { + "epoch": 0.41077639568472557, + "grad_norm": 0.2378457635641098, + "learning_rate": 4.5006797044877245e-05, + "loss": 1.295, + "step": 2613 + }, + { + "epoch": 0.4109336005816581, + "grad_norm": 0.23281626403331757, + "learning_rate": 4.500309163493964e-05, + "loss": 1.111, + "step": 2614 + }, + { + "epoch": 0.41109080547859067, + "grad_norm": 0.20313280820846558, + "learning_rate": 4.49993850032893e-05, + "loss": 1.1184, + "step": 2615 + }, + { + "epoch": 0.4112480103755232, + "grad_norm": 0.18402956426143646, + "learning_rate": 4.499567715015262e-05, + "loss": 1.127, + "step": 2616 + }, + { + "epoch": 0.4114052152724557, + "grad_norm": 0.15802954137325287, + "learning_rate": 4.499196807575605e-05, + "loss": 1.108, + "step": 2617 + }, + { + "epoch": 0.41156242016938827, + "grad_norm": 0.2320290356874466, + "learning_rate": 4.498825778032615e-05, + "loss": 1.1369, + "step": 2618 + }, + { + "epoch": 0.4117196250663208, + "grad_norm": 0.19282428920269012, + "learning_rate": 4.49845462640895e-05, + "loss": 1.1828, + "step": 2619 + }, + { + "epoch": 0.41187682996325337, + "grad_norm": 0.18736015260219574, + "learning_rate": 4.4980833527272804e-05, + "loss": 1.1012, + "step": 2620 + }, + { + "epoch": 0.4120340348601859, + "grad_norm": 0.13990436494350433, + "learning_rate": 4.497711957010282e-05, + "loss": 1.0997, + "step": 2621 + }, + { + "epoch": 0.4121912397571184, + "grad_norm": 0.1439363956451416, + "learning_rate": 4.497340439280638e-05, + "loss": 1.1568, + "step": 2622 + }, + { + "epoch": 0.41234844465405096, + "grad_norm": 0.23712140321731567, + "learning_rate": 4.49696879956104e-05, + "loss": 1.2776, + "step": 2623 + }, + { + "epoch": 0.4125056495509835, + "grad_norm": 0.20338992774486542, + "learning_rate": 4.4965970378741845e-05, + "loss": 1.2083, + "step": 2624 + }, + { + "epoch": 0.41266285444791606, + "grad_norm": 0.1320551484823227, + "learning_rate": 4.496225154242779e-05, + "loss": 1.1028, + "step": 2625 + }, + { + "epoch": 0.4128200593448486, + "grad_norm": 0.20532973110675812, + "learning_rate": 4.495853148689536e-05, + "loss": 1.133, + "step": 2626 + }, + { + "epoch": 0.4129772642417811, + "grad_norm": 0.20016834139823914, + "learning_rate": 4.4954810212371766e-05, + "loss": 1.1123, + "step": 2627 + }, + { + "epoch": 0.41313446913871366, + "grad_norm": 0.22745396196842194, + "learning_rate": 4.495108771908429e-05, + "loss": 1.0813, + "step": 2628 + }, + { + "epoch": 0.4132916740356462, + "grad_norm": 0.1960359364748001, + "learning_rate": 4.494736400726029e-05, + "loss": 1.1536, + "step": 2629 + }, + { + "epoch": 0.41344887893257876, + "grad_norm": 0.22357088327407837, + "learning_rate": 4.494363907712719e-05, + "loss": 1.0777, + "step": 2630 + }, + { + "epoch": 0.4136060838295113, + "grad_norm": 0.19476816058158875, + "learning_rate": 4.4939912928912484e-05, + "loss": 1.1019, + "step": 2631 + }, + { + "epoch": 0.41376328872644386, + "grad_norm": 0.16308310627937317, + "learning_rate": 4.493618556284377e-05, + "loss": 1.1504, + "step": 2632 + }, + { + "epoch": 0.41392049362337635, + "grad_norm": 0.23933067917823792, + "learning_rate": 4.493245697914869e-05, + "loss": 1.1173, + "step": 2633 + }, + { + "epoch": 0.4140776985203089, + "grad_norm": 0.19607731699943542, + "learning_rate": 4.492872717805498e-05, + "loss": 1.1738, + "step": 2634 + }, + { + "epoch": 0.41423490341724145, + "grad_norm": 0.1769096553325653, + "learning_rate": 4.492499615979043e-05, + "loss": 1.1922, + "step": 2635 + }, + { + "epoch": 0.414392108314174, + "grad_norm": 0.2839674949645996, + "learning_rate": 4.492126392458293e-05, + "loss": 1.1784, + "step": 2636 + }, + { + "epoch": 0.41454931321110655, + "grad_norm": 0.22033485770225525, + "learning_rate": 4.491753047266043e-05, + "loss": 1.1593, + "step": 2637 + }, + { + "epoch": 0.41470651810803905, + "grad_norm": 0.23080600798130035, + "learning_rate": 4.491379580425095e-05, + "loss": 1.0708, + "step": 2638 + }, + { + "epoch": 0.4148637230049716, + "grad_norm": 0.19914209842681885, + "learning_rate": 4.491005991958258e-05, + "loss": 1.1696, + "step": 2639 + }, + { + "epoch": 0.41502092790190415, + "grad_norm": 0.1926482617855072, + "learning_rate": 4.490632281888351e-05, + "loss": 1.1159, + "step": 2640 + }, + { + "epoch": 0.4151781327988367, + "grad_norm": 0.20513269305229187, + "learning_rate": 4.4902584502381986e-05, + "loss": 1.0759, + "step": 2641 + }, + { + "epoch": 0.41533533769576925, + "grad_norm": 0.19525010883808136, + "learning_rate": 4.4898844970306306e-05, + "loss": 1.1967, + "step": 2642 + }, + { + "epoch": 0.41549254259270174, + "grad_norm": 0.2566875219345093, + "learning_rate": 4.48951042228849e-05, + "loss": 1.148, + "step": 2643 + }, + { + "epoch": 0.4156497474896343, + "grad_norm": 0.23048824071884155, + "learning_rate": 4.4891362260346226e-05, + "loss": 1.2564, + "step": 2644 + }, + { + "epoch": 0.41580695238656684, + "grad_norm": 0.22120051085948944, + "learning_rate": 4.488761908291882e-05, + "loss": 1.2008, + "step": 2645 + }, + { + "epoch": 0.4159641572834994, + "grad_norm": 0.2097010314464569, + "learning_rate": 4.488387469083131e-05, + "loss": 1.1148, + "step": 2646 + }, + { + "epoch": 0.41612136218043194, + "grad_norm": 0.18486234545707703, + "learning_rate": 4.488012908431239e-05, + "loss": 1.0886, + "step": 2647 + }, + { + "epoch": 0.41627856707736444, + "grad_norm": 0.1906927525997162, + "learning_rate": 4.487638226359082e-05, + "loss": 1.1823, + "step": 2648 + }, + { + "epoch": 0.416435771974297, + "grad_norm": 0.2732648253440857, + "learning_rate": 4.487263422889545e-05, + "loss": 1.0555, + "step": 2649 + }, + { + "epoch": 0.41659297687122954, + "grad_norm": 0.1381736546754837, + "learning_rate": 4.486888498045519e-05, + "loss": 1.1222, + "step": 2650 + }, + { + "epoch": 0.4167501817681621, + "grad_norm": 0.13629379868507385, + "learning_rate": 4.486513451849903e-05, + "loss": 1.0836, + "step": 2651 + }, + { + "epoch": 0.41690738666509464, + "grad_norm": 0.18362641334533691, + "learning_rate": 4.4861382843256035e-05, + "loss": 1.2531, + "step": 2652 + }, + { + "epoch": 0.41706459156202713, + "grad_norm": 0.24584434926509857, + "learning_rate": 4.485762995495534e-05, + "loss": 1.1816, + "step": 2653 + }, + { + "epoch": 0.4172217964589597, + "grad_norm": 0.21411539614200592, + "learning_rate": 4.485387585382617e-05, + "loss": 1.1371, + "step": 2654 + }, + { + "epoch": 0.41737900135589223, + "grad_norm": 0.18241286277770996, + "learning_rate": 4.485012054009779e-05, + "loss": 1.1405, + "step": 2655 + }, + { + "epoch": 0.4175362062528248, + "grad_norm": 0.18648092448711395, + "learning_rate": 4.4846364013999584e-05, + "loss": 1.085, + "step": 2656 + }, + { + "epoch": 0.41769341114975733, + "grad_norm": 0.19255079329013824, + "learning_rate": 4.484260627576098e-05, + "loss": 1.0671, + "step": 2657 + }, + { + "epoch": 0.4178506160466898, + "grad_norm": 0.17891213297843933, + "learning_rate": 4.483884732561147e-05, + "loss": 1.1641, + "step": 2658 + }, + { + "epoch": 0.4180078209436224, + "grad_norm": 0.26097598671913147, + "learning_rate": 4.483508716378064e-05, + "loss": 1.0732, + "step": 2659 + }, + { + "epoch": 0.4181650258405549, + "grad_norm": 0.16144752502441406, + "learning_rate": 4.483132579049817e-05, + "loss": 1.1039, + "step": 2660 + }, + { + "epoch": 0.4183222307374875, + "grad_norm": 0.3161677420139313, + "learning_rate": 4.482756320599376e-05, + "loss": 1.1283, + "step": 2661 + }, + { + "epoch": 0.41847943563442, + "grad_norm": 0.2448277771472931, + "learning_rate": 4.482379941049724e-05, + "loss": 1.1227, + "step": 2662 + }, + { + "epoch": 0.4186366405313526, + "grad_norm": 0.23270602524280548, + "learning_rate": 4.482003440423847e-05, + "loss": 1.0057, + "step": 2663 + }, + { + "epoch": 0.41879384542828507, + "grad_norm": 0.2527409791946411, + "learning_rate": 4.481626818744741e-05, + "loss": 1.0883, + "step": 2664 + }, + { + "epoch": 0.4189510503252176, + "grad_norm": 0.2619737982749939, + "learning_rate": 4.481250076035408e-05, + "loss": 1.0826, + "step": 2665 + }, + { + "epoch": 0.4191082552221502, + "grad_norm": 0.16641849279403687, + "learning_rate": 4.4808732123188593e-05, + "loss": 1.0928, + "step": 2666 + }, + { + "epoch": 0.4192654601190827, + "grad_norm": 0.22881364822387695, + "learning_rate": 4.4804962276181115e-05, + "loss": 1.2395, + "step": 2667 + }, + { + "epoch": 0.4194226650160153, + "grad_norm": 0.1922225058078766, + "learning_rate": 4.4801191219561886e-05, + "loss": 1.125, + "step": 2668 + }, + { + "epoch": 0.41957986991294777, + "grad_norm": 0.16535848379135132, + "learning_rate": 4.479741895356124e-05, + "loss": 1.1668, + "step": 2669 + }, + { + "epoch": 0.4197370748098803, + "grad_norm": 0.1907588392496109, + "learning_rate": 4.4793645478409576e-05, + "loss": 1.1296, + "step": 2670 + }, + { + "epoch": 0.41989427970681287, + "grad_norm": 0.19568997621536255, + "learning_rate": 4.4789870794337355e-05, + "loss": 1.0744, + "step": 2671 + }, + { + "epoch": 0.4200514846037454, + "grad_norm": 0.2092171162366867, + "learning_rate": 4.478609490157512e-05, + "loss": 1.1545, + "step": 2672 + }, + { + "epoch": 0.42020868950067797, + "grad_norm": 0.18425819277763367, + "learning_rate": 4.4782317800353476e-05, + "loss": 1.1108, + "step": 2673 + }, + { + "epoch": 0.42036589439761046, + "grad_norm": 0.16294459998607635, + "learning_rate": 4.477853949090314e-05, + "loss": 1.1444, + "step": 2674 + }, + { + "epoch": 0.420523099294543, + "grad_norm": 0.1816370189189911, + "learning_rate": 4.477475997345486e-05, + "loss": 1.1692, + "step": 2675 + }, + { + "epoch": 0.42068030419147556, + "grad_norm": 0.1853678822517395, + "learning_rate": 4.477097924823948e-05, + "loss": 1.1799, + "step": 2676 + }, + { + "epoch": 0.4208375090884081, + "grad_norm": 0.19207173585891724, + "learning_rate": 4.476719731548792e-05, + "loss": 1.1308, + "step": 2677 + }, + { + "epoch": 0.42099471398534066, + "grad_norm": 0.18002179265022278, + "learning_rate": 4.4763414175431146e-05, + "loss": 1.1212, + "step": 2678 + }, + { + "epoch": 0.42115191888227316, + "grad_norm": 0.2948683500289917, + "learning_rate": 4.4759629828300234e-05, + "loss": 1.1712, + "step": 2679 + }, + { + "epoch": 0.4213091237792057, + "grad_norm": 0.1672886162996292, + "learning_rate": 4.475584427432631e-05, + "loss": 1.098, + "step": 2680 + }, + { + "epoch": 0.42146632867613826, + "grad_norm": 0.1577957570552826, + "learning_rate": 4.4752057513740584e-05, + "loss": 1.0854, + "step": 2681 + }, + { + "epoch": 0.4216235335730708, + "grad_norm": 0.20781965553760529, + "learning_rate": 4.474826954677434e-05, + "loss": 1.1503, + "step": 2682 + }, + { + "epoch": 0.42178073847000336, + "grad_norm": 0.20414681732654572, + "learning_rate": 4.4744480373658925e-05, + "loss": 1.1162, + "step": 2683 + }, + { + "epoch": 0.42193794336693585, + "grad_norm": 0.1716751754283905, + "learning_rate": 4.4740689994625775e-05, + "loss": 1.0506, + "step": 2684 + }, + { + "epoch": 0.4220951482638684, + "grad_norm": 0.20430327951908112, + "learning_rate": 4.4736898409906385e-05, + "loss": 1.1359, + "step": 2685 + }, + { + "epoch": 0.42225235316080095, + "grad_norm": 0.21866196393966675, + "learning_rate": 4.4733105619732334e-05, + "loss": 1.138, + "step": 2686 + }, + { + "epoch": 0.4224095580577335, + "grad_norm": 0.19445818662643433, + "learning_rate": 4.4729311624335275e-05, + "loss": 1.1227, + "step": 2687 + }, + { + "epoch": 0.42256676295466605, + "grad_norm": 0.16323702037334442, + "learning_rate": 4.472551642394693e-05, + "loss": 1.1723, + "step": 2688 + }, + { + "epoch": 0.4227239678515986, + "grad_norm": 0.18188408017158508, + "learning_rate": 4.472172001879909e-05, + "loss": 1.1264, + "step": 2689 + }, + { + "epoch": 0.4228811727485311, + "grad_norm": 0.25359630584716797, + "learning_rate": 4.471792240912362e-05, + "loss": 1.164, + "step": 2690 + }, + { + "epoch": 0.42303837764546365, + "grad_norm": 0.14942757785320282, + "learning_rate": 4.4714123595152476e-05, + "loss": 1.2065, + "step": 2691 + }, + { + "epoch": 0.4231955825423962, + "grad_norm": 0.16352878510951996, + "learning_rate": 4.471032357711767e-05, + "loss": 1.1007, + "step": 2692 + }, + { + "epoch": 0.42335278743932875, + "grad_norm": 0.25480419397354126, + "learning_rate": 4.470652235525129e-05, + "loss": 1.1349, + "step": 2693 + }, + { + "epoch": 0.4235099923362613, + "grad_norm": 0.201410710811615, + "learning_rate": 4.4702719929785505e-05, + "loss": 1.1439, + "step": 2694 + }, + { + "epoch": 0.4236671972331938, + "grad_norm": 0.20557720959186554, + "learning_rate": 4.469891630095256e-05, + "loss": 1.192, + "step": 2695 + }, + { + "epoch": 0.42382440213012634, + "grad_norm": 0.1845296025276184, + "learning_rate": 4.469511146898475e-05, + "loss": 1.1645, + "step": 2696 + }, + { + "epoch": 0.4239816070270589, + "grad_norm": 0.1634630411863327, + "learning_rate": 4.4691305434114466e-05, + "loss": 1.0994, + "step": 2697 + }, + { + "epoch": 0.42413881192399144, + "grad_norm": 0.25110775232315063, + "learning_rate": 4.4687498196574165e-05, + "loss": 1.2912, + "step": 2698 + }, + { + "epoch": 0.424296016820924, + "grad_norm": 0.1885124295949936, + "learning_rate": 4.468368975659638e-05, + "loss": 1.1205, + "step": 2699 + }, + { + "epoch": 0.4244532217178565, + "grad_norm": 0.17171499133110046, + "learning_rate": 4.467988011441372e-05, + "loss": 1.1772, + "step": 2700 + }, + { + "epoch": 0.42461042661478904, + "grad_norm": 0.19311368465423584, + "learning_rate": 4.467606927025886e-05, + "loss": 1.1761, + "step": 2701 + }, + { + "epoch": 0.4247676315117216, + "grad_norm": 0.18169812858104706, + "learning_rate": 4.4672257224364545e-05, + "loss": 1.0528, + "step": 2702 + }, + { + "epoch": 0.42492483640865414, + "grad_norm": 0.2034253031015396, + "learning_rate": 4.466844397696361e-05, + "loss": 1.0786, + "step": 2703 + }, + { + "epoch": 0.4250820413055867, + "grad_norm": 0.1843811571598053, + "learning_rate": 4.466462952828895e-05, + "loss": 1.052, + "step": 2704 + }, + { + "epoch": 0.4252392462025192, + "grad_norm": 0.19026444852352142, + "learning_rate": 4.466081387857354e-05, + "loss": 1.1456, + "step": 2705 + }, + { + "epoch": 0.42539645109945173, + "grad_norm": 0.16506510972976685, + "learning_rate": 4.4656997028050426e-05, + "loss": 1.2883, + "step": 2706 + }, + { + "epoch": 0.4255536559963843, + "grad_norm": 0.25424107909202576, + "learning_rate": 4.465317897695271e-05, + "loss": 0.9969, + "step": 2707 + }, + { + "epoch": 0.42571086089331683, + "grad_norm": 0.18849819898605347, + "learning_rate": 4.464935972551361e-05, + "loss": 1.1043, + "step": 2708 + }, + { + "epoch": 0.4258680657902494, + "grad_norm": 0.22406542301177979, + "learning_rate": 4.4645539273966374e-05, + "loss": 1.1311, + "step": 2709 + }, + { + "epoch": 0.4260252706871819, + "grad_norm": 0.1915282905101776, + "learning_rate": 4.464171762254436e-05, + "loss": 1.1565, + "step": 2710 + }, + { + "epoch": 0.42618247558411443, + "grad_norm": 0.21550290286540985, + "learning_rate": 4.463789477148094e-05, + "loss": 1.0745, + "step": 2711 + }, + { + "epoch": 0.426339680481047, + "grad_norm": 0.18291574716567993, + "learning_rate": 4.463407072100964e-05, + "loss": 1.1443, + "step": 2712 + }, + { + "epoch": 0.42649688537797953, + "grad_norm": 0.174668088555336, + "learning_rate": 4.4630245471364004e-05, + "loss": 1.0771, + "step": 2713 + }, + { + "epoch": 0.4266540902749121, + "grad_norm": 0.21048250794410706, + "learning_rate": 4.462641902277765e-05, + "loss": 1.1574, + "step": 2714 + }, + { + "epoch": 0.42681129517184463, + "grad_norm": 0.22583508491516113, + "learning_rate": 4.4622591375484316e-05, + "loss": 1.1584, + "step": 2715 + }, + { + "epoch": 0.4269685000687771, + "grad_norm": 0.17098526656627655, + "learning_rate": 4.461876252971774e-05, + "loss": 1.0009, + "step": 2716 + }, + { + "epoch": 0.4271257049657097, + "grad_norm": 0.16546213626861572, + "learning_rate": 4.4614932485711805e-05, + "loss": 1.1723, + "step": 2717 + }, + { + "epoch": 0.4272829098626422, + "grad_norm": 0.1419089436531067, + "learning_rate": 4.461110124370042e-05, + "loss": 1.2826, + "step": 2718 + }, + { + "epoch": 0.4274401147595748, + "grad_norm": 0.2434864342212677, + "learning_rate": 4.460726880391759e-05, + "loss": 1.0681, + "step": 2719 + }, + { + "epoch": 0.4275973196565073, + "grad_norm": 0.23129241168498993, + "learning_rate": 4.460343516659738e-05, + "loss": 1.2189, + "step": 2720 + }, + { + "epoch": 0.4275973196565073, + "eval_loss": 1.121034026145935, + "eval_runtime": 2296.2998, + "eval_samples_per_second": 4.032, + "eval_steps_per_second": 2.016, + "step": 2720 + }, + { + "epoch": 0.4277545245534398, + "grad_norm": 0.16803450882434845, + "learning_rate": 4.4599600331973936e-05, + "loss": 1.3243, + "step": 2721 + }, + { + "epoch": 0.42791172945037237, + "grad_norm": 0.20555785298347473, + "learning_rate": 4.459576430028147e-05, + "loss": 1.2458, + "step": 2722 + }, + { + "epoch": 0.4280689343473049, + "grad_norm": 0.17189499735832214, + "learning_rate": 4.459192707175428e-05, + "loss": 1.0436, + "step": 2723 + }, + { + "epoch": 0.42822613924423747, + "grad_norm": 0.21618017554283142, + "learning_rate": 4.4588088646626736e-05, + "loss": 1.1963, + "step": 2724 + }, + { + "epoch": 0.42838334414117, + "grad_norm": 0.1805650144815445, + "learning_rate": 4.4584249025133256e-05, + "loss": 1.1143, + "step": 2725 + }, + { + "epoch": 0.4285405490381025, + "grad_norm": 0.1871338039636612, + "learning_rate": 4.458040820750836e-05, + "loss": 1.0584, + "step": 2726 + }, + { + "epoch": 0.42869775393503506, + "grad_norm": 0.197019562125206, + "learning_rate": 4.4576566193986635e-05, + "loss": 1.0886, + "step": 2727 + }, + { + "epoch": 0.4288549588319676, + "grad_norm": 0.1631096750497818, + "learning_rate": 4.457272298480273e-05, + "loss": 1.1358, + "step": 2728 + }, + { + "epoch": 0.42901216372890016, + "grad_norm": 0.16281025111675262, + "learning_rate": 4.4568878580191364e-05, + "loss": 1.1463, + "step": 2729 + }, + { + "epoch": 0.4291693686258327, + "grad_norm": 0.1618165671825409, + "learning_rate": 4.456503298038735e-05, + "loss": 1.1639, + "step": 2730 + }, + { + "epoch": 0.4293265735227652, + "grad_norm": 0.20703785121440887, + "learning_rate": 4.4561186185625574e-05, + "loss": 1.1258, + "step": 2731 + }, + { + "epoch": 0.42948377841969776, + "grad_norm": 0.2330520749092102, + "learning_rate": 4.455733819614096e-05, + "loss": 1.0725, + "step": 2732 + }, + { + "epoch": 0.4296409833166303, + "grad_norm": 0.18974269926548004, + "learning_rate": 4.4553489012168546e-05, + "loss": 1.1385, + "step": 2733 + }, + { + "epoch": 0.42979818821356286, + "grad_norm": 0.17599286139011383, + "learning_rate": 4.454963863394343e-05, + "loss": 1.1215, + "step": 2734 + }, + { + "epoch": 0.4299553931104954, + "grad_norm": 0.15310634672641754, + "learning_rate": 4.454578706170075e-05, + "loss": 1.137, + "step": 2735 + }, + { + "epoch": 0.4301125980074279, + "grad_norm": 0.15119053423404694, + "learning_rate": 4.454193429567577e-05, + "loss": 1.152, + "step": 2736 + }, + { + "epoch": 0.43026980290436045, + "grad_norm": 0.18684156239032745, + "learning_rate": 4.45380803361038e-05, + "loss": 1.1315, + "step": 2737 + }, + { + "epoch": 0.430427007801293, + "grad_norm": 0.1770356446504593, + "learning_rate": 4.45342251832202e-05, + "loss": 1.0439, + "step": 2738 + }, + { + "epoch": 0.43058421269822555, + "grad_norm": 0.151130810379982, + "learning_rate": 4.453036883726047e-05, + "loss": 1.1593, + "step": 2739 + }, + { + "epoch": 0.4307414175951581, + "grad_norm": 0.20127306878566742, + "learning_rate": 4.4526511298460114e-05, + "loss": 1.1414, + "step": 2740 + }, + { + "epoch": 0.43089862249209065, + "grad_norm": 0.1737474501132965, + "learning_rate": 4.452265256705474e-05, + "loss": 1.2059, + "step": 2741 + }, + { + "epoch": 0.43105582738902315, + "grad_norm": 0.15642091631889343, + "learning_rate": 4.451879264328003e-05, + "loss": 1.2293, + "step": 2742 + }, + { + "epoch": 0.4312130322859557, + "grad_norm": 0.1680547147989273, + "learning_rate": 4.451493152737172e-05, + "loss": 1.0843, + "step": 2743 + }, + { + "epoch": 0.43137023718288825, + "grad_norm": 0.3224833607673645, + "learning_rate": 4.451106921956565e-05, + "loss": 1.1196, + "step": 2744 + }, + { + "epoch": 0.4315274420798208, + "grad_norm": 0.20196394622325897, + "learning_rate": 4.450720572009771e-05, + "loss": 1.0971, + "step": 2745 + }, + { + "epoch": 0.43168464697675335, + "grad_norm": 0.1925266832113266, + "learning_rate": 4.4503341029203856e-05, + "loss": 1.2121, + "step": 2746 + }, + { + "epoch": 0.43184185187368584, + "grad_norm": 0.1832456737756729, + "learning_rate": 4.449947514712014e-05, + "loss": 1.0814, + "step": 2747 + }, + { + "epoch": 0.4319990567706184, + "grad_norm": 0.17716901004314423, + "learning_rate": 4.449560807408267e-05, + "loss": 1.153, + "step": 2748 + }, + { + "epoch": 0.43215626166755094, + "grad_norm": 0.20120856165885925, + "learning_rate": 4.4491739810327635e-05, + "loss": 1.1343, + "step": 2749 + }, + { + "epoch": 0.4323134665644835, + "grad_norm": 0.15247990190982819, + "learning_rate": 4.44878703560913e-05, + "loss": 1.1549, + "step": 2750 + }, + { + "epoch": 0.43247067146141605, + "grad_norm": 0.230882465839386, + "learning_rate": 4.448399971160999e-05, + "loss": 1.0305, + "step": 2751 + }, + { + "epoch": 0.43262787635834854, + "grad_norm": 0.23438026010990143, + "learning_rate": 4.44801278771201e-05, + "loss": 1.1409, + "step": 2752 + }, + { + "epoch": 0.4327850812552811, + "grad_norm": 0.18865425884723663, + "learning_rate": 4.447625485285813e-05, + "loss": 1.1122, + "step": 2753 + }, + { + "epoch": 0.43294228615221364, + "grad_norm": 0.18455225229263306, + "learning_rate": 4.4472380639060605e-05, + "loss": 1.1954, + "step": 2754 + }, + { + "epoch": 0.4330994910491462, + "grad_norm": 0.17371979355812073, + "learning_rate": 4.4468505235964165e-05, + "loss": 1.0564, + "step": 2755 + }, + { + "epoch": 0.43325669594607874, + "grad_norm": 0.17381125688552856, + "learning_rate": 4.4464628643805495e-05, + "loss": 1.1591, + "step": 2756 + }, + { + "epoch": 0.43341390084301123, + "grad_norm": 0.20750965178012848, + "learning_rate": 4.4460750862821366e-05, + "loss": 1.0045, + "step": 2757 + }, + { + "epoch": 0.4335711057399438, + "grad_norm": 0.2176593393087387, + "learning_rate": 4.445687189324862e-05, + "loss": 1.1218, + "step": 2758 + }, + { + "epoch": 0.43372831063687634, + "grad_norm": 0.20053786039352417, + "learning_rate": 4.445299173532416e-05, + "loss": 1.2038, + "step": 2759 + }, + { + "epoch": 0.4338855155338089, + "grad_norm": 0.159897580742836, + "learning_rate": 4.444911038928499e-05, + "loss": 1.1344, + "step": 2760 + }, + { + "epoch": 0.43404272043074144, + "grad_norm": 0.19078446924686432, + "learning_rate": 4.4445227855368144e-05, + "loss": 1.1395, + "step": 2761 + }, + { + "epoch": 0.43419992532767393, + "grad_norm": 0.15986692905426025, + "learning_rate": 4.4441344133810766e-05, + "loss": 1.2075, + "step": 2762 + }, + { + "epoch": 0.4343571302246065, + "grad_norm": 0.1700146645307541, + "learning_rate": 4.443745922485006e-05, + "loss": 1.1439, + "step": 2763 + }, + { + "epoch": 0.43451433512153903, + "grad_norm": 0.16691164672374725, + "learning_rate": 4.4433573128723306e-05, + "loss": 0.9497, + "step": 2764 + }, + { + "epoch": 0.4346715400184716, + "grad_norm": 0.31677502393722534, + "learning_rate": 4.442968584566784e-05, + "loss": 1.0701, + "step": 2765 + }, + { + "epoch": 0.43482874491540413, + "grad_norm": 0.17169591784477234, + "learning_rate": 4.442579737592109e-05, + "loss": 1.1079, + "step": 2766 + }, + { + "epoch": 0.4349859498123367, + "grad_norm": 0.33990463614463806, + "learning_rate": 4.442190771972054e-05, + "loss": 1.0178, + "step": 2767 + }, + { + "epoch": 0.4351431547092692, + "grad_norm": 0.1680772304534912, + "learning_rate": 4.441801687730377e-05, + "loss": 1.2274, + "step": 2768 + }, + { + "epoch": 0.4353003596062017, + "grad_norm": 0.21385186910629272, + "learning_rate": 4.441412484890841e-05, + "loss": 1.1274, + "step": 2769 + }, + { + "epoch": 0.4354575645031343, + "grad_norm": 0.19230084121227264, + "learning_rate": 4.4410231634772164e-05, + "loss": 0.9932, + "step": 2770 + }, + { + "epoch": 0.4356147694000668, + "grad_norm": 0.21352258324623108, + "learning_rate": 4.440633723513282e-05, + "loss": 1.0798, + "step": 2771 + }, + { + "epoch": 0.4357719742969994, + "grad_norm": 0.17857983708381653, + "learning_rate": 4.440244165022824e-05, + "loss": 1.1705, + "step": 2772 + }, + { + "epoch": 0.43592917919393187, + "grad_norm": 0.1947200447320938, + "learning_rate": 4.439854488029634e-05, + "loss": 1.0449, + "step": 2773 + }, + { + "epoch": 0.4360863840908644, + "grad_norm": 0.22335095703601837, + "learning_rate": 4.439464692557514e-05, + "loss": 1.0209, + "step": 2774 + }, + { + "epoch": 0.43624358898779697, + "grad_norm": 0.16146698594093323, + "learning_rate": 4.439074778630268e-05, + "loss": 1.0747, + "step": 2775 + }, + { + "epoch": 0.4364007938847295, + "grad_norm": 0.20121018588542938, + "learning_rate": 4.4386847462717126e-05, + "loss": 1.1736, + "step": 2776 + }, + { + "epoch": 0.43655799878166207, + "grad_norm": 0.20480762422084808, + "learning_rate": 4.43829459550567e-05, + "loss": 1.0921, + "step": 2777 + }, + { + "epoch": 0.43671520367859457, + "grad_norm": 0.1490727663040161, + "learning_rate": 4.437904326355967e-05, + "loss": 1.1989, + "step": 2778 + }, + { + "epoch": 0.4368724085755271, + "grad_norm": 0.2289019227027893, + "learning_rate": 4.4375139388464415e-05, + "loss": 1.0705, + "step": 2779 + }, + { + "epoch": 0.43702961347245967, + "grad_norm": 0.21382930874824524, + "learning_rate": 4.437123433000937e-05, + "loss": 1.1479, + "step": 2780 + }, + { + "epoch": 0.4371868183693922, + "grad_norm": 0.1747499406337738, + "learning_rate": 4.4367328088433026e-05, + "loss": 1.0534, + "step": 2781 + }, + { + "epoch": 0.43734402326632477, + "grad_norm": 0.1528068631887436, + "learning_rate": 4.436342066397397e-05, + "loss": 1.0571, + "step": 2782 + }, + { + "epoch": 0.43750122816325726, + "grad_norm": 0.2127470076084137, + "learning_rate": 4.435951205687086e-05, + "loss": 1.0721, + "step": 2783 + }, + { + "epoch": 0.4376584330601898, + "grad_norm": 0.20668397843837738, + "learning_rate": 4.4355602267362404e-05, + "loss": 1.0828, + "step": 2784 + }, + { + "epoch": 0.43781563795712236, + "grad_norm": 0.17368297278881073, + "learning_rate": 4.435169129568742e-05, + "loss": 1.0282, + "step": 2785 + }, + { + "epoch": 0.4379728428540549, + "grad_norm": 0.20142029225826263, + "learning_rate": 4.434777914208475e-05, + "loss": 1.042, + "step": 2786 + }, + { + "epoch": 0.43813004775098746, + "grad_norm": 0.15896591544151306, + "learning_rate": 4.434386580679334e-05, + "loss": 1.1075, + "step": 2787 + }, + { + "epoch": 0.43828725264791996, + "grad_norm": 0.24827557802200317, + "learning_rate": 4.433995129005221e-05, + "loss": 1.1095, + "step": 2788 + }, + { + "epoch": 0.4384444575448525, + "grad_norm": 0.20462195575237274, + "learning_rate": 4.433603559210043e-05, + "loss": 1.0703, + "step": 2789 + }, + { + "epoch": 0.43860166244178506, + "grad_norm": 0.20382918417453766, + "learning_rate": 4.4332118713177175e-05, + "loss": 1.1476, + "step": 2790 + }, + { + "epoch": 0.4387588673387176, + "grad_norm": 0.8631371855735779, + "learning_rate": 4.432820065352166e-05, + "loss": 1.158, + "step": 2791 + }, + { + "epoch": 0.43891607223565016, + "grad_norm": 0.18501952290534973, + "learning_rate": 4.432428141337318e-05, + "loss": 1.1529, + "step": 2792 + }, + { + "epoch": 0.4390732771325827, + "grad_norm": 0.21414092183113098, + "learning_rate": 4.432036099297113e-05, + "loss": 1.0746, + "step": 2793 + }, + { + "epoch": 0.4392304820295152, + "grad_norm": 0.15636850893497467, + "learning_rate": 4.4316439392554934e-05, + "loss": 1.1779, + "step": 2794 + }, + { + "epoch": 0.43938768692644775, + "grad_norm": 0.17315103113651276, + "learning_rate": 4.4312516612364106e-05, + "loss": 1.0921, + "step": 2795 + }, + { + "epoch": 0.4395448918233803, + "grad_norm": 0.15908555686473846, + "learning_rate": 4.4308592652638245e-05, + "loss": 1.1729, + "step": 2796 + }, + { + "epoch": 0.43970209672031285, + "grad_norm": 0.14251650869846344, + "learning_rate": 4.4304667513617014e-05, + "loss": 1.1684, + "step": 2797 + }, + { + "epoch": 0.4398593016172454, + "grad_norm": 0.29002174735069275, + "learning_rate": 4.4300741195540144e-05, + "loss": 1.1376, + "step": 2798 + }, + { + "epoch": 0.4400165065141779, + "grad_norm": 0.1559830904006958, + "learning_rate": 4.429681369864743e-05, + "loss": 1.1366, + "step": 2799 + }, + { + "epoch": 0.44017371141111045, + "grad_norm": 0.2127392441034317, + "learning_rate": 4.429288502317876e-05, + "loss": 1.144, + "step": 2800 + }, + { + "epoch": 0.440330916308043, + "grad_norm": 0.1910010576248169, + "learning_rate": 4.428895516937408e-05, + "loss": 1.0268, + "step": 2801 + }, + { + "epoch": 0.44048812120497555, + "grad_norm": 0.14085833728313446, + "learning_rate": 4.42850241374734e-05, + "loss": 1.2426, + "step": 2802 + }, + { + "epoch": 0.4406453261019081, + "grad_norm": 0.16497376561164856, + "learning_rate": 4.428109192771682e-05, + "loss": 0.9953, + "step": 2803 + }, + { + "epoch": 0.4408025309988406, + "grad_norm": 0.24869468808174133, + "learning_rate": 4.427715854034451e-05, + "loss": 1.1183, + "step": 2804 + }, + { + "epoch": 0.44095973589577314, + "grad_norm": 0.2633094787597656, + "learning_rate": 4.4273223975596704e-05, + "loss": 1.1522, + "step": 2805 + }, + { + "epoch": 0.4411169407927057, + "grad_norm": 0.17634916305541992, + "learning_rate": 4.4269288233713704e-05, + "loss": 1.2075, + "step": 2806 + }, + { + "epoch": 0.44127414568963824, + "grad_norm": 0.2116928994655609, + "learning_rate": 4.426535131493589e-05, + "loss": 1.0832, + "step": 2807 + }, + { + "epoch": 0.4414313505865708, + "grad_norm": 0.17454542219638824, + "learning_rate": 4.4261413219503714e-05, + "loss": 1.0844, + "step": 2808 + }, + { + "epoch": 0.4415885554835033, + "grad_norm": 0.21070100367069244, + "learning_rate": 4.425747394765771e-05, + "loss": 1.105, + "step": 2809 + }, + { + "epoch": 0.44174576038043584, + "grad_norm": 0.17848363518714905, + "learning_rate": 4.425353349963847e-05, + "loss": 1.1904, + "step": 2810 + }, + { + "epoch": 0.4419029652773684, + "grad_norm": 0.15250985324382782, + "learning_rate": 4.4249591875686655e-05, + "loss": 1.1707, + "step": 2811 + }, + { + "epoch": 0.44206017017430094, + "grad_norm": 0.1811569631099701, + "learning_rate": 4.4245649076043e-05, + "loss": 1.1703, + "step": 2812 + }, + { + "epoch": 0.4422173750712335, + "grad_norm": 0.2516029477119446, + "learning_rate": 4.424170510094834e-05, + "loss": 1.1403, + "step": 2813 + }, + { + "epoch": 0.442374579968166, + "grad_norm": 0.19467385113239288, + "learning_rate": 4.423775995064353e-05, + "loss": 1.079, + "step": 2814 + }, + { + "epoch": 0.44253178486509853, + "grad_norm": 0.17406821250915527, + "learning_rate": 4.4233813625369547e-05, + "loss": 1.0794, + "step": 2815 + }, + { + "epoch": 0.4426889897620311, + "grad_norm": 0.18775492906570435, + "learning_rate": 4.4229866125367404e-05, + "loss": 1.134, + "step": 2816 + }, + { + "epoch": 0.44284619465896363, + "grad_norm": 0.18786782026290894, + "learning_rate": 4.42259174508782e-05, + "loss": 1.0186, + "step": 2817 + }, + { + "epoch": 0.4430033995558962, + "grad_norm": 0.23192930221557617, + "learning_rate": 4.422196760214311e-05, + "loss": 1.1521, + "step": 2818 + }, + { + "epoch": 0.44316060445282873, + "grad_norm": 0.21265646815299988, + "learning_rate": 4.421801657940337e-05, + "loss": 1.0756, + "step": 2819 + }, + { + "epoch": 0.4433178093497612, + "grad_norm": 0.1617029756307602, + "learning_rate": 4.42140643829003e-05, + "loss": 0.9996, + "step": 2820 + }, + { + "epoch": 0.4434750142466938, + "grad_norm": 0.15906690061092377, + "learning_rate": 4.421011101287529e-05, + "loss": 1.0924, + "step": 2821 + }, + { + "epoch": 0.4436322191436263, + "grad_norm": 0.17704284191131592, + "learning_rate": 4.4206156469569774e-05, + "loss": 1.1341, + "step": 2822 + }, + { + "epoch": 0.4437894240405589, + "grad_norm": 0.1560877412557602, + "learning_rate": 4.420220075322531e-05, + "loss": 1.1215, + "step": 2823 + }, + { + "epoch": 0.4439466289374914, + "grad_norm": 0.16209112107753754, + "learning_rate": 4.4198243864083474e-05, + "loss": 1.2121, + "step": 2824 + }, + { + "epoch": 0.4441038338344239, + "grad_norm": 0.21708954870700836, + "learning_rate": 4.4194285802385946e-05, + "loss": 1.1038, + "step": 2825 + }, + { + "epoch": 0.4442610387313565, + "grad_norm": 0.18496249616146088, + "learning_rate": 4.419032656837448e-05, + "loss": 1.1421, + "step": 2826 + }, + { + "epoch": 0.444418243628289, + "grad_norm": 0.1930168867111206, + "learning_rate": 4.418636616229087e-05, + "loss": 1.0371, + "step": 2827 + }, + { + "epoch": 0.4445754485252216, + "grad_norm": 0.287266343832016, + "learning_rate": 4.4182404584377026e-05, + "loss": 1.0948, + "step": 2828 + }, + { + "epoch": 0.4447326534221541, + "grad_norm": 0.16841591894626617, + "learning_rate": 4.417844183487488e-05, + "loss": 1.0902, + "step": 2829 + }, + { + "epoch": 0.4448898583190866, + "grad_norm": 0.1748381108045578, + "learning_rate": 4.417447791402649e-05, + "loss": 1.124, + "step": 2830 + }, + { + "epoch": 0.44504706321601917, + "grad_norm": 0.23243847489356995, + "learning_rate": 4.417051282207394e-05, + "loss": 1.0163, + "step": 2831 + }, + { + "epoch": 0.4452042681129517, + "grad_norm": 0.15848687291145325, + "learning_rate": 4.41665465592594e-05, + "loss": 1.0835, + "step": 2832 + }, + { + "epoch": 0.44536147300988427, + "grad_norm": 0.18608181178569794, + "learning_rate": 4.4162579125825124e-05, + "loss": 1.1433, + "step": 2833 + }, + { + "epoch": 0.4455186779068168, + "grad_norm": 0.2262060046195984, + "learning_rate": 4.4158610522013424e-05, + "loss": 1.0953, + "step": 2834 + }, + { + "epoch": 0.4456758828037493, + "grad_norm": 0.2521316409111023, + "learning_rate": 4.415464074806669e-05, + "loss": 0.9369, + "step": 2835 + }, + { + "epoch": 0.44583308770068186, + "grad_norm": 0.16947337985038757, + "learning_rate": 4.415066980422737e-05, + "loss": 1.119, + "step": 2836 + }, + { + "epoch": 0.4459902925976144, + "grad_norm": 0.17369608581066132, + "learning_rate": 4.4146697690738015e-05, + "loss": 1.1021, + "step": 2837 + }, + { + "epoch": 0.44614749749454696, + "grad_norm": 0.13055743277072906, + "learning_rate": 4.41427244078412e-05, + "loss": 1.1948, + "step": 2838 + }, + { + "epoch": 0.4463047023914795, + "grad_norm": 0.1885204315185547, + "learning_rate": 4.4138749955779617e-05, + "loss": 1.1319, + "step": 2839 + }, + { + "epoch": 0.446461907288412, + "grad_norm": 0.24677696824073792, + "learning_rate": 4.4134774334796005e-05, + "loss": 1.1474, + "step": 2840 + }, + { + "epoch": 0.44661911218534456, + "grad_norm": 0.1810072809457779, + "learning_rate": 4.413079754513318e-05, + "loss": 1.1885, + "step": 2841 + }, + { + "epoch": 0.4467763170822771, + "grad_norm": 0.14184872806072235, + "learning_rate": 4.412681958703403e-05, + "loss": 1.2749, + "step": 2842 + }, + { + "epoch": 0.44693352197920966, + "grad_norm": 0.1864720582962036, + "learning_rate": 4.412284046074151e-05, + "loss": 1.0701, + "step": 2843 + }, + { + "epoch": 0.4470907268761422, + "grad_norm": 0.16632923483848572, + "learning_rate": 4.411886016649865e-05, + "loss": 1.0741, + "step": 2844 + }, + { + "epoch": 0.44724793177307476, + "grad_norm": 0.1669205278158188, + "learning_rate": 4.4114878704548555e-05, + "loss": 1.1825, + "step": 2845 + }, + { + "epoch": 0.44740513667000725, + "grad_norm": 0.16396629810333252, + "learning_rate": 4.41108960751344e-05, + "loss": 1.1323, + "step": 2846 + }, + { + "epoch": 0.4475623415669398, + "grad_norm": 0.2604614794254303, + "learning_rate": 4.410691227849942e-05, + "loss": 1.1358, + "step": 2847 + }, + { + "epoch": 0.44771954646387235, + "grad_norm": 0.20713360607624054, + "learning_rate": 4.410292731488694e-05, + "loss": 1.054, + "step": 2848 + }, + { + "epoch": 0.4478767513608049, + "grad_norm": 0.20656947791576385, + "learning_rate": 4.4098941184540335e-05, + "loss": 1.0937, + "step": 2849 + }, + { + "epoch": 0.44803395625773745, + "grad_norm": 0.19168046116828918, + "learning_rate": 4.4094953887703074e-05, + "loss": 1.0854, + "step": 2850 + }, + { + "epoch": 0.44819116115466995, + "grad_norm": 0.2075878083705902, + "learning_rate": 4.409096542461868e-05, + "loss": 1.0695, + "step": 2851 + }, + { + "epoch": 0.4483483660516025, + "grad_norm": 0.226437047123909, + "learning_rate": 4.408697579553076e-05, + "loss": 1.1658, + "step": 2852 + }, + { + "epoch": 0.44850557094853505, + "grad_norm": 0.14772802591323853, + "learning_rate": 4.408298500068297e-05, + "loss": 1.2319, + "step": 2853 + }, + { + "epoch": 0.4486627758454676, + "grad_norm": 0.17494453489780426, + "learning_rate": 4.407899304031906e-05, + "loss": 1.0833, + "step": 2854 + }, + { + "epoch": 0.44881998074240015, + "grad_norm": 0.21048682928085327, + "learning_rate": 4.407499991468286e-05, + "loss": 1.0763, + "step": 2855 + }, + { + "epoch": 0.44897718563933264, + "grad_norm": 0.17920631170272827, + "learning_rate": 4.407100562401823e-05, + "loss": 1.0633, + "step": 2856 + }, + { + "epoch": 0.4491343905362652, + "grad_norm": 0.17860840260982513, + "learning_rate": 4.406701016856914e-05, + "loss": 1.1584, + "step": 2857 + }, + { + "epoch": 0.44929159543319774, + "grad_norm": 0.15417364239692688, + "learning_rate": 4.406301354857962e-05, + "loss": 1.1879, + "step": 2858 + }, + { + "epoch": 0.4494488003301303, + "grad_norm": 0.15544764697551727, + "learning_rate": 4.405901576429375e-05, + "loss": 1.1779, + "step": 2859 + }, + { + "epoch": 0.44960600522706284, + "grad_norm": 0.21157599985599518, + "learning_rate": 4.4055016815955716e-05, + "loss": 1.1368, + "step": 2860 + }, + { + "epoch": 0.44976321012399534, + "grad_norm": 0.1670396625995636, + "learning_rate": 4.405101670380976e-05, + "loss": 1.1953, + "step": 2861 + }, + { + "epoch": 0.4499204150209279, + "grad_norm": 0.17523956298828125, + "learning_rate": 4.4047015428100184e-05, + "loss": 1.1532, + "step": 2862 + }, + { + "epoch": 0.45007761991786044, + "grad_norm": 0.1506781280040741, + "learning_rate": 4.404301298907138e-05, + "loss": 1.1732, + "step": 2863 + }, + { + "epoch": 0.450234824814793, + "grad_norm": 0.2258424013853073, + "learning_rate": 4.403900938696779e-05, + "loss": 1.0762, + "step": 2864 + }, + { + "epoch": 0.45039202971172554, + "grad_norm": 0.15648561716079712, + "learning_rate": 4.403500462203395e-05, + "loss": 1.1401, + "step": 2865 + }, + { + "epoch": 0.45054923460865803, + "grad_norm": 0.16834858059883118, + "learning_rate": 4.403099869451445e-05, + "loss": 1.1393, + "step": 2866 + }, + { + "epoch": 0.4507064395055906, + "grad_norm": 0.24699564278125763, + "learning_rate": 4.4026991604653954e-05, + "loss": 1.1564, + "step": 2867 + }, + { + "epoch": 0.45086364440252313, + "grad_norm": 0.21116980910301208, + "learning_rate": 4.402298335269721e-05, + "loss": 1.1344, + "step": 2868 + }, + { + "epoch": 0.4510208492994557, + "grad_norm": 0.1752268224954605, + "learning_rate": 4.401897393888902e-05, + "loss": 1.0779, + "step": 2869 + }, + { + "epoch": 0.45117805419638823, + "grad_norm": 0.22002138197422028, + "learning_rate": 4.401496336347426e-05, + "loss": 0.9862, + "step": 2870 + }, + { + "epoch": 0.4513352590933208, + "grad_norm": 1.107559323310852, + "learning_rate": 4.401095162669788e-05, + "loss": 1.0143, + "step": 2871 + }, + { + "epoch": 0.4514924639902533, + "grad_norm": 0.23041501641273499, + "learning_rate": 4.400693872880491e-05, + "loss": 1.1625, + "step": 2872 + }, + { + "epoch": 0.45164966888718583, + "grad_norm": 0.18464645743370056, + "learning_rate": 4.400292467004044e-05, + "loss": 1.15, + "step": 2873 + }, + { + "epoch": 0.4518068737841184, + "grad_norm": 0.19527535140514374, + "learning_rate": 4.3998909450649644e-05, + "loss": 1.1722, + "step": 2874 + }, + { + "epoch": 0.45196407868105093, + "grad_norm": 0.2334446758031845, + "learning_rate": 4.3994893070877734e-05, + "loss": 1.1482, + "step": 2875 + }, + { + "epoch": 0.4521212835779835, + "grad_norm": 0.18088001012802124, + "learning_rate": 4.3990875530970034e-05, + "loss": 1.1087, + "step": 2876 + }, + { + "epoch": 0.452278488474916, + "grad_norm": 0.21173341572284698, + "learning_rate": 4.39868568311719e-05, + "loss": 1.1277, + "step": 2877 + }, + { + "epoch": 0.4524356933718485, + "grad_norm": 0.33623263239860535, + "learning_rate": 4.39828369717288e-05, + "loss": 1.1106, + "step": 2878 + }, + { + "epoch": 0.4525928982687811, + "grad_norm": 0.1798340082168579, + "learning_rate": 4.397881595288624e-05, + "loss": 1.1459, + "step": 2879 + }, + { + "epoch": 0.4527501031657136, + "grad_norm": 0.17362205684185028, + "learning_rate": 4.397479377488981e-05, + "loss": 1.2209, + "step": 2880 + }, + { + "epoch": 0.4527501031657136, + "eval_loss": 1.1185150146484375, + "eval_runtime": 2300.8084, + "eval_samples_per_second": 4.024, + "eval_steps_per_second": 2.012, + "step": 2880 + }, + { + "epoch": 0.4529073080626462, + "grad_norm": 0.22614096105098724, + "learning_rate": 4.397077043798517e-05, + "loss": 1.1362, + "step": 2881 + }, + { + "epoch": 0.45306451295957867, + "grad_norm": 0.23455561697483063, + "learning_rate": 4.3966745942418056e-05, + "loss": 1.0824, + "step": 2882 + }, + { + "epoch": 0.4532217178565112, + "grad_norm": 0.2692466974258423, + "learning_rate": 4.3962720288434254e-05, + "loss": 1.125, + "step": 2883 + }, + { + "epoch": 0.45337892275344377, + "grad_norm": 0.23672759532928467, + "learning_rate": 4.395869347627966e-05, + "loss": 1.1478, + "step": 2884 + }, + { + "epoch": 0.4535361276503763, + "grad_norm": 0.25377583503723145, + "learning_rate": 4.395466550620019e-05, + "loss": 1.177, + "step": 2885 + }, + { + "epoch": 0.45369333254730887, + "grad_norm": 0.14862841367721558, + "learning_rate": 4.395063637844187e-05, + "loss": 1.156, + "step": 2886 + }, + { + "epoch": 0.45385053744424136, + "grad_norm": 0.2521793842315674, + "learning_rate": 4.3946606093250786e-05, + "loss": 1.0896, + "step": 2887 + }, + { + "epoch": 0.4540077423411739, + "grad_norm": 0.2162342667579651, + "learning_rate": 4.3942574650873084e-05, + "loss": 1.0524, + "step": 2888 + }, + { + "epoch": 0.45416494723810646, + "grad_norm": 0.1862705945968628, + "learning_rate": 4.3938542051555e-05, + "loss": 1.1185, + "step": 2889 + }, + { + "epoch": 0.454322152135039, + "grad_norm": 0.2547942101955414, + "learning_rate": 4.393450829554282e-05, + "loss": 1.0975, + "step": 2890 + }, + { + "epoch": 0.45447935703197156, + "grad_norm": 0.20143720507621765, + "learning_rate": 4.393047338308292e-05, + "loss": 1.1355, + "step": 2891 + }, + { + "epoch": 0.45463656192890406, + "grad_norm": 0.18321287631988525, + "learning_rate": 4.392643731442172e-05, + "loss": 1.146, + "step": 2892 + }, + { + "epoch": 0.4547937668258366, + "grad_norm": 0.19959893822669983, + "learning_rate": 4.392240008980575e-05, + "loss": 1.1397, + "step": 2893 + }, + { + "epoch": 0.45495097172276916, + "grad_norm": 0.17295321822166443, + "learning_rate": 4.391836170948157e-05, + "loss": 1.1841, + "step": 2894 + }, + { + "epoch": 0.4551081766197017, + "grad_norm": 0.6232892274856567, + "learning_rate": 4.391432217369584e-05, + "loss": 1.0642, + "step": 2895 + }, + { + "epoch": 0.45526538151663426, + "grad_norm": 0.18548434972763062, + "learning_rate": 4.391028148269528e-05, + "loss": 1.2487, + "step": 2896 + }, + { + "epoch": 0.4554225864135668, + "grad_norm": 0.7635773420333862, + "learning_rate": 4.390623963672667e-05, + "loss": 1.0099, + "step": 2897 + }, + { + "epoch": 0.4555797913104993, + "grad_norm": 0.28948745131492615, + "learning_rate": 4.3902196636036874e-05, + "loss": 1.1306, + "step": 2898 + }, + { + "epoch": 0.45573699620743185, + "grad_norm": 0.1994236707687378, + "learning_rate": 4.389815248087284e-05, + "loss": 1.0493, + "step": 2899 + }, + { + "epoch": 0.4558942011043644, + "grad_norm": 0.18875907361507416, + "learning_rate": 4.389410717148154e-05, + "loss": 1.2152, + "step": 2900 + }, + { + "epoch": 0.45605140600129696, + "grad_norm": 0.17050974071025848, + "learning_rate": 4.389006070811007e-05, + "loss": 1.2183, + "step": 2901 + }, + { + "epoch": 0.4562086108982295, + "grad_norm": 0.1716713309288025, + "learning_rate": 4.3886013091005554e-05, + "loss": 1.1924, + "step": 2902 + }, + { + "epoch": 0.456365815795162, + "grad_norm": 0.2430170327425003, + "learning_rate": 4.388196432041522e-05, + "loss": 1.2085, + "step": 2903 + }, + { + "epoch": 0.45652302069209455, + "grad_norm": 0.21377846598625183, + "learning_rate": 4.387791439658635e-05, + "loss": 1.194, + "step": 2904 + }, + { + "epoch": 0.4566802255890271, + "grad_norm": 0.2664497494697571, + "learning_rate": 4.3873863319766294e-05, + "loss": 1.096, + "step": 2905 + }, + { + "epoch": 0.45683743048595965, + "grad_norm": 0.3957104980945587, + "learning_rate": 4.386981109020248e-05, + "loss": 1.2151, + "step": 2906 + }, + { + "epoch": 0.4569946353828922, + "grad_norm": 0.2563398778438568, + "learning_rate": 4.38657577081424e-05, + "loss": 1.0582, + "step": 2907 + }, + { + "epoch": 0.4571518402798247, + "grad_norm": 0.19324427843093872, + "learning_rate": 4.3861703173833606e-05, + "loss": 1.1949, + "step": 2908 + }, + { + "epoch": 0.45730904517675725, + "grad_norm": 0.24101825058460236, + "learning_rate": 4.385764748752376e-05, + "loss": 1.1812, + "step": 2909 + }, + { + "epoch": 0.4574662500736898, + "grad_norm": 0.1889408826828003, + "learning_rate": 4.385359064946054e-05, + "loss": 1.0492, + "step": 2910 + }, + { + "epoch": 0.45762345497062235, + "grad_norm": 0.18517924845218658, + "learning_rate": 4.3849532659891746e-05, + "loss": 1.082, + "step": 2911 + }, + { + "epoch": 0.4577806598675549, + "grad_norm": 0.20901206135749817, + "learning_rate": 4.384547351906522e-05, + "loss": 1.1381, + "step": 2912 + }, + { + "epoch": 0.4579378647644874, + "grad_norm": 0.211951345205307, + "learning_rate": 4.384141322722886e-05, + "loss": 1.0802, + "step": 2913 + }, + { + "epoch": 0.45809506966141994, + "grad_norm": 0.2100234031677246, + "learning_rate": 4.3837351784630676e-05, + "loss": 1.1165, + "step": 2914 + }, + { + "epoch": 0.4582522745583525, + "grad_norm": 0.20928257703781128, + "learning_rate": 4.383328919151871e-05, + "loss": 1.0882, + "step": 2915 + }, + { + "epoch": 0.45840947945528504, + "grad_norm": 0.2149987667798996, + "learning_rate": 4.38292254481411e-05, + "loss": 1.2111, + "step": 2916 + }, + { + "epoch": 0.4585666843522176, + "grad_norm": 0.16672222316265106, + "learning_rate": 4.382516055474605e-05, + "loss": 1.2464, + "step": 2917 + }, + { + "epoch": 0.4587238892491501, + "grad_norm": 0.2167981117963791, + "learning_rate": 4.382109451158181e-05, + "loss": 1.1166, + "step": 2918 + }, + { + "epoch": 0.45888109414608264, + "grad_norm": 0.17324289679527283, + "learning_rate": 4.381702731889672e-05, + "loss": 1.0655, + "step": 2919 + }, + { + "epoch": 0.4590382990430152, + "grad_norm": 0.1950710564851761, + "learning_rate": 4.38129589769392e-05, + "loss": 1.1824, + "step": 2920 + }, + { + "epoch": 0.45919550393994774, + "grad_norm": 0.31070321798324585, + "learning_rate": 4.3808889485957726e-05, + "loss": 1.1666, + "step": 2921 + }, + { + "epoch": 0.4593527088368803, + "grad_norm": 0.24002663791179657, + "learning_rate": 4.380481884620084e-05, + "loss": 1.1344, + "step": 2922 + }, + { + "epoch": 0.45950991373381284, + "grad_norm": 0.2190941721200943, + "learning_rate": 4.380074705791718e-05, + "loss": 1.0767, + "step": 2923 + }, + { + "epoch": 0.45966711863074533, + "grad_norm": 0.24419142305850983, + "learning_rate": 4.3796674121355416e-05, + "loss": 1.1538, + "step": 2924 + }, + { + "epoch": 0.4598243235276779, + "grad_norm": 0.24275580048561096, + "learning_rate": 4.379260003676431e-05, + "loss": 1.2278, + "step": 2925 + }, + { + "epoch": 0.45998152842461043, + "grad_norm": 0.20563790202140808, + "learning_rate": 4.3788524804392694e-05, + "loss": 1.1544, + "step": 2926 + }, + { + "epoch": 0.460138733321543, + "grad_norm": 0.2236025035381317, + "learning_rate": 4.3784448424489476e-05, + "loss": 1.0229, + "step": 2927 + }, + { + "epoch": 0.46029593821847553, + "grad_norm": 0.22830867767333984, + "learning_rate": 4.378037089730361e-05, + "loss": 1.1557, + "step": 2928 + }, + { + "epoch": 0.460453143115408, + "grad_norm": 0.1820029616355896, + "learning_rate": 4.3776292223084146e-05, + "loss": 1.195, + "step": 2929 + }, + { + "epoch": 0.4606103480123406, + "grad_norm": 0.19844214618206024, + "learning_rate": 4.377221240208019e-05, + "loss": 1.1008, + "step": 2930 + }, + { + "epoch": 0.4607675529092731, + "grad_norm": 0.20659616589546204, + "learning_rate": 4.376813143454093e-05, + "loss": 1.1284, + "step": 2931 + }, + { + "epoch": 0.4609247578062057, + "grad_norm": 0.21897144615650177, + "learning_rate": 4.3764049320715606e-05, + "loss": 1.0937, + "step": 2932 + }, + { + "epoch": 0.4610819627031382, + "grad_norm": 0.20715178549289703, + "learning_rate": 4.3759966060853545e-05, + "loss": 0.9869, + "step": 2933 + }, + { + "epoch": 0.4612391676000707, + "grad_norm": 0.2190580666065216, + "learning_rate": 4.3755881655204136e-05, + "loss": 1.2428, + "step": 2934 + }, + { + "epoch": 0.46139637249700327, + "grad_norm": 0.17025412619113922, + "learning_rate": 4.375179610401683e-05, + "loss": 1.2025, + "step": 2935 + }, + { + "epoch": 0.4615535773939358, + "grad_norm": 0.17598217725753784, + "learning_rate": 4.3747709407541174e-05, + "loss": 1.1, + "step": 2936 + }, + { + "epoch": 0.46171078229086837, + "grad_norm": 0.20214369893074036, + "learning_rate": 4.374362156602675e-05, + "loss": 1.1479, + "step": 2937 + }, + { + "epoch": 0.4618679871878009, + "grad_norm": 0.1486193835735321, + "learning_rate": 4.373953257972323e-05, + "loss": 1.2626, + "step": 2938 + }, + { + "epoch": 0.4620251920847334, + "grad_norm": 0.1790590137243271, + "learning_rate": 4.373544244888037e-05, + "loss": 1.1798, + "step": 2939 + }, + { + "epoch": 0.46218239698166597, + "grad_norm": 0.18835453689098358, + "learning_rate": 4.373135117374797e-05, + "loss": 1.0728, + "step": 2940 + }, + { + "epoch": 0.4623396018785985, + "grad_norm": 0.21751640737056732, + "learning_rate": 4.37272587545759e-05, + "loss": 1.0957, + "step": 2941 + }, + { + "epoch": 0.46249680677553107, + "grad_norm": 0.17027625441551208, + "learning_rate": 4.3723165191614126e-05, + "loss": 1.1387, + "step": 2942 + }, + { + "epoch": 0.4626540116724636, + "grad_norm": 0.2957732081413269, + "learning_rate": 4.3719070485112646e-05, + "loss": 1.1947, + "step": 2943 + }, + { + "epoch": 0.4628112165693961, + "grad_norm": 0.22199559211730957, + "learning_rate": 4.371497463532157e-05, + "loss": 1.1139, + "step": 2944 + }, + { + "epoch": 0.46296842146632866, + "grad_norm": 0.14452876150608063, + "learning_rate": 4.371087764249106e-05, + "loss": 1.1742, + "step": 2945 + }, + { + "epoch": 0.4631256263632612, + "grad_norm": 0.2115442007780075, + "learning_rate": 4.370677950687132e-05, + "loss": 1.124, + "step": 2946 + }, + { + "epoch": 0.46328283126019376, + "grad_norm": 0.21930637955665588, + "learning_rate": 4.370268022871267e-05, + "loss": 1.1338, + "step": 2947 + }, + { + "epoch": 0.4634400361571263, + "grad_norm": 0.17294028401374817, + "learning_rate": 4.369857980826546e-05, + "loss": 1.1796, + "step": 2948 + }, + { + "epoch": 0.4635972410540588, + "grad_norm": 0.21426047384738922, + "learning_rate": 4.369447824578015e-05, + "loss": 1.153, + "step": 2949 + }, + { + "epoch": 0.46375444595099136, + "grad_norm": 0.19322267174720764, + "learning_rate": 4.369037554150723e-05, + "loss": 1.1341, + "step": 2950 + }, + { + "epoch": 0.4639116508479239, + "grad_norm": 0.2040453553199768, + "learning_rate": 4.368627169569729e-05, + "loss": 1.1638, + "step": 2951 + }, + { + "epoch": 0.46406885574485646, + "grad_norm": 0.18749818205833435, + "learning_rate": 4.368216670860096e-05, + "loss": 1.0691, + "step": 2952 + }, + { + "epoch": 0.464226060641789, + "grad_norm": 0.18389207124710083, + "learning_rate": 4.3678060580468984e-05, + "loss": 1.0091, + "step": 2953 + }, + { + "epoch": 0.46438326553872156, + "grad_norm": 0.23443341255187988, + "learning_rate": 4.3673953311552115e-05, + "loss": 1.1391, + "step": 2954 + }, + { + "epoch": 0.46454047043565405, + "grad_norm": 0.14893463253974915, + "learning_rate": 4.366984490210124e-05, + "loss": 1.1696, + "step": 2955 + }, + { + "epoch": 0.4646976753325866, + "grad_norm": 0.2501336336135864, + "learning_rate": 4.366573535236728e-05, + "loss": 1.0786, + "step": 2956 + }, + { + "epoch": 0.46485488022951915, + "grad_norm": 0.17904464900493622, + "learning_rate": 4.366162466260121e-05, + "loss": 1.2305, + "step": 2957 + }, + { + "epoch": 0.4650120851264517, + "grad_norm": 0.22989200055599213, + "learning_rate": 4.365751283305411e-05, + "loss": 1.1096, + "step": 2958 + }, + { + "epoch": 0.46516929002338425, + "grad_norm": 0.20203103125095367, + "learning_rate": 4.365339986397712e-05, + "loss": 1.1126, + "step": 2959 + }, + { + "epoch": 0.46532649492031675, + "grad_norm": 0.21098080277442932, + "learning_rate": 4.364928575562143e-05, + "loss": 1.1223, + "step": 2960 + }, + { + "epoch": 0.4654836998172493, + "grad_norm": 0.1513175219297409, + "learning_rate": 4.364517050823832e-05, + "loss": 1.0855, + "step": 2961 + }, + { + "epoch": 0.46564090471418185, + "grad_norm": 0.14790455996990204, + "learning_rate": 4.364105412207914e-05, + "loss": 1.1123, + "step": 2962 + }, + { + "epoch": 0.4657981096111144, + "grad_norm": 0.24311016499996185, + "learning_rate": 4.36369365973953e-05, + "loss": 1.1073, + "step": 2963 + }, + { + "epoch": 0.46595531450804695, + "grad_norm": 0.24297219514846802, + "learning_rate": 4.3632817934438284e-05, + "loss": 0.9989, + "step": 2964 + }, + { + "epoch": 0.46611251940497944, + "grad_norm": 0.21929419040679932, + "learning_rate": 4.362869813345964e-05, + "loss": 1.1306, + "step": 2965 + }, + { + "epoch": 0.466269724301912, + "grad_norm": 0.21477848291397095, + "learning_rate": 4.3624577194710993e-05, + "loss": 1.1154, + "step": 2966 + }, + { + "epoch": 0.46642692919884454, + "grad_norm": 0.22623370587825775, + "learning_rate": 4.362045511844402e-05, + "loss": 1.167, + "step": 2967 + }, + { + "epoch": 0.4665841340957771, + "grad_norm": 0.17645582556724548, + "learning_rate": 4.3616331904910515e-05, + "loss": 1.1196, + "step": 2968 + }, + { + "epoch": 0.46674133899270964, + "grad_norm": 0.1720646470785141, + "learning_rate": 4.361220755436227e-05, + "loss": 1.1096, + "step": 2969 + }, + { + "epoch": 0.46689854388964214, + "grad_norm": 0.2820630371570587, + "learning_rate": 4.3608082067051214e-05, + "loss": 1.1802, + "step": 2970 + }, + { + "epoch": 0.4670557487865747, + "grad_norm": 0.15399809181690216, + "learning_rate": 4.36039554432293e-05, + "loss": 1.0906, + "step": 2971 + }, + { + "epoch": 0.46721295368350724, + "grad_norm": 0.14993947744369507, + "learning_rate": 4.359982768314857e-05, + "loss": 1.1125, + "step": 2972 + }, + { + "epoch": 0.4673701585804398, + "grad_norm": 0.2066517025232315, + "learning_rate": 4.359569878706113e-05, + "loss": 1.1647, + "step": 2973 + }, + { + "epoch": 0.46752736347737234, + "grad_norm": 0.1963212788105011, + "learning_rate": 4.359156875521917e-05, + "loss": 1.127, + "step": 2974 + }, + { + "epoch": 0.46768456837430483, + "grad_norm": 0.17257174849510193, + "learning_rate": 4.3587437587874926e-05, + "loss": 1.0375, + "step": 2975 + }, + { + "epoch": 0.4678417732712374, + "grad_norm": 0.1978078931570053, + "learning_rate": 4.3583305285280704e-05, + "loss": 1.132, + "step": 2976 + }, + { + "epoch": 0.46799897816816993, + "grad_norm": 0.17845632135868073, + "learning_rate": 4.35791718476889e-05, + "loss": 1.0805, + "step": 2977 + }, + { + "epoch": 0.4681561830651025, + "grad_norm": 0.24879387021064758, + "learning_rate": 4.357503727535198e-05, + "loss": 1.0959, + "step": 2978 + }, + { + "epoch": 0.46831338796203503, + "grad_norm": 0.23771560192108154, + "learning_rate": 4.3570901568522445e-05, + "loss": 1.0856, + "step": 2979 + }, + { + "epoch": 0.4684705928589676, + "grad_norm": 0.1961091309785843, + "learning_rate": 4.3566764727452914e-05, + "loss": 1.1077, + "step": 2980 + }, + { + "epoch": 0.4686277977559001, + "grad_norm": 0.2253684103488922, + "learning_rate": 4.356262675239603e-05, + "loss": 1.1091, + "step": 2981 + }, + { + "epoch": 0.4687850026528326, + "grad_norm": 0.22095037996768951, + "learning_rate": 4.355848764360453e-05, + "loss": 1.0995, + "step": 2982 + }, + { + "epoch": 0.4689422075497652, + "grad_norm": 0.15505705773830414, + "learning_rate": 4.355434740133121e-05, + "loss": 1.0732, + "step": 2983 + }, + { + "epoch": 0.4690994124466977, + "grad_norm": 0.13467110693454742, + "learning_rate": 4.355020602582895e-05, + "loss": 1.0693, + "step": 2984 + }, + { + "epoch": 0.4692566173436303, + "grad_norm": 0.2177276611328125, + "learning_rate": 4.354606351735068e-05, + "loss": 1.0611, + "step": 2985 + }, + { + "epoch": 0.4694138222405628, + "grad_norm": 0.2077556699514389, + "learning_rate": 4.3541919876149416e-05, + "loss": 1.1161, + "step": 2986 + }, + { + "epoch": 0.4695710271374953, + "grad_norm": 0.17732571065425873, + "learning_rate": 4.3537775102478234e-05, + "loss": 1.1127, + "step": 2987 + }, + { + "epoch": 0.4697282320344279, + "grad_norm": 0.15690331161022186, + "learning_rate": 4.353362919659028e-05, + "loss": 1.0747, + "step": 2988 + }, + { + "epoch": 0.4698854369313604, + "grad_norm": 0.14969655871391296, + "learning_rate": 4.352948215873877e-05, + "loss": 1.1483, + "step": 2989 + }, + { + "epoch": 0.470042641828293, + "grad_norm": 0.15860646963119507, + "learning_rate": 4.3525333989177e-05, + "loss": 1.1119, + "step": 2990 + }, + { + "epoch": 0.47019984672522547, + "grad_norm": 0.25320160388946533, + "learning_rate": 4.35211846881583e-05, + "loss": 1.1342, + "step": 2991 + }, + { + "epoch": 0.470357051622158, + "grad_norm": 0.18844164907932281, + "learning_rate": 4.3517034255936104e-05, + "loss": 1.1295, + "step": 2992 + }, + { + "epoch": 0.47051425651909057, + "grad_norm": 0.17962507903575897, + "learning_rate": 4.3512882692763926e-05, + "loss": 1.1112, + "step": 2993 + }, + { + "epoch": 0.4706714614160231, + "grad_norm": 0.2303573489189148, + "learning_rate": 4.35087299988953e-05, + "loss": 1.1863, + "step": 2994 + }, + { + "epoch": 0.47082866631295567, + "grad_norm": 0.2209206074476242, + "learning_rate": 4.350457617458387e-05, + "loss": 1.0956, + "step": 2995 + }, + { + "epoch": 0.47098587120988816, + "grad_norm": 0.15327110886573792, + "learning_rate": 4.350042122008333e-05, + "loss": 1.2205, + "step": 2996 + }, + { + "epoch": 0.4711430761068207, + "grad_norm": 0.19386257231235504, + "learning_rate": 4.349626513564745e-05, + "loss": 1.1507, + "step": 2997 + }, + { + "epoch": 0.47130028100375326, + "grad_norm": 0.19012616574764252, + "learning_rate": 4.3492107921530067e-05, + "loss": 1.1616, + "step": 2998 + }, + { + "epoch": 0.4714574859006858, + "grad_norm": 0.3165668845176697, + "learning_rate": 4.3487949577985096e-05, + "loss": 1.1514, + "step": 2999 + }, + { + "epoch": 0.47161469079761836, + "grad_norm": 0.185637429356575, + "learning_rate": 4.348379010526651e-05, + "loss": 1.0717, + "step": 3000 + }, + { + "epoch": 0.47177189569455086, + "grad_norm": 0.1926163285970688, + "learning_rate": 4.347962950362834e-05, + "loss": 1.0976, + "step": 3001 + }, + { + "epoch": 0.4719291005914834, + "grad_norm": 0.21438013017177582, + "learning_rate": 4.347546777332472e-05, + "loss": 1.101, + "step": 3002 + }, + { + "epoch": 0.47208630548841596, + "grad_norm": 0.13781698048114777, + "learning_rate": 4.347130491460982e-05, + "loss": 1.1528, + "step": 3003 + }, + { + "epoch": 0.4722435103853485, + "grad_norm": 0.2153354436159134, + "learning_rate": 4.346714092773789e-05, + "loss": 1.0979, + "step": 3004 + }, + { + "epoch": 0.47240071528228106, + "grad_norm": 0.1563863605260849, + "learning_rate": 4.3462975812963255e-05, + "loss": 1.0688, + "step": 3005 + }, + { + "epoch": 0.4725579201792136, + "grad_norm": 0.14853888750076294, + "learning_rate": 4.3458809570540315e-05, + "loss": 1.3536, + "step": 3006 + }, + { + "epoch": 0.4727151250761461, + "grad_norm": 0.18929365277290344, + "learning_rate": 4.34546422007235e-05, + "loss": 1.1523, + "step": 3007 + }, + { + "epoch": 0.47287232997307865, + "grad_norm": 0.1504976898431778, + "learning_rate": 4.345047370376737e-05, + "loss": 1.1512, + "step": 3008 + }, + { + "epoch": 0.4730295348700112, + "grad_norm": 0.164264515042305, + "learning_rate": 4.3446304079926505e-05, + "loss": 1.1382, + "step": 3009 + }, + { + "epoch": 0.47318673976694375, + "grad_norm": 0.17672760784626007, + "learning_rate": 4.344213332945557e-05, + "loss": 1.1067, + "step": 3010 + }, + { + "epoch": 0.4733439446638763, + "grad_norm": 0.15469646453857422, + "learning_rate": 4.343796145260929e-05, + "loss": 1.1057, + "step": 3011 + }, + { + "epoch": 0.4735011495608088, + "grad_norm": 0.2399524748325348, + "learning_rate": 4.3433788449642485e-05, + "loss": 1.0207, + "step": 3012 + }, + { + "epoch": 0.47365835445774135, + "grad_norm": 0.21121852099895477, + "learning_rate": 4.342961432081001e-05, + "loss": 1.0743, + "step": 3013 + }, + { + "epoch": 0.4738155593546739, + "grad_norm": 0.18658918142318726, + "learning_rate": 4.342543906636682e-05, + "loss": 1.0925, + "step": 3014 + }, + { + "epoch": 0.47397276425160645, + "grad_norm": 0.16221532225608826, + "learning_rate": 4.342126268656791e-05, + "loss": 1.2256, + "step": 3015 + }, + { + "epoch": 0.474129969148539, + "grad_norm": 0.3126663863658905, + "learning_rate": 4.341708518166837e-05, + "loss": 1.0866, + "step": 3016 + }, + { + "epoch": 0.4742871740454715, + "grad_norm": 0.16433748602867126, + "learning_rate": 4.341290655192333e-05, + "loss": 1.1915, + "step": 3017 + }, + { + "epoch": 0.47444437894240404, + "grad_norm": 0.15654461085796356, + "learning_rate": 4.3408726797588023e-05, + "loss": 1.0935, + "step": 3018 + }, + { + "epoch": 0.4746015838393366, + "grad_norm": 0.17568379640579224, + "learning_rate": 4.3404545918917724e-05, + "loss": 1.2203, + "step": 3019 + }, + { + "epoch": 0.47475878873626914, + "grad_norm": 0.15613645315170288, + "learning_rate": 4.3400363916167774e-05, + "loss": 1.1754, + "step": 3020 + }, + { + "epoch": 0.4749159936332017, + "grad_norm": 0.1775684654712677, + "learning_rate": 4.339618078959362e-05, + "loss": 1.1522, + "step": 3021 + }, + { + "epoch": 0.4750731985301342, + "grad_norm": 0.25972139835357666, + "learning_rate": 4.339199653945072e-05, + "loss": 1.0544, + "step": 3022 + }, + { + "epoch": 0.47523040342706674, + "grad_norm": 0.2272106260061264, + "learning_rate": 4.338781116599466e-05, + "loss": 1.2017, + "step": 3023 + }, + { + "epoch": 0.4753876083239993, + "grad_norm": 0.17775173485279083, + "learning_rate": 4.338362466948105e-05, + "loss": 1.0349, + "step": 3024 + }, + { + "epoch": 0.47554481322093184, + "grad_norm": 0.17978572845458984, + "learning_rate": 4.3379437050165595e-05, + "loss": 1.2645, + "step": 3025 + }, + { + "epoch": 0.4757020181178644, + "grad_norm": 0.27326762676239014, + "learning_rate": 4.337524830830405e-05, + "loss": 1.1615, + "step": 3026 + }, + { + "epoch": 0.4758592230147969, + "grad_norm": 0.15005174279212952, + "learning_rate": 4.337105844415226e-05, + "loss": 1.0954, + "step": 3027 + }, + { + "epoch": 0.47601642791172943, + "grad_norm": 0.1960383504629135, + "learning_rate": 4.3366867457966106e-05, + "loss": 1.1727, + "step": 3028 + }, + { + "epoch": 0.476173632808662, + "grad_norm": 0.2822120785713196, + "learning_rate": 4.336267535000157e-05, + "loss": 1.0665, + "step": 3029 + }, + { + "epoch": 0.47633083770559453, + "grad_norm": 0.19774426519870758, + "learning_rate": 4.33584821205147e-05, + "loss": 1.0842, + "step": 3030 + }, + { + "epoch": 0.4764880426025271, + "grad_norm": 0.18241934478282928, + "learning_rate": 4.3354287769761584e-05, + "loss": 1.062, + "step": 3031 + }, + { + "epoch": 0.47664524749945963, + "grad_norm": 0.15259404480457306, + "learning_rate": 4.33500922979984e-05, + "loss": 1.1752, + "step": 3032 + }, + { + "epoch": 0.47680245239639213, + "grad_norm": 0.28016725182533264, + "learning_rate": 4.33458957054814e-05, + "loss": 1.1332, + "step": 3033 + }, + { + "epoch": 0.4769596572933247, + "grad_norm": 0.21748925745487213, + "learning_rate": 4.33416979924669e-05, + "loss": 1.1161, + "step": 3034 + }, + { + "epoch": 0.47711686219025723, + "grad_norm": 0.20696724951267242, + "learning_rate": 4.333749915921126e-05, + "loss": 1.176, + "step": 3035 + }, + { + "epoch": 0.4772740670871898, + "grad_norm": 0.17570556700229645, + "learning_rate": 4.3333299205970946e-05, + "loss": 1.1311, + "step": 3036 + }, + { + "epoch": 0.47743127198412233, + "grad_norm": 0.15662014484405518, + "learning_rate": 4.3329098133002475e-05, + "loss": 1.0178, + "step": 3037 + }, + { + "epoch": 0.4775884768810548, + "grad_norm": 0.16522455215454102, + "learning_rate": 4.332489594056242e-05, + "loss": 1.0892, + "step": 3038 + }, + { + "epoch": 0.4777456817779874, + "grad_norm": 0.2099331170320511, + "learning_rate": 4.332069262890745e-05, + "loss": 1.1686, + "step": 3039 + }, + { + "epoch": 0.4779028866749199, + "grad_norm": 0.2070057988166809, + "learning_rate": 4.331648819829427e-05, + "loss": 1.1487, + "step": 3040 + }, + { + "epoch": 0.4779028866749199, + "eval_loss": 1.1146655082702637, + "eval_runtime": 2325.1865, + "eval_samples_per_second": 3.982, + "eval_steps_per_second": 1.991, + "step": 3040 + }, + { + "epoch": 0.4780600915718525, + "grad_norm": 0.20269645750522614, + "learning_rate": 4.331228264897968e-05, + "loss": 1.1289, + "step": 3041 + }, + { + "epoch": 0.478217296468785, + "grad_norm": 0.22121499478816986, + "learning_rate": 4.3308075981220555e-05, + "loss": 1.0923, + "step": 3042 + }, + { + "epoch": 0.4783745013657175, + "grad_norm": 0.3529304265975952, + "learning_rate": 4.330386819527379e-05, + "loss": 1.1289, + "step": 3043 + }, + { + "epoch": 0.47853170626265007, + "grad_norm": 0.18771342933177948, + "learning_rate": 4.329965929139641e-05, + "loss": 1.0981, + "step": 3044 + }, + { + "epoch": 0.4786889111595826, + "grad_norm": 0.2015509307384491, + "learning_rate": 4.329544926984546e-05, + "loss": 1.0751, + "step": 3045 + }, + { + "epoch": 0.47884611605651517, + "grad_norm": 0.18288064002990723, + "learning_rate": 4.329123813087808e-05, + "loss": 1.2024, + "step": 3046 + }, + { + "epoch": 0.4790033209534477, + "grad_norm": 0.17577502131462097, + "learning_rate": 4.3287025874751466e-05, + "loss": 1.0602, + "step": 3047 + }, + { + "epoch": 0.4791605258503802, + "grad_norm": 0.14473573863506317, + "learning_rate": 4.3282812501722895e-05, + "loss": 1.1354, + "step": 3048 + }, + { + "epoch": 0.47931773074731276, + "grad_norm": 0.13793261349201202, + "learning_rate": 4.3278598012049685e-05, + "loss": 1.1413, + "step": 3049 + }, + { + "epoch": 0.4794749356442453, + "grad_norm": 0.19814302027225494, + "learning_rate": 4.3274382405989266e-05, + "loss": 1.1041, + "step": 3050 + }, + { + "epoch": 0.47963214054117786, + "grad_norm": 0.17834322154521942, + "learning_rate": 4.32701656837991e-05, + "loss": 1.0496, + "step": 3051 + }, + { + "epoch": 0.4797893454381104, + "grad_norm": 0.17347043752670288, + "learning_rate": 4.326594784573672e-05, + "loss": 1.1002, + "step": 3052 + }, + { + "epoch": 0.4799465503350429, + "grad_norm": 0.18119116127490997, + "learning_rate": 4.326172889205975e-05, + "loss": 1.1539, + "step": 3053 + }, + { + "epoch": 0.48010375523197546, + "grad_norm": 0.18658742308616638, + "learning_rate": 4.325750882302586e-05, + "loss": 1.0959, + "step": 3054 + }, + { + "epoch": 0.480260960128908, + "grad_norm": 0.19841469824314117, + "learning_rate": 4.325328763889279e-05, + "loss": 1.1105, + "step": 3055 + }, + { + "epoch": 0.48041816502584056, + "grad_norm": 0.21206001937389374, + "learning_rate": 4.324906533991836e-05, + "loss": 1.0634, + "step": 3056 + }, + { + "epoch": 0.4805753699227731, + "grad_norm": 0.18314093351364136, + "learning_rate": 4.324484192636046e-05, + "loss": 1.1106, + "step": 3057 + }, + { + "epoch": 0.48073257481970566, + "grad_norm": 0.18414467573165894, + "learning_rate": 4.324061739847702e-05, + "loss": 0.9478, + "step": 3058 + }, + { + "epoch": 0.48088977971663815, + "grad_norm": 0.19265106320381165, + "learning_rate": 4.323639175652608e-05, + "loss": 1.1306, + "step": 3059 + }, + { + "epoch": 0.4810469846135707, + "grad_norm": 0.17755688726902008, + "learning_rate": 4.323216500076572e-05, + "loss": 1.0935, + "step": 3060 + }, + { + "epoch": 0.48120418951050326, + "grad_norm": 0.1681739240884781, + "learning_rate": 4.322793713145408e-05, + "loss": 1.1834, + "step": 3061 + }, + { + "epoch": 0.4813613944074358, + "grad_norm": 0.17772947251796722, + "learning_rate": 4.3223708148849404e-05, + "loss": 1.2605, + "step": 3062 + }, + { + "epoch": 0.48151859930436836, + "grad_norm": 0.16665947437286377, + "learning_rate": 4.321947805320996e-05, + "loss": 1.173, + "step": 3063 + }, + { + "epoch": 0.48167580420130085, + "grad_norm": 0.1512874960899353, + "learning_rate": 4.321524684479412e-05, + "loss": 1.2195, + "step": 3064 + }, + { + "epoch": 0.4818330090982334, + "grad_norm": 0.14165686070919037, + "learning_rate": 4.3211014523860315e-05, + "loss": 1.1116, + "step": 3065 + }, + { + "epoch": 0.48199021399516595, + "grad_norm": 0.205230712890625, + "learning_rate": 4.3206781090667026e-05, + "loss": 1.0755, + "step": 3066 + }, + { + "epoch": 0.4821474188920985, + "grad_norm": 0.15291771292686462, + "learning_rate": 4.3202546545472824e-05, + "loss": 0.9513, + "step": 3067 + }, + { + "epoch": 0.48230462378903105, + "grad_norm": 0.2035183608531952, + "learning_rate": 4.3198310888536325e-05, + "loss": 1.089, + "step": 3068 + }, + { + "epoch": 0.48246182868596355, + "grad_norm": 0.18657858669757843, + "learning_rate": 4.319407412011625e-05, + "loss": 1.1266, + "step": 3069 + }, + { + "epoch": 0.4826190335828961, + "grad_norm": 0.19956351816654205, + "learning_rate": 4.3189836240471335e-05, + "loss": 1.1221, + "step": 3070 + }, + { + "epoch": 0.48277623847982865, + "grad_norm": 0.2228332757949829, + "learning_rate": 4.318559724986044e-05, + "loss": 1.1047, + "step": 3071 + }, + { + "epoch": 0.4829334433767612, + "grad_norm": 0.1445583552122116, + "learning_rate": 4.318135714854246e-05, + "loss": 1.1637, + "step": 3072 + }, + { + "epoch": 0.48309064827369375, + "grad_norm": 0.19175522029399872, + "learning_rate": 4.317711593677636e-05, + "loss": 1.1594, + "step": 3073 + }, + { + "epoch": 0.48324785317062624, + "grad_norm": 0.19144673645496368, + "learning_rate": 4.3172873614821176e-05, + "loss": 1.1604, + "step": 3074 + }, + { + "epoch": 0.4834050580675588, + "grad_norm": 0.20282785594463348, + "learning_rate": 4.316863018293601e-05, + "loss": 1.1843, + "step": 3075 + }, + { + "epoch": 0.48356226296449134, + "grad_norm": 0.1457735151052475, + "learning_rate": 4.3164385641380045e-05, + "loss": 1.0704, + "step": 3076 + }, + { + "epoch": 0.4837194678614239, + "grad_norm": 0.1576244980096817, + "learning_rate": 4.316013999041252e-05, + "loss": 1.204, + "step": 3077 + }, + { + "epoch": 0.48387667275835644, + "grad_norm": 0.15812274813652039, + "learning_rate": 4.315589323029273e-05, + "loss": 1.048, + "step": 3078 + }, + { + "epoch": 0.48403387765528894, + "grad_norm": 0.19766604900360107, + "learning_rate": 4.315164536128007e-05, + "loss": 1.0627, + "step": 3079 + }, + { + "epoch": 0.4841910825522215, + "grad_norm": 0.1403280347585678, + "learning_rate": 4.314739638363396e-05, + "loss": 1.2214, + "step": 3080 + }, + { + "epoch": 0.48434828744915404, + "grad_norm": 0.19247771799564362, + "learning_rate": 4.314314629761393e-05, + "loss": 1.1174, + "step": 3081 + }, + { + "epoch": 0.4845054923460866, + "grad_norm": 0.15992018580436707, + "learning_rate": 4.313889510347956e-05, + "loss": 1.0445, + "step": 3082 + }, + { + "epoch": 0.48466269724301914, + "grad_norm": 0.1454923152923584, + "learning_rate": 4.313464280149049e-05, + "loss": 1.0791, + "step": 3083 + }, + { + "epoch": 0.4848199021399517, + "grad_norm": 0.17863498628139496, + "learning_rate": 4.313038939190644e-05, + "loss": 1.1887, + "step": 3084 + }, + { + "epoch": 0.4849771070368842, + "grad_norm": 0.16536812484264374, + "learning_rate": 4.3126134874987176e-05, + "loss": 1.0926, + "step": 3085 + }, + { + "epoch": 0.48513431193381673, + "grad_norm": 0.21133893728256226, + "learning_rate": 4.3121879250992566e-05, + "loss": 1.1867, + "step": 3086 + }, + { + "epoch": 0.4852915168307493, + "grad_norm": 0.13754390180110931, + "learning_rate": 4.311762252018252e-05, + "loss": 1.1501, + "step": 3087 + }, + { + "epoch": 0.48544872172768183, + "grad_norm": 0.18948908150196075, + "learning_rate": 4.3113364682817024e-05, + "loss": 1.0505, + "step": 3088 + }, + { + "epoch": 0.4856059266246144, + "grad_norm": 0.18722207844257355, + "learning_rate": 4.310910573915613e-05, + "loss": 1.1081, + "step": 3089 + }, + { + "epoch": 0.4857631315215469, + "grad_norm": 0.16461189091205597, + "learning_rate": 4.310484568945996e-05, + "loss": 1.1096, + "step": 3090 + }, + { + "epoch": 0.4859203364184794, + "grad_norm": 0.16495181620121002, + "learning_rate": 4.3100584533988694e-05, + "loss": 1.2194, + "step": 3091 + }, + { + "epoch": 0.486077541315412, + "grad_norm": 0.1575373113155365, + "learning_rate": 4.30963222730026e-05, + "loss": 1.0526, + "step": 3092 + }, + { + "epoch": 0.4862347462123445, + "grad_norm": 0.16486801207065582, + "learning_rate": 4.309205890676199e-05, + "loss": 1.1296, + "step": 3093 + }, + { + "epoch": 0.4863919511092771, + "grad_norm": 0.16061507165431976, + "learning_rate": 4.308779443552726e-05, + "loss": 1.2397, + "step": 3094 + }, + { + "epoch": 0.48654915600620957, + "grad_norm": 0.20702579617500305, + "learning_rate": 4.3083528859558855e-05, + "loss": 1.1798, + "step": 3095 + }, + { + "epoch": 0.4867063609031421, + "grad_norm": 0.1880490630865097, + "learning_rate": 4.3079262179117316e-05, + "loss": 0.9903, + "step": 3096 + }, + { + "epoch": 0.48686356580007467, + "grad_norm": 0.1547381728887558, + "learning_rate": 4.307499439446324e-05, + "loss": 1.1069, + "step": 3097 + }, + { + "epoch": 0.4870207706970072, + "grad_norm": 0.1779354214668274, + "learning_rate": 4.307072550585727e-05, + "loss": 1.1234, + "step": 3098 + }, + { + "epoch": 0.48717797559393977, + "grad_norm": 0.1460437774658203, + "learning_rate": 4.306645551356014e-05, + "loss": 1.1703, + "step": 3099 + }, + { + "epoch": 0.48733518049087227, + "grad_norm": 0.19177353382110596, + "learning_rate": 4.3062184417832644e-05, + "loss": 1.0394, + "step": 3100 + }, + { + "epoch": 0.4874923853878048, + "grad_norm": 0.19918255507946014, + "learning_rate": 4.305791221893565e-05, + "loss": 1.1018, + "step": 3101 + }, + { + "epoch": 0.48764959028473737, + "grad_norm": 0.19703230261802673, + "learning_rate": 4.305363891713008e-05, + "loss": 1.1699, + "step": 3102 + }, + { + "epoch": 0.4878067951816699, + "grad_norm": 0.20058362185955048, + "learning_rate": 4.304936451267694e-05, + "loss": 1.1041, + "step": 3103 + }, + { + "epoch": 0.48796400007860247, + "grad_norm": 0.16634905338287354, + "learning_rate": 4.304508900583728e-05, + "loss": 0.9865, + "step": 3104 + }, + { + "epoch": 0.48812120497553496, + "grad_norm": 0.13455606997013092, + "learning_rate": 4.304081239687225e-05, + "loss": 1.1838, + "step": 3105 + }, + { + "epoch": 0.4882784098724675, + "grad_norm": 0.146565243601799, + "learning_rate": 4.303653468604303e-05, + "loss": 1.0494, + "step": 3106 + }, + { + "epoch": 0.48843561476940006, + "grad_norm": 0.21972596645355225, + "learning_rate": 4.3032255873610905e-05, + "loss": 1.0792, + "step": 3107 + }, + { + "epoch": 0.4885928196663326, + "grad_norm": 0.20464344322681427, + "learning_rate": 4.30279759598372e-05, + "loss": 0.9665, + "step": 3108 + }, + { + "epoch": 0.48875002456326516, + "grad_norm": 0.17815683782100677, + "learning_rate": 4.3023694944983305e-05, + "loss": 1.0264, + "step": 3109 + }, + { + "epoch": 0.4889072294601977, + "grad_norm": 0.16300149261951447, + "learning_rate": 4.3019412829310704e-05, + "loss": 1.0808, + "step": 3110 + }, + { + "epoch": 0.4890644343571302, + "grad_norm": 0.16352546215057373, + "learning_rate": 4.301512961308093e-05, + "loss": 1.0634, + "step": 3111 + }, + { + "epoch": 0.48922163925406276, + "grad_norm": 0.14276129007339478, + "learning_rate": 4.301084529655558e-05, + "loss": 1.1966, + "step": 3112 + }, + { + "epoch": 0.4893788441509953, + "grad_norm": 0.17422689497470856, + "learning_rate": 4.300655987999633e-05, + "loss": 1.082, + "step": 3113 + }, + { + "epoch": 0.48953604904792786, + "grad_norm": 0.23078230023384094, + "learning_rate": 4.30022733636649e-05, + "loss": 1.1523, + "step": 3114 + }, + { + "epoch": 0.4896932539448604, + "grad_norm": 0.16097494959831238, + "learning_rate": 4.299798574782312e-05, + "loss": 1.2632, + "step": 3115 + }, + { + "epoch": 0.4898504588417929, + "grad_norm": 0.1580386459827423, + "learning_rate": 4.299369703273285e-05, + "loss": 1.0741, + "step": 3116 + }, + { + "epoch": 0.49000766373872545, + "grad_norm": 0.18079139292240143, + "learning_rate": 4.298940721865602e-05, + "loss": 1.1011, + "step": 3117 + }, + { + "epoch": 0.490164868635658, + "grad_norm": 0.21938787400722504, + "learning_rate": 4.298511630585464e-05, + "loss": 1.1851, + "step": 3118 + }, + { + "epoch": 0.49032207353259055, + "grad_norm": 0.17028893530368805, + "learning_rate": 4.298082429459079e-05, + "loss": 1.1701, + "step": 3119 + }, + { + "epoch": 0.4904792784295231, + "grad_norm": 0.20295727252960205, + "learning_rate": 4.29765311851266e-05, + "loss": 1.1091, + "step": 3120 + }, + { + "epoch": 0.4906364833264556, + "grad_norm": 0.1343192160129547, + "learning_rate": 4.297223697772429e-05, + "loss": 1.1188, + "step": 3121 + }, + { + "epoch": 0.49079368822338815, + "grad_norm": 0.17286306619644165, + "learning_rate": 4.296794167264612e-05, + "loss": 1.1054, + "step": 3122 + }, + { + "epoch": 0.4909508931203207, + "grad_norm": 0.17326287925243378, + "learning_rate": 4.296364527015443e-05, + "loss": 0.98, + "step": 3123 + }, + { + "epoch": 0.49110809801725325, + "grad_norm": 0.15772922337055206, + "learning_rate": 4.295934777051164e-05, + "loss": 1.0586, + "step": 3124 + }, + { + "epoch": 0.4912653029141858, + "grad_norm": 0.20056964457035065, + "learning_rate": 4.295504917398022e-05, + "loss": 1.0914, + "step": 3125 + }, + { + "epoch": 0.4914225078111183, + "grad_norm": 0.16219423711299896, + "learning_rate": 4.2950749480822714e-05, + "loss": 1.0313, + "step": 3126 + }, + { + "epoch": 0.49157971270805084, + "grad_norm": 0.18571913242340088, + "learning_rate": 4.294644869130172e-05, + "loss": 1.0954, + "step": 3127 + }, + { + "epoch": 0.4917369176049834, + "grad_norm": 0.17019760608673096, + "learning_rate": 4.294214680567993e-05, + "loss": 1.1577, + "step": 3128 + }, + { + "epoch": 0.49189412250191594, + "grad_norm": 0.19843801856040955, + "learning_rate": 4.293784382422007e-05, + "loss": 1.083, + "step": 3129 + }, + { + "epoch": 0.4920513273988485, + "grad_norm": 0.1765783727169037, + "learning_rate": 4.2933539747184966e-05, + "loss": 1.081, + "step": 3130 + }, + { + "epoch": 0.492208532295781, + "grad_norm": 0.14573058485984802, + "learning_rate": 4.292923457483748e-05, + "loss": 0.9873, + "step": 3131 + }, + { + "epoch": 0.49236573719271354, + "grad_norm": 0.24786439538002014, + "learning_rate": 4.292492830744057e-05, + "loss": 1.1195, + "step": 3132 + }, + { + "epoch": 0.4925229420896461, + "grad_norm": 0.16276319324970245, + "learning_rate": 4.292062094525723e-05, + "loss": 1.1914, + "step": 3133 + }, + { + "epoch": 0.49268014698657864, + "grad_norm": 0.15106429159641266, + "learning_rate": 4.291631248855055e-05, + "loss": 1.2236, + "step": 3134 + }, + { + "epoch": 0.4928373518835112, + "grad_norm": 0.15074944496154785, + "learning_rate": 4.2912002937583674e-05, + "loss": 1.1511, + "step": 3135 + }, + { + "epoch": 0.49299455678044374, + "grad_norm": 0.14793723821640015, + "learning_rate": 4.2907692292619804e-05, + "loss": 0.9877, + "step": 3136 + }, + { + "epoch": 0.49315176167737623, + "grad_norm": 0.13366791605949402, + "learning_rate": 4.290338055392223e-05, + "loss": 1.1199, + "step": 3137 + }, + { + "epoch": 0.4933089665743088, + "grad_norm": 0.19258376955986023, + "learning_rate": 4.289906772175428e-05, + "loss": 1.1465, + "step": 3138 + }, + { + "epoch": 0.49346617147124133, + "grad_norm": 0.22038963437080383, + "learning_rate": 4.289475379637938e-05, + "loss": 1.1204, + "step": 3139 + }, + { + "epoch": 0.4936233763681739, + "grad_norm": 0.18796618282794952, + "learning_rate": 4.289043877806101e-05, + "loss": 1.0417, + "step": 3140 + }, + { + "epoch": 0.49378058126510643, + "grad_norm": 0.1992577463388443, + "learning_rate": 4.28861226670627e-05, + "loss": 1.1426, + "step": 3141 + }, + { + "epoch": 0.4939377861620389, + "grad_norm": 0.199345663189888, + "learning_rate": 4.2881805463648075e-05, + "loss": 1.1781, + "step": 3142 + }, + { + "epoch": 0.4940949910589715, + "grad_norm": 0.19986248016357422, + "learning_rate": 4.28774871680808e-05, + "loss": 1.1761, + "step": 3143 + }, + { + "epoch": 0.49425219595590403, + "grad_norm": 0.2036089152097702, + "learning_rate": 4.2873167780624634e-05, + "loss": 1.0096, + "step": 3144 + }, + { + "epoch": 0.4944094008528366, + "grad_norm": 0.16037362813949585, + "learning_rate": 4.286884730154338e-05, + "loss": 1.132, + "step": 3145 + }, + { + "epoch": 0.49456660574976913, + "grad_norm": 0.17637239396572113, + "learning_rate": 4.286452573110092e-05, + "loss": 1.1883, + "step": 3146 + }, + { + "epoch": 0.4947238106467016, + "grad_norm": 0.1752503663301468, + "learning_rate": 4.28602030695612e-05, + "loss": 1.1, + "step": 3147 + }, + { + "epoch": 0.4948810155436342, + "grad_norm": 0.1588784009218216, + "learning_rate": 4.285587931718823e-05, + "loss": 1.0101, + "step": 3148 + }, + { + "epoch": 0.4950382204405667, + "grad_norm": 0.18488885462284088, + "learning_rate": 4.285155447424609e-05, + "loss": 1.0712, + "step": 3149 + }, + { + "epoch": 0.4951954253374993, + "grad_norm": 0.2019781470298767, + "learning_rate": 4.284722854099892e-05, + "loss": 0.9967, + "step": 3150 + }, + { + "epoch": 0.4953526302344318, + "grad_norm": 0.12347476184368134, + "learning_rate": 4.284290151771094e-05, + "loss": 1.1921, + "step": 3151 + }, + { + "epoch": 0.4955098351313643, + "grad_norm": 0.16759006679058075, + "learning_rate": 4.283857340464642e-05, + "loss": 1.0069, + "step": 3152 + }, + { + "epoch": 0.49566704002829687, + "grad_norm": 0.2088867723941803, + "learning_rate": 4.283424420206971e-05, + "loss": 1.1376, + "step": 3153 + }, + { + "epoch": 0.4958242449252294, + "grad_norm": 0.13864010572433472, + "learning_rate": 4.282991391024521e-05, + "loss": 1.2096, + "step": 3154 + }, + { + "epoch": 0.49598144982216197, + "grad_norm": 0.1731109619140625, + "learning_rate": 4.282558252943741e-05, + "loss": 1.1171, + "step": 3155 + }, + { + "epoch": 0.4961386547190945, + "grad_norm": 0.15584808588027954, + "learning_rate": 4.2821250059910857e-05, + "loss": 1.2236, + "step": 3156 + }, + { + "epoch": 0.496295859616027, + "grad_norm": 0.14465118944644928, + "learning_rate": 4.281691650193016e-05, + "loss": 1.0953, + "step": 3157 + }, + { + "epoch": 0.49645306451295956, + "grad_norm": 0.16158504784107208, + "learning_rate": 4.281258185575998e-05, + "loss": 1.0887, + "step": 3158 + }, + { + "epoch": 0.4966102694098921, + "grad_norm": 0.15371543169021606, + "learning_rate": 4.2808246121665075e-05, + "loss": 1.1856, + "step": 3159 + }, + { + "epoch": 0.49676747430682466, + "grad_norm": 0.15584081411361694, + "learning_rate": 4.280390929991026e-05, + "loss": 1.1122, + "step": 3160 + }, + { + "epoch": 0.4969246792037572, + "grad_norm": 0.18433384597301483, + "learning_rate": 4.27995713907604e-05, + "loss": 1.0427, + "step": 3161 + }, + { + "epoch": 0.49708188410068976, + "grad_norm": 0.13477365672588348, + "learning_rate": 4.279523239448044e-05, + "loss": 1.1669, + "step": 3162 + }, + { + "epoch": 0.49723908899762226, + "grad_norm": 0.15489578247070312, + "learning_rate": 4.27908923113354e-05, + "loss": 1.1288, + "step": 3163 + }, + { + "epoch": 0.4973962938945548, + "grad_norm": 0.16293781995773315, + "learning_rate": 4.278655114159034e-05, + "loss": 1.0994, + "step": 3164 + }, + { + "epoch": 0.49755349879148736, + "grad_norm": 0.15055860579013824, + "learning_rate": 4.278220888551041e-05, + "loss": 1.0994, + "step": 3165 + }, + { + "epoch": 0.4977107036884199, + "grad_norm": 0.16553258895874023, + "learning_rate": 4.277786554336082e-05, + "loss": 1.1377, + "step": 3166 + }, + { + "epoch": 0.49786790858535246, + "grad_norm": 0.17897070944309235, + "learning_rate": 4.277352111540685e-05, + "loss": 1.0658, + "step": 3167 + }, + { + "epoch": 0.49802511348228495, + "grad_norm": 0.17738717794418335, + "learning_rate": 4.276917560191382e-05, + "loss": 1.1975, + "step": 3168 + }, + { + "epoch": 0.4981823183792175, + "grad_norm": 0.17369163036346436, + "learning_rate": 4.2764829003147155e-05, + "loss": 1.1395, + "step": 3169 + }, + { + "epoch": 0.49833952327615005, + "grad_norm": 0.14780254662036896, + "learning_rate": 4.276048131937233e-05, + "loss": 1.2145, + "step": 3170 + }, + { + "epoch": 0.4984967281730826, + "grad_norm": 0.15877056121826172, + "learning_rate": 4.275613255085488e-05, + "loss": 1.1127, + "step": 3171 + }, + { + "epoch": 0.49865393307001515, + "grad_norm": 0.1346055120229721, + "learning_rate": 4.2751782697860407e-05, + "loss": 1.1607, + "step": 3172 + }, + { + "epoch": 0.49881113796694765, + "grad_norm": 0.2160274088382721, + "learning_rate": 4.2747431760654596e-05, + "loss": 1.1515, + "step": 3173 + }, + { + "epoch": 0.4989683428638802, + "grad_norm": 0.21286068856716156, + "learning_rate": 4.274307973950317e-05, + "loss": 1.1661, + "step": 3174 + }, + { + "epoch": 0.49912554776081275, + "grad_norm": 0.2089259922504425, + "learning_rate": 4.2738726634671944e-05, + "loss": 1.1204, + "step": 3175 + }, + { + "epoch": 0.4992827526577453, + "grad_norm": 0.20848344266414642, + "learning_rate": 4.273437244642678e-05, + "loss": 1.1643, + "step": 3176 + }, + { + "epoch": 0.49943995755467785, + "grad_norm": 0.1899596005678177, + "learning_rate": 4.273001717503364e-05, + "loss": 1.1176, + "step": 3177 + }, + { + "epoch": 0.49959716245161034, + "grad_norm": 0.14908720552921295, + "learning_rate": 4.2725660820758494e-05, + "loss": 1.1538, + "step": 3178 + }, + { + "epoch": 0.4997543673485429, + "grad_norm": 0.2636309564113617, + "learning_rate": 4.2721303383867426e-05, + "loss": 1.0771, + "step": 3179 + }, + { + "epoch": 0.49991157224547544, + "grad_norm": 0.14900867640972137, + "learning_rate": 4.2716944864626585e-05, + "loss": 1.08, + "step": 3180 + }, + { + "epoch": 0.5000687771424079, + "grad_norm": 0.16542400419712067, + "learning_rate": 4.271258526330215e-05, + "loss": 1.1431, + "step": 3181 + }, + { + "epoch": 0.5002259820393405, + "grad_norm": 0.29188400506973267, + "learning_rate": 4.27082245801604e-05, + "loss": 0.9727, + "step": 3182 + }, + { + "epoch": 0.500383186936273, + "grad_norm": 0.13522717356681824, + "learning_rate": 4.2703862815467674e-05, + "loss": 1.1593, + "step": 3183 + }, + { + "epoch": 0.5005403918332056, + "grad_norm": 0.16434258222579956, + "learning_rate": 4.269949996949036e-05, + "loss": 1.1627, + "step": 3184 + }, + { + "epoch": 0.5006975967301381, + "grad_norm": 0.15872086584568024, + "learning_rate": 4.2695136042494934e-05, + "loss": 1.1622, + "step": 3185 + }, + { + "epoch": 0.5008548016270706, + "grad_norm": 0.16532592475414276, + "learning_rate": 4.2690771034747916e-05, + "loss": 1.1645, + "step": 3186 + }, + { + "epoch": 0.5010120065240032, + "grad_norm": 0.28270232677459717, + "learning_rate": 4.2686404946515926e-05, + "loss": 1.2106, + "step": 3187 + }, + { + "epoch": 0.5011692114209357, + "grad_norm": 0.17568837106227875, + "learning_rate": 4.26820377780656e-05, + "loss": 1.1278, + "step": 3188 + }, + { + "epoch": 0.5013264163178683, + "grad_norm": 0.21069854497909546, + "learning_rate": 4.267766952966369e-05, + "loss": 1.0745, + "step": 3189 + }, + { + "epoch": 0.5014836212148008, + "grad_norm": 0.18528971076011658, + "learning_rate": 4.267330020157698e-05, + "loss": 1.2046, + "step": 3190 + }, + { + "epoch": 0.5016408261117333, + "grad_norm": 0.18909358978271484, + "learning_rate": 4.266892979407234e-05, + "loss": 1.035, + "step": 3191 + }, + { + "epoch": 0.5017980310086659, + "grad_norm": 0.198701873421669, + "learning_rate": 4.26645583074167e-05, + "loss": 1.0765, + "step": 3192 + }, + { + "epoch": 0.5019552359055984, + "grad_norm": 0.14333534240722656, + "learning_rate": 4.266018574187703e-05, + "loss": 1.2583, + "step": 3193 + }, + { + "epoch": 0.502112440802531, + "grad_norm": 0.1988966464996338, + "learning_rate": 4.265581209772043e-05, + "loss": 1.1101, + "step": 3194 + }, + { + "epoch": 0.5022696456994635, + "grad_norm": 0.15289467573165894, + "learning_rate": 4.265143737521399e-05, + "loss": 1.042, + "step": 3195 + }, + { + "epoch": 0.502426850596396, + "grad_norm": 0.1811387538909912, + "learning_rate": 4.2647061574624916e-05, + "loss": 1.0892, + "step": 3196 + }, + { + "epoch": 0.5025840554933286, + "grad_norm": 0.1622971147298813, + "learning_rate": 4.264268469622046e-05, + "loss": 1.1167, + "step": 3197 + }, + { + "epoch": 0.5027412603902611, + "grad_norm": 0.13076646625995636, + "learning_rate": 4.263830674026795e-05, + "loss": 1.1922, + "step": 3198 + }, + { + "epoch": 0.5028984652871937, + "grad_norm": 0.16915321350097656, + "learning_rate": 4.2633927707034785e-05, + "loss": 1.0426, + "step": 3199 + }, + { + "epoch": 0.5030556701841262, + "grad_norm": 0.15100719034671783, + "learning_rate": 4.26295475967884e-05, + "loss": 1.0658, + "step": 3200 + }, + { + "epoch": 0.5030556701841262, + "eval_loss": 1.1112091541290283, + "eval_runtime": 2329.136, + "eval_samples_per_second": 3.975, + "eval_steps_per_second": 1.987, + "step": 3200 + }, + { + "epoch": 0.5032128750810588, + "grad_norm": 0.22507062554359436, + "learning_rate": 4.262516640979632e-05, + "loss": 1.0526, + "step": 3201 + }, + { + "epoch": 0.5033700799779913, + "grad_norm": 0.1575806438922882, + "learning_rate": 4.2620784146326134e-05, + "loss": 1.1379, + "step": 3202 + }, + { + "epoch": 0.5035272848749238, + "grad_norm": 0.23570889234542847, + "learning_rate": 4.26164008066455e-05, + "loss": 1.0043, + "step": 3203 + }, + { + "epoch": 0.5036844897718564, + "grad_norm": 0.15176087617874146, + "learning_rate": 4.261201639102214e-05, + "loss": 1.1273, + "step": 3204 + }, + { + "epoch": 0.5038416946687889, + "grad_norm": 0.2037598192691803, + "learning_rate": 4.2607630899723815e-05, + "loss": 1.1443, + "step": 3205 + }, + { + "epoch": 0.5039988995657215, + "grad_norm": 0.18929214775562286, + "learning_rate": 4.260324433301839e-05, + "loss": 1.1648, + "step": 3206 + }, + { + "epoch": 0.504156104462654, + "grad_norm": 0.1639530211687088, + "learning_rate": 4.259885669117377e-05, + "loss": 1.1273, + "step": 3207 + }, + { + "epoch": 0.5043133093595865, + "grad_norm": 0.16363751888275146, + "learning_rate": 4.259446797445795e-05, + "loss": 1.022, + "step": 3208 + }, + { + "epoch": 0.5044705142565191, + "grad_norm": 0.15995655953884125, + "learning_rate": 4.2590078183138976e-05, + "loss": 1.1725, + "step": 3209 + }, + { + "epoch": 0.5046277191534516, + "grad_norm": 0.14447644352912903, + "learning_rate": 4.258568731748494e-05, + "loss": 1.1501, + "step": 3210 + }, + { + "epoch": 0.5047849240503842, + "grad_norm": 0.183898463845253, + "learning_rate": 4.258129537776405e-05, + "loss": 1.1573, + "step": 3211 + }, + { + "epoch": 0.5049421289473167, + "grad_norm": 0.16828583180904388, + "learning_rate": 4.257690236424451e-05, + "loss": 1.2383, + "step": 3212 + }, + { + "epoch": 0.5050993338442492, + "grad_norm": 0.15795843303203583, + "learning_rate": 4.257250827719466e-05, + "loss": 1.0214, + "step": 3213 + }, + { + "epoch": 0.5052565387411818, + "grad_norm": 0.2043066918849945, + "learning_rate": 4.2568113116882854e-05, + "loss": 0.9322, + "step": 3214 + }, + { + "epoch": 0.5054137436381143, + "grad_norm": 0.15502794086933136, + "learning_rate": 4.256371688357755e-05, + "loss": 1.1537, + "step": 3215 + }, + { + "epoch": 0.5055709485350469, + "grad_norm": 0.19955813884735107, + "learning_rate": 4.255931957754725e-05, + "loss": 1.0808, + "step": 3216 + }, + { + "epoch": 0.5057281534319794, + "grad_norm": 0.13643762469291687, + "learning_rate": 4.255492119906051e-05, + "loss": 1.0076, + "step": 3217 + }, + { + "epoch": 0.5058853583289119, + "grad_norm": 0.18335899710655212, + "learning_rate": 4.2550521748385975e-05, + "loss": 1.0465, + "step": 3218 + }, + { + "epoch": 0.5060425632258445, + "grad_norm": 0.13511648774147034, + "learning_rate": 4.254612122579235e-05, + "loss": 1.0214, + "step": 3219 + }, + { + "epoch": 0.506199768122777, + "grad_norm": 0.1416211724281311, + "learning_rate": 4.25417196315484e-05, + "loss": 1.0799, + "step": 3220 + }, + { + "epoch": 0.5063569730197096, + "grad_norm": 0.1651175618171692, + "learning_rate": 4.253731696592295e-05, + "loss": 1.1739, + "step": 3221 + }, + { + "epoch": 0.5065141779166421, + "grad_norm": 0.1566598117351532, + "learning_rate": 4.253291322918491e-05, + "loss": 1.0812, + "step": 3222 + }, + { + "epoch": 0.5066713828135746, + "grad_norm": 0.1521669626235962, + "learning_rate": 4.252850842160324e-05, + "loss": 1.1608, + "step": 3223 + }, + { + "epoch": 0.5068285877105072, + "grad_norm": 0.1704331487417221, + "learning_rate": 4.252410254344696e-05, + "loss": 1.1423, + "step": 3224 + }, + { + "epoch": 0.5069857926074397, + "grad_norm": 0.1667756289243698, + "learning_rate": 4.251969559498519e-05, + "loss": 1.1358, + "step": 3225 + }, + { + "epoch": 0.5071429975043723, + "grad_norm": 0.1840224266052246, + "learning_rate": 4.251528757648705e-05, + "loss": 1.1496, + "step": 3226 + }, + { + "epoch": 0.5073002024013048, + "grad_norm": 0.2619423270225525, + "learning_rate": 4.2510878488221794e-05, + "loss": 1.058, + "step": 3227 + }, + { + "epoch": 0.5074574072982373, + "grad_norm": 0.16940724849700928, + "learning_rate": 4.2506468330458706e-05, + "loss": 1.0027, + "step": 3228 + }, + { + "epoch": 0.5076146121951699, + "grad_norm": 0.14660978317260742, + "learning_rate": 4.250205710346714e-05, + "loss": 1.064, + "step": 3229 + }, + { + "epoch": 0.5077718170921024, + "grad_norm": 0.1640421599149704, + "learning_rate": 4.249764480751652e-05, + "loss": 1.1039, + "step": 3230 + }, + { + "epoch": 0.507929021989035, + "grad_norm": 0.14718511700630188, + "learning_rate": 4.249323144287632e-05, + "loss": 1.1057, + "step": 3231 + }, + { + "epoch": 0.5080862268859675, + "grad_norm": 0.16258497536182404, + "learning_rate": 4.248881700981611e-05, + "loss": 1.1102, + "step": 3232 + }, + { + "epoch": 0.5082434317829, + "grad_norm": 0.18936386704444885, + "learning_rate": 4.248440150860549e-05, + "loss": 1.1816, + "step": 3233 + }, + { + "epoch": 0.5084006366798326, + "grad_norm": 0.1487811654806137, + "learning_rate": 4.2479984939514155e-05, + "loss": 1.1632, + "step": 3234 + }, + { + "epoch": 0.5085578415767651, + "grad_norm": 0.19328673183918, + "learning_rate": 4.2475567302811846e-05, + "loss": 1.085, + "step": 3235 + }, + { + "epoch": 0.5087150464736977, + "grad_norm": 0.16193972527980804, + "learning_rate": 4.2471148598768375e-05, + "loss": 1.0772, + "step": 3236 + }, + { + "epoch": 0.5088722513706302, + "grad_norm": 0.16668765246868134, + "learning_rate": 4.246672882765362e-05, + "loss": 1.1176, + "step": 3237 + }, + { + "epoch": 0.5090294562675627, + "grad_norm": 0.24110597372055054, + "learning_rate": 4.246230798973753e-05, + "loss": 1.0638, + "step": 3238 + }, + { + "epoch": 0.5091866611644953, + "grad_norm": 0.1574033796787262, + "learning_rate": 4.24578860852901e-05, + "loss": 1.2187, + "step": 3239 + }, + { + "epoch": 0.5093438660614278, + "grad_norm": 0.14084014296531677, + "learning_rate": 4.2453463114581414e-05, + "loss": 1.0872, + "step": 3240 + }, + { + "epoch": 0.5095010709583604, + "grad_norm": 0.16895703971385956, + "learning_rate": 4.2449039077881616e-05, + "loss": 1.0453, + "step": 3241 + }, + { + "epoch": 0.5096582758552929, + "grad_norm": 0.2261844277381897, + "learning_rate": 4.244461397546089e-05, + "loss": 1.1039, + "step": 3242 + }, + { + "epoch": 0.5098154807522254, + "grad_norm": 0.178606778383255, + "learning_rate": 4.2440187807589515e-05, + "loss": 1.043, + "step": 3243 + }, + { + "epoch": 0.509972685649158, + "grad_norm": 0.1622498333454132, + "learning_rate": 4.243576057453783e-05, + "loss": 1.0987, + "step": 3244 + }, + { + "epoch": 0.5101298905460905, + "grad_norm": 0.14160633087158203, + "learning_rate": 4.243133227657622e-05, + "loss": 1.1027, + "step": 3245 + }, + { + "epoch": 0.5102870954430231, + "grad_norm": 0.14689600467681885, + "learning_rate": 4.2426902913975165e-05, + "loss": 1.113, + "step": 3246 + }, + { + "epoch": 0.5104443003399556, + "grad_norm": 0.5163748860359192, + "learning_rate": 4.242247248700518e-05, + "loss": 0.9741, + "step": 3247 + }, + { + "epoch": 0.5106015052368881, + "grad_norm": 0.13907118141651154, + "learning_rate": 4.241804099593686e-05, + "loss": 1.0915, + "step": 3248 + }, + { + "epoch": 0.5107587101338207, + "grad_norm": 0.16090743243694305, + "learning_rate": 4.241360844104087e-05, + "loss": 1.1899, + "step": 3249 + }, + { + "epoch": 0.5109159150307532, + "grad_norm": 0.1669221967458725, + "learning_rate": 4.240917482258794e-05, + "loss": 1.0962, + "step": 3250 + }, + { + "epoch": 0.5110731199276858, + "grad_norm": 0.18096497654914856, + "learning_rate": 4.240474014084884e-05, + "loss": 1.2463, + "step": 3251 + }, + { + "epoch": 0.5112303248246183, + "grad_norm": 0.1275794804096222, + "learning_rate": 4.240030439609444e-05, + "loss": 1.1379, + "step": 3252 + }, + { + "epoch": 0.5113875297215508, + "grad_norm": 0.13416628539562225, + "learning_rate": 4.239586758859564e-05, + "loss": 1.1639, + "step": 3253 + }, + { + "epoch": 0.5115447346184834, + "grad_norm": 0.15411725640296936, + "learning_rate": 4.2391429718623445e-05, + "loss": 1.0484, + "step": 3254 + }, + { + "epoch": 0.5117019395154159, + "grad_norm": 0.15557514131069183, + "learning_rate": 4.238699078644889e-05, + "loss": 1.0661, + "step": 3255 + }, + { + "epoch": 0.5118591444123485, + "grad_norm": 0.21330618858337402, + "learning_rate": 4.238255079234309e-05, + "loss": 1.2001, + "step": 3256 + }, + { + "epoch": 0.512016349309281, + "grad_norm": 0.1585479974746704, + "learning_rate": 4.237810973657722e-05, + "loss": 1.1734, + "step": 3257 + }, + { + "epoch": 0.5121735542062136, + "grad_norm": 0.1720305234193802, + "learning_rate": 4.237366761942253e-05, + "loss": 1.0198, + "step": 3258 + }, + { + "epoch": 0.5123307591031461, + "grad_norm": 0.16566184163093567, + "learning_rate": 4.2369224441150324e-05, + "loss": 1.2449, + "step": 3259 + }, + { + "epoch": 0.5124879640000786, + "grad_norm": 0.17463403940200806, + "learning_rate": 4.236478020203198e-05, + "loss": 1.1379, + "step": 3260 + }, + { + "epoch": 0.5126451688970112, + "grad_norm": 0.16861651837825775, + "learning_rate": 4.236033490233892e-05, + "loss": 1.2033, + "step": 3261 + }, + { + "epoch": 0.5128023737939437, + "grad_norm": 0.17625701427459717, + "learning_rate": 4.2355888542342666e-05, + "loss": 1.1482, + "step": 3262 + }, + { + "epoch": 0.5129595786908763, + "grad_norm": 0.1449575424194336, + "learning_rate": 4.2351441122314764e-05, + "loss": 1.1017, + "step": 3263 + }, + { + "epoch": 0.5131167835878088, + "grad_norm": 0.12848585844039917, + "learning_rate": 4.234699264252687e-05, + "loss": 1.1988, + "step": 3264 + }, + { + "epoch": 0.5132739884847413, + "grad_norm": 0.16907472908496857, + "learning_rate": 4.2342543103250654e-05, + "loss": 0.9311, + "step": 3265 + }, + { + "epoch": 0.5134311933816739, + "grad_norm": 0.1578347235918045, + "learning_rate": 4.2338092504757896e-05, + "loss": 1.1661, + "step": 3266 + }, + { + "epoch": 0.5135883982786064, + "grad_norm": 0.15366345643997192, + "learning_rate": 4.233364084732041e-05, + "loss": 1.0702, + "step": 3267 + }, + { + "epoch": 0.513745603175539, + "grad_norm": 0.14660310745239258, + "learning_rate": 4.2329188131210094e-05, + "loss": 1.0974, + "step": 3268 + }, + { + "epoch": 0.5139028080724715, + "grad_norm": 0.1957070529460907, + "learning_rate": 4.23247343566989e-05, + "loss": 0.9215, + "step": 3269 + }, + { + "epoch": 0.514060012969404, + "grad_norm": 0.16161002218723297, + "learning_rate": 4.2320279524058855e-05, + "loss": 1.0315, + "step": 3270 + }, + { + "epoch": 0.5142172178663366, + "grad_norm": 0.15061277151107788, + "learning_rate": 4.2315823633562025e-05, + "loss": 1.2509, + "step": 3271 + }, + { + "epoch": 0.5143744227632691, + "grad_norm": 0.13233885169029236, + "learning_rate": 4.231136668548057e-05, + "loss": 1.0631, + "step": 3272 + }, + { + "epoch": 0.5145316276602017, + "grad_norm": 0.15317268669605255, + "learning_rate": 4.230690868008671e-05, + "loss": 1.151, + "step": 3273 + }, + { + "epoch": 0.5146888325571342, + "grad_norm": 0.18332509696483612, + "learning_rate": 4.2302449617652716e-05, + "loss": 1.1068, + "step": 3274 + }, + { + "epoch": 0.5148460374540667, + "grad_norm": 0.17250898480415344, + "learning_rate": 4.229798949845093e-05, + "loss": 1.1823, + "step": 3275 + }, + { + "epoch": 0.5150032423509993, + "grad_norm": 0.14509278535842896, + "learning_rate": 4.2293528322753754e-05, + "loss": 1.1604, + "step": 3276 + }, + { + "epoch": 0.5151604472479318, + "grad_norm": 0.20953206717967987, + "learning_rate": 4.2289066090833674e-05, + "loss": 1.0799, + "step": 3277 + }, + { + "epoch": 0.5153176521448644, + "grad_norm": 0.14778871834278107, + "learning_rate": 4.2284602802963216e-05, + "loss": 1.1736, + "step": 3278 + }, + { + "epoch": 0.5154748570417969, + "grad_norm": 0.20026050508022308, + "learning_rate": 4.2280138459414976e-05, + "loss": 1.1321, + "step": 3279 + }, + { + "epoch": 0.5156320619387293, + "grad_norm": 0.20624299347400665, + "learning_rate": 4.227567306046164e-05, + "loss": 1.1062, + "step": 3280 + }, + { + "epoch": 0.515789266835662, + "grad_norm": 0.12861621379852295, + "learning_rate": 4.2271206606375904e-05, + "loss": 1.1786, + "step": 3281 + }, + { + "epoch": 0.5159464717325944, + "grad_norm": 0.1462821513414383, + "learning_rate": 4.226673909743059e-05, + "loss": 1.1002, + "step": 3282 + }, + { + "epoch": 0.516103676629527, + "grad_norm": 0.1743149310350418, + "learning_rate": 4.226227053389855e-05, + "loss": 1.1643, + "step": 3283 + }, + { + "epoch": 0.5162608815264595, + "grad_norm": 0.17978879809379578, + "learning_rate": 4.22578009160527e-05, + "loss": 1.2249, + "step": 3284 + }, + { + "epoch": 0.516418086423392, + "grad_norm": 0.14567448198795319, + "learning_rate": 4.2253330244166035e-05, + "loss": 1.161, + "step": 3285 + }, + { + "epoch": 0.5165752913203246, + "grad_norm": 0.1618671864271164, + "learning_rate": 4.2248858518511605e-05, + "loss": 1.2158, + "step": 3286 + }, + { + "epoch": 0.5167324962172571, + "grad_norm": 0.16569803655147552, + "learning_rate": 4.224438573936252e-05, + "loss": 1.0314, + "step": 3287 + }, + { + "epoch": 0.5168897011141897, + "grad_norm": 0.1353185921907425, + "learning_rate": 4.223991190699197e-05, + "loss": 1.0542, + "step": 3288 + }, + { + "epoch": 0.5170469060111222, + "grad_norm": 0.16214190423488617, + "learning_rate": 4.223543702167319e-05, + "loss": 1.1062, + "step": 3289 + }, + { + "epoch": 0.5172041109080547, + "grad_norm": 0.20043228566646576, + "learning_rate": 4.2230961083679485e-05, + "loss": 1.0467, + "step": 3290 + }, + { + "epoch": 0.5173613158049873, + "grad_norm": 0.1516493409872055, + "learning_rate": 4.222648409328425e-05, + "loss": 1.2023, + "step": 3291 + }, + { + "epoch": 0.5175185207019198, + "grad_norm": 0.15186642110347748, + "learning_rate": 4.22220060507609e-05, + "loss": 1.1551, + "step": 3292 + }, + { + "epoch": 0.5176757255988524, + "grad_norm": 0.24731656908988953, + "learning_rate": 4.221752695638296e-05, + "loss": 1.1195, + "step": 3293 + }, + { + "epoch": 0.5178329304957849, + "grad_norm": 0.14959721267223358, + "learning_rate": 4.221304681042397e-05, + "loss": 1.0982, + "step": 3294 + }, + { + "epoch": 0.5179901353927174, + "grad_norm": 0.16160228848457336, + "learning_rate": 4.220856561315757e-05, + "loss": 1.0693, + "step": 3295 + }, + { + "epoch": 0.51814734028965, + "grad_norm": 0.17099249362945557, + "learning_rate": 4.220408336485746e-05, + "loss": 1.0608, + "step": 3296 + }, + { + "epoch": 0.5183045451865825, + "grad_norm": 0.1416684240102768, + "learning_rate": 4.219960006579739e-05, + "loss": 1.1713, + "step": 3297 + }, + { + "epoch": 0.5184617500835151, + "grad_norm": 0.2992304265499115, + "learning_rate": 4.21951157162512e-05, + "loss": 1.0846, + "step": 3298 + }, + { + "epoch": 0.5186189549804476, + "grad_norm": 0.14916300773620605, + "learning_rate": 4.219063031649276e-05, + "loss": 1.0304, + "step": 3299 + }, + { + "epoch": 0.5187761598773801, + "grad_norm": 0.12246151268482208, + "learning_rate": 4.2186143866796025e-05, + "loss": 1.2179, + "step": 3300 + }, + { + "epoch": 0.5189333647743127, + "grad_norm": 0.1627214252948761, + "learning_rate": 4.218165636743502e-05, + "loss": 1.1276, + "step": 3301 + }, + { + "epoch": 0.5190905696712452, + "grad_norm": 0.14838017523288727, + "learning_rate": 4.217716781868381e-05, + "loss": 1.2015, + "step": 3302 + }, + { + "epoch": 0.5192477745681778, + "grad_norm": 0.14416925609111786, + "learning_rate": 4.217267822081654e-05, + "loss": 1.296, + "step": 3303 + }, + { + "epoch": 0.5194049794651103, + "grad_norm": 0.15439528226852417, + "learning_rate": 4.2168187574107424e-05, + "loss": 1.2018, + "step": 3304 + }, + { + "epoch": 0.5195621843620428, + "grad_norm": 0.15193216502666473, + "learning_rate": 4.216369587883073e-05, + "loss": 1.0985, + "step": 3305 + }, + { + "epoch": 0.5197193892589754, + "grad_norm": 0.14423884451389313, + "learning_rate": 4.2159203135260804e-05, + "loss": 1.1073, + "step": 3306 + }, + { + "epoch": 0.5198765941559079, + "grad_norm": 0.15461623668670654, + "learning_rate": 4.215470934367203e-05, + "loss": 1.0362, + "step": 3307 + }, + { + "epoch": 0.5200337990528405, + "grad_norm": 0.14225488901138306, + "learning_rate": 4.215021450433888e-05, + "loss": 1.0318, + "step": 3308 + }, + { + "epoch": 0.520191003949773, + "grad_norm": 0.16612344980239868, + "learning_rate": 4.214571861753588e-05, + "loss": 1.1269, + "step": 3309 + }, + { + "epoch": 0.5203482088467056, + "grad_norm": 0.21814720332622528, + "learning_rate": 4.2141221683537624e-05, + "loss": 1.2525, + "step": 3310 + }, + { + "epoch": 0.5205054137436381, + "grad_norm": 0.15520301461219788, + "learning_rate": 4.2136723702618765e-05, + "loss": 1.1255, + "step": 3311 + }, + { + "epoch": 0.5206626186405706, + "grad_norm": 0.15896332263946533, + "learning_rate": 4.213222467505402e-05, + "loss": 1.1534, + "step": 3312 + }, + { + "epoch": 0.5208198235375032, + "grad_norm": 0.14684036374092102, + "learning_rate": 4.212772460111818e-05, + "loss": 1.1959, + "step": 3313 + }, + { + "epoch": 0.5209770284344357, + "grad_norm": 0.1430623084306717, + "learning_rate": 4.2123223481086084e-05, + "loss": 1.2392, + "step": 3314 + }, + { + "epoch": 0.5211342333313683, + "grad_norm": 0.19666410982608795, + "learning_rate": 4.211872131523265e-05, + "loss": 1.1901, + "step": 3315 + }, + { + "epoch": 0.5212914382283008, + "grad_norm": 0.1641821265220642, + "learning_rate": 4.211421810383285e-05, + "loss": 1.0118, + "step": 3316 + }, + { + "epoch": 0.5214486431252333, + "grad_norm": 0.16409622132778168, + "learning_rate": 4.210971384716173e-05, + "loss": 1.0415, + "step": 3317 + }, + { + "epoch": 0.5216058480221659, + "grad_norm": 0.25037920475006104, + "learning_rate": 4.2105208545494375e-05, + "loss": 1.075, + "step": 3318 + }, + { + "epoch": 0.5217630529190984, + "grad_norm": 0.1819911003112793, + "learning_rate": 4.210070219910597e-05, + "loss": 1.0585, + "step": 3319 + }, + { + "epoch": 0.521920257816031, + "grad_norm": 0.19422422349452972, + "learning_rate": 4.209619480827173e-05, + "loss": 1.153, + "step": 3320 + }, + { + "epoch": 0.5220774627129635, + "grad_norm": 0.22342312335968018, + "learning_rate": 4.209168637326697e-05, + "loss": 0.981, + "step": 3321 + }, + { + "epoch": 0.522234667609896, + "grad_norm": 0.18515369296073914, + "learning_rate": 4.208717689436703e-05, + "loss": 1.089, + "step": 3322 + }, + { + "epoch": 0.5223918725068286, + "grad_norm": 0.14646635949611664, + "learning_rate": 4.208266637184734e-05, + "loss": 1.061, + "step": 3323 + }, + { + "epoch": 0.5225490774037611, + "grad_norm": 0.15196828544139862, + "learning_rate": 4.207815480598338e-05, + "loss": 1.1832, + "step": 3324 + }, + { + "epoch": 0.5227062823006937, + "grad_norm": 0.1385664939880371, + "learning_rate": 4.207364219705071e-05, + "loss": 1.1758, + "step": 3325 + }, + { + "epoch": 0.5228634871976262, + "grad_norm": 0.18854454159736633, + "learning_rate": 4.206912854532492e-05, + "loss": 1.0566, + "step": 3326 + }, + { + "epoch": 0.5230206920945587, + "grad_norm": 0.16105569899082184, + "learning_rate": 4.2064613851081717e-05, + "loss": 1.0328, + "step": 3327 + }, + { + "epoch": 0.5231778969914913, + "grad_norm": 0.15872891247272491, + "learning_rate": 4.2060098114596824e-05, + "loss": 1.0453, + "step": 3328 + }, + { + "epoch": 0.5233351018884238, + "grad_norm": 0.1624576896429062, + "learning_rate": 4.205558133614604e-05, + "loss": 1.1419, + "step": 3329 + }, + { + "epoch": 0.5234923067853564, + "grad_norm": 0.14295019209384918, + "learning_rate": 4.205106351600525e-05, + "loss": 1.1008, + "step": 3330 + }, + { + "epoch": 0.5236495116822889, + "grad_norm": 0.19790048897266388, + "learning_rate": 4.204654465445037e-05, + "loss": 1.181, + "step": 3331 + }, + { + "epoch": 0.5238067165792214, + "grad_norm": 0.1526133269071579, + "learning_rate": 4.20420247517574e-05, + "loss": 1.1758, + "step": 3332 + }, + { + "epoch": 0.523963921476154, + "grad_norm": 0.18274520337581635, + "learning_rate": 4.20375038082024e-05, + "loss": 1.1191, + "step": 3333 + }, + { + "epoch": 0.5241211263730865, + "grad_norm": 0.14900778234004974, + "learning_rate": 4.20329818240615e-05, + "loss": 1.01, + "step": 3334 + }, + { + "epoch": 0.5242783312700191, + "grad_norm": 0.17267143726348877, + "learning_rate": 4.202845879961086e-05, + "loss": 1.006, + "step": 3335 + }, + { + "epoch": 0.5244355361669516, + "grad_norm": 0.18299135565757751, + "learning_rate": 4.202393473512676e-05, + "loss": 1.0996, + "step": 3336 + }, + { + "epoch": 0.5245927410638841, + "grad_norm": 0.1378333568572998, + "learning_rate": 4.2019409630885485e-05, + "loss": 1.1321, + "step": 3337 + }, + { + "epoch": 0.5247499459608167, + "grad_norm": 0.184214249253273, + "learning_rate": 4.2014883487163434e-05, + "loss": 1.0813, + "step": 3338 + }, + { + "epoch": 0.5249071508577492, + "grad_norm": 0.14198952913284302, + "learning_rate": 4.201035630423703e-05, + "loss": 1.2111, + "step": 3339 + }, + { + "epoch": 0.5250643557546818, + "grad_norm": 0.18620987236499786, + "learning_rate": 4.2005828082382784e-05, + "loss": 1.0249, + "step": 3340 + }, + { + "epoch": 0.5252215606516143, + "grad_norm": 0.17725297808647156, + "learning_rate": 4.200129882187726e-05, + "loss": 1.0831, + "step": 3341 + }, + { + "epoch": 0.5253787655485468, + "grad_norm": 0.1756705790758133, + "learning_rate": 4.19967685229971e-05, + "loss": 1.1297, + "step": 3342 + }, + { + "epoch": 0.5255359704454794, + "grad_norm": 0.18759998679161072, + "learning_rate": 4.199223718601899e-05, + "loss": 1.0577, + "step": 3343 + }, + { + "epoch": 0.5256931753424119, + "grad_norm": 0.1379726082086563, + "learning_rate": 4.198770481121967e-05, + "loss": 1.053, + "step": 3344 + }, + { + "epoch": 0.5258503802393445, + "grad_norm": 0.223505437374115, + "learning_rate": 4.198317139887598e-05, + "loss": 1.0955, + "step": 3345 + }, + { + "epoch": 0.526007585136277, + "grad_norm": 0.1413012593984604, + "learning_rate": 4.197863694926479e-05, + "loss": 1.1897, + "step": 3346 + }, + { + "epoch": 0.5261647900332095, + "grad_norm": 0.20731431245803833, + "learning_rate": 4.1974101462663075e-05, + "loss": 1.1718, + "step": 3347 + }, + { + "epoch": 0.5263219949301421, + "grad_norm": 0.13680782914161682, + "learning_rate": 4.19695649393478e-05, + "loss": 1.1427, + "step": 3348 + }, + { + "epoch": 0.5264791998270746, + "grad_norm": 0.1617634892463684, + "learning_rate": 4.1965027379596077e-05, + "loss": 0.9997, + "step": 3349 + }, + { + "epoch": 0.5266364047240072, + "grad_norm": 0.12036359310150146, + "learning_rate": 4.196048878368503e-05, + "loss": 1.2218, + "step": 3350 + }, + { + "epoch": 0.5267936096209397, + "grad_norm": 0.23155498504638672, + "learning_rate": 4.195594915189186e-05, + "loss": 1.1274, + "step": 3351 + }, + { + "epoch": 0.5269508145178722, + "grad_norm": 0.1686663031578064, + "learning_rate": 4.195140848449383e-05, + "loss": 1.0256, + "step": 3352 + }, + { + "epoch": 0.5271080194148048, + "grad_norm": 0.19032715260982513, + "learning_rate": 4.1946866781768256e-05, + "loss": 1.1329, + "step": 3353 + }, + { + "epoch": 0.5272652243117373, + "grad_norm": 0.18130408227443695, + "learning_rate": 4.1942324043992546e-05, + "loss": 1.0983, + "step": 3354 + }, + { + "epoch": 0.5274224292086699, + "grad_norm": 0.16532330214977264, + "learning_rate": 4.193778027144414e-05, + "loss": 1.153, + "step": 3355 + }, + { + "epoch": 0.5275796341056024, + "grad_norm": 0.14856065809726715, + "learning_rate": 4.1933235464400554e-05, + "loss": 1.1062, + "step": 3356 + }, + { + "epoch": 0.5277368390025349, + "grad_norm": 0.16016262769699097, + "learning_rate": 4.1928689623139385e-05, + "loss": 1.1015, + "step": 3357 + }, + { + "epoch": 0.5278940438994675, + "grad_norm": 0.16563931107521057, + "learning_rate": 4.192414274793825e-05, + "loss": 1.074, + "step": 3358 + }, + { + "epoch": 0.5280512487964, + "grad_norm": 0.18133817613124847, + "learning_rate": 4.1919594839074884e-05, + "loss": 1.0512, + "step": 3359 + }, + { + "epoch": 0.5282084536933326, + "grad_norm": 0.14429429173469543, + "learning_rate": 4.191504589682702e-05, + "loss": 1.0633, + "step": 3360 + }, + { + "epoch": 0.5282084536933326, + "eval_loss": 1.108935832977295, + "eval_runtime": 2328.9809, + "eval_samples_per_second": 3.975, + "eval_steps_per_second": 1.988, + "step": 3360 + }, + { + "epoch": 0.5283656585902651, + "grad_norm": 0.15838821232318878, + "learning_rate": 4.1910495921472525e-05, + "loss": 1.0943, + "step": 3361 + }, + { + "epoch": 0.5285228634871977, + "grad_norm": 0.16242218017578125, + "learning_rate": 4.190594491328928e-05, + "loss": 1.1898, + "step": 3362 + }, + { + "epoch": 0.5286800683841302, + "grad_norm": 0.15097495913505554, + "learning_rate": 4.190139287255524e-05, + "loss": 1.1512, + "step": 3363 + }, + { + "epoch": 0.5288372732810627, + "grad_norm": 0.16428695619106293, + "learning_rate": 4.1896839799548424e-05, + "loss": 0.9446, + "step": 3364 + }, + { + "epoch": 0.5289944781779953, + "grad_norm": 0.1545208841562271, + "learning_rate": 4.189228569454693e-05, + "loss": 1.1657, + "step": 3365 + }, + { + "epoch": 0.5291516830749278, + "grad_norm": 0.14587946236133575, + "learning_rate": 4.1887730557828886e-05, + "loss": 1.216, + "step": 3366 + }, + { + "epoch": 0.5293088879718604, + "grad_norm": 0.17108961939811707, + "learning_rate": 4.188317438967252e-05, + "loss": 1.1622, + "step": 3367 + }, + { + "epoch": 0.5294660928687929, + "grad_norm": 0.1690489500761032, + "learning_rate": 4.1878617190356095e-05, + "loss": 1.1143, + "step": 3368 + }, + { + "epoch": 0.5296232977657254, + "grad_norm": 0.1476382315158844, + "learning_rate": 4.187405896015795e-05, + "loss": 1.2572, + "step": 3369 + }, + { + "epoch": 0.529780502662658, + "grad_norm": 0.12622134387493134, + "learning_rate": 4.1869499699356494e-05, + "loss": 1.0056, + "step": 3370 + }, + { + "epoch": 0.5299377075595905, + "grad_norm": 0.1347476989030838, + "learning_rate": 4.186493940823018e-05, + "loss": 1.1663, + "step": 3371 + }, + { + "epoch": 0.5300949124565231, + "grad_norm": 0.1345609575510025, + "learning_rate": 4.186037808705753e-05, + "loss": 1.0417, + "step": 3372 + }, + { + "epoch": 0.5302521173534556, + "grad_norm": 0.18443985283374786, + "learning_rate": 4.1855815736117135e-05, + "loss": 0.9926, + "step": 3373 + }, + { + "epoch": 0.5304093222503881, + "grad_norm": 0.15589120984077454, + "learning_rate": 4.185125235568764e-05, + "loss": 1.0637, + "step": 3374 + }, + { + "epoch": 0.5305665271473207, + "grad_norm": 0.15485121309757233, + "learning_rate": 4.184668794604777e-05, + "loss": 1.1427, + "step": 3375 + }, + { + "epoch": 0.5307237320442532, + "grad_norm": 0.14066243171691895, + "learning_rate": 4.184212250747631e-05, + "loss": 1.0034, + "step": 3376 + }, + { + "epoch": 0.5308809369411858, + "grad_norm": 0.13589130342006683, + "learning_rate": 4.183755604025208e-05, + "loss": 1.1053, + "step": 3377 + }, + { + "epoch": 0.5310381418381183, + "grad_norm": 0.13392047584056854, + "learning_rate": 4.183298854465398e-05, + "loss": 1.0974, + "step": 3378 + }, + { + "epoch": 0.5311953467350508, + "grad_norm": 0.13384903967380524, + "learning_rate": 4.182842002096099e-05, + "loss": 1.0901, + "step": 3379 + }, + { + "epoch": 0.5313525516319834, + "grad_norm": 0.1484198421239853, + "learning_rate": 4.182385046945214e-05, + "loss": 1.1706, + "step": 3380 + }, + { + "epoch": 0.5315097565289159, + "grad_norm": 0.13713057339191437, + "learning_rate": 4.1819279890406506e-05, + "loss": 1.1243, + "step": 3381 + }, + { + "epoch": 0.5316669614258485, + "grad_norm": 0.2148253470659256, + "learning_rate": 4.181470828410325e-05, + "loss": 1.1012, + "step": 3382 + }, + { + "epoch": 0.531824166322781, + "grad_norm": 0.1586213856935501, + "learning_rate": 4.1810135650821604e-05, + "loss": 1.1232, + "step": 3383 + }, + { + "epoch": 0.5319813712197135, + "grad_norm": 0.15845105051994324, + "learning_rate": 4.180556199084082e-05, + "loss": 1.1245, + "step": 3384 + }, + { + "epoch": 0.5321385761166461, + "grad_norm": 0.14359626173973083, + "learning_rate": 4.180098730444024e-05, + "loss": 1.0408, + "step": 3385 + }, + { + "epoch": 0.5322957810135786, + "grad_norm": 0.15530677139759064, + "learning_rate": 4.179641159189929e-05, + "loss": 0.9341, + "step": 3386 + }, + { + "epoch": 0.5324529859105112, + "grad_norm": 0.16154368221759796, + "learning_rate": 4.179183485349742e-05, + "loss": 1.1421, + "step": 3387 + }, + { + "epoch": 0.5326101908074437, + "grad_norm": 0.16615764796733856, + "learning_rate": 4.178725708951418e-05, + "loss": 1.0734, + "step": 3388 + }, + { + "epoch": 0.5327673957043761, + "grad_norm": 0.2428814321756363, + "learning_rate": 4.178267830022913e-05, + "loss": 1.0714, + "step": 3389 + }, + { + "epoch": 0.5329246006013088, + "grad_norm": 0.14412826299667358, + "learning_rate": 4.177809848592195e-05, + "loss": 1.2244, + "step": 3390 + }, + { + "epoch": 0.5330818054982412, + "grad_norm": 0.18421795964241028, + "learning_rate": 4.177351764687235e-05, + "loss": 1.1815, + "step": 3391 + }, + { + "epoch": 0.5332390103951739, + "grad_norm": 0.14347229897975922, + "learning_rate": 4.176893578336012e-05, + "loss": 1.0274, + "step": 3392 + }, + { + "epoch": 0.5333962152921063, + "grad_norm": 0.12033331394195557, + "learning_rate": 4.1764352895665085e-05, + "loss": 1.1543, + "step": 3393 + }, + { + "epoch": 0.5335534201890388, + "grad_norm": 0.15218985080718994, + "learning_rate": 4.175976898406716e-05, + "loss": 1.1674, + "step": 3394 + }, + { + "epoch": 0.5337106250859714, + "grad_norm": 0.13008874654769897, + "learning_rate": 4.1755184048846316e-05, + "loss": 0.9576, + "step": 3395 + }, + { + "epoch": 0.5338678299829039, + "grad_norm": 0.14899078011512756, + "learning_rate": 4.175059809028258e-05, + "loss": 1.0168, + "step": 3396 + }, + { + "epoch": 0.5340250348798365, + "grad_norm": 0.1490127146244049, + "learning_rate": 4.1746011108656045e-05, + "loss": 1.1041, + "step": 3397 + }, + { + "epoch": 0.534182239776769, + "grad_norm": 0.15424524247646332, + "learning_rate": 4.1741423104246855e-05, + "loss": 1.1484, + "step": 3398 + }, + { + "epoch": 0.5343394446737015, + "grad_norm": 0.13760043680667877, + "learning_rate": 4.173683407733525e-05, + "loss": 1.1936, + "step": 3399 + }, + { + "epoch": 0.5344966495706341, + "grad_norm": 0.1429462432861328, + "learning_rate": 4.1732244028201495e-05, + "loss": 1.1503, + "step": 3400 + }, + { + "epoch": 0.5346538544675666, + "grad_norm": 0.1386295109987259, + "learning_rate": 4.172765295712594e-05, + "loss": 1.0223, + "step": 3401 + }, + { + "epoch": 0.5348110593644992, + "grad_norm": 0.1693735420703888, + "learning_rate": 4.172306086438898e-05, + "loss": 1.1685, + "step": 3402 + }, + { + "epoch": 0.5349682642614317, + "grad_norm": 0.19189637899398804, + "learning_rate": 4.1718467750271095e-05, + "loss": 1.1067, + "step": 3403 + }, + { + "epoch": 0.5351254691583642, + "grad_norm": 0.14684249460697174, + "learning_rate": 4.1713873615052815e-05, + "loss": 1.0856, + "step": 3404 + }, + { + "epoch": 0.5352826740552968, + "grad_norm": 0.15605910122394562, + "learning_rate": 4.1709278459014713e-05, + "loss": 1.045, + "step": 3405 + }, + { + "epoch": 0.5354398789522293, + "grad_norm": 0.14440178871154785, + "learning_rate": 4.170468228243747e-05, + "loss": 1.0972, + "step": 3406 + }, + { + "epoch": 0.5355970838491619, + "grad_norm": 0.13100458681583405, + "learning_rate": 4.170008508560178e-05, + "loss": 1.2334, + "step": 3407 + }, + { + "epoch": 0.5357542887460944, + "grad_norm": 0.19869981706142426, + "learning_rate": 4.1695486868788435e-05, + "loss": 1.1948, + "step": 3408 + }, + { + "epoch": 0.5359114936430269, + "grad_norm": 0.1492564082145691, + "learning_rate": 4.169088763227828e-05, + "loss": 1.0754, + "step": 3409 + }, + { + "epoch": 0.5360686985399595, + "grad_norm": 0.1578557938337326, + "learning_rate": 4.168628737635221e-05, + "loss": 1.1661, + "step": 3410 + }, + { + "epoch": 0.536225903436892, + "grad_norm": 0.12749029695987701, + "learning_rate": 4.1681686101291194e-05, + "loss": 1.1115, + "step": 3411 + }, + { + "epoch": 0.5363831083338246, + "grad_norm": 0.12497510015964508, + "learning_rate": 4.167708380737626e-05, + "loss": 1.1506, + "step": 3412 + }, + { + "epoch": 0.5365403132307571, + "grad_norm": 0.18353544175624847, + "learning_rate": 4.1672480494888496e-05, + "loss": 1.0797, + "step": 3413 + }, + { + "epoch": 0.5366975181276897, + "grad_norm": 0.19030232727527618, + "learning_rate": 4.1667876164109065e-05, + "loss": 1.0587, + "step": 3414 + }, + { + "epoch": 0.5368547230246222, + "grad_norm": 0.1877097338438034, + "learning_rate": 4.1663270815319176e-05, + "loss": 1.1747, + "step": 3415 + }, + { + "epoch": 0.5370119279215547, + "grad_norm": 0.1300366222858429, + "learning_rate": 4.16586644488001e-05, + "loss": 1.0576, + "step": 3416 + }, + { + "epoch": 0.5371691328184873, + "grad_norm": 0.15688586235046387, + "learning_rate": 4.165405706483318e-05, + "loss": 1.0936, + "step": 3417 + }, + { + "epoch": 0.5373263377154198, + "grad_norm": 0.17134688794612885, + "learning_rate": 4.164944866369983e-05, + "loss": 1.1725, + "step": 3418 + }, + { + "epoch": 0.5374835426123524, + "grad_norm": 0.19654420018196106, + "learning_rate": 4.164483924568149e-05, + "loss": 1.1048, + "step": 3419 + }, + { + "epoch": 0.5376407475092849, + "grad_norm": 0.1443120241165161, + "learning_rate": 4.16402288110597e-05, + "loss": 1.1554, + "step": 3420 + }, + { + "epoch": 0.5377979524062174, + "grad_norm": 0.1997511386871338, + "learning_rate": 4.1635617360116056e-05, + "loss": 1.1937, + "step": 3421 + }, + { + "epoch": 0.53795515730315, + "grad_norm": 0.13346394896507263, + "learning_rate": 4.1631004893132186e-05, + "loss": 1.1473, + "step": 3422 + }, + { + "epoch": 0.5381123622000825, + "grad_norm": 0.15485936403274536, + "learning_rate": 4.162639141038982e-05, + "loss": 1.0169, + "step": 3423 + }, + { + "epoch": 0.5382695670970151, + "grad_norm": 0.14727017283439636, + "learning_rate": 4.1621776912170726e-05, + "loss": 1.1214, + "step": 3424 + }, + { + "epoch": 0.5384267719939476, + "grad_norm": 0.12584324181079865, + "learning_rate": 4.161716139875674e-05, + "loss": 1.1039, + "step": 3425 + }, + { + "epoch": 0.5385839768908801, + "grad_norm": 0.15168297290802002, + "learning_rate": 4.161254487042976e-05, + "loss": 1.1374, + "step": 3426 + }, + { + "epoch": 0.5387411817878127, + "grad_norm": 0.16526435315608978, + "learning_rate": 4.1607927327471746e-05, + "loss": 1.1036, + "step": 3427 + }, + { + "epoch": 0.5388983866847452, + "grad_norm": 0.13458026945590973, + "learning_rate": 4.160330877016472e-05, + "loss": 1.2455, + "step": 3428 + }, + { + "epoch": 0.5390555915816778, + "grad_norm": 0.1569700390100479, + "learning_rate": 4.159868919879076e-05, + "loss": 1.0301, + "step": 3429 + }, + { + "epoch": 0.5392127964786103, + "grad_norm": 0.35492467880249023, + "learning_rate": 4.159406861363202e-05, + "loss": 1.1051, + "step": 3430 + }, + { + "epoch": 0.5393700013755428, + "grad_norm": 0.13499435782432556, + "learning_rate": 4.158944701497071e-05, + "loss": 1.0865, + "step": 3431 + }, + { + "epoch": 0.5395272062724754, + "grad_norm": 0.1455809772014618, + "learning_rate": 4.1584824403089096e-05, + "loss": 1.194, + "step": 3432 + }, + { + "epoch": 0.5396844111694079, + "grad_norm": 0.13883116841316223, + "learning_rate": 4.1580200778269504e-05, + "loss": 1.0438, + "step": 3433 + }, + { + "epoch": 0.5398416160663405, + "grad_norm": 0.32132571935653687, + "learning_rate": 4.157557614079433e-05, + "loss": 1.1206, + "step": 3434 + }, + { + "epoch": 0.539998820963273, + "grad_norm": 0.17668181657791138, + "learning_rate": 4.157095049094604e-05, + "loss": 1.0786, + "step": 3435 + }, + { + "epoch": 0.5401560258602055, + "grad_norm": 0.16002698242664337, + "learning_rate": 4.156632382900713e-05, + "loss": 1.0721, + "step": 3436 + }, + { + "epoch": 0.5403132307571381, + "grad_norm": 0.14776268601417542, + "learning_rate": 4.15616961552602e-05, + "loss": 1.1081, + "step": 3437 + }, + { + "epoch": 0.5404704356540706, + "grad_norm": 0.186686173081398, + "learning_rate": 4.155706746998788e-05, + "loss": 1.2066, + "step": 3438 + }, + { + "epoch": 0.5406276405510032, + "grad_norm": 0.1298559308052063, + "learning_rate": 4.155243777347287e-05, + "loss": 1.1499, + "step": 3439 + }, + { + "epoch": 0.5407848454479357, + "grad_norm": 0.14942237734794617, + "learning_rate": 4.154780706599795e-05, + "loss": 1.1508, + "step": 3440 + }, + { + "epoch": 0.5409420503448682, + "grad_norm": 0.13668204843997955, + "learning_rate": 4.154317534784593e-05, + "loss": 1.1353, + "step": 3441 + }, + { + "epoch": 0.5410992552418008, + "grad_norm": 0.1622249037027359, + "learning_rate": 4.15385426192997e-05, + "loss": 1.1168, + "step": 3442 + }, + { + "epoch": 0.5412564601387333, + "grad_norm": 0.1504237949848175, + "learning_rate": 4.1533908880642206e-05, + "loss": 1.0531, + "step": 3443 + }, + { + "epoch": 0.5414136650356659, + "grad_norm": 0.19511030614376068, + "learning_rate": 4.152927413215647e-05, + "loss": 1.017, + "step": 3444 + }, + { + "epoch": 0.5415708699325984, + "grad_norm": 0.15601330995559692, + "learning_rate": 4.1524638374125565e-05, + "loss": 1.0748, + "step": 3445 + }, + { + "epoch": 0.5417280748295309, + "grad_norm": 0.21786673367023468, + "learning_rate": 4.1520001606832616e-05, + "loss": 1.1413, + "step": 3446 + }, + { + "epoch": 0.5418852797264635, + "grad_norm": 0.1456231325864792, + "learning_rate": 4.1515363830560824e-05, + "loss": 1.0997, + "step": 3447 + }, + { + "epoch": 0.542042484623396, + "grad_norm": 0.17294257879257202, + "learning_rate": 4.151072504559344e-05, + "loss": 1.1522, + "step": 3448 + }, + { + "epoch": 0.5421996895203286, + "grad_norm": 0.17040883004665375, + "learning_rate": 4.15060852522138e-05, + "loss": 1.1087, + "step": 3449 + }, + { + "epoch": 0.5423568944172611, + "grad_norm": 0.18714837729930878, + "learning_rate": 4.150144445070527e-05, + "loss": 1.1267, + "step": 3450 + }, + { + "epoch": 0.5425140993141936, + "grad_norm": 0.1632552444934845, + "learning_rate": 4.1496802641351295e-05, + "loss": 1.1252, + "step": 3451 + }, + { + "epoch": 0.5426713042111262, + "grad_norm": 0.15470536053180695, + "learning_rate": 4.1492159824435386e-05, + "loss": 1.1384, + "step": 3452 + }, + { + "epoch": 0.5428285091080587, + "grad_norm": 0.1544426828622818, + "learning_rate": 4.14875160002411e-05, + "loss": 1.1067, + "step": 3453 + }, + { + "epoch": 0.5429857140049913, + "grad_norm": 0.1383865624666214, + "learning_rate": 4.1482871169052065e-05, + "loss": 1.2324, + "step": 3454 + }, + { + "epoch": 0.5431429189019238, + "grad_norm": 0.22764433920383453, + "learning_rate": 4.1478225331151976e-05, + "loss": 1.0468, + "step": 3455 + }, + { + "epoch": 0.5433001237988563, + "grad_norm": 0.12846636772155762, + "learning_rate": 4.1473578486824585e-05, + "loss": 1.1248, + "step": 3456 + }, + { + "epoch": 0.5434573286957889, + "grad_norm": 0.17757047712802887, + "learning_rate": 4.146893063635369e-05, + "loss": 1.125, + "step": 3457 + }, + { + "epoch": 0.5436145335927214, + "grad_norm": 0.16342827677726746, + "learning_rate": 4.1464281780023165e-05, + "loss": 1.1254, + "step": 3458 + }, + { + "epoch": 0.543771738489654, + "grad_norm": 0.15827199816703796, + "learning_rate": 4.145963191811696e-05, + "loss": 1.0685, + "step": 3459 + }, + { + "epoch": 0.5439289433865865, + "grad_norm": 0.12205367535352707, + "learning_rate": 4.1454981050919064e-05, + "loss": 1.1013, + "step": 3460 + }, + { + "epoch": 0.544086148283519, + "grad_norm": 0.1325395554304123, + "learning_rate": 4.1450329178713535e-05, + "loss": 1.1054, + "step": 3461 + }, + { + "epoch": 0.5442433531804516, + "grad_norm": 0.13637445867061615, + "learning_rate": 4.144567630178447e-05, + "loss": 1.0977, + "step": 3462 + }, + { + "epoch": 0.5444005580773841, + "grad_norm": 0.17633673548698425, + "learning_rate": 4.144102242041609e-05, + "loss": 1.1647, + "step": 3463 + }, + { + "epoch": 0.5445577629743167, + "grad_norm": 0.2620643675327301, + "learning_rate": 4.1436367534892604e-05, + "loss": 1.1291, + "step": 3464 + }, + { + "epoch": 0.5447149678712492, + "grad_norm": 0.18059346079826355, + "learning_rate": 4.1431711645498325e-05, + "loss": 0.998, + "step": 3465 + }, + { + "epoch": 0.5448721727681818, + "grad_norm": 0.13098302483558655, + "learning_rate": 4.1427054752517626e-05, + "loss": 1.1569, + "step": 3466 + }, + { + "epoch": 0.5450293776651143, + "grad_norm": 0.14102932810783386, + "learning_rate": 4.142239685623492e-05, + "loss": 1.1395, + "step": 3467 + }, + { + "epoch": 0.5451865825620468, + "grad_norm": 0.15154092013835907, + "learning_rate": 4.1417737956934696e-05, + "loss": 1.1194, + "step": 3468 + }, + { + "epoch": 0.5453437874589794, + "grad_norm": 0.17950110137462616, + "learning_rate": 4.14130780549015e-05, + "loss": 1.1436, + "step": 3469 + }, + { + "epoch": 0.5455009923559119, + "grad_norm": 0.15299329161643982, + "learning_rate": 4.140841715041995e-05, + "loss": 1.0516, + "step": 3470 + }, + { + "epoch": 0.5456581972528445, + "grad_norm": 0.15179339051246643, + "learning_rate": 4.140375524377471e-05, + "loss": 1.0996, + "step": 3471 + }, + { + "epoch": 0.545815402149777, + "grad_norm": 0.1765875220298767, + "learning_rate": 4.13990923352505e-05, + "loss": 1.1138, + "step": 3472 + }, + { + "epoch": 0.5459726070467095, + "grad_norm": 0.15171504020690918, + "learning_rate": 4.139442842513214e-05, + "loss": 1.0216, + "step": 3473 + }, + { + "epoch": 0.5461298119436421, + "grad_norm": 0.13052770495414734, + "learning_rate": 4.138976351370446e-05, + "loss": 1.0444, + "step": 3474 + }, + { + "epoch": 0.5462870168405746, + "grad_norm": 0.17383016645908356, + "learning_rate": 4.138509760125239e-05, + "loss": 1.1402, + "step": 3475 + }, + { + "epoch": 0.5464442217375072, + "grad_norm": 0.17007580399513245, + "learning_rate": 4.138043068806089e-05, + "loss": 1.0068, + "step": 3476 + }, + { + "epoch": 0.5466014266344397, + "grad_norm": 0.14631864428520203, + "learning_rate": 4.137576277441501e-05, + "loss": 1.1191, + "step": 3477 + }, + { + "epoch": 0.5467586315313722, + "grad_norm": 0.16629987955093384, + "learning_rate": 4.137109386059985e-05, + "loss": 1.062, + "step": 3478 + }, + { + "epoch": 0.5469158364283048, + "grad_norm": 0.17852547764778137, + "learning_rate": 4.1366423946900565e-05, + "loss": 1.0064, + "step": 3479 + }, + { + "epoch": 0.5470730413252373, + "grad_norm": 0.14198686182498932, + "learning_rate": 4.1361753033602365e-05, + "loss": 1.159, + "step": 3480 + }, + { + "epoch": 0.5472302462221699, + "grad_norm": 0.13683508336544037, + "learning_rate": 4.135708112099056e-05, + "loss": 1.1204, + "step": 3481 + }, + { + "epoch": 0.5473874511191024, + "grad_norm": 0.14965085685253143, + "learning_rate": 4.135240820935046e-05, + "loss": 1.1551, + "step": 3482 + }, + { + "epoch": 0.5475446560160349, + "grad_norm": 0.14537665247917175, + "learning_rate": 4.1347734298967486e-05, + "loss": 1.0633, + "step": 3483 + }, + { + "epoch": 0.5477018609129675, + "grad_norm": 0.14488355815410614, + "learning_rate": 4.13430593901271e-05, + "loss": 1.1625, + "step": 3484 + }, + { + "epoch": 0.5478590658099, + "grad_norm": 0.12533707916736603, + "learning_rate": 4.1338383483114834e-05, + "loss": 1.1789, + "step": 3485 + }, + { + "epoch": 0.5480162707068326, + "grad_norm": 0.1712876558303833, + "learning_rate": 4.133370657821627e-05, + "loss": 1.0927, + "step": 3486 + }, + { + "epoch": 0.5481734756037651, + "grad_norm": 0.13493269681930542, + "learning_rate": 4.1329028675717044e-05, + "loss": 1.1271, + "step": 3487 + }, + { + "epoch": 0.5483306805006976, + "grad_norm": 0.1372552216053009, + "learning_rate": 4.132434977590288e-05, + "loss": 1.1048, + "step": 3488 + }, + { + "epoch": 0.5484878853976302, + "grad_norm": 0.16924694180488586, + "learning_rate": 4.131966987905954e-05, + "loss": 0.9947, + "step": 3489 + }, + { + "epoch": 0.5486450902945627, + "grad_norm": 0.1597478687763214, + "learning_rate": 4.131498898547286e-05, + "loss": 1.0079, + "step": 3490 + }, + { + "epoch": 0.5488022951914953, + "grad_norm": 0.14878109097480774, + "learning_rate": 4.1310307095428726e-05, + "loss": 0.9512, + "step": 3491 + }, + { + "epoch": 0.5489595000884278, + "grad_norm": 0.1394796073436737, + "learning_rate": 4.1305624209213084e-05, + "loss": 1.0678, + "step": 3492 + }, + { + "epoch": 0.5491167049853602, + "grad_norm": 0.1500493586063385, + "learning_rate": 4.130094032711196e-05, + "loss": 1.1213, + "step": 3493 + }, + { + "epoch": 0.5492739098822929, + "grad_norm": 0.15943877398967743, + "learning_rate": 4.129625544941142e-05, + "loss": 1.1808, + "step": 3494 + }, + { + "epoch": 0.5494311147792253, + "grad_norm": 0.13630403578281403, + "learning_rate": 4.1291569576397604e-05, + "loss": 1.1033, + "step": 3495 + }, + { + "epoch": 0.549588319676158, + "grad_norm": 0.13915424048900604, + "learning_rate": 4.12868827083567e-05, + "loss": 1.1821, + "step": 3496 + }, + { + "epoch": 0.5497455245730904, + "grad_norm": 0.13302691280841827, + "learning_rate": 4.1282194845574966e-05, + "loss": 1.1093, + "step": 3497 + }, + { + "epoch": 0.5499027294700229, + "grad_norm": 0.14590945839881897, + "learning_rate": 4.127750598833873e-05, + "loss": 1.0629, + "step": 3498 + }, + { + "epoch": 0.5500599343669555, + "grad_norm": 0.20246762037277222, + "learning_rate": 4.127281613693435e-05, + "loss": 1.25, + "step": 3499 + }, + { + "epoch": 0.550217139263888, + "grad_norm": 0.17783774435520172, + "learning_rate": 4.126812529164828e-05, + "loss": 1.0606, + "step": 3500 + }, + { + "epoch": 0.5503743441608206, + "grad_norm": 0.19174616038799286, + "learning_rate": 4.126343345276701e-05, + "loss": 1.0292, + "step": 3501 + }, + { + "epoch": 0.5505315490577531, + "grad_norm": 0.2838459014892578, + "learning_rate": 4.1258740620577104e-05, + "loss": 1.0346, + "step": 3502 + }, + { + "epoch": 0.5506887539546856, + "grad_norm": 0.15128780901432037, + "learning_rate": 4.125404679536518e-05, + "loss": 1.0671, + "step": 3503 + }, + { + "epoch": 0.5508459588516182, + "grad_norm": 0.14986170828342438, + "learning_rate": 4.1249351977417926e-05, + "loss": 1.1325, + "step": 3504 + }, + { + "epoch": 0.5510031637485507, + "grad_norm": 0.13601899147033691, + "learning_rate": 4.124465616702207e-05, + "loss": 1.1468, + "step": 3505 + }, + { + "epoch": 0.5511603686454833, + "grad_norm": 0.1459992378950119, + "learning_rate": 4.123995936446443e-05, + "loss": 1.0934, + "step": 3506 + }, + { + "epoch": 0.5513175735424158, + "grad_norm": 0.20352213084697723, + "learning_rate": 4.123526157003186e-05, + "loss": 1.0711, + "step": 3507 + }, + { + "epoch": 0.5514747784393483, + "grad_norm": 0.1530255526304245, + "learning_rate": 4.123056278401128e-05, + "loss": 1.192, + "step": 3508 + }, + { + "epoch": 0.5516319833362809, + "grad_norm": 0.1644168347120285, + "learning_rate": 4.122586300668968e-05, + "loss": 1.1092, + "step": 3509 + }, + { + "epoch": 0.5517891882332134, + "grad_norm": 0.14001496136188507, + "learning_rate": 4.12211622383541e-05, + "loss": 1.1029, + "step": 3510 + }, + { + "epoch": 0.551946393130146, + "grad_norm": 0.15016533434391022, + "learning_rate": 4.121646047929165e-05, + "loss": 1.0815, + "step": 3511 + }, + { + "epoch": 0.5521035980270785, + "grad_norm": 0.23084966838359833, + "learning_rate": 4.12117577297895e-05, + "loss": 1.0295, + "step": 3512 + }, + { + "epoch": 0.552260802924011, + "grad_norm": 0.1836826652288437, + "learning_rate": 4.120705399013486e-05, + "loss": 1.137, + "step": 3513 + }, + { + "epoch": 0.5524180078209436, + "grad_norm": 0.15557770431041718, + "learning_rate": 4.1202349260615034e-05, + "loss": 1.1611, + "step": 3514 + }, + { + "epoch": 0.5525752127178761, + "grad_norm": 0.1517626792192459, + "learning_rate": 4.119764354151736e-05, + "loss": 1.0761, + "step": 3515 + }, + { + "epoch": 0.5527324176148087, + "grad_norm": 0.14277459681034088, + "learning_rate": 4.119293683312924e-05, + "loss": 1.2162, + "step": 3516 + }, + { + "epoch": 0.5528896225117412, + "grad_norm": 0.14309588074684143, + "learning_rate": 4.1188229135738154e-05, + "loss": 1.0045, + "step": 3517 + }, + { + "epoch": 0.5530468274086738, + "grad_norm": 0.18487317860126495, + "learning_rate": 4.1183520449631615e-05, + "loss": 1.0144, + "step": 3518 + }, + { + "epoch": 0.5532040323056063, + "grad_norm": 0.12368316948413849, + "learning_rate": 4.117881077509723e-05, + "loss": 1.0489, + "step": 3519 + }, + { + "epoch": 0.5533612372025388, + "grad_norm": 0.12940697371959686, + "learning_rate": 4.117410011242264e-05, + "loss": 1.1486, + "step": 3520 + }, + { + "epoch": 0.5533612372025388, + "eval_loss": 1.107121229171753, + "eval_runtime": 2329.4939, + "eval_samples_per_second": 3.974, + "eval_steps_per_second": 1.987, + "step": 3520 + }, + { + "epoch": 0.5535184420994714, + "grad_norm": 0.14380988478660583, + "learning_rate": 4.1169388461895557e-05, + "loss": 1.1459, + "step": 3521 + }, + { + "epoch": 0.5536756469964039, + "grad_norm": 0.2936558723449707, + "learning_rate": 4.116467582380374e-05, + "loss": 1.0386, + "step": 3522 + }, + { + "epoch": 0.5538328518933365, + "grad_norm": 0.1526966094970703, + "learning_rate": 4.1159962198435034e-05, + "loss": 1.0906, + "step": 3523 + }, + { + "epoch": 0.553990056790269, + "grad_norm": 0.14400209486484528, + "learning_rate": 4.115524758607731e-05, + "loss": 1.0553, + "step": 3524 + }, + { + "epoch": 0.5541472616872015, + "grad_norm": 0.1706121563911438, + "learning_rate": 4.1150531987018535e-05, + "loss": 1.1487, + "step": 3525 + }, + { + "epoch": 0.5543044665841341, + "grad_norm": 0.19591955840587616, + "learning_rate": 4.114581540154672e-05, + "loss": 1.0226, + "step": 3526 + }, + { + "epoch": 0.5544616714810666, + "grad_norm": 0.18198204040527344, + "learning_rate": 4.114109782994993e-05, + "loss": 1.05, + "step": 3527 + }, + { + "epoch": 0.5546188763779992, + "grad_norm": 0.13738413155078888, + "learning_rate": 4.113637927251629e-05, + "loss": 1.1371, + "step": 3528 + }, + { + "epoch": 0.5547760812749317, + "grad_norm": 0.15277525782585144, + "learning_rate": 4.1131659729534006e-05, + "loss": 1.2617, + "step": 3529 + }, + { + "epoch": 0.5549332861718642, + "grad_norm": 0.12683087587356567, + "learning_rate": 4.112693920129132e-05, + "loss": 1.1409, + "step": 3530 + }, + { + "epoch": 0.5550904910687968, + "grad_norm": 0.13529619574546814, + "learning_rate": 4.1122217688076546e-05, + "loss": 1.109, + "step": 3531 + }, + { + "epoch": 0.5552476959657293, + "grad_norm": 0.152116596698761, + "learning_rate": 4.111749519017806e-05, + "loss": 1.203, + "step": 3532 + }, + { + "epoch": 0.5554049008626619, + "grad_norm": 0.15577782690525055, + "learning_rate": 4.111277170788428e-05, + "loss": 1.0422, + "step": 3533 + }, + { + "epoch": 0.5555621057595944, + "grad_norm": 0.1772596538066864, + "learning_rate": 4.110804724148373e-05, + "loss": 1.2143, + "step": 3534 + }, + { + "epoch": 0.5557193106565269, + "grad_norm": 0.1525980532169342, + "learning_rate": 4.110332179126493e-05, + "loss": 1.0592, + "step": 3535 + }, + { + "epoch": 0.5558765155534595, + "grad_norm": 0.15575167536735535, + "learning_rate": 4.109859535751649e-05, + "loss": 1.1221, + "step": 3536 + }, + { + "epoch": 0.556033720450392, + "grad_norm": 0.15791207551956177, + "learning_rate": 4.1093867940527115e-05, + "loss": 1.0413, + "step": 3537 + }, + { + "epoch": 0.5561909253473246, + "grad_norm": 0.22039160132408142, + "learning_rate": 4.108913954058552e-05, + "loss": 1.0689, + "step": 3538 + }, + { + "epoch": 0.5563481302442571, + "grad_norm": 0.1370696723461151, + "learning_rate": 4.1084410157980484e-05, + "loss": 1.1589, + "step": 3539 + }, + { + "epoch": 0.5565053351411896, + "grad_norm": 0.14719007909297943, + "learning_rate": 4.107967979300088e-05, + "loss": 1.1352, + "step": 3540 + }, + { + "epoch": 0.5566625400381222, + "grad_norm": 0.1324034333229065, + "learning_rate": 4.107494844593561e-05, + "loss": 1.2209, + "step": 3541 + }, + { + "epoch": 0.5568197449350547, + "grad_norm": 0.13277071714401245, + "learning_rate": 4.107021611707366e-05, + "loss": 1.1789, + "step": 3542 + }, + { + "epoch": 0.5569769498319873, + "grad_norm": 0.14232930541038513, + "learning_rate": 4.106548280670405e-05, + "loss": 1.0172, + "step": 3543 + }, + { + "epoch": 0.5571341547289198, + "grad_norm": 0.1549500972032547, + "learning_rate": 4.106074851511587e-05, + "loss": 0.9018, + "step": 3544 + }, + { + "epoch": 0.5572913596258523, + "grad_norm": 0.1389908343553543, + "learning_rate": 4.1056013242598276e-05, + "loss": 1.1046, + "step": 3545 + }, + { + "epoch": 0.5574485645227849, + "grad_norm": 0.13126444816589355, + "learning_rate": 4.105127698944049e-05, + "loss": 1.2158, + "step": 3546 + }, + { + "epoch": 0.5576057694197174, + "grad_norm": 0.1347741037607193, + "learning_rate": 4.104653975593177e-05, + "loss": 1.1548, + "step": 3547 + }, + { + "epoch": 0.55776297431665, + "grad_norm": 0.15513011813163757, + "learning_rate": 4.104180154236146e-05, + "loss": 1.1751, + "step": 3548 + }, + { + "epoch": 0.5579201792135825, + "grad_norm": 0.14042234420776367, + "learning_rate": 4.103706234901894e-05, + "loss": 1.019, + "step": 3549 + }, + { + "epoch": 0.558077384110515, + "grad_norm": 0.12702056765556335, + "learning_rate": 4.103232217619368e-05, + "loss": 1.1166, + "step": 3550 + }, + { + "epoch": 0.5582345890074476, + "grad_norm": 0.14291422069072723, + "learning_rate": 4.102758102417517e-05, + "loss": 1.189, + "step": 3551 + }, + { + "epoch": 0.5583917939043801, + "grad_norm": 0.1567365825176239, + "learning_rate": 4.102283889325299e-05, + "loss": 1.0894, + "step": 3552 + }, + { + "epoch": 0.5585489988013127, + "grad_norm": 0.16741162538528442, + "learning_rate": 4.101809578371678e-05, + "loss": 1.0622, + "step": 3553 + }, + { + "epoch": 0.5587062036982452, + "grad_norm": 0.2240571677684784, + "learning_rate": 4.101335169585623e-05, + "loss": 1.1743, + "step": 3554 + }, + { + "epoch": 0.5588634085951777, + "grad_norm": 0.14278970658779144, + "learning_rate": 4.100860662996108e-05, + "loss": 1.1325, + "step": 3555 + }, + { + "epoch": 0.5590206134921103, + "grad_norm": 0.15045011043548584, + "learning_rate": 4.100386058632114e-05, + "loss": 1.1009, + "step": 3556 + }, + { + "epoch": 0.5591778183890428, + "grad_norm": 0.18575306236743927, + "learning_rate": 4.0999113565226286e-05, + "loss": 1.0188, + "step": 3557 + }, + { + "epoch": 0.5593350232859754, + "grad_norm": 0.1384294033050537, + "learning_rate": 4.0994365566966456e-05, + "loss": 1.1446, + "step": 3558 + }, + { + "epoch": 0.5594922281829079, + "grad_norm": 0.14572866261005402, + "learning_rate": 4.098961659183163e-05, + "loss": 1.0271, + "step": 3559 + }, + { + "epoch": 0.5596494330798404, + "grad_norm": 0.17086221277713776, + "learning_rate": 4.098486664011186e-05, + "loss": 1.1636, + "step": 3560 + }, + { + "epoch": 0.559806637976773, + "grad_norm": 0.15883690118789673, + "learning_rate": 4.098011571209724e-05, + "loss": 1.1531, + "step": 3561 + }, + { + "epoch": 0.5599638428737055, + "grad_norm": 0.13134004175662994, + "learning_rate": 4.097536380807797e-05, + "loss": 1.1382, + "step": 3562 + }, + { + "epoch": 0.5601210477706381, + "grad_norm": 0.18722890317440033, + "learning_rate": 4.097061092834425e-05, + "loss": 1.0367, + "step": 3563 + }, + { + "epoch": 0.5602782526675706, + "grad_norm": 0.15676817297935486, + "learning_rate": 4.0965857073186394e-05, + "loss": 1.1242, + "step": 3564 + }, + { + "epoch": 0.5604354575645031, + "grad_norm": 0.14853468537330627, + "learning_rate": 4.096110224289472e-05, + "loss": 1.1161, + "step": 3565 + }, + { + "epoch": 0.5605926624614357, + "grad_norm": 0.15379522740840912, + "learning_rate": 4.095634643775965e-05, + "loss": 1.1923, + "step": 3566 + }, + { + "epoch": 0.5607498673583682, + "grad_norm": 0.19688835740089417, + "learning_rate": 4.095158965807165e-05, + "loss": 1.1384, + "step": 3567 + }, + { + "epoch": 0.5609070722553008, + "grad_norm": 0.13620081543922424, + "learning_rate": 4.094683190412125e-05, + "loss": 1.1516, + "step": 3568 + }, + { + "epoch": 0.5610642771522333, + "grad_norm": 0.1553749442100525, + "learning_rate": 4.0942073176199036e-05, + "loss": 1.114, + "step": 3569 + }, + { + "epoch": 0.5612214820491658, + "grad_norm": 0.1777694970369339, + "learning_rate": 4.093731347459564e-05, + "loss": 1.0629, + "step": 3570 + }, + { + "epoch": 0.5613786869460984, + "grad_norm": 0.1379484385251999, + "learning_rate": 4.0932552799601776e-05, + "loss": 1.1787, + "step": 3571 + }, + { + "epoch": 0.5615358918430309, + "grad_norm": 0.1495600789785385, + "learning_rate": 4.092779115150821e-05, + "loss": 1.2294, + "step": 3572 + }, + { + "epoch": 0.5616930967399635, + "grad_norm": 0.13353364169597626, + "learning_rate": 4.0923028530605756e-05, + "loss": 1.1443, + "step": 3573 + }, + { + "epoch": 0.561850301636896, + "grad_norm": 0.13893137872219086, + "learning_rate": 4.09182649371853e-05, + "loss": 1.2105, + "step": 3574 + }, + { + "epoch": 0.5620075065338286, + "grad_norm": 0.12231533229351044, + "learning_rate": 4.0913500371537796e-05, + "loss": 1.0414, + "step": 3575 + }, + { + "epoch": 0.5621647114307611, + "grad_norm": 0.1520678997039795, + "learning_rate": 4.090873483395422e-05, + "loss": 1.0846, + "step": 3576 + }, + { + "epoch": 0.5623219163276936, + "grad_norm": 0.13837867975234985, + "learning_rate": 4.090396832472566e-05, + "loss": 0.9917, + "step": 3577 + }, + { + "epoch": 0.5624791212246262, + "grad_norm": 0.13468261063098907, + "learning_rate": 4.089920084414323e-05, + "loss": 1.173, + "step": 3578 + }, + { + "epoch": 0.5626363261215587, + "grad_norm": 0.15696372091770172, + "learning_rate": 4.089443239249811e-05, + "loss": 1.0865, + "step": 3579 + }, + { + "epoch": 0.5627935310184913, + "grad_norm": 0.14664031565189362, + "learning_rate": 4.088966297008152e-05, + "loss": 1.1385, + "step": 3580 + }, + { + "epoch": 0.5629507359154238, + "grad_norm": 0.18675899505615234, + "learning_rate": 4.0884892577184774e-05, + "loss": 1.0914, + "step": 3581 + }, + { + "epoch": 0.5631079408123563, + "grad_norm": 0.14525212347507477, + "learning_rate": 4.0880121214099225e-05, + "loss": 1.0615, + "step": 3582 + }, + { + "epoch": 0.5632651457092889, + "grad_norm": 0.12778644263744354, + "learning_rate": 4.08753488811163e-05, + "loss": 1.0975, + "step": 3583 + }, + { + "epoch": 0.5634223506062214, + "grad_norm": 0.13698804378509521, + "learning_rate": 4.087057557852747e-05, + "loss": 1.1787, + "step": 3584 + }, + { + "epoch": 0.563579555503154, + "grad_norm": 0.1349797397851944, + "learning_rate": 4.086580130662426e-05, + "loss": 1.2011, + "step": 3585 + }, + { + "epoch": 0.5637367604000865, + "grad_norm": 0.12913671135902405, + "learning_rate": 4.086102606569827e-05, + "loss": 1.1346, + "step": 3586 + }, + { + "epoch": 0.563893965297019, + "grad_norm": 0.15104079246520996, + "learning_rate": 4.0856249856041154e-05, + "loss": 1.1491, + "step": 3587 + }, + { + "epoch": 0.5640511701939516, + "grad_norm": 0.13582037389278412, + "learning_rate": 4.0851472677944636e-05, + "loss": 1.0484, + "step": 3588 + }, + { + "epoch": 0.5642083750908841, + "grad_norm": 0.13232627511024475, + "learning_rate": 4.084669453170047e-05, + "loss": 1.0965, + "step": 3589 + }, + { + "epoch": 0.5643655799878167, + "grad_norm": 0.14906802773475647, + "learning_rate": 4.08419154176005e-05, + "loss": 1.1288, + "step": 3590 + }, + { + "epoch": 0.5645227848847492, + "grad_norm": 0.13499197363853455, + "learning_rate": 4.0837135335936606e-05, + "loss": 1.1255, + "step": 3591 + }, + { + "epoch": 0.5646799897816817, + "grad_norm": 0.15802523493766785, + "learning_rate": 4.083235428700074e-05, + "loss": 1.1828, + "step": 3592 + }, + { + "epoch": 0.5648371946786143, + "grad_norm": 0.14267662167549133, + "learning_rate": 4.082757227108492e-05, + "loss": 1.1359, + "step": 3593 + }, + { + "epoch": 0.5649943995755468, + "grad_norm": 0.12380488216876984, + "learning_rate": 4.08227892884812e-05, + "loss": 1.1136, + "step": 3594 + }, + { + "epoch": 0.5651516044724794, + "grad_norm": 0.1933763176202774, + "learning_rate": 4.081800533948171e-05, + "loss": 1.068, + "step": 3595 + }, + { + "epoch": 0.5653088093694119, + "grad_norm": 0.14156892895698547, + "learning_rate": 4.081322042437864e-05, + "loss": 1.2321, + "step": 3596 + }, + { + "epoch": 0.5654660142663444, + "grad_norm": 0.14347857236862183, + "learning_rate": 4.0808434543464233e-05, + "loss": 1.1036, + "step": 3597 + }, + { + "epoch": 0.565623219163277, + "grad_norm": 0.15195155143737793, + "learning_rate": 4.08036476970308e-05, + "loss": 1.1351, + "step": 3598 + }, + { + "epoch": 0.5657804240602095, + "grad_norm": 0.1291377991437912, + "learning_rate": 4.0798859885370676e-05, + "loss": 1.1474, + "step": 3599 + }, + { + "epoch": 0.5659376289571421, + "grad_norm": 0.21259215474128723, + "learning_rate": 4.0794071108776314e-05, + "loss": 1.1855, + "step": 3600 + }, + { + "epoch": 0.5660948338540746, + "grad_norm": 0.1222395971417427, + "learning_rate": 4.078928136754018e-05, + "loss": 1.2164, + "step": 3601 + }, + { + "epoch": 0.566252038751007, + "grad_norm": 0.18162107467651367, + "learning_rate": 4.078449066195481e-05, + "loss": 1.0613, + "step": 3602 + }, + { + "epoch": 0.5664092436479397, + "grad_norm": 0.13709622621536255, + "learning_rate": 4.0779698992312806e-05, + "loss": 1.1023, + "step": 3603 + }, + { + "epoch": 0.5665664485448721, + "grad_norm": 0.16631649434566498, + "learning_rate": 4.077490635890683e-05, + "loss": 1.1133, + "step": 3604 + }, + { + "epoch": 0.5667236534418048, + "grad_norm": 0.14726059138774872, + "learning_rate": 4.077011276202959e-05, + "loss": 1.2028, + "step": 3605 + }, + { + "epoch": 0.5668808583387372, + "grad_norm": 0.1343258172273636, + "learning_rate": 4.0765318201973865e-05, + "loss": 1.0611, + "step": 3606 + }, + { + "epoch": 0.5670380632356697, + "grad_norm": 0.20124560594558716, + "learning_rate": 4.0760522679032484e-05, + "loss": 0.957, + "step": 3607 + }, + { + "epoch": 0.5671952681326023, + "grad_norm": 0.12561757862567902, + "learning_rate": 4.075572619349836e-05, + "loss": 1.2235, + "step": 3608 + }, + { + "epoch": 0.5673524730295348, + "grad_norm": 0.1428644210100174, + "learning_rate": 4.075092874566441e-05, + "loss": 1.1726, + "step": 3609 + }, + { + "epoch": 0.5675096779264674, + "grad_norm": 0.12811197340488434, + "learning_rate": 4.0746130335823656e-05, + "loss": 1.0739, + "step": 3610 + }, + { + "epoch": 0.5676668828233999, + "grad_norm": 0.14488913118839264, + "learning_rate": 4.0741330964269176e-05, + "loss": 1.1257, + "step": 3611 + }, + { + "epoch": 0.5678240877203324, + "grad_norm": 0.15192878246307373, + "learning_rate": 4.0736530631294104e-05, + "loss": 1.1235, + "step": 3612 + }, + { + "epoch": 0.567981292617265, + "grad_norm": 0.2228131741285324, + "learning_rate": 4.0731729337191606e-05, + "loss": 0.9947, + "step": 3613 + }, + { + "epoch": 0.5681384975141975, + "grad_norm": 0.14630167186260223, + "learning_rate": 4.0726927082254934e-05, + "loss": 1.1134, + "step": 3614 + }, + { + "epoch": 0.5682957024111301, + "grad_norm": 0.14998601377010345, + "learning_rate": 4.072212386677739e-05, + "loss": 1.2102, + "step": 3615 + }, + { + "epoch": 0.5684529073080626, + "grad_norm": 0.12869413197040558, + "learning_rate": 4.071731969105235e-05, + "loss": 1.0684, + "step": 3616 + }, + { + "epoch": 0.5686101122049951, + "grad_norm": 0.13855576515197754, + "learning_rate": 4.071251455537321e-05, + "loss": 1.1482, + "step": 3617 + }, + { + "epoch": 0.5687673171019277, + "grad_norm": 0.11902645975351334, + "learning_rate": 4.070770846003347e-05, + "loss": 1.1163, + "step": 3618 + }, + { + "epoch": 0.5689245219988602, + "grad_norm": 0.15628483891487122, + "learning_rate": 4.070290140532667e-05, + "loss": 1.1314, + "step": 3619 + }, + { + "epoch": 0.5690817268957928, + "grad_norm": 0.1387438029050827, + "learning_rate": 4.069809339154638e-05, + "loss": 1.0639, + "step": 3620 + }, + { + "epoch": 0.5692389317927253, + "grad_norm": 0.17734022438526154, + "learning_rate": 4.0693284418986286e-05, + "loss": 1.0748, + "step": 3621 + }, + { + "epoch": 0.5693961366896578, + "grad_norm": 0.18251527845859528, + "learning_rate": 4.068847448794009e-05, + "loss": 1.1747, + "step": 3622 + }, + { + "epoch": 0.5695533415865904, + "grad_norm": 0.18893775343894958, + "learning_rate": 4.0683663598701546e-05, + "loss": 1.0855, + "step": 3623 + }, + { + "epoch": 0.5697105464835229, + "grad_norm": 0.14489051699638367, + "learning_rate": 4.0678851751564515e-05, + "loss": 1.112, + "step": 3624 + }, + { + "epoch": 0.5698677513804555, + "grad_norm": 0.18162967264652252, + "learning_rate": 4.0674038946822876e-05, + "loss": 1.1593, + "step": 3625 + }, + { + "epoch": 0.570024956277388, + "grad_norm": 0.16984649002552032, + "learning_rate": 4.066922518477056e-05, + "loss": 1.0952, + "step": 3626 + }, + { + "epoch": 0.5701821611743206, + "grad_norm": 0.17276768386363983, + "learning_rate": 4.0664410465701605e-05, + "loss": 1.2318, + "step": 3627 + }, + { + "epoch": 0.5703393660712531, + "grad_norm": 0.16322898864746094, + "learning_rate": 4.065959478991005e-05, + "loss": 1.0514, + "step": 3628 + }, + { + "epoch": 0.5704965709681856, + "grad_norm": 0.11980457603931427, + "learning_rate": 4.0654778157690025e-05, + "loss": 1.1183, + "step": 3629 + }, + { + "epoch": 0.5706537758651182, + "grad_norm": 0.14290717244148254, + "learning_rate": 4.064996056933571e-05, + "loss": 1.1669, + "step": 3630 + }, + { + "epoch": 0.5708109807620507, + "grad_norm": 0.13066339492797852, + "learning_rate": 4.064514202514136e-05, + "loss": 1.2166, + "step": 3631 + }, + { + "epoch": 0.5709681856589833, + "grad_norm": 0.14656370878219604, + "learning_rate": 4.0640322525401254e-05, + "loss": 1.0351, + "step": 3632 + }, + { + "epoch": 0.5711253905559158, + "grad_norm": 0.1635439097881317, + "learning_rate": 4.063550207040975e-05, + "loss": 1.1568, + "step": 3633 + }, + { + "epoch": 0.5712825954528483, + "grad_norm": 0.15011559426784515, + "learning_rate": 4.063068066046127e-05, + "loss": 1.1624, + "step": 3634 + }, + { + "epoch": 0.5714398003497809, + "grad_norm": 0.13049575686454773, + "learning_rate": 4.06258582958503e-05, + "loss": 1.0406, + "step": 3635 + }, + { + "epoch": 0.5715970052467134, + "grad_norm": 0.1825868785381317, + "learning_rate": 4.0621034976871344e-05, + "loss": 1.0516, + "step": 3636 + }, + { + "epoch": 0.571754210143646, + "grad_norm": 0.14342835545539856, + "learning_rate": 4.0616210703819e-05, + "loss": 1.1982, + "step": 3637 + }, + { + "epoch": 0.5719114150405785, + "grad_norm": 0.16550028324127197, + "learning_rate": 4.061138547698794e-05, + "loss": 1.0964, + "step": 3638 + }, + { + "epoch": 0.572068619937511, + "grad_norm": 0.13078954815864563, + "learning_rate": 4.060655929667284e-05, + "loss": 1.1712, + "step": 3639 + }, + { + "epoch": 0.5722258248344436, + "grad_norm": 0.17670851945877075, + "learning_rate": 4.060173216316847e-05, + "loss": 1.0302, + "step": 3640 + }, + { + "epoch": 0.5723830297313761, + "grad_norm": 0.1388305425643921, + "learning_rate": 4.0596904076769674e-05, + "loss": 1.1351, + "step": 3641 + }, + { + "epoch": 0.5725402346283087, + "grad_norm": 0.13523413240909576, + "learning_rate": 4.059207503777131e-05, + "loss": 1.1302, + "step": 3642 + }, + { + "epoch": 0.5726974395252412, + "grad_norm": 0.14954648911952972, + "learning_rate": 4.058724504646834e-05, + "loss": 1.2166, + "step": 3643 + }, + { + "epoch": 0.5728546444221737, + "grad_norm": 0.1510806679725647, + "learning_rate": 4.058241410315574e-05, + "loss": 1.0743, + "step": 3644 + }, + { + "epoch": 0.5730118493191063, + "grad_norm": 0.13376949727535248, + "learning_rate": 4.057758220812857e-05, + "loss": 1.062, + "step": 3645 + }, + { + "epoch": 0.5731690542160388, + "grad_norm": 0.16394519805908203, + "learning_rate": 4.057274936168196e-05, + "loss": 1.1083, + "step": 3646 + }, + { + "epoch": 0.5733262591129714, + "grad_norm": 0.14098455011844635, + "learning_rate": 4.056791556411106e-05, + "loss": 1.1301, + "step": 3647 + }, + { + "epoch": 0.5734834640099039, + "grad_norm": 0.13498766720294952, + "learning_rate": 4.0563080815711116e-05, + "loss": 1.1505, + "step": 3648 + }, + { + "epoch": 0.5736406689068364, + "grad_norm": 0.1271194964647293, + "learning_rate": 4.0558245116777394e-05, + "loss": 1.1138, + "step": 3649 + }, + { + "epoch": 0.573797873803769, + "grad_norm": 0.166845440864563, + "learning_rate": 4.055340846760527e-05, + "loss": 0.9864, + "step": 3650 + }, + { + "epoch": 0.5739550787007015, + "grad_norm": 0.13443748652935028, + "learning_rate": 4.054857086849013e-05, + "loss": 1.1908, + "step": 3651 + }, + { + "epoch": 0.5741122835976341, + "grad_norm": 0.12793311476707458, + "learning_rate": 4.054373231972744e-05, + "loss": 1.1881, + "step": 3652 + }, + { + "epoch": 0.5742694884945666, + "grad_norm": 0.12760072946548462, + "learning_rate": 4.053889282161272e-05, + "loss": 1.0298, + "step": 3653 + }, + { + "epoch": 0.5744266933914991, + "grad_norm": 0.14803969860076904, + "learning_rate": 4.0534052374441544e-05, + "loss": 1.1912, + "step": 3654 + }, + { + "epoch": 0.5745838982884317, + "grad_norm": 0.15170976519584656, + "learning_rate": 4.0529210978509556e-05, + "loss": 1.1831, + "step": 3655 + }, + { + "epoch": 0.5747411031853642, + "grad_norm": 0.13536196947097778, + "learning_rate": 4.0524368634112446e-05, + "loss": 1.0846, + "step": 3656 + }, + { + "epoch": 0.5748983080822968, + "grad_norm": 0.14190278947353363, + "learning_rate": 4.0519525341545964e-05, + "loss": 1.0797, + "step": 3657 + }, + { + "epoch": 0.5750555129792293, + "grad_norm": 0.14169275760650635, + "learning_rate": 4.051468110110593e-05, + "loss": 1.1638, + "step": 3658 + }, + { + "epoch": 0.5752127178761618, + "grad_norm": 0.16677023470401764, + "learning_rate": 4.050983591308819e-05, + "loss": 1.1189, + "step": 3659 + }, + { + "epoch": 0.5753699227730944, + "grad_norm": 0.14935442805290222, + "learning_rate": 4.050498977778869e-05, + "loss": 1.2741, + "step": 3660 + }, + { + "epoch": 0.5755271276700269, + "grad_norm": 0.2275468111038208, + "learning_rate": 4.050014269550342e-05, + "loss": 1.1942, + "step": 3661 + }, + { + "epoch": 0.5756843325669595, + "grad_norm": 0.14003866910934448, + "learning_rate": 4.049529466652839e-05, + "loss": 1.1577, + "step": 3662 + }, + { + "epoch": 0.575841537463892, + "grad_norm": 0.14179524779319763, + "learning_rate": 4.0490445691159726e-05, + "loss": 1.1885, + "step": 3663 + }, + { + "epoch": 0.5759987423608245, + "grad_norm": 0.14022792875766754, + "learning_rate": 4.048559576969357e-05, + "loss": 1.1538, + "step": 3664 + }, + { + "epoch": 0.5761559472577571, + "grad_norm": 0.14557988941669464, + "learning_rate": 4.048074490242615e-05, + "loss": 1.1012, + "step": 3665 + }, + { + "epoch": 0.5763131521546896, + "grad_norm": 0.14714285731315613, + "learning_rate": 4.047589308965373e-05, + "loss": 1.1016, + "step": 3666 + }, + { + "epoch": 0.5764703570516222, + "grad_norm": 0.17905539274215698, + "learning_rate": 4.0471040331672646e-05, + "loss": 1.1554, + "step": 3667 + }, + { + "epoch": 0.5766275619485547, + "grad_norm": 0.13437265157699585, + "learning_rate": 4.046618662877928e-05, + "loss": 1.1321, + "step": 3668 + }, + { + "epoch": 0.5767847668454872, + "grad_norm": 0.1353270262479782, + "learning_rate": 4.046133198127007e-05, + "loss": 1.0082, + "step": 3669 + }, + { + "epoch": 0.5769419717424198, + "grad_norm": 0.12693148851394653, + "learning_rate": 4.045647638944154e-05, + "loss": 1.1042, + "step": 3670 + }, + { + "epoch": 0.5770991766393523, + "grad_norm": 0.14182643592357635, + "learning_rate": 4.045161985359024e-05, + "loss": 1.1912, + "step": 3671 + }, + { + "epoch": 0.5772563815362849, + "grad_norm": 0.13259053230285645, + "learning_rate": 4.044676237401278e-05, + "loss": 1.128, + "step": 3672 + }, + { + "epoch": 0.5774135864332174, + "grad_norm": 0.13726124167442322, + "learning_rate": 4.044190395100585e-05, + "loss": 1.0668, + "step": 3673 + }, + { + "epoch": 0.5775707913301499, + "grad_norm": 0.13621871173381805, + "learning_rate": 4.043704458486618e-05, + "loss": 1.0255, + "step": 3674 + }, + { + "epoch": 0.5777279962270825, + "grad_norm": 0.1848558783531189, + "learning_rate": 4.043218427589056e-05, + "loss": 1.0299, + "step": 3675 + }, + { + "epoch": 0.577885201124015, + "grad_norm": 0.13466404378414154, + "learning_rate": 4.042732302437585e-05, + "loss": 1.0444, + "step": 3676 + }, + { + "epoch": 0.5780424060209476, + "grad_norm": 0.1385255753993988, + "learning_rate": 4.042246083061894e-05, + "loss": 0.9893, + "step": 3677 + }, + { + "epoch": 0.5781996109178801, + "grad_norm": 0.14059042930603027, + "learning_rate": 4.041759769491679e-05, + "loss": 1.1886, + "step": 3678 + }, + { + "epoch": 0.5783568158148127, + "grad_norm": 0.1421997994184494, + "learning_rate": 4.041273361756645e-05, + "loss": 1.0924, + "step": 3679 + }, + { + "epoch": 0.5785140207117452, + "grad_norm": 0.16908836364746094, + "learning_rate": 4.040786859886497e-05, + "loss": 1.0998, + "step": 3680 + }, + { + "epoch": 0.5785140207117452, + "eval_loss": 1.1054325103759766, + "eval_runtime": 2330.938, + "eval_samples_per_second": 3.972, + "eval_steps_per_second": 1.986, + "step": 3680 + }, + { + "epoch": 0.5786712256086777, + "grad_norm": 0.21620066463947296, + "learning_rate": 4.04030026391095e-05, + "loss": 1.051, + "step": 3681 + }, + { + "epoch": 0.5788284305056103, + "grad_norm": 0.12915241718292236, + "learning_rate": 4.0398135738597244e-05, + "loss": 1.0906, + "step": 3682 + }, + { + "epoch": 0.5789856354025428, + "grad_norm": 0.1334918737411499, + "learning_rate": 4.0393267897625434e-05, + "loss": 1.066, + "step": 3683 + }, + { + "epoch": 0.5791428402994754, + "grad_norm": 0.1355300396680832, + "learning_rate": 4.038839911649139e-05, + "loss": 1.1698, + "step": 3684 + }, + { + "epoch": 0.5793000451964079, + "grad_norm": 0.15459658205509186, + "learning_rate": 4.038352939549247e-05, + "loss": 1.0792, + "step": 3685 + }, + { + "epoch": 0.5794572500933404, + "grad_norm": 0.1558839976787567, + "learning_rate": 4.0378658734926116e-05, + "loss": 1.1722, + "step": 3686 + }, + { + "epoch": 0.579614454990273, + "grad_norm": 0.1443861573934555, + "learning_rate": 4.0373787135089796e-05, + "loss": 1.1336, + "step": 3687 + }, + { + "epoch": 0.5797716598872055, + "grad_norm": 0.12290564924478531, + "learning_rate": 4.036891459628105e-05, + "loss": 1.0873, + "step": 3688 + }, + { + "epoch": 0.5799288647841381, + "grad_norm": 0.1709529608488083, + "learning_rate": 4.0364041118797476e-05, + "loss": 1.0867, + "step": 3689 + }, + { + "epoch": 0.5800860696810706, + "grad_norm": 0.15766242146492004, + "learning_rate": 4.0359166702936724e-05, + "loss": 1.1389, + "step": 3690 + }, + { + "epoch": 0.5802432745780031, + "grad_norm": 0.1576821208000183, + "learning_rate": 4.035429134899652e-05, + "loss": 1.176, + "step": 3691 + }, + { + "epoch": 0.5804004794749357, + "grad_norm": 0.2048172652721405, + "learning_rate": 4.0349415057274604e-05, + "loss": 1.1504, + "step": 3692 + }, + { + "epoch": 0.5805576843718682, + "grad_norm": 0.13622994720935822, + "learning_rate": 4.0344537828068816e-05, + "loss": 1.094, + "step": 3693 + }, + { + "epoch": 0.5807148892688008, + "grad_norm": 0.14516420662403107, + "learning_rate": 4.033965966167705e-05, + "loss": 1.0591, + "step": 3694 + }, + { + "epoch": 0.5808720941657333, + "grad_norm": 0.1271536499261856, + "learning_rate": 4.033478055839723e-05, + "loss": 1.1008, + "step": 3695 + }, + { + "epoch": 0.5810292990626658, + "grad_norm": 0.13999730348587036, + "learning_rate": 4.032990051852736e-05, + "loss": 1.1955, + "step": 3696 + }, + { + "epoch": 0.5811865039595984, + "grad_norm": 0.1689818650484085, + "learning_rate": 4.032501954236549e-05, + "loss": 1.0263, + "step": 3697 + }, + { + "epoch": 0.5813437088565309, + "grad_norm": 0.1439339816570282, + "learning_rate": 4.032013763020974e-05, + "loss": 0.9829, + "step": 3698 + }, + { + "epoch": 0.5815009137534635, + "grad_norm": 0.15156951546669006, + "learning_rate": 4.031525478235827e-05, + "loss": 1.0514, + "step": 3699 + }, + { + "epoch": 0.581658118650396, + "grad_norm": 0.1496332734823227, + "learning_rate": 4.031037099910931e-05, + "loss": 1.1401, + "step": 3700 + }, + { + "epoch": 0.5818153235473285, + "grad_norm": 0.1426810920238495, + "learning_rate": 4.030548628076114e-05, + "loss": 1.0832, + "step": 3701 + }, + { + "epoch": 0.5819725284442611, + "grad_norm": 0.14869458973407745, + "learning_rate": 4.03006006276121e-05, + "loss": 1.0517, + "step": 3702 + }, + { + "epoch": 0.5821297333411936, + "grad_norm": 0.15788580477237701, + "learning_rate": 4.0295714039960595e-05, + "loss": 0.912, + "step": 3703 + }, + { + "epoch": 0.5822869382381262, + "grad_norm": 0.13516800105571747, + "learning_rate": 4.029082651810507e-05, + "loss": 1.1437, + "step": 3704 + }, + { + "epoch": 0.5824441431350587, + "grad_norm": 0.15520767867565155, + "learning_rate": 4.0285938062344034e-05, + "loss": 1.2535, + "step": 3705 + }, + { + "epoch": 0.5826013480319912, + "grad_norm": 0.13410113751888275, + "learning_rate": 4.028104867297606e-05, + "loss": 1.1021, + "step": 3706 + }, + { + "epoch": 0.5827585529289238, + "grad_norm": 0.1681327521800995, + "learning_rate": 4.027615835029978e-05, + "loss": 1.1215, + "step": 3707 + }, + { + "epoch": 0.5829157578258563, + "grad_norm": 0.15960273146629333, + "learning_rate": 4.0271267094613877e-05, + "loss": 1.165, + "step": 3708 + }, + { + "epoch": 0.5830729627227889, + "grad_norm": 0.12318623811006546, + "learning_rate": 4.0266374906217064e-05, + "loss": 1.1991, + "step": 3709 + }, + { + "epoch": 0.5832301676197214, + "grad_norm": 0.1433396190404892, + "learning_rate": 4.0261481785408165e-05, + "loss": 0.9933, + "step": 3710 + }, + { + "epoch": 0.5833873725166538, + "grad_norm": 0.14328768849372864, + "learning_rate": 4.025658773248603e-05, + "loss": 1.0885, + "step": 3711 + }, + { + "epoch": 0.5835445774135865, + "grad_norm": 0.14786562323570251, + "learning_rate": 4.025169274774956e-05, + "loss": 1.0613, + "step": 3712 + }, + { + "epoch": 0.583701782310519, + "grad_norm": 0.1389857828617096, + "learning_rate": 4.0246796831497724e-05, + "loss": 1.1572, + "step": 3713 + }, + { + "epoch": 0.5838589872074516, + "grad_norm": 0.14071199297904968, + "learning_rate": 4.024189998402955e-05, + "loss": 1.1341, + "step": 3714 + }, + { + "epoch": 0.584016192104384, + "grad_norm": 0.1397501528263092, + "learning_rate": 4.0237002205644116e-05, + "loss": 1.1668, + "step": 3715 + }, + { + "epoch": 0.5841733970013165, + "grad_norm": 0.13448472321033478, + "learning_rate": 4.023210349664056e-05, + "loss": 1.1673, + "step": 3716 + }, + { + "epoch": 0.5843306018982491, + "grad_norm": 0.1473264843225479, + "learning_rate": 4.022720385731808e-05, + "loss": 1.0642, + "step": 3717 + }, + { + "epoch": 0.5844878067951816, + "grad_norm": 0.13606154918670654, + "learning_rate": 4.022230328797591e-05, + "loss": 1.1701, + "step": 3718 + }, + { + "epoch": 0.5846450116921142, + "grad_norm": 0.12004934996366501, + "learning_rate": 4.021740178891339e-05, + "loss": 1.137, + "step": 3719 + }, + { + "epoch": 0.5848022165890467, + "grad_norm": 0.15330874919891357, + "learning_rate": 4.021249936042986e-05, + "loss": 1.1412, + "step": 3720 + }, + { + "epoch": 0.5849594214859792, + "grad_norm": 0.13745856285095215, + "learning_rate": 4.020759600282475e-05, + "loss": 1.0378, + "step": 3721 + }, + { + "epoch": 0.5851166263829118, + "grad_norm": 0.13707605004310608, + "learning_rate": 4.020269171639754e-05, + "loss": 1.0882, + "step": 3722 + }, + { + "epoch": 0.5852738312798443, + "grad_norm": 0.1425696462392807, + "learning_rate": 4.019778650144775e-05, + "loss": 1.1595, + "step": 3723 + }, + { + "epoch": 0.5854310361767769, + "grad_norm": 0.1498594433069229, + "learning_rate": 4.0192880358275e-05, + "loss": 1.0956, + "step": 3724 + }, + { + "epoch": 0.5855882410737094, + "grad_norm": 0.1398852914571762, + "learning_rate": 4.018797328717891e-05, + "loss": 1.0655, + "step": 3725 + }, + { + "epoch": 0.5857454459706419, + "grad_norm": 0.1315324306488037, + "learning_rate": 4.018306528845921e-05, + "loss": 1.1341, + "step": 3726 + }, + { + "epoch": 0.5859026508675745, + "grad_norm": 0.136732816696167, + "learning_rate": 4.017815636241565e-05, + "loss": 1.126, + "step": 3727 + }, + { + "epoch": 0.586059855764507, + "grad_norm": 0.12884072959423065, + "learning_rate": 4.017324650934804e-05, + "loss": 1.203, + "step": 3728 + }, + { + "epoch": 0.5862170606614396, + "grad_norm": 0.1397540420293808, + "learning_rate": 4.016833572955626e-05, + "loss": 1.1054, + "step": 3729 + }, + { + "epoch": 0.5863742655583721, + "grad_norm": 0.14420150220394135, + "learning_rate": 4.016342402334026e-05, + "loss": 1.1141, + "step": 3730 + }, + { + "epoch": 0.5865314704553047, + "grad_norm": 0.13877861201763153, + "learning_rate": 4.0158511391000006e-05, + "loss": 1.0809, + "step": 3731 + }, + { + "epoch": 0.5866886753522372, + "grad_norm": 0.1263275295495987, + "learning_rate": 4.015359783283555e-05, + "loss": 1.0784, + "step": 3732 + }, + { + "epoch": 0.5868458802491697, + "grad_norm": 0.1705816388130188, + "learning_rate": 4.0148683349146985e-05, + "loss": 1.193, + "step": 3733 + }, + { + "epoch": 0.5870030851461023, + "grad_norm": 0.1465051770210266, + "learning_rate": 4.014376794023449e-05, + "loss": 1.0628, + "step": 3734 + }, + { + "epoch": 0.5871602900430348, + "grad_norm": 0.13974584639072418, + "learning_rate": 4.013885160639826e-05, + "loss": 1.1602, + "step": 3735 + }, + { + "epoch": 0.5873174949399674, + "grad_norm": 0.13794563710689545, + "learning_rate": 4.013393434793858e-05, + "loss": 1.1384, + "step": 3736 + }, + { + "epoch": 0.5874746998368999, + "grad_norm": 0.12318416684865952, + "learning_rate": 4.012901616515578e-05, + "loss": 1.1166, + "step": 3737 + }, + { + "epoch": 0.5876319047338324, + "grad_norm": 0.14967970550060272, + "learning_rate": 4.012409705835022e-05, + "loss": 1.2414, + "step": 3738 + }, + { + "epoch": 0.587789109630765, + "grad_norm": 0.13011139631271362, + "learning_rate": 4.011917702782236e-05, + "loss": 1.1052, + "step": 3739 + }, + { + "epoch": 0.5879463145276975, + "grad_norm": 0.14660027623176575, + "learning_rate": 4.0114256073872694e-05, + "loss": 1.1247, + "step": 3740 + }, + { + "epoch": 0.5881035194246301, + "grad_norm": 0.14377620816230774, + "learning_rate": 4.010933419680176e-05, + "loss": 1.2025, + "step": 3741 + }, + { + "epoch": 0.5882607243215626, + "grad_norm": 0.13705839216709137, + "learning_rate": 4.01044113969102e-05, + "loss": 1.2845, + "step": 3742 + }, + { + "epoch": 0.5884179292184951, + "grad_norm": 0.2073041945695877, + "learning_rate": 4.009948767449865e-05, + "loss": 1.0314, + "step": 3743 + }, + { + "epoch": 0.5885751341154277, + "grad_norm": 0.1414439082145691, + "learning_rate": 4.009456302986784e-05, + "loss": 1.13, + "step": 3744 + }, + { + "epoch": 0.5887323390123602, + "grad_norm": 0.1278563141822815, + "learning_rate": 4.008963746331855e-05, + "loss": 1.0588, + "step": 3745 + }, + { + "epoch": 0.5888895439092928, + "grad_norm": 0.1332569420337677, + "learning_rate": 4.008471097515163e-05, + "loss": 1.1562, + "step": 3746 + }, + { + "epoch": 0.5890467488062253, + "grad_norm": 0.15122583508491516, + "learning_rate": 4.0079783565667944e-05, + "loss": 1.067, + "step": 3747 + }, + { + "epoch": 0.5892039537031578, + "grad_norm": 0.1541803926229477, + "learning_rate": 4.0074855235168454e-05, + "loss": 1.0791, + "step": 3748 + }, + { + "epoch": 0.5893611586000904, + "grad_norm": 0.165819451212883, + "learning_rate": 4.0069925983954165e-05, + "loss": 1.1362, + "step": 3749 + }, + { + "epoch": 0.5895183634970229, + "grad_norm": 0.1396874040365219, + "learning_rate": 4.0064995812326135e-05, + "loss": 1.0843, + "step": 3750 + }, + { + "epoch": 0.5896755683939555, + "grad_norm": 0.14084576070308685, + "learning_rate": 4.006006472058548e-05, + "loss": 1.0624, + "step": 3751 + }, + { + "epoch": 0.589832773290888, + "grad_norm": 0.17363415658473969, + "learning_rate": 4.0055132709033373e-05, + "loss": 1.1095, + "step": 3752 + }, + { + "epoch": 0.5899899781878205, + "grad_norm": 0.1511961817741394, + "learning_rate": 4.005019977797103e-05, + "loss": 1.0511, + "step": 3753 + }, + { + "epoch": 0.5901471830847531, + "grad_norm": 0.17082269489765167, + "learning_rate": 4.004526592769976e-05, + "loss": 1.1251, + "step": 3754 + }, + { + "epoch": 0.5903043879816856, + "grad_norm": 0.14516089856624603, + "learning_rate": 4.004033115852088e-05, + "loss": 1.0771, + "step": 3755 + }, + { + "epoch": 0.5904615928786182, + "grad_norm": 0.18782827258110046, + "learning_rate": 4.0035395470735815e-05, + "loss": 1.0621, + "step": 3756 + }, + { + "epoch": 0.5906187977755507, + "grad_norm": 0.14073415100574493, + "learning_rate": 4.003045886464599e-05, + "loss": 1.1879, + "step": 3757 + }, + { + "epoch": 0.5907760026724832, + "grad_norm": 0.13535237312316895, + "learning_rate": 4.002552134055292e-05, + "loss": 1.1784, + "step": 3758 + }, + { + "epoch": 0.5909332075694158, + "grad_norm": 0.13585159182548523, + "learning_rate": 4.002058289875817e-05, + "loss": 1.0525, + "step": 3759 + }, + { + "epoch": 0.5910904124663483, + "grad_norm": 0.13575370609760284, + "learning_rate": 4.0015643539563383e-05, + "loss": 1.1482, + "step": 3760 + }, + { + "epoch": 0.5912476173632809, + "grad_norm": 0.16383205354213715, + "learning_rate": 4.001070326327021e-05, + "loss": 1.0601, + "step": 3761 + }, + { + "epoch": 0.5914048222602134, + "grad_norm": 0.13209950923919678, + "learning_rate": 4.00057620701804e-05, + "loss": 1.1027, + "step": 3762 + }, + { + "epoch": 0.5915620271571459, + "grad_norm": 0.16311690211296082, + "learning_rate": 4.000081996059573e-05, + "loss": 1.0555, + "step": 3763 + }, + { + "epoch": 0.5917192320540785, + "grad_norm": 0.1423267275094986, + "learning_rate": 3.999587693481804e-05, + "loss": 1.1087, + "step": 3764 + }, + { + "epoch": 0.591876436951011, + "grad_norm": 0.16528518497943878, + "learning_rate": 3.9990932993149266e-05, + "loss": 1.1218, + "step": 3765 + }, + { + "epoch": 0.5920336418479436, + "grad_norm": 0.13041681051254272, + "learning_rate": 3.9985988135891326e-05, + "loss": 1.1765, + "step": 3766 + }, + { + "epoch": 0.5921908467448761, + "grad_norm": 0.15191826224327087, + "learning_rate": 3.998104236334625e-05, + "loss": 1.043, + "step": 3767 + }, + { + "epoch": 0.5923480516418086, + "grad_norm": 0.18016381561756134, + "learning_rate": 3.997609567581611e-05, + "loss": 1.0738, + "step": 3768 + }, + { + "epoch": 0.5925052565387412, + "grad_norm": 0.14468325674533844, + "learning_rate": 3.997114807360303e-05, + "loss": 1.0948, + "step": 3769 + }, + { + "epoch": 0.5926624614356737, + "grad_norm": 0.14610640704631805, + "learning_rate": 3.996619955700918e-05, + "loss": 1.0561, + "step": 3770 + }, + { + "epoch": 0.5928196663326063, + "grad_norm": 0.1404576301574707, + "learning_rate": 3.9961250126336803e-05, + "loss": 1.1185, + "step": 3771 + }, + { + "epoch": 0.5929768712295388, + "grad_norm": 0.13908030092716217, + "learning_rate": 3.99562997818882e-05, + "loss": 1.1209, + "step": 3772 + }, + { + "epoch": 0.5931340761264713, + "grad_norm": 0.1259194165468216, + "learning_rate": 3.99513485239657e-05, + "loss": 1.1947, + "step": 3773 + }, + { + "epoch": 0.5932912810234039, + "grad_norm": 0.1661093682050705, + "learning_rate": 3.994639635287172e-05, + "loss": 0.96, + "step": 3774 + }, + { + "epoch": 0.5934484859203364, + "grad_norm": 0.13863816857337952, + "learning_rate": 3.994144326890873e-05, + "loss": 1.0553, + "step": 3775 + }, + { + "epoch": 0.593605690817269, + "grad_norm": 0.17332686483860016, + "learning_rate": 3.993648927237922e-05, + "loss": 1.0102, + "step": 3776 + }, + { + "epoch": 0.5937628957142015, + "grad_norm": 0.14273682236671448, + "learning_rate": 3.9931534363585784e-05, + "loss": 1.13, + "step": 3777 + }, + { + "epoch": 0.593920100611134, + "grad_norm": 0.17309348285198212, + "learning_rate": 3.992657854283104e-05, + "loss": 1.0525, + "step": 3778 + }, + { + "epoch": 0.5940773055080666, + "grad_norm": 0.1335354894399643, + "learning_rate": 3.992162181041766e-05, + "loss": 1.1013, + "step": 3779 + }, + { + "epoch": 0.5942345104049991, + "grad_norm": 0.12234276533126831, + "learning_rate": 3.9916664166648405e-05, + "loss": 1.097, + "step": 3780 + }, + { + "epoch": 0.5943917153019317, + "grad_norm": 0.14885415136814117, + "learning_rate": 3.991170561182605e-05, + "loss": 1.1586, + "step": 3781 + }, + { + "epoch": 0.5945489201988642, + "grad_norm": 0.15419688820838928, + "learning_rate": 3.990674614625345e-05, + "loss": 1.0608, + "step": 3782 + }, + { + "epoch": 0.5947061250957968, + "grad_norm": 0.1355455368757248, + "learning_rate": 3.9901785770233524e-05, + "loss": 1.0948, + "step": 3783 + }, + { + "epoch": 0.5948633299927293, + "grad_norm": 0.13568316400051117, + "learning_rate": 3.9896824484069205e-05, + "loss": 1.1747, + "step": 3784 + }, + { + "epoch": 0.5950205348896618, + "grad_norm": 0.13468366861343384, + "learning_rate": 3.989186228806354e-05, + "loss": 1.1428, + "step": 3785 + }, + { + "epoch": 0.5951777397865944, + "grad_norm": 0.13422173261642456, + "learning_rate": 3.988689918251958e-05, + "loss": 1.076, + "step": 3786 + }, + { + "epoch": 0.5953349446835269, + "grad_norm": 0.1521790474653244, + "learning_rate": 3.9881935167740446e-05, + "loss": 1.0135, + "step": 3787 + }, + { + "epoch": 0.5954921495804595, + "grad_norm": 0.14957469701766968, + "learning_rate": 3.9876970244029354e-05, + "loss": 1.1601, + "step": 3788 + }, + { + "epoch": 0.595649354477392, + "grad_norm": 0.14396795630455017, + "learning_rate": 3.987200441168951e-05, + "loss": 1.1368, + "step": 3789 + }, + { + "epoch": 0.5958065593743245, + "grad_norm": 0.12735560536384583, + "learning_rate": 3.986703767102423e-05, + "loss": 1.268, + "step": 3790 + }, + { + "epoch": 0.5959637642712571, + "grad_norm": 0.14245949685573578, + "learning_rate": 3.986207002233685e-05, + "loss": 1.1013, + "step": 3791 + }, + { + "epoch": 0.5961209691681896, + "grad_norm": 0.16585348546504974, + "learning_rate": 3.985710146593077e-05, + "loss": 1.1026, + "step": 3792 + }, + { + "epoch": 0.5962781740651222, + "grad_norm": 0.14120368659496307, + "learning_rate": 3.9852132002109476e-05, + "loss": 1.0614, + "step": 3793 + }, + { + "epoch": 0.5964353789620547, + "grad_norm": 0.14978669583797455, + "learning_rate": 3.984716163117646e-05, + "loss": 1.0808, + "step": 3794 + }, + { + "epoch": 0.5965925838589872, + "grad_norm": 0.1450655162334442, + "learning_rate": 3.984219035343531e-05, + "loss": 1.0358, + "step": 3795 + }, + { + "epoch": 0.5967497887559198, + "grad_norm": 0.14680932462215424, + "learning_rate": 3.983721816918963e-05, + "loss": 1.2001, + "step": 3796 + }, + { + "epoch": 0.5969069936528523, + "grad_norm": 0.14158986508846283, + "learning_rate": 3.983224507874312e-05, + "loss": 1.0586, + "step": 3797 + }, + { + "epoch": 0.5970641985497849, + "grad_norm": 0.12906032800674438, + "learning_rate": 3.982727108239952e-05, + "loss": 1.1182, + "step": 3798 + }, + { + "epoch": 0.5972214034467174, + "grad_norm": 0.12561413645744324, + "learning_rate": 3.9822296180462615e-05, + "loss": 1.1288, + "step": 3799 + }, + { + "epoch": 0.5973786083436499, + "grad_norm": 0.17582055926322937, + "learning_rate": 3.981732037323625e-05, + "loss": 1.0403, + "step": 3800 + }, + { + "epoch": 0.5975358132405825, + "grad_norm": 0.1650211364030838, + "learning_rate": 3.981234366102434e-05, + "loss": 1.1018, + "step": 3801 + }, + { + "epoch": 0.597693018137515, + "grad_norm": 0.14423269033432007, + "learning_rate": 3.9807366044130825e-05, + "loss": 1.0561, + "step": 3802 + }, + { + "epoch": 0.5978502230344476, + "grad_norm": 0.1274254322052002, + "learning_rate": 3.980238752285974e-05, + "loss": 1.1414, + "step": 3803 + }, + { + "epoch": 0.5980074279313801, + "grad_norm": 0.1247900202870369, + "learning_rate": 3.979740809751514e-05, + "loss": 1.0948, + "step": 3804 + }, + { + "epoch": 0.5981646328283126, + "grad_norm": 0.13164469599723816, + "learning_rate": 3.9792427768401153e-05, + "loss": 1.0135, + "step": 3805 + }, + { + "epoch": 0.5983218377252452, + "grad_norm": 0.12268634140491486, + "learning_rate": 3.978744653582197e-05, + "loss": 1.1238, + "step": 3806 + }, + { + "epoch": 0.5984790426221777, + "grad_norm": 0.13479411602020264, + "learning_rate": 3.97824644000818e-05, + "loss": 1.1349, + "step": 3807 + }, + { + "epoch": 0.5986362475191103, + "grad_norm": 0.15460748970508575, + "learning_rate": 3.9777481361484956e-05, + "loss": 1.0716, + "step": 3808 + }, + { + "epoch": 0.5987934524160428, + "grad_norm": 0.13242261111736298, + "learning_rate": 3.9772497420335774e-05, + "loss": 1.1981, + "step": 3809 + }, + { + "epoch": 0.5989506573129753, + "grad_norm": 0.14284269511699677, + "learning_rate": 3.976751257693865e-05, + "loss": 1.1462, + "step": 3810 + }, + { + "epoch": 0.5991078622099079, + "grad_norm": 0.14825420081615448, + "learning_rate": 3.976252683159806e-05, + "loss": 1.0104, + "step": 3811 + }, + { + "epoch": 0.5992650671068404, + "grad_norm": 0.13737694919109344, + "learning_rate": 3.975754018461848e-05, + "loss": 1.0234, + "step": 3812 + }, + { + "epoch": 0.599422272003773, + "grad_norm": 0.19531135261058807, + "learning_rate": 3.9752552636304504e-05, + "loss": 1.0967, + "step": 3813 + }, + { + "epoch": 0.5995794769007055, + "grad_norm": 0.17012959718704224, + "learning_rate": 3.9747564186960744e-05, + "loss": 1.1456, + "step": 3814 + }, + { + "epoch": 0.599736681797638, + "grad_norm": 0.1392662674188614, + "learning_rate": 3.974257483689188e-05, + "loss": 1.2154, + "step": 3815 + }, + { + "epoch": 0.5998938866945706, + "grad_norm": 0.15269821882247925, + "learning_rate": 3.9737584586402624e-05, + "loss": 1.1929, + "step": 3816 + }, + { + "epoch": 0.600051091591503, + "grad_norm": 0.13864651322364807, + "learning_rate": 3.9732593435797774e-05, + "loss": 1.1531, + "step": 3817 + }, + { + "epoch": 0.6002082964884357, + "grad_norm": 0.12776337563991547, + "learning_rate": 3.9727601385382174e-05, + "loss": 1.1405, + "step": 3818 + }, + { + "epoch": 0.6003655013853681, + "grad_norm": 0.14198248088359833, + "learning_rate": 3.9722608435460716e-05, + "loss": 1.152, + "step": 3819 + }, + { + "epoch": 0.6005227062823006, + "grad_norm": 0.12717603147029877, + "learning_rate": 3.971761458633836e-05, + "loss": 1.0789, + "step": 3820 + }, + { + "epoch": 0.6006799111792333, + "grad_norm": 0.13504749536514282, + "learning_rate": 3.97126198383201e-05, + "loss": 1.1775, + "step": 3821 + }, + { + "epoch": 0.6008371160761657, + "grad_norm": 0.14354047179222107, + "learning_rate": 3.9707624191710984e-05, + "loss": 1.2396, + "step": 3822 + }, + { + "epoch": 0.6009943209730984, + "grad_norm": 0.12898609042167664, + "learning_rate": 3.9702627646816146e-05, + "loss": 1.1291, + "step": 3823 + }, + { + "epoch": 0.6011515258700308, + "grad_norm": 0.14688414335250854, + "learning_rate": 3.969763020394076e-05, + "loss": 1.1159, + "step": 3824 + }, + { + "epoch": 0.6013087307669633, + "grad_norm": 0.1664854884147644, + "learning_rate": 3.969263186339004e-05, + "loss": 1.0544, + "step": 3825 + }, + { + "epoch": 0.601465935663896, + "grad_norm": 0.14804497361183167, + "learning_rate": 3.9687632625469264e-05, + "loss": 1.122, + "step": 3826 + }, + { + "epoch": 0.6016231405608284, + "grad_norm": 0.1647578924894333, + "learning_rate": 3.9682632490483765e-05, + "loss": 1.0659, + "step": 3827 + }, + { + "epoch": 0.601780345457761, + "grad_norm": 0.1529950052499771, + "learning_rate": 3.967763145873895e-05, + "loss": 1.1689, + "step": 3828 + }, + { + "epoch": 0.6019375503546935, + "grad_norm": 0.1398983597755432, + "learning_rate": 3.967262953054024e-05, + "loss": 1.0378, + "step": 3829 + }, + { + "epoch": 0.602094755251626, + "grad_norm": 0.1268242448568344, + "learning_rate": 3.966762670619315e-05, + "loss": 1.1719, + "step": 3830 + }, + { + "epoch": 0.6022519601485586, + "grad_norm": 0.143156036734581, + "learning_rate": 3.9662622986003226e-05, + "loss": 1.1132, + "step": 3831 + }, + { + "epoch": 0.6024091650454911, + "grad_norm": 0.13624730706214905, + "learning_rate": 3.9657618370276076e-05, + "loss": 1.2035, + "step": 3832 + }, + { + "epoch": 0.6025663699424237, + "grad_norm": 0.13455921411514282, + "learning_rate": 3.9652612859317364e-05, + "loss": 1.134, + "step": 3833 + }, + { + "epoch": 0.6027235748393562, + "grad_norm": 0.13268525898456573, + "learning_rate": 3.964760645343281e-05, + "loss": 0.9777, + "step": 3834 + }, + { + "epoch": 0.6028807797362887, + "grad_norm": 0.1300792098045349, + "learning_rate": 3.964259915292818e-05, + "loss": 1.0246, + "step": 3835 + }, + { + "epoch": 0.6030379846332213, + "grad_norm": 0.11833825707435608, + "learning_rate": 3.963759095810931e-05, + "loss": 1.1349, + "step": 3836 + }, + { + "epoch": 0.6031951895301538, + "grad_norm": 0.15196771919727325, + "learning_rate": 3.9632581869282076e-05, + "loss": 1.1064, + "step": 3837 + }, + { + "epoch": 0.6033523944270864, + "grad_norm": 0.13854347169399261, + "learning_rate": 3.962757188675241e-05, + "loss": 1.0556, + "step": 3838 + }, + { + "epoch": 0.6035095993240189, + "grad_norm": 0.12719032168388367, + "learning_rate": 3.962256101082632e-05, + "loss": 1.148, + "step": 3839 + }, + { + "epoch": 0.6036668042209515, + "grad_norm": 0.14315147697925568, + "learning_rate": 3.9617549241809826e-05, + "loss": 1.109, + "step": 3840 + }, + { + "epoch": 0.6036668042209515, + "eval_loss": 1.1036489009857178, + "eval_runtime": 2338.1192, + "eval_samples_per_second": 3.96, + "eval_steps_per_second": 1.98, + "step": 3840 + }, + { + "epoch": 0.603824009117884, + "grad_norm": 0.13646598160266876, + "learning_rate": 3.961253658000904e-05, + "loss": 1.115, + "step": 3841 + }, + { + "epoch": 0.6039812140148165, + "grad_norm": 0.12857642769813538, + "learning_rate": 3.960752302573012e-05, + "loss": 1.1206, + "step": 3842 + }, + { + "epoch": 0.6041384189117491, + "grad_norm": 0.15237180888652802, + "learning_rate": 3.960250857927928e-05, + "loss": 1.2445, + "step": 3843 + }, + { + "epoch": 0.6042956238086816, + "grad_norm": 0.1883225291967392, + "learning_rate": 3.959749324096277e-05, + "loss": 1.1447, + "step": 3844 + }, + { + "epoch": 0.6044528287056142, + "grad_norm": 0.13263137638568878, + "learning_rate": 3.959247701108691e-05, + "loss": 1.2384, + "step": 3845 + }, + { + "epoch": 0.6046100336025467, + "grad_norm": 0.12633422017097473, + "learning_rate": 3.958745988995807e-05, + "loss": 1.177, + "step": 3846 + }, + { + "epoch": 0.6047672384994792, + "grad_norm": 0.14455291628837585, + "learning_rate": 3.9582441877882695e-05, + "loss": 0.958, + "step": 3847 + }, + { + "epoch": 0.6049244433964118, + "grad_norm": 0.15000420808792114, + "learning_rate": 3.9577422975167245e-05, + "loss": 1.2108, + "step": 3848 + }, + { + "epoch": 0.6050816482933443, + "grad_norm": 0.13494277000427246, + "learning_rate": 3.957240318211826e-05, + "loss": 1.109, + "step": 3849 + }, + { + "epoch": 0.6052388531902769, + "grad_norm": 0.12015588581562042, + "learning_rate": 3.9567382499042337e-05, + "loss": 1.1693, + "step": 3850 + }, + { + "epoch": 0.6053960580872094, + "grad_norm": 0.13013339042663574, + "learning_rate": 3.956236092624611e-05, + "loss": 1.2344, + "step": 3851 + }, + { + "epoch": 0.6055532629841419, + "grad_norm": 0.1262013465166092, + "learning_rate": 3.955733846403629e-05, + "loss": 1.069, + "step": 3852 + }, + { + "epoch": 0.6057104678810745, + "grad_norm": 0.12954431772232056, + "learning_rate": 3.9552315112719626e-05, + "loss": 1.1141, + "step": 3853 + }, + { + "epoch": 0.605867672778007, + "grad_norm": 0.12079882621765137, + "learning_rate": 3.954729087260291e-05, + "loss": 1.0836, + "step": 3854 + }, + { + "epoch": 0.6060248776749396, + "grad_norm": 0.13711245357990265, + "learning_rate": 3.9542265743993036e-05, + "loss": 1.2398, + "step": 3855 + }, + { + "epoch": 0.6061820825718721, + "grad_norm": 0.12721870839595795, + "learning_rate": 3.9537239727196886e-05, + "loss": 1.2091, + "step": 3856 + }, + { + "epoch": 0.6063392874688046, + "grad_norm": 0.12898989021778107, + "learning_rate": 3.9532212822521454e-05, + "loss": 1.0693, + "step": 3857 + }, + { + "epoch": 0.6064964923657372, + "grad_norm": 0.12627077102661133, + "learning_rate": 3.952718503027375e-05, + "loss": 1.1112, + "step": 3858 + }, + { + "epoch": 0.6066536972626697, + "grad_norm": 0.15694749355316162, + "learning_rate": 3.9522156350760855e-05, + "loss": 1.0982, + "step": 3859 + }, + { + "epoch": 0.6068109021596023, + "grad_norm": 0.137521430850029, + "learning_rate": 3.9517126784289896e-05, + "loss": 1.1493, + "step": 3860 + }, + { + "epoch": 0.6069681070565348, + "grad_norm": 0.13872124254703522, + "learning_rate": 3.9512096331168076e-05, + "loss": 1.1129, + "step": 3861 + }, + { + "epoch": 0.6071253119534673, + "grad_norm": 0.18209943175315857, + "learning_rate": 3.9507064991702625e-05, + "loss": 1.1326, + "step": 3862 + }, + { + "epoch": 0.6072825168503999, + "grad_norm": 0.1378040462732315, + "learning_rate": 3.950203276620084e-05, + "loss": 1.1655, + "step": 3863 + }, + { + "epoch": 0.6074397217473324, + "grad_norm": 0.13413548469543457, + "learning_rate": 3.949699965497007e-05, + "loss": 1.1299, + "step": 3864 + }, + { + "epoch": 0.607596926644265, + "grad_norm": 0.12779201567173004, + "learning_rate": 3.949196565831772e-05, + "loss": 1.2032, + "step": 3865 + }, + { + "epoch": 0.6077541315411975, + "grad_norm": 0.1494266390800476, + "learning_rate": 3.9486930776551246e-05, + "loss": 1.1007, + "step": 3866 + }, + { + "epoch": 0.60791133643813, + "grad_norm": 0.12856054306030273, + "learning_rate": 3.948189500997816e-05, + "loss": 1.0411, + "step": 3867 + }, + { + "epoch": 0.6080685413350626, + "grad_norm": 0.13846808671951294, + "learning_rate": 3.947685835890602e-05, + "loss": 1.1598, + "step": 3868 + }, + { + "epoch": 0.6082257462319951, + "grad_norm": 0.13360343873500824, + "learning_rate": 3.947182082364246e-05, + "loss": 1.1168, + "step": 3869 + }, + { + "epoch": 0.6083829511289277, + "grad_norm": 0.12072020769119263, + "learning_rate": 3.946678240449515e-05, + "loss": 1.1159, + "step": 3870 + }, + { + "epoch": 0.6085401560258602, + "grad_norm": 0.12124241888523102, + "learning_rate": 3.94617431017718e-05, + "loss": 1.0516, + "step": 3871 + }, + { + "epoch": 0.6086973609227927, + "grad_norm": 0.14225207269191742, + "learning_rate": 3.945670291578021e-05, + "loss": 1.1411, + "step": 3872 + }, + { + "epoch": 0.6088545658197253, + "grad_norm": 0.1412978172302246, + "learning_rate": 3.9451661846828216e-05, + "loss": 1.139, + "step": 3873 + }, + { + "epoch": 0.6090117707166578, + "grad_norm": 0.12855301797389984, + "learning_rate": 3.9446619895223696e-05, + "loss": 1.1952, + "step": 3874 + }, + { + "epoch": 0.6091689756135904, + "grad_norm": 0.1284981071949005, + "learning_rate": 3.94415770612746e-05, + "loss": 1.0926, + "step": 3875 + }, + { + "epoch": 0.6093261805105229, + "grad_norm": 0.14576299488544464, + "learning_rate": 3.943653334528892e-05, + "loss": 0.9897, + "step": 3876 + }, + { + "epoch": 0.6094833854074554, + "grad_norm": 0.12897971272468567, + "learning_rate": 3.9431488747574715e-05, + "loss": 1.1691, + "step": 3877 + }, + { + "epoch": 0.609640590304388, + "grad_norm": 0.13588838279247284, + "learning_rate": 3.9426443268440086e-05, + "loss": 1.0245, + "step": 3878 + }, + { + "epoch": 0.6097977952013205, + "grad_norm": 0.12872371077537537, + "learning_rate": 3.942139690819319e-05, + "loss": 1.1282, + "step": 3879 + }, + { + "epoch": 0.6099550000982531, + "grad_norm": 0.12690573930740356, + "learning_rate": 3.9416349667142236e-05, + "loss": 1.0472, + "step": 3880 + }, + { + "epoch": 0.6101122049951856, + "grad_norm": 0.14904765784740448, + "learning_rate": 3.94113015455955e-05, + "loss": 1.1463, + "step": 3881 + }, + { + "epoch": 0.6102694098921181, + "grad_norm": 0.1351795345544815, + "learning_rate": 3.94062525438613e-05, + "loss": 1.2001, + "step": 3882 + }, + { + "epoch": 0.6104266147890507, + "grad_norm": 0.15208245813846588, + "learning_rate": 3.9401202662248004e-05, + "loss": 1.1609, + "step": 3883 + }, + { + "epoch": 0.6105838196859832, + "grad_norm": 0.15503764152526855, + "learning_rate": 3.939615190106404e-05, + "loss": 1.1118, + "step": 3884 + }, + { + "epoch": 0.6107410245829158, + "grad_norm": 0.1458703875541687, + "learning_rate": 3.93911002606179e-05, + "loss": 1.1229, + "step": 3885 + }, + { + "epoch": 0.6108982294798483, + "grad_norm": 0.18219946324825287, + "learning_rate": 3.9386047741218096e-05, + "loss": 1.0334, + "step": 3886 + }, + { + "epoch": 0.6110554343767808, + "grad_norm": 0.1569867730140686, + "learning_rate": 3.938099434317324e-05, + "loss": 1.0899, + "step": 3887 + }, + { + "epoch": 0.6112126392737134, + "grad_norm": 0.11686835438013077, + "learning_rate": 3.937594006679197e-05, + "loss": 1.1626, + "step": 3888 + }, + { + "epoch": 0.6113698441706459, + "grad_norm": 0.11868549883365631, + "learning_rate": 3.9370884912382965e-05, + "loss": 1.1415, + "step": 3889 + }, + { + "epoch": 0.6115270490675785, + "grad_norm": 0.14847975969314575, + "learning_rate": 3.9365828880254994e-05, + "loss": 1.1623, + "step": 3890 + }, + { + "epoch": 0.611684253964511, + "grad_norm": 0.11808676272630692, + "learning_rate": 3.936077197071686e-05, + "loss": 1.0998, + "step": 3891 + }, + { + "epoch": 0.6118414588614436, + "grad_norm": 0.1390334814786911, + "learning_rate": 3.935571418407741e-05, + "loss": 1.0401, + "step": 3892 + }, + { + "epoch": 0.6119986637583761, + "grad_norm": 0.13492408394813538, + "learning_rate": 3.935065552064555e-05, + "loss": 1.0843, + "step": 3893 + }, + { + "epoch": 0.6121558686553086, + "grad_norm": 0.152793750166893, + "learning_rate": 3.934559598073025e-05, + "loss": 1.086, + "step": 3894 + }, + { + "epoch": 0.6123130735522412, + "grad_norm": 0.13042302429676056, + "learning_rate": 3.9340535564640534e-05, + "loss": 1.1583, + "step": 3895 + }, + { + "epoch": 0.6124702784491737, + "grad_norm": 0.14149799942970276, + "learning_rate": 3.933547427268547e-05, + "loss": 1.0554, + "step": 3896 + }, + { + "epoch": 0.6126274833461063, + "grad_norm": 0.1303536593914032, + "learning_rate": 3.933041210517419e-05, + "loss": 1.0976, + "step": 3897 + }, + { + "epoch": 0.6127846882430388, + "grad_norm": 0.15663042664527893, + "learning_rate": 3.932534906241585e-05, + "loss": 1.1031, + "step": 3898 + }, + { + "epoch": 0.6129418931399713, + "grad_norm": 0.13524563610553741, + "learning_rate": 3.9320285144719695e-05, + "loss": 1.064, + "step": 3899 + }, + { + "epoch": 0.6130990980369039, + "grad_norm": 0.1357249617576599, + "learning_rate": 3.931522035239501e-05, + "loss": 1.1611, + "step": 3900 + }, + { + "epoch": 0.6132563029338364, + "grad_norm": 0.13968004286289215, + "learning_rate": 3.931015468575113e-05, + "loss": 1.1555, + "step": 3901 + }, + { + "epoch": 0.613413507830769, + "grad_norm": 0.13817796111106873, + "learning_rate": 3.9305088145097447e-05, + "loss": 1.036, + "step": 3902 + }, + { + "epoch": 0.6135707127277015, + "grad_norm": 0.13282924890518188, + "learning_rate": 3.930002073074341e-05, + "loss": 1.1606, + "step": 3903 + }, + { + "epoch": 0.613727917624634, + "grad_norm": 0.15224741399288177, + "learning_rate": 3.9294952442998514e-05, + "loss": 1.2341, + "step": 3904 + }, + { + "epoch": 0.6138851225215666, + "grad_norm": 0.1290319263935089, + "learning_rate": 3.928988328217231e-05, + "loss": 1.2607, + "step": 3905 + }, + { + "epoch": 0.6140423274184991, + "grad_norm": 0.17039501667022705, + "learning_rate": 3.9284813248574405e-05, + "loss": 0.9922, + "step": 3906 + }, + { + "epoch": 0.6141995323154317, + "grad_norm": 0.14699743688106537, + "learning_rate": 3.927974234251446e-05, + "loss": 1.0983, + "step": 3907 + }, + { + "epoch": 0.6143567372123642, + "grad_norm": 0.15270787477493286, + "learning_rate": 3.927467056430218e-05, + "loss": 1.1628, + "step": 3908 + }, + { + "epoch": 0.6145139421092967, + "grad_norm": 0.12988269329071045, + "learning_rate": 3.9269597914247335e-05, + "loss": 1.1579, + "step": 3909 + }, + { + "epoch": 0.6146711470062293, + "grad_norm": 0.1486128568649292, + "learning_rate": 3.926452439265974e-05, + "loss": 1.2035, + "step": 3910 + }, + { + "epoch": 0.6148283519031618, + "grad_norm": 0.1319809854030609, + "learning_rate": 3.925944999984927e-05, + "loss": 1.1176, + "step": 3911 + }, + { + "epoch": 0.6149855568000944, + "grad_norm": 0.1273583173751831, + "learning_rate": 3.925437473612585e-05, + "loss": 1.1432, + "step": 3912 + }, + { + "epoch": 0.6151427616970269, + "grad_norm": 0.13283202052116394, + "learning_rate": 3.924929860179946e-05, + "loss": 1.0856, + "step": 3913 + }, + { + "epoch": 0.6152999665939594, + "grad_norm": 0.12940795719623566, + "learning_rate": 3.924422159718011e-05, + "loss": 1.0855, + "step": 3914 + }, + { + "epoch": 0.615457171490892, + "grad_norm": 0.1384333074092865, + "learning_rate": 3.9239143722577915e-05, + "loss": 1.0048, + "step": 3915 + }, + { + "epoch": 0.6156143763878245, + "grad_norm": 0.1223512664437294, + "learning_rate": 3.923406497830299e-05, + "loss": 1.1227, + "step": 3916 + }, + { + "epoch": 0.6157715812847571, + "grad_norm": 0.1405000239610672, + "learning_rate": 3.922898536466554e-05, + "loss": 1.0386, + "step": 3917 + }, + { + "epoch": 0.6159287861816896, + "grad_norm": 0.1473666876554489, + "learning_rate": 3.92239048819758e-05, + "loss": 1.0323, + "step": 3918 + }, + { + "epoch": 0.616085991078622, + "grad_norm": 0.1441134363412857, + "learning_rate": 3.921882353054407e-05, + "loss": 1.0195, + "step": 3919 + }, + { + "epoch": 0.6162431959755547, + "grad_norm": 0.13485924899578094, + "learning_rate": 3.9213741310680686e-05, + "loss": 1.0685, + "step": 3920 + }, + { + "epoch": 0.6164004008724872, + "grad_norm": 0.15113161504268646, + "learning_rate": 3.920865822269607e-05, + "loss": 0.9724, + "step": 3921 + }, + { + "epoch": 0.6165576057694198, + "grad_norm": 0.153519868850708, + "learning_rate": 3.920357426690067e-05, + "loss": 1.1156, + "step": 3922 + }, + { + "epoch": 0.6167148106663523, + "grad_norm": 0.13160768151283264, + "learning_rate": 3.9198489443605e-05, + "loss": 1.1649, + "step": 3923 + }, + { + "epoch": 0.6168720155632847, + "grad_norm": 0.14783591032028198, + "learning_rate": 3.919340375311961e-05, + "loss": 1.0885, + "step": 3924 + }, + { + "epoch": 0.6170292204602174, + "grad_norm": 0.12354061752557755, + "learning_rate": 3.918831719575512e-05, + "loss": 1.1117, + "step": 3925 + }, + { + "epoch": 0.6171864253571498, + "grad_norm": 0.15359634160995483, + "learning_rate": 3.91832297718222e-05, + "loss": 1.1198, + "step": 3926 + }, + { + "epoch": 0.6173436302540825, + "grad_norm": 0.13819736242294312, + "learning_rate": 3.917814148163158e-05, + "loss": 1.1171, + "step": 3927 + }, + { + "epoch": 0.617500835151015, + "grad_norm": 0.13880357146263123, + "learning_rate": 3.917305232549401e-05, + "loss": 1.1716, + "step": 3928 + }, + { + "epoch": 0.6176580400479474, + "grad_norm": 0.13249677419662476, + "learning_rate": 3.916796230372034e-05, + "loss": 1.0433, + "step": 3929 + }, + { + "epoch": 0.61781524494488, + "grad_norm": 0.1238926574587822, + "learning_rate": 3.916287141662142e-05, + "loss": 1.1879, + "step": 3930 + }, + { + "epoch": 0.6179724498418125, + "grad_norm": 0.15673570334911346, + "learning_rate": 3.915777966450821e-05, + "loss": 1.1016, + "step": 3931 + }, + { + "epoch": 0.6181296547387451, + "grad_norm": 0.1377524733543396, + "learning_rate": 3.9152687047691695e-05, + "loss": 1.1478, + "step": 3932 + }, + { + "epoch": 0.6182868596356776, + "grad_norm": 0.13096009194850922, + "learning_rate": 3.914759356648289e-05, + "loss": 1.1211, + "step": 3933 + }, + { + "epoch": 0.6184440645326101, + "grad_norm": 0.14554426074028015, + "learning_rate": 3.9142499221192894e-05, + "loss": 1.1488, + "step": 3934 + }, + { + "epoch": 0.6186012694295427, + "grad_norm": 0.20322880148887634, + "learning_rate": 3.9137404012132866e-05, + "loss": 0.9873, + "step": 3935 + }, + { + "epoch": 0.6187584743264752, + "grad_norm": 0.1772369146347046, + "learning_rate": 3.913230793961399e-05, + "loss": 1.1642, + "step": 3936 + }, + { + "epoch": 0.6189156792234078, + "grad_norm": 0.1685638576745987, + "learning_rate": 3.91272110039475e-05, + "loss": 1.1079, + "step": 3937 + }, + { + "epoch": 0.6190728841203403, + "grad_norm": 0.14887399971485138, + "learning_rate": 3.912211320544473e-05, + "loss": 1.1486, + "step": 3938 + }, + { + "epoch": 0.6192300890172728, + "grad_norm": 0.13565029203891754, + "learning_rate": 3.911701454441701e-05, + "loss": 1.0728, + "step": 3939 + }, + { + "epoch": 0.6193872939142054, + "grad_norm": 0.13880778849124908, + "learning_rate": 3.911191502117576e-05, + "loss": 1.1221, + "step": 3940 + }, + { + "epoch": 0.6195444988111379, + "grad_norm": 0.13235561549663544, + "learning_rate": 3.910681463603242e-05, + "loss": 1.0952, + "step": 3941 + }, + { + "epoch": 0.6197017037080705, + "grad_norm": 0.16491159796714783, + "learning_rate": 3.9101713389298525e-05, + "loss": 1.0765, + "step": 3942 + }, + { + "epoch": 0.619858908605003, + "grad_norm": 0.14132866263389587, + "learning_rate": 3.909661128128562e-05, + "loss": 1.2179, + "step": 3943 + }, + { + "epoch": 0.6200161135019356, + "grad_norm": 0.13405713438987732, + "learning_rate": 3.909150831230534e-05, + "loss": 1.1696, + "step": 3944 + }, + { + "epoch": 0.6201733183988681, + "grad_norm": 0.1398204267024994, + "learning_rate": 3.908640448266934e-05, + "loss": 0.9907, + "step": 3945 + }, + { + "epoch": 0.6203305232958006, + "grad_norm": 0.16295889019966125, + "learning_rate": 3.908129979268936e-05, + "loss": 1.0666, + "step": 3946 + }, + { + "epoch": 0.6204877281927332, + "grad_norm": 0.14149579405784607, + "learning_rate": 3.9076194242677156e-05, + "loss": 1.1172, + "step": 3947 + }, + { + "epoch": 0.6206449330896657, + "grad_norm": 0.16177891194820404, + "learning_rate": 3.907108783294457e-05, + "loss": 1.0195, + "step": 3948 + }, + { + "epoch": 0.6208021379865983, + "grad_norm": 0.12914012372493744, + "learning_rate": 3.906598056380347e-05, + "loss": 1.2259, + "step": 3949 + }, + { + "epoch": 0.6209593428835308, + "grad_norm": 0.14361891150474548, + "learning_rate": 3.9060872435565796e-05, + "loss": 0.9796, + "step": 3950 + }, + { + "epoch": 0.6211165477804633, + "grad_norm": 0.12880343198776245, + "learning_rate": 3.905576344854354e-05, + "loss": 1.1487, + "step": 3951 + }, + { + "epoch": 0.6212737526773959, + "grad_norm": 0.13725869357585907, + "learning_rate": 3.9050653603048725e-05, + "loss": 1.1024, + "step": 3952 + }, + { + "epoch": 0.6214309575743284, + "grad_norm": 0.13440468907356262, + "learning_rate": 3.904554289939345e-05, + "loss": 1.171, + "step": 3953 + }, + { + "epoch": 0.621588162471261, + "grad_norm": 0.12612858414649963, + "learning_rate": 3.904043133788984e-05, + "loss": 1.0411, + "step": 3954 + }, + { + "epoch": 0.6217453673681935, + "grad_norm": 0.12622110545635223, + "learning_rate": 3.903531891885012e-05, + "loss": 1.1089, + "step": 3955 + }, + { + "epoch": 0.621902572265126, + "grad_norm": 0.1516236960887909, + "learning_rate": 3.9030205642586514e-05, + "loss": 1.242, + "step": 3956 + }, + { + "epoch": 0.6220597771620586, + "grad_norm": 0.1355689913034439, + "learning_rate": 3.9025091509411336e-05, + "loss": 1.0582, + "step": 3957 + }, + { + "epoch": 0.6222169820589911, + "grad_norm": 0.15789024531841278, + "learning_rate": 3.901997651963692e-05, + "loss": 1.1701, + "step": 3958 + }, + { + "epoch": 0.6223741869559237, + "grad_norm": 0.1915074735879898, + "learning_rate": 3.901486067357569e-05, + "loss": 1.1742, + "step": 3959 + }, + { + "epoch": 0.6225313918528562, + "grad_norm": 0.1438111662864685, + "learning_rate": 3.900974397154009e-05, + "loss": 1.1453, + "step": 3960 + }, + { + "epoch": 0.6226885967497887, + "grad_norm": 0.12249389290809631, + "learning_rate": 3.900462641384264e-05, + "loss": 1.0477, + "step": 3961 + }, + { + "epoch": 0.6228458016467213, + "grad_norm": 0.15517558157444, + "learning_rate": 3.899950800079588e-05, + "loss": 1.0476, + "step": 3962 + }, + { + "epoch": 0.6230030065436538, + "grad_norm": 0.14869697391986847, + "learning_rate": 3.899438873271244e-05, + "loss": 1.1311, + "step": 3963 + }, + { + "epoch": 0.6231602114405864, + "grad_norm": 0.1483510434627533, + "learning_rate": 3.8989268609904985e-05, + "loss": 1.0019, + "step": 3964 + }, + { + "epoch": 0.6233174163375189, + "grad_norm": 0.13164103031158447, + "learning_rate": 3.898414763268622e-05, + "loss": 1.0782, + "step": 3965 + }, + { + "epoch": 0.6234746212344514, + "grad_norm": 0.14316119253635406, + "learning_rate": 3.8979025801368936e-05, + "loss": 1.0564, + "step": 3966 + }, + { + "epoch": 0.623631826131384, + "grad_norm": 0.18632963299751282, + "learning_rate": 3.8973903116265936e-05, + "loss": 0.9668, + "step": 3967 + }, + { + "epoch": 0.6237890310283165, + "grad_norm": 0.1400153636932373, + "learning_rate": 3.8968779577690105e-05, + "loss": 1.2211, + "step": 3968 + }, + { + "epoch": 0.6239462359252491, + "grad_norm": 0.1399480402469635, + "learning_rate": 3.896365518595436e-05, + "loss": 1.1038, + "step": 3969 + }, + { + "epoch": 0.6241034408221816, + "grad_norm": 0.14281897246837616, + "learning_rate": 3.895852994137168e-05, + "loss": 1.0784, + "step": 3970 + }, + { + "epoch": 0.6242606457191141, + "grad_norm": 0.1383058726787567, + "learning_rate": 3.8953403844255106e-05, + "loss": 1.0747, + "step": 3971 + }, + { + "epoch": 0.6244178506160467, + "grad_norm": 0.13327376544475555, + "learning_rate": 3.894827689491772e-05, + "loss": 1.16, + "step": 3972 + }, + { + "epoch": 0.6245750555129792, + "grad_norm": 0.12594720721244812, + "learning_rate": 3.8943149093672646e-05, + "loss": 1.1264, + "step": 3973 + }, + { + "epoch": 0.6247322604099118, + "grad_norm": 0.12865445017814636, + "learning_rate": 3.8938020440833066e-05, + "loss": 1.0779, + "step": 3974 + }, + { + "epoch": 0.6248894653068443, + "grad_norm": 0.13021129369735718, + "learning_rate": 3.893289093671224e-05, + "loss": 1.2198, + "step": 3975 + }, + { + "epoch": 0.6250466702037768, + "grad_norm": 0.15246430039405823, + "learning_rate": 3.8927760581623455e-05, + "loss": 1.229, + "step": 3976 + }, + { + "epoch": 0.6252038751007094, + "grad_norm": 0.1328427493572235, + "learning_rate": 3.892262937588002e-05, + "loss": 1.0821, + "step": 3977 + }, + { + "epoch": 0.6253610799976419, + "grad_norm": 0.14552165567874908, + "learning_rate": 3.8917497319795385e-05, + "loss": 1.0284, + "step": 3978 + }, + { + "epoch": 0.6255182848945745, + "grad_norm": 0.13598152995109558, + "learning_rate": 3.891236441368294e-05, + "loss": 1.204, + "step": 3979 + }, + { + "epoch": 0.625675489791507, + "grad_norm": 0.12847700715065002, + "learning_rate": 3.890723065785622e-05, + "loss": 0.9571, + "step": 3980 + }, + { + "epoch": 0.6258326946884395, + "grad_norm": 0.1271078735589981, + "learning_rate": 3.890209605262877e-05, + "loss": 1.067, + "step": 3981 + }, + { + "epoch": 0.6259898995853721, + "grad_norm": 0.12795038521289825, + "learning_rate": 3.889696059831418e-05, + "loss": 1.1389, + "step": 3982 + }, + { + "epoch": 0.6261471044823046, + "grad_norm": 0.14186857640743256, + "learning_rate": 3.8891824295226115e-05, + "loss": 1.0729, + "step": 3983 + }, + { + "epoch": 0.6263043093792372, + "grad_norm": 0.16656219959259033, + "learning_rate": 3.8886687143678275e-05, + "loss": 1.0958, + "step": 3984 + }, + { + "epoch": 0.6264615142761697, + "grad_norm": 0.13067491352558136, + "learning_rate": 3.888154914398442e-05, + "loss": 1.1101, + "step": 3985 + }, + { + "epoch": 0.6266187191731022, + "grad_norm": 0.14706555008888245, + "learning_rate": 3.887641029645836e-05, + "loss": 1.1012, + "step": 3986 + }, + { + "epoch": 0.6267759240700348, + "grad_norm": 0.14502152800559998, + "learning_rate": 3.887127060141396e-05, + "loss": 1.1221, + "step": 3987 + }, + { + "epoch": 0.6269331289669673, + "grad_norm": 0.13042514026165009, + "learning_rate": 3.8866130059165115e-05, + "loss": 1.2059, + "step": 3988 + }, + { + "epoch": 0.6270903338638999, + "grad_norm": 0.13619203865528107, + "learning_rate": 3.886098867002581e-05, + "loss": 1.0648, + "step": 3989 + }, + { + "epoch": 0.6272475387608324, + "grad_norm": 0.12742075324058533, + "learning_rate": 3.885584643431006e-05, + "loss": 1.1937, + "step": 3990 + }, + { + "epoch": 0.6274047436577649, + "grad_norm": 0.15529866516590118, + "learning_rate": 3.8850703352331925e-05, + "loss": 1.084, + "step": 3991 + }, + { + "epoch": 0.6275619485546975, + "grad_norm": 0.16348528861999512, + "learning_rate": 3.8845559424405534e-05, + "loss": 1.1566, + "step": 3992 + }, + { + "epoch": 0.62771915345163, + "grad_norm": 0.11867035925388336, + "learning_rate": 3.884041465084504e-05, + "loss": 0.9744, + "step": 3993 + }, + { + "epoch": 0.6278763583485626, + "grad_norm": 0.14471690356731415, + "learning_rate": 3.8835269031964685e-05, + "loss": 1.1047, + "step": 3994 + }, + { + "epoch": 0.6280335632454951, + "grad_norm": 0.14537517726421356, + "learning_rate": 3.883012256807873e-05, + "loss": 1.1548, + "step": 3995 + }, + { + "epoch": 0.6281907681424277, + "grad_norm": 0.1430378556251526, + "learning_rate": 3.882497525950152e-05, + "loss": 1.0705, + "step": 3996 + }, + { + "epoch": 0.6283479730393602, + "grad_norm": 0.1448555290699005, + "learning_rate": 3.881982710654741e-05, + "loss": 1.1347, + "step": 3997 + }, + { + "epoch": 0.6285051779362927, + "grad_norm": 0.14756466448307037, + "learning_rate": 3.881467810953085e-05, + "loss": 1.185, + "step": 3998 + }, + { + "epoch": 0.6286623828332253, + "grad_norm": 0.13334186375141144, + "learning_rate": 3.8809528268766304e-05, + "loss": 1.2077, + "step": 3999 + }, + { + "epoch": 0.6288195877301578, + "grad_norm": 0.16169632971286774, + "learning_rate": 3.8804377584568324e-05, + "loss": 1.0994, + "step": 4000 + }, + { + "epoch": 0.6288195877301578, + "eval_loss": 1.1022133827209473, + "eval_runtime": 2321.8951, + "eval_samples_per_second": 3.987, + "eval_steps_per_second": 1.994, + "step": 4000 + }, + { + "epoch": 0.6289767926270904, + "grad_norm": 0.12798309326171875, + "learning_rate": 3.879922605725148e-05, + "loss": 1.2744, + "step": 4001 + }, + { + "epoch": 0.6291339975240229, + "grad_norm": 0.1319848597049713, + "learning_rate": 3.8794073687130414e-05, + "loss": 1.0977, + "step": 4002 + }, + { + "epoch": 0.6292912024209554, + "grad_norm": 0.14508125185966492, + "learning_rate": 3.87889204745198e-05, + "loss": 1.0866, + "step": 4003 + }, + { + "epoch": 0.629448407317888, + "grad_norm": 0.1388920098543167, + "learning_rate": 3.878376641973439e-05, + "loss": 1.1524, + "step": 4004 + }, + { + "epoch": 0.6296056122148205, + "grad_norm": 0.14365117251873016, + "learning_rate": 3.8778611523088976e-05, + "loss": 1.1457, + "step": 4005 + }, + { + "epoch": 0.6297628171117531, + "grad_norm": 0.16236121952533722, + "learning_rate": 3.877345578489839e-05, + "loss": 1.13, + "step": 4006 + }, + { + "epoch": 0.6299200220086856, + "grad_norm": 0.14283150434494019, + "learning_rate": 3.876829920547753e-05, + "loss": 1.0183, + "step": 4007 + }, + { + "epoch": 0.6300772269056181, + "grad_norm": 0.16025055944919586, + "learning_rate": 3.876314178514134e-05, + "loss": 1.1702, + "step": 4008 + }, + { + "epoch": 0.6302344318025507, + "grad_norm": 0.15556633472442627, + "learning_rate": 3.875798352420482e-05, + "loss": 1.1579, + "step": 4009 + }, + { + "epoch": 0.6303916366994832, + "grad_norm": 0.16086506843566895, + "learning_rate": 3.875282442298301e-05, + "loss": 1.2713, + "step": 4010 + }, + { + "epoch": 0.6305488415964158, + "grad_norm": 0.14218272268772125, + "learning_rate": 3.8747664481791e-05, + "loss": 1.1606, + "step": 4011 + }, + { + "epoch": 0.6307060464933483, + "grad_norm": 0.13343515992164612, + "learning_rate": 3.874250370094397e-05, + "loss": 0.9663, + "step": 4012 + }, + { + "epoch": 0.6308632513902808, + "grad_norm": 0.13490377366542816, + "learning_rate": 3.873734208075709e-05, + "loss": 1.0989, + "step": 4013 + }, + { + "epoch": 0.6310204562872134, + "grad_norm": 0.13085541129112244, + "learning_rate": 3.873217962154562e-05, + "loss": 1.0909, + "step": 4014 + }, + { + "epoch": 0.6311776611841459, + "grad_norm": 0.182506263256073, + "learning_rate": 3.872701632362486e-05, + "loss": 1.1672, + "step": 4015 + }, + { + "epoch": 0.6313348660810785, + "grad_norm": 0.18165533244609833, + "learning_rate": 3.8721852187310184e-05, + "loss": 1.0913, + "step": 4016 + }, + { + "epoch": 0.631492070978011, + "grad_norm": 0.2167358249425888, + "learning_rate": 3.871668721291698e-05, + "loss": 1.1286, + "step": 4017 + }, + { + "epoch": 0.6316492758749435, + "grad_norm": 0.13552582263946533, + "learning_rate": 3.871152140076071e-05, + "loss": 1.0448, + "step": 4018 + }, + { + "epoch": 0.6318064807718761, + "grad_norm": 0.1412697434425354, + "learning_rate": 3.8706354751156875e-05, + "loss": 1.0886, + "step": 4019 + }, + { + "epoch": 0.6319636856688086, + "grad_norm": 0.18857620656490326, + "learning_rate": 3.8701187264421046e-05, + "loss": 1.1056, + "step": 4020 + }, + { + "epoch": 0.6321208905657412, + "grad_norm": 0.13617004454135895, + "learning_rate": 3.8696018940868835e-05, + "loss": 1.2482, + "step": 4021 + }, + { + "epoch": 0.6322780954626737, + "grad_norm": 0.15233376622200012, + "learning_rate": 3.869084978081589e-05, + "loss": 1.0117, + "step": 4022 + }, + { + "epoch": 0.6324353003596062, + "grad_norm": 0.1499616652727127, + "learning_rate": 3.868567978457793e-05, + "loss": 1.1485, + "step": 4023 + }, + { + "epoch": 0.6325925052565388, + "grad_norm": 0.15782010555267334, + "learning_rate": 3.8680508952470726e-05, + "loss": 0.9957, + "step": 4024 + }, + { + "epoch": 0.6327497101534713, + "grad_norm": 0.1355133205652237, + "learning_rate": 3.867533728481008e-05, + "loss": 1.1428, + "step": 4025 + }, + { + "epoch": 0.6329069150504039, + "grad_norm": 0.14103074371814728, + "learning_rate": 3.8670164781911864e-05, + "loss": 1.191, + "step": 4026 + }, + { + "epoch": 0.6330641199473364, + "grad_norm": 0.15501828491687775, + "learning_rate": 3.8664991444091994e-05, + "loss": 1.0547, + "step": 4027 + }, + { + "epoch": 0.6332213248442689, + "grad_norm": 0.16231434047222137, + "learning_rate": 3.865981727166644e-05, + "loss": 1.0961, + "step": 4028 + }, + { + "epoch": 0.6333785297412015, + "grad_norm": 0.13422849774360657, + "learning_rate": 3.8654642264951224e-05, + "loss": 1.1408, + "step": 4029 + }, + { + "epoch": 0.633535734638134, + "grad_norm": 0.13933904469013214, + "learning_rate": 3.86494664242624e-05, + "loss": 1.1404, + "step": 4030 + }, + { + "epoch": 0.6336929395350666, + "grad_norm": 0.12841267883777618, + "learning_rate": 3.8644289749916116e-05, + "loss": 0.9775, + "step": 4031 + }, + { + "epoch": 0.633850144431999, + "grad_norm": 0.13102753460407257, + "learning_rate": 3.863911224222851e-05, + "loss": 1.0729, + "step": 4032 + }, + { + "epoch": 0.6340073493289315, + "grad_norm": 0.12041983008384705, + "learning_rate": 3.8633933901515834e-05, + "loss": 1.1076, + "step": 4033 + }, + { + "epoch": 0.6341645542258642, + "grad_norm": 0.14172932505607605, + "learning_rate": 3.862875472809434e-05, + "loss": 1.1153, + "step": 4034 + }, + { + "epoch": 0.6343217591227966, + "grad_norm": 0.15133321285247803, + "learning_rate": 3.862357472228037e-05, + "loss": 1.0577, + "step": 4035 + }, + { + "epoch": 0.6344789640197293, + "grad_norm": 0.13426773250102997, + "learning_rate": 3.861839388439029e-05, + "loss": 1.1876, + "step": 4036 + }, + { + "epoch": 0.6346361689166617, + "grad_norm": 0.1450241208076477, + "learning_rate": 3.861321221474052e-05, + "loss": 1.1069, + "step": 4037 + }, + { + "epoch": 0.6347933738135942, + "grad_norm": 0.15062105655670166, + "learning_rate": 3.8608029713647545e-05, + "loss": 1.1346, + "step": 4038 + }, + { + "epoch": 0.6349505787105268, + "grad_norm": 0.15360552072525024, + "learning_rate": 3.860284638142789e-05, + "loss": 1.1386, + "step": 4039 + }, + { + "epoch": 0.6351077836074593, + "grad_norm": 0.15498320758342743, + "learning_rate": 3.8597662218398136e-05, + "loss": 1.0031, + "step": 4040 + }, + { + "epoch": 0.635264988504392, + "grad_norm": 0.15017934143543243, + "learning_rate": 3.85924772248749e-05, + "loss": 1.0386, + "step": 4041 + }, + { + "epoch": 0.6354221934013244, + "grad_norm": 0.14026129245758057, + "learning_rate": 3.8587291401174886e-05, + "loss": 1.1104, + "step": 4042 + }, + { + "epoch": 0.6355793982982569, + "grad_norm": 0.12707599997520447, + "learning_rate": 3.85821047476148e-05, + "loss": 1.1505, + "step": 4043 + }, + { + "epoch": 0.6357366031951895, + "grad_norm": 0.17259611189365387, + "learning_rate": 3.857691726451143e-05, + "loss": 1.0531, + "step": 4044 + }, + { + "epoch": 0.635893808092122, + "grad_norm": 0.13623571395874023, + "learning_rate": 3.857172895218162e-05, + "loss": 1.0596, + "step": 4045 + }, + { + "epoch": 0.6360510129890546, + "grad_norm": 0.1337416023015976, + "learning_rate": 3.856653981094224e-05, + "loss": 0.9934, + "step": 4046 + }, + { + "epoch": 0.6362082178859871, + "grad_norm": 0.16031776368618011, + "learning_rate": 3.8561349841110215e-05, + "loss": 1.109, + "step": 4047 + }, + { + "epoch": 0.6363654227829197, + "grad_norm": 0.1332205832004547, + "learning_rate": 3.855615904300255e-05, + "loss": 1.0332, + "step": 4048 + }, + { + "epoch": 0.6365226276798522, + "grad_norm": 0.1302390694618225, + "learning_rate": 3.855096741693627e-05, + "loss": 0.9843, + "step": 4049 + }, + { + "epoch": 0.6366798325767847, + "grad_norm": 0.13197927176952362, + "learning_rate": 3.854577496322845e-05, + "loss": 1.0397, + "step": 4050 + }, + { + "epoch": 0.6368370374737173, + "grad_norm": 0.14378191530704498, + "learning_rate": 3.854058168219624e-05, + "loss": 1.0665, + "step": 4051 + }, + { + "epoch": 0.6369942423706498, + "grad_norm": 0.137291818857193, + "learning_rate": 3.853538757415681e-05, + "loss": 1.0029, + "step": 4052 + }, + { + "epoch": 0.6371514472675824, + "grad_norm": 0.1508316695690155, + "learning_rate": 3.853019263942741e-05, + "loss": 0.9818, + "step": 4053 + }, + { + "epoch": 0.6373086521645149, + "grad_norm": 0.1240345910191536, + "learning_rate": 3.852499687832533e-05, + "loss": 1.0345, + "step": 4054 + }, + { + "epoch": 0.6374658570614474, + "grad_norm": 0.14110183715820312, + "learning_rate": 3.85198002911679e-05, + "loss": 1.0664, + "step": 4055 + }, + { + "epoch": 0.63762306195838, + "grad_norm": 0.12327904999256134, + "learning_rate": 3.8514602878272496e-05, + "loss": 1.0971, + "step": 4056 + }, + { + "epoch": 0.6377802668553125, + "grad_norm": 0.13333137333393097, + "learning_rate": 3.850940463995658e-05, + "loss": 1.0969, + "step": 4057 + }, + { + "epoch": 0.6379374717522451, + "grad_norm": 0.17208878695964813, + "learning_rate": 3.850420557653762e-05, + "loss": 1.0696, + "step": 4058 + }, + { + "epoch": 0.6380946766491776, + "grad_norm": 0.13075406849384308, + "learning_rate": 3.8499005688333165e-05, + "loss": 1.1611, + "step": 4059 + }, + { + "epoch": 0.6382518815461101, + "grad_norm": 0.13193902373313904, + "learning_rate": 3.849380497566081e-05, + "loss": 1.0657, + "step": 4060 + }, + { + "epoch": 0.6384090864430427, + "grad_norm": 0.14207960665225983, + "learning_rate": 3.848860343883818e-05, + "loss": 1.0669, + "step": 4061 + }, + { + "epoch": 0.6385662913399752, + "grad_norm": 0.12478265911340714, + "learning_rate": 3.848340107818298e-05, + "loss": 1.1441, + "step": 4062 + }, + { + "epoch": 0.6387234962369078, + "grad_norm": 0.14924664795398712, + "learning_rate": 3.847819789401294e-05, + "loss": 1.0968, + "step": 4063 + }, + { + "epoch": 0.6388807011338403, + "grad_norm": 0.15374234318733215, + "learning_rate": 3.847299388664585e-05, + "loss": 1.0329, + "step": 4064 + }, + { + "epoch": 0.6390379060307728, + "grad_norm": 0.1404561996459961, + "learning_rate": 3.8467789056399554e-05, + "loss": 1.1609, + "step": 4065 + }, + { + "epoch": 0.6391951109277054, + "grad_norm": 0.13482248783111572, + "learning_rate": 3.846258340359195e-05, + "loss": 1.1606, + "step": 4066 + }, + { + "epoch": 0.6393523158246379, + "grad_norm": 0.13773567974567413, + "learning_rate": 3.8457376928540966e-05, + "loss": 1.2042, + "step": 4067 + }, + { + "epoch": 0.6395095207215705, + "grad_norm": 0.14411531388759613, + "learning_rate": 3.8452169631564604e-05, + "loss": 1.1219, + "step": 4068 + }, + { + "epoch": 0.639666725618503, + "grad_norm": 0.12763367593288422, + "learning_rate": 3.8446961512980906e-05, + "loss": 1.0552, + "step": 4069 + }, + { + "epoch": 0.6398239305154355, + "grad_norm": 0.1240297257900238, + "learning_rate": 3.844175257310796e-05, + "loss": 1.128, + "step": 4070 + }, + { + "epoch": 0.6399811354123681, + "grad_norm": 0.132890522480011, + "learning_rate": 3.843654281226391e-05, + "loss": 1.1883, + "step": 4071 + }, + { + "epoch": 0.6401383403093006, + "grad_norm": 0.14669398963451385, + "learning_rate": 3.843133223076695e-05, + "loss": 1.1783, + "step": 4072 + }, + { + "epoch": 0.6402955452062332, + "grad_norm": 0.15040293335914612, + "learning_rate": 3.842612082893531e-05, + "loss": 1.0935, + "step": 4073 + }, + { + "epoch": 0.6404527501031657, + "grad_norm": 0.1313762068748474, + "learning_rate": 3.84209086070873e-05, + "loss": 1.1244, + "step": 4074 + }, + { + "epoch": 0.6406099550000982, + "grad_norm": 0.14827388525009155, + "learning_rate": 3.841569556554126e-05, + "loss": 1.1807, + "step": 4075 + }, + { + "epoch": 0.6407671598970308, + "grad_norm": 0.16525235772132874, + "learning_rate": 3.8410481704615574e-05, + "loss": 1.096, + "step": 4076 + }, + { + "epoch": 0.6409243647939633, + "grad_norm": 0.1288774460554123, + "learning_rate": 3.840526702462869e-05, + "loss": 1.1513, + "step": 4077 + }, + { + "epoch": 0.6410815696908959, + "grad_norm": 0.16332295536994934, + "learning_rate": 3.84000515258991e-05, + "loss": 1.1474, + "step": 4078 + }, + { + "epoch": 0.6412387745878284, + "grad_norm": 0.20217250287532806, + "learning_rate": 3.8394835208745343e-05, + "loss": 1.0317, + "step": 4079 + }, + { + "epoch": 0.6413959794847609, + "grad_norm": 0.12176761776208878, + "learning_rate": 3.838961807348602e-05, + "loss": 1.1813, + "step": 4080 + }, + { + "epoch": 0.6415531843816935, + "grad_norm": 0.13961894810199738, + "learning_rate": 3.838440012043977e-05, + "loss": 1.0547, + "step": 4081 + }, + { + "epoch": 0.641710389278626, + "grad_norm": 0.14181359112262726, + "learning_rate": 3.837918134992528e-05, + "loss": 1.0766, + "step": 4082 + }, + { + "epoch": 0.6418675941755586, + "grad_norm": 0.12520286440849304, + "learning_rate": 3.83739617622613e-05, + "loss": 1.2473, + "step": 4083 + }, + { + "epoch": 0.6420247990724911, + "grad_norm": 0.14975857734680176, + "learning_rate": 3.836874135776662e-05, + "loss": 1.1195, + "step": 4084 + }, + { + "epoch": 0.6421820039694236, + "grad_norm": 0.1481516808271408, + "learning_rate": 3.836352013676008e-05, + "loss": 1.0374, + "step": 4085 + }, + { + "epoch": 0.6423392088663562, + "grad_norm": 0.15783464908599854, + "learning_rate": 3.835829809956058e-05, + "loss": 1.165, + "step": 4086 + }, + { + "epoch": 0.6424964137632887, + "grad_norm": 0.14074595272541046, + "learning_rate": 3.8353075246487044e-05, + "loss": 1.1495, + "step": 4087 + }, + { + "epoch": 0.6426536186602213, + "grad_norm": 0.15384332835674286, + "learning_rate": 3.834785157785849e-05, + "loss": 1.0054, + "step": 4088 + }, + { + "epoch": 0.6428108235571538, + "grad_norm": 0.1430482268333435, + "learning_rate": 3.834262709399395e-05, + "loss": 1.0299, + "step": 4089 + }, + { + "epoch": 0.6429680284540863, + "grad_norm": 0.15135294198989868, + "learning_rate": 3.8337401795212514e-05, + "loss": 0.9536, + "step": 4090 + }, + { + "epoch": 0.6431252333510189, + "grad_norm": 0.13654224574565887, + "learning_rate": 3.833217568183331e-05, + "loss": 1.112, + "step": 4091 + }, + { + "epoch": 0.6432824382479514, + "grad_norm": 0.15030381083488464, + "learning_rate": 3.832694875417554e-05, + "loss": 1.0778, + "step": 4092 + }, + { + "epoch": 0.643439643144884, + "grad_norm": 0.14391788840293884, + "learning_rate": 3.8321721012558456e-05, + "loss": 1.1318, + "step": 4093 + }, + { + "epoch": 0.6435968480418165, + "grad_norm": 0.13817065954208374, + "learning_rate": 3.8316492457301334e-05, + "loss": 1.1262, + "step": 4094 + }, + { + "epoch": 0.643754052938749, + "grad_norm": 0.1388375461101532, + "learning_rate": 3.831126308872352e-05, + "loss": 1.0942, + "step": 4095 + }, + { + "epoch": 0.6439112578356816, + "grad_norm": 0.1540645956993103, + "learning_rate": 3.83060329071444e-05, + "loss": 1.1978, + "step": 4096 + }, + { + "epoch": 0.6440684627326141, + "grad_norm": 0.15924197435379028, + "learning_rate": 3.830080191288342e-05, + "loss": 1.1066, + "step": 4097 + }, + { + "epoch": 0.6442256676295467, + "grad_norm": 0.15522272884845734, + "learning_rate": 3.829557010626006e-05, + "loss": 1.1649, + "step": 4098 + }, + { + "epoch": 0.6443828725264792, + "grad_norm": 0.1274399310350418, + "learning_rate": 3.829033748759386e-05, + "loss": 1.1831, + "step": 4099 + }, + { + "epoch": 0.6445400774234118, + "grad_norm": 0.15869344770908356, + "learning_rate": 3.8285104057204426e-05, + "loss": 1.1169, + "step": 4100 + }, + { + "epoch": 0.6446972823203443, + "grad_norm": 0.12494777143001556, + "learning_rate": 3.827986981541138e-05, + "loss": 1.0256, + "step": 4101 + }, + { + "epoch": 0.6448544872172768, + "grad_norm": 0.1447725147008896, + "learning_rate": 3.8274634762534405e-05, + "loss": 1.1523, + "step": 4102 + }, + { + "epoch": 0.6450116921142094, + "grad_norm": 0.1429455280303955, + "learning_rate": 3.826939889889325e-05, + "loss": 1.1314, + "step": 4103 + }, + { + "epoch": 0.6451688970111419, + "grad_norm": 0.15775367617607117, + "learning_rate": 3.8264162224807696e-05, + "loss": 1.1693, + "step": 4104 + }, + { + "epoch": 0.6453261019080745, + "grad_norm": 0.13923032581806183, + "learning_rate": 3.825892474059758e-05, + "loss": 1.1584, + "step": 4105 + }, + { + "epoch": 0.645483306805007, + "grad_norm": 0.13384175300598145, + "learning_rate": 3.825368644658279e-05, + "loss": 1.1482, + "step": 4106 + }, + { + "epoch": 0.6456405117019395, + "grad_norm": 0.15843184292316437, + "learning_rate": 3.8248447343083255e-05, + "loss": 1.02, + "step": 4107 + }, + { + "epoch": 0.6457977165988721, + "grad_norm": 0.1540258228778839, + "learning_rate": 3.8243207430418965e-05, + "loss": 1.0609, + "step": 4108 + }, + { + "epoch": 0.6459549214958046, + "grad_norm": 0.13326622545719147, + "learning_rate": 3.823796670890996e-05, + "loss": 1.1519, + "step": 4109 + }, + { + "epoch": 0.6461121263927372, + "grad_norm": 0.18570023775100708, + "learning_rate": 3.823272517887631e-05, + "loss": 1.1447, + "step": 4110 + }, + { + "epoch": 0.6462693312896697, + "grad_norm": 0.1267216056585312, + "learning_rate": 3.8227482840638144e-05, + "loss": 1.1803, + "step": 4111 + }, + { + "epoch": 0.6464265361866022, + "grad_norm": 0.14555345475673676, + "learning_rate": 3.822223969451566e-05, + "loss": 1.2036, + "step": 4112 + }, + { + "epoch": 0.6465837410835348, + "grad_norm": 0.15878638625144958, + "learning_rate": 3.821699574082908e-05, + "loss": 1.0386, + "step": 4113 + }, + { + "epoch": 0.6467409459804673, + "grad_norm": 0.15997201204299927, + "learning_rate": 3.82117509798987e-05, + "loss": 1.1584, + "step": 4114 + }, + { + "epoch": 0.6468981508773999, + "grad_norm": 0.13046613335609436, + "learning_rate": 3.820650541204482e-05, + "loss": 1.1935, + "step": 4115 + }, + { + "epoch": 0.6470553557743324, + "grad_norm": 0.1658896803855896, + "learning_rate": 3.820125903758786e-05, + "loss": 1.1556, + "step": 4116 + }, + { + "epoch": 0.6472125606712649, + "grad_norm": 0.13119108974933624, + "learning_rate": 3.8196011856848204e-05, + "loss": 1.0129, + "step": 4117 + }, + { + "epoch": 0.6473697655681975, + "grad_norm": 0.13990801572799683, + "learning_rate": 3.8190763870146355e-05, + "loss": 1.1842, + "step": 4118 + }, + { + "epoch": 0.64752697046513, + "grad_norm": 0.15538953244686127, + "learning_rate": 3.818551507780284e-05, + "loss": 1.0767, + "step": 4119 + }, + { + "epoch": 0.6476841753620626, + "grad_norm": 0.1424800306558609, + "learning_rate": 3.8180265480138236e-05, + "loss": 1.0018, + "step": 4120 + }, + { + "epoch": 0.6478413802589951, + "grad_norm": 0.13954682648181915, + "learning_rate": 3.817501507747316e-05, + "loss": 1.072, + "step": 4121 + }, + { + "epoch": 0.6479985851559276, + "grad_norm": 0.18287043273448944, + "learning_rate": 3.8169763870128284e-05, + "loss": 1.1309, + "step": 4122 + }, + { + "epoch": 0.6481557900528602, + "grad_norm": 0.13996580243110657, + "learning_rate": 3.816451185842435e-05, + "loss": 1.0904, + "step": 4123 + }, + { + "epoch": 0.6483129949497927, + "grad_norm": 0.18551260232925415, + "learning_rate": 3.815925904268211e-05, + "loss": 1.0991, + "step": 4124 + }, + { + "epoch": 0.6484701998467253, + "grad_norm": 0.1717006415128708, + "learning_rate": 3.81540054232224e-05, + "loss": 1.1385, + "step": 4125 + }, + { + "epoch": 0.6486274047436578, + "grad_norm": 0.1935938596725464, + "learning_rate": 3.814875100036609e-05, + "loss": 1.0321, + "step": 4126 + }, + { + "epoch": 0.6487846096405903, + "grad_norm": 0.16522254049777985, + "learning_rate": 3.814349577443408e-05, + "loss": 1.1219, + "step": 4127 + }, + { + "epoch": 0.6489418145375229, + "grad_norm": 0.15041324496269226, + "learning_rate": 3.813823974574738e-05, + "loss": 1.0763, + "step": 4128 + }, + { + "epoch": 0.6490990194344554, + "grad_norm": 0.13009963929653168, + "learning_rate": 3.813298291462697e-05, + "loss": 1.2183, + "step": 4129 + }, + { + "epoch": 0.649256224331388, + "grad_norm": 0.1726483553647995, + "learning_rate": 3.812772528139394e-05, + "loss": 1.0157, + "step": 4130 + }, + { + "epoch": 0.6494134292283205, + "grad_norm": 0.1214425191283226, + "learning_rate": 3.812246684636939e-05, + "loss": 1.0584, + "step": 4131 + }, + { + "epoch": 0.649570634125253, + "grad_norm": 0.1466849148273468, + "learning_rate": 3.81172076098745e-05, + "loss": 0.9647, + "step": 4132 + }, + { + "epoch": 0.6497278390221856, + "grad_norm": 0.1390989124774933, + "learning_rate": 3.811194757223046e-05, + "loss": 1.1379, + "step": 4133 + }, + { + "epoch": 0.649885043919118, + "grad_norm": 0.16147755086421967, + "learning_rate": 3.810668673375856e-05, + "loss": 1.1648, + "step": 4134 + }, + { + "epoch": 0.6500422488160507, + "grad_norm": 0.13957399129867554, + "learning_rate": 3.810142509478011e-05, + "loss": 1.0383, + "step": 4135 + }, + { + "epoch": 0.6501994537129832, + "grad_norm": 0.14382591843605042, + "learning_rate": 3.809616265561645e-05, + "loss": 1.0436, + "step": 4136 + }, + { + "epoch": 0.6503566586099156, + "grad_norm": 0.13169275224208832, + "learning_rate": 3.809089941658901e-05, + "loss": 1.1344, + "step": 4137 + }, + { + "epoch": 0.6505138635068483, + "grad_norm": 0.14189770817756653, + "learning_rate": 3.808563537801924e-05, + "loss": 1.1493, + "step": 4138 + }, + { + "epoch": 0.6506710684037808, + "grad_norm": 0.15206053853034973, + "learning_rate": 3.808037054022865e-05, + "loss": 1.0069, + "step": 4139 + }, + { + "epoch": 0.6508282733007134, + "grad_norm": 0.18518586456775665, + "learning_rate": 3.8075104903538795e-05, + "loss": 1.0699, + "step": 4140 + }, + { + "epoch": 0.6509854781976459, + "grad_norm": 0.1503700166940689, + "learning_rate": 3.806983846827128e-05, + "loss": 1.0635, + "step": 4141 + }, + { + "epoch": 0.6511426830945783, + "grad_norm": 0.1372813582420349, + "learning_rate": 3.806457123474776e-05, + "loss": 1.017, + "step": 4142 + }, + { + "epoch": 0.651299887991511, + "grad_norm": 0.14780773222446442, + "learning_rate": 3.805930320328993e-05, + "loss": 1.1236, + "step": 4143 + }, + { + "epoch": 0.6514570928884434, + "grad_norm": 0.16201084852218628, + "learning_rate": 3.805403437421955e-05, + "loss": 0.9939, + "step": 4144 + }, + { + "epoch": 0.651614297785376, + "grad_norm": 0.13731682300567627, + "learning_rate": 3.804876474785842e-05, + "loss": 1.1471, + "step": 4145 + }, + { + "epoch": 0.6517715026823085, + "grad_norm": 0.15947549045085907, + "learning_rate": 3.804349432452838e-05, + "loss": 1.0879, + "step": 4146 + }, + { + "epoch": 0.651928707579241, + "grad_norm": 0.1373717188835144, + "learning_rate": 3.8038223104551344e-05, + "loss": 1.0824, + "step": 4147 + }, + { + "epoch": 0.6520859124761736, + "grad_norm": 0.13640908896923065, + "learning_rate": 3.8032951088249245e-05, + "loss": 1.1553, + "step": 4148 + }, + { + "epoch": 0.6522431173731061, + "grad_norm": 0.13748691976070404, + "learning_rate": 3.802767827594408e-05, + "loss": 1.0725, + "step": 4149 + }, + { + "epoch": 0.6524003222700387, + "grad_norm": 0.14089974761009216, + "learning_rate": 3.80224046679579e-05, + "loss": 1.1241, + "step": 4150 + }, + { + "epoch": 0.6525575271669712, + "grad_norm": 0.13118582963943481, + "learning_rate": 3.8017130264612775e-05, + "loss": 1.0418, + "step": 4151 + }, + { + "epoch": 0.6527147320639037, + "grad_norm": 0.1608380675315857, + "learning_rate": 3.8011855066230866e-05, + "loss": 1.1589, + "step": 4152 + }, + { + "epoch": 0.6528719369608363, + "grad_norm": 0.18779993057250977, + "learning_rate": 3.800657907313436e-05, + "loss": 1.162, + "step": 4153 + }, + { + "epoch": 0.6530291418577688, + "grad_norm": 0.1772323101758957, + "learning_rate": 3.800130228564549e-05, + "loss": 1.0781, + "step": 4154 + }, + { + "epoch": 0.6531863467547014, + "grad_norm": 0.13229519128799438, + "learning_rate": 3.799602470408654e-05, + "loss": 1.1686, + "step": 4155 + }, + { + "epoch": 0.6533435516516339, + "grad_norm": 0.1577441245317459, + "learning_rate": 3.799074632877985e-05, + "loss": 1.08, + "step": 4156 + }, + { + "epoch": 0.6535007565485665, + "grad_norm": 0.14009861648082733, + "learning_rate": 3.7985467160047804e-05, + "loss": 1.147, + "step": 4157 + }, + { + "epoch": 0.653657961445499, + "grad_norm": 0.1450236737728119, + "learning_rate": 3.798018719821283e-05, + "loss": 1.1681, + "step": 4158 + }, + { + "epoch": 0.6538151663424315, + "grad_norm": 0.153233140707016, + "learning_rate": 3.79749064435974e-05, + "loss": 1.0823, + "step": 4159 + }, + { + "epoch": 0.6539723712393641, + "grad_norm": 0.1304878294467926, + "learning_rate": 3.796962489652406e-05, + "loss": 1.058, + "step": 4160 + }, + { + "epoch": 0.6539723712393641, + "eval_loss": 1.1008325815200806, + "eval_runtime": 2357.0176, + "eval_samples_per_second": 3.928, + "eval_steps_per_second": 1.964, + "step": 4160 + }, + { + "epoch": 0.6541295761362966, + "grad_norm": 0.1540244072675705, + "learning_rate": 3.796434255731537e-05, + "loss": 1.0523, + "step": 4161 + }, + { + "epoch": 0.6542867810332292, + "grad_norm": 0.19584552943706512, + "learning_rate": 3.7959059426293964e-05, + "loss": 1.0906, + "step": 4162 + }, + { + "epoch": 0.6544439859301617, + "grad_norm": 0.1526932269334793, + "learning_rate": 3.795377550378252e-05, + "loss": 1.0324, + "step": 4163 + }, + { + "epoch": 0.6546011908270942, + "grad_norm": 0.15011286735534668, + "learning_rate": 3.794849079010375e-05, + "loss": 1.0482, + "step": 4164 + }, + { + "epoch": 0.6547583957240268, + "grad_norm": 0.14008505642414093, + "learning_rate": 3.794320528558044e-05, + "loss": 1.1551, + "step": 4165 + }, + { + "epoch": 0.6549156006209593, + "grad_norm": 0.12673290073871613, + "learning_rate": 3.7937918990535376e-05, + "loss": 1.0247, + "step": 4166 + }, + { + "epoch": 0.6550728055178919, + "grad_norm": 0.13903503119945526, + "learning_rate": 3.793263190529146e-05, + "loss": 1.0566, + "step": 4167 + }, + { + "epoch": 0.6552300104148244, + "grad_norm": 0.13928020000457764, + "learning_rate": 3.7927344030171584e-05, + "loss": 1.1154, + "step": 4168 + }, + { + "epoch": 0.6553872153117569, + "grad_norm": 0.13580933213233948, + "learning_rate": 3.7922055365498726e-05, + "loss": 1.1776, + "step": 4169 + }, + { + "epoch": 0.6555444202086895, + "grad_norm": 0.14068977534770966, + "learning_rate": 3.79167659115959e-05, + "loss": 1.0519, + "step": 4170 + }, + { + "epoch": 0.655701625105622, + "grad_norm": 0.12532474100589752, + "learning_rate": 3.7911475668786135e-05, + "loss": 1.216, + "step": 4171 + }, + { + "epoch": 0.6558588300025546, + "grad_norm": 1.0994434356689453, + "learning_rate": 3.790618463739258e-05, + "loss": 1.1033, + "step": 4172 + }, + { + "epoch": 0.6560160348994871, + "grad_norm": 0.22691218554973602, + "learning_rate": 3.790089281773837e-05, + "loss": 1.2606, + "step": 4173 + }, + { + "epoch": 0.6561732397964196, + "grad_norm": 0.19565100967884064, + "learning_rate": 3.7895600210146696e-05, + "loss": 1.0829, + "step": 4174 + }, + { + "epoch": 0.6563304446933522, + "grad_norm": 0.16726306080818176, + "learning_rate": 3.789030681494084e-05, + "loss": 1.0931, + "step": 4175 + }, + { + "epoch": 0.6564876495902847, + "grad_norm": 0.15301238000392914, + "learning_rate": 3.788501263244408e-05, + "loss": 1.2654, + "step": 4176 + }, + { + "epoch": 0.6566448544872173, + "grad_norm": 0.16582754254341125, + "learning_rate": 3.7879717662979785e-05, + "loss": 0.9883, + "step": 4177 + }, + { + "epoch": 0.6568020593841498, + "grad_norm": 0.16992676258087158, + "learning_rate": 3.787442190687133e-05, + "loss": 1.0806, + "step": 4178 + }, + { + "epoch": 0.6569592642810823, + "grad_norm": 0.46979808807373047, + "learning_rate": 3.786912536444217e-05, + "loss": 1.2047, + "step": 4179 + }, + { + "epoch": 0.6571164691780149, + "grad_norm": 0.21897472441196442, + "learning_rate": 3.7863828036015805e-05, + "loss": 1.0024, + "step": 4180 + }, + { + "epoch": 0.6572736740749474, + "grad_norm": 0.17460548877716064, + "learning_rate": 3.785852992191575e-05, + "loss": 1.1766, + "step": 4181 + }, + { + "epoch": 0.65743087897188, + "grad_norm": 0.16413532197475433, + "learning_rate": 3.785323102246562e-05, + "loss": 1.1307, + "step": 4182 + }, + { + "epoch": 0.6575880838688125, + "grad_norm": 0.24324509501457214, + "learning_rate": 3.784793133798904e-05, + "loss": 0.998, + "step": 4183 + }, + { + "epoch": 0.657745288765745, + "grad_norm": 0.2441328763961792, + "learning_rate": 3.78426308688097e-05, + "loss": 0.9856, + "step": 4184 + }, + { + "epoch": 0.6579024936626776, + "grad_norm": 0.16712550818920135, + "learning_rate": 3.7837329615251336e-05, + "loss": 1.0543, + "step": 4185 + }, + { + "epoch": 0.6580596985596101, + "grad_norm": 0.17282027006149292, + "learning_rate": 3.783202757763771e-05, + "loss": 1.0719, + "step": 4186 + }, + { + "epoch": 0.6582169034565427, + "grad_norm": 0.20528116822242737, + "learning_rate": 3.7826724756292666e-05, + "loss": 1.2393, + "step": 4187 + }, + { + "epoch": 0.6583741083534752, + "grad_norm": 0.2005816549062729, + "learning_rate": 3.7821421151540084e-05, + "loss": 0.9683, + "step": 4188 + }, + { + "epoch": 0.6585313132504077, + "grad_norm": 0.21068783104419708, + "learning_rate": 3.7816116763703874e-05, + "loss": 1.0571, + "step": 4189 + }, + { + "epoch": 0.6586885181473403, + "grad_norm": 0.17092068493366241, + "learning_rate": 3.781081159310801e-05, + "loss": 0.9867, + "step": 4190 + }, + { + "epoch": 0.6588457230442728, + "grad_norm": 0.19518724083900452, + "learning_rate": 3.780550564007652e-05, + "loss": 1.131, + "step": 4191 + }, + { + "epoch": 0.6590029279412054, + "grad_norm": 0.15224303305149078, + "learning_rate": 3.780019890493347e-05, + "loss": 1.1173, + "step": 4192 + }, + { + "epoch": 0.6591601328381379, + "grad_norm": 0.1946234405040741, + "learning_rate": 3.779489138800297e-05, + "loss": 1.149, + "step": 4193 + }, + { + "epoch": 0.6593173377350704, + "grad_norm": 0.16867277026176453, + "learning_rate": 3.778958308960919e-05, + "loss": 1.019, + "step": 4194 + }, + { + "epoch": 0.659474542632003, + "grad_norm": 0.21311645209789276, + "learning_rate": 3.778427401007632e-05, + "loss": 1.0941, + "step": 4195 + }, + { + "epoch": 0.6596317475289355, + "grad_norm": 0.18073083460330963, + "learning_rate": 3.777896414972866e-05, + "loss": 1.1035, + "step": 4196 + }, + { + "epoch": 0.6597889524258681, + "grad_norm": 0.19108764827251434, + "learning_rate": 3.7773653508890475e-05, + "loss": 1.1679, + "step": 4197 + }, + { + "epoch": 0.6599461573228006, + "grad_norm": 0.17400071024894714, + "learning_rate": 3.776834208788613e-05, + "loss": 1.0681, + "step": 4198 + }, + { + "epoch": 0.6601033622197331, + "grad_norm": 0.173418328166008, + "learning_rate": 3.776302988704004e-05, + "loss": 1.1563, + "step": 4199 + }, + { + "epoch": 0.6602605671166657, + "grad_norm": 0.18360170722007751, + "learning_rate": 3.775771690667665e-05, + "loss": 1.1159, + "step": 4200 + }, + { + "epoch": 0.6604177720135982, + "grad_norm": 0.17133785784244537, + "learning_rate": 3.775240314712043e-05, + "loss": 1.1207, + "step": 4201 + }, + { + "epoch": 0.6605749769105308, + "grad_norm": 0.1588124930858612, + "learning_rate": 3.7747088608695965e-05, + "loss": 1.0254, + "step": 4202 + }, + { + "epoch": 0.6607321818074633, + "grad_norm": 0.20954452455043793, + "learning_rate": 3.7741773291727815e-05, + "loss": 1.108, + "step": 4203 + }, + { + "epoch": 0.6608893867043958, + "grad_norm": 0.19729499518871307, + "learning_rate": 3.773645719654064e-05, + "loss": 1.1431, + "step": 4204 + }, + { + "epoch": 0.6610465916013284, + "grad_norm": 0.20525553822517395, + "learning_rate": 3.773114032345911e-05, + "loss": 1.0959, + "step": 4205 + }, + { + "epoch": 0.6612037964982609, + "grad_norm": 0.1458091139793396, + "learning_rate": 3.772582267280798e-05, + "loss": 1.1896, + "step": 4206 + }, + { + "epoch": 0.6613610013951935, + "grad_norm": 0.18808773159980774, + "learning_rate": 3.772050424491201e-05, + "loss": 1.0664, + "step": 4207 + }, + { + "epoch": 0.661518206292126, + "grad_norm": 0.1506444215774536, + "learning_rate": 3.7715185040096046e-05, + "loss": 1.1904, + "step": 4208 + }, + { + "epoch": 0.6616754111890586, + "grad_norm": 0.15354827046394348, + "learning_rate": 3.7709865058684944e-05, + "loss": 1.1939, + "step": 4209 + }, + { + "epoch": 0.6618326160859911, + "grad_norm": 0.15871380269527435, + "learning_rate": 3.770454430100365e-05, + "loss": 1.1425, + "step": 4210 + }, + { + "epoch": 0.6619898209829236, + "grad_norm": 0.17379145324230194, + "learning_rate": 3.7699222767377135e-05, + "loss": 1.1903, + "step": 4211 + }, + { + "epoch": 0.6621470258798562, + "grad_norm": 0.16655685007572174, + "learning_rate": 3.769390045813041e-05, + "loss": 1.0508, + "step": 4212 + }, + { + "epoch": 0.6623042307767887, + "grad_norm": 0.16712385416030884, + "learning_rate": 3.768857737358854e-05, + "loss": 1.144, + "step": 4213 + }, + { + "epoch": 0.6624614356737213, + "grad_norm": 0.20002847909927368, + "learning_rate": 3.768325351407664e-05, + "loss": 1.1763, + "step": 4214 + }, + { + "epoch": 0.6626186405706538, + "grad_norm": 0.19525060057640076, + "learning_rate": 3.7677928879919866e-05, + "loss": 1.0668, + "step": 4215 + }, + { + "epoch": 0.6627758454675863, + "grad_norm": 0.16536962985992432, + "learning_rate": 3.767260347144344e-05, + "loss": 1.1823, + "step": 4216 + }, + { + "epoch": 0.6629330503645189, + "grad_norm": 0.25156453251838684, + "learning_rate": 3.76672772889726e-05, + "loss": 0.9883, + "step": 4217 + }, + { + "epoch": 0.6630902552614514, + "grad_norm": 0.1674017310142517, + "learning_rate": 3.766195033283267e-05, + "loss": 1.0279, + "step": 4218 + }, + { + "epoch": 0.663247460158384, + "grad_norm": 0.14695732295513153, + "learning_rate": 3.765662260334899e-05, + "loss": 0.9986, + "step": 4219 + }, + { + "epoch": 0.6634046650553165, + "grad_norm": 0.17739038169384003, + "learning_rate": 3.765129410084694e-05, + "loss": 1.0437, + "step": 4220 + }, + { + "epoch": 0.663561869952249, + "grad_norm": 0.1540808528661728, + "learning_rate": 3.7645964825652e-05, + "loss": 1.0727, + "step": 4221 + }, + { + "epoch": 0.6637190748491816, + "grad_norm": 0.16241209208965302, + "learning_rate": 3.7640634778089635e-05, + "loss": 1.166, + "step": 4222 + }, + { + "epoch": 0.6638762797461141, + "grad_norm": 0.1566782295703888, + "learning_rate": 3.76353039584854e-05, + "loss": 1.1314, + "step": 4223 + }, + { + "epoch": 0.6640334846430467, + "grad_norm": 0.19483767449855804, + "learning_rate": 3.762997236716487e-05, + "loss": 1.0905, + "step": 4224 + }, + { + "epoch": 0.6641906895399792, + "grad_norm": 0.1881464421749115, + "learning_rate": 3.7624640004453674e-05, + "loss": 1.1489, + "step": 4225 + }, + { + "epoch": 0.6643478944369117, + "grad_norm": 0.15651607513427734, + "learning_rate": 3.76193068706775e-05, + "loss": 1.1743, + "step": 4226 + }, + { + "epoch": 0.6645050993338443, + "grad_norm": 0.17168089747428894, + "learning_rate": 3.761397296616208e-05, + "loss": 0.9814, + "step": 4227 + }, + { + "epoch": 0.6646623042307768, + "grad_norm": 0.18237245082855225, + "learning_rate": 3.760863829123319e-05, + "loss": 1.1592, + "step": 4228 + }, + { + "epoch": 0.6648195091277094, + "grad_norm": 0.1554557830095291, + "learning_rate": 3.760330284621664e-05, + "loss": 1.1879, + "step": 4229 + }, + { + "epoch": 0.6649767140246419, + "grad_norm": 0.17463083565235138, + "learning_rate": 3.759796663143831e-05, + "loss": 1.1055, + "step": 4230 + }, + { + "epoch": 0.6651339189215744, + "grad_norm": 0.2073536515235901, + "learning_rate": 3.75926296472241e-05, + "loss": 1.0116, + "step": 4231 + }, + { + "epoch": 0.665291123818507, + "grad_norm": 0.34921008348464966, + "learning_rate": 3.758729189389999e-05, + "loss": 1.1515, + "step": 4232 + }, + { + "epoch": 0.6654483287154395, + "grad_norm": 0.17489458620548248, + "learning_rate": 3.7581953371791985e-05, + "loss": 1.1912, + "step": 4233 + }, + { + "epoch": 0.6656055336123721, + "grad_norm": 0.16053788363933563, + "learning_rate": 3.757661408122614e-05, + "loss": 1.0192, + "step": 4234 + }, + { + "epoch": 0.6657627385093046, + "grad_norm": 0.14950202405452728, + "learning_rate": 3.757127402252855e-05, + "loss": 1.0427, + "step": 4235 + }, + { + "epoch": 0.6659199434062371, + "grad_norm": 0.19765916466712952, + "learning_rate": 3.756593319602537e-05, + "loss": 1.0708, + "step": 4236 + }, + { + "epoch": 0.6660771483031697, + "grad_norm": 0.16286489367485046, + "learning_rate": 3.756059160204281e-05, + "loss": 1.2351, + "step": 4237 + }, + { + "epoch": 0.6662343532001022, + "grad_norm": 0.17618192732334137, + "learning_rate": 3.755524924090711e-05, + "loss": 1.1426, + "step": 4238 + }, + { + "epoch": 0.6663915580970348, + "grad_norm": 0.14757370948791504, + "learning_rate": 3.7549906112944546e-05, + "loss": 1.0719, + "step": 4239 + }, + { + "epoch": 0.6665487629939673, + "grad_norm": 0.16622844338417053, + "learning_rate": 3.754456221848146e-05, + "loss": 1.0361, + "step": 4240 + }, + { + "epoch": 0.6667059678908998, + "grad_norm": 0.19668567180633545, + "learning_rate": 3.753921755784425e-05, + "loss": 1.0828, + "step": 4241 + }, + { + "epoch": 0.6668631727878324, + "grad_norm": 0.16749325394630432, + "learning_rate": 3.753387213135935e-05, + "loss": 1.1456, + "step": 4242 + }, + { + "epoch": 0.6670203776847649, + "grad_norm": 0.16815859079360962, + "learning_rate": 3.752852593935322e-05, + "loss": 1.1183, + "step": 4243 + }, + { + "epoch": 0.6671775825816975, + "grad_norm": 0.1528685837984085, + "learning_rate": 3.752317898215239e-05, + "loss": 1.1304, + "step": 4244 + }, + { + "epoch": 0.66733478747863, + "grad_norm": 0.15588803589344025, + "learning_rate": 3.751783126008344e-05, + "loss": 1.0748, + "step": 4245 + }, + { + "epoch": 0.6674919923755624, + "grad_norm": 0.16677920520305634, + "learning_rate": 3.751248277347298e-05, + "loss": 1.0492, + "step": 4246 + }, + { + "epoch": 0.667649197272495, + "grad_norm": 0.1643311083316803, + "learning_rate": 3.750713352264768e-05, + "loss": 1.2256, + "step": 4247 + }, + { + "epoch": 0.6678064021694275, + "grad_norm": 0.1470828801393509, + "learning_rate": 3.750178350793425e-05, + "loss": 1.0181, + "step": 4248 + }, + { + "epoch": 0.6679636070663602, + "grad_norm": 0.17522110044956207, + "learning_rate": 3.749643272965946e-05, + "loss": 1.145, + "step": 4249 + }, + { + "epoch": 0.6681208119632926, + "grad_norm": 0.17564639449119568, + "learning_rate": 3.749108118815009e-05, + "loss": 1.0818, + "step": 4250 + }, + { + "epoch": 0.6682780168602251, + "grad_norm": 0.18222050368785858, + "learning_rate": 3.748572888373302e-05, + "loss": 1.0535, + "step": 4251 + }, + { + "epoch": 0.6684352217571577, + "grad_norm": 0.1446535885334015, + "learning_rate": 3.748037581673513e-05, + "loss": 1.0991, + "step": 4252 + }, + { + "epoch": 0.6685924266540902, + "grad_norm": 0.20465463399887085, + "learning_rate": 3.747502198748336e-05, + "loss": 1.0645, + "step": 4253 + }, + { + "epoch": 0.6687496315510228, + "grad_norm": 0.1818159520626068, + "learning_rate": 3.746966739630473e-05, + "loss": 1.1188, + "step": 4254 + }, + { + "epoch": 0.6689068364479553, + "grad_norm": 0.19995780289173126, + "learning_rate": 3.746431204352624e-05, + "loss": 1.1536, + "step": 4255 + }, + { + "epoch": 0.6690640413448878, + "grad_norm": 0.20652185380458832, + "learning_rate": 3.7458955929475e-05, + "loss": 1.0296, + "step": 4256 + }, + { + "epoch": 0.6692212462418204, + "grad_norm": 0.14761137962341309, + "learning_rate": 3.745359905447814e-05, + "loss": 1.0662, + "step": 4257 + }, + { + "epoch": 0.6693784511387529, + "grad_norm": 0.18381749093532562, + "learning_rate": 3.744824141886283e-05, + "loss": 1.2777, + "step": 4258 + }, + { + "epoch": 0.6695356560356855, + "grad_norm": 0.16218112409114838, + "learning_rate": 3.7442883022956294e-05, + "loss": 1.1225, + "step": 4259 + }, + { + "epoch": 0.669692860932618, + "grad_norm": 0.13938245177268982, + "learning_rate": 3.7437523867085813e-05, + "loss": 1.0624, + "step": 4260 + }, + { + "epoch": 0.6698500658295506, + "grad_norm": 0.1756437122821808, + "learning_rate": 3.743216395157869e-05, + "loss": 1.1881, + "step": 4261 + }, + { + "epoch": 0.6700072707264831, + "grad_norm": 0.1338120400905609, + "learning_rate": 3.74268032767623e-05, + "loss": 1.1023, + "step": 4262 + }, + { + "epoch": 0.6701644756234156, + "grad_norm": 0.17289410531520844, + "learning_rate": 3.742144184296404e-05, + "loss": 1.0453, + "step": 4263 + }, + { + "epoch": 0.6703216805203482, + "grad_norm": 0.14961934089660645, + "learning_rate": 3.741607965051137e-05, + "loss": 1.1886, + "step": 4264 + }, + { + "epoch": 0.6704788854172807, + "grad_norm": 0.18203534185886383, + "learning_rate": 3.7410716699731805e-05, + "loss": 1.1251, + "step": 4265 + }, + { + "epoch": 0.6706360903142133, + "grad_norm": 0.17501042783260345, + "learning_rate": 3.740535299095287e-05, + "loss": 1.1852, + "step": 4266 + }, + { + "epoch": 0.6707932952111458, + "grad_norm": 0.2523060142993927, + "learning_rate": 3.739998852450218e-05, + "loss": 0.8827, + "step": 4267 + }, + { + "epoch": 0.6709505001080783, + "grad_norm": 0.1660834401845932, + "learning_rate": 3.7394623300707375e-05, + "loss": 1.1962, + "step": 4268 + }, + { + "epoch": 0.6711077050050109, + "grad_norm": 0.1615104079246521, + "learning_rate": 3.7389257319896135e-05, + "loss": 1.1052, + "step": 4269 + }, + { + "epoch": 0.6712649099019434, + "grad_norm": 0.1610841453075409, + "learning_rate": 3.73838905823962e-05, + "loss": 1.1558, + "step": 4270 + }, + { + "epoch": 0.671422114798876, + "grad_norm": 0.17448994517326355, + "learning_rate": 3.737852308853533e-05, + "loss": 1.1081, + "step": 4271 + }, + { + "epoch": 0.6715793196958085, + "grad_norm": 0.16625918447971344, + "learning_rate": 3.737315483864138e-05, + "loss": 0.9769, + "step": 4272 + }, + { + "epoch": 0.671736524592741, + "grad_norm": 0.1489565372467041, + "learning_rate": 3.736778583304221e-05, + "loss": 1.0799, + "step": 4273 + }, + { + "epoch": 0.6718937294896736, + "grad_norm": 0.1871234029531479, + "learning_rate": 3.736241607206573e-05, + "loss": 1.1174, + "step": 4274 + }, + { + "epoch": 0.6720509343866061, + "grad_norm": 0.15307629108428955, + "learning_rate": 3.735704555603992e-05, + "loss": 1.0429, + "step": 4275 + }, + { + "epoch": 0.6722081392835387, + "grad_norm": 0.18358135223388672, + "learning_rate": 3.7351674285292785e-05, + "loss": 1.0812, + "step": 4276 + }, + { + "epoch": 0.6723653441804712, + "grad_norm": 0.1625143587589264, + "learning_rate": 3.734630226015238e-05, + "loss": 0.9433, + "step": 4277 + }, + { + "epoch": 0.6725225490774037, + "grad_norm": 0.15999294817447662, + "learning_rate": 3.73409294809468e-05, + "loss": 1.1477, + "step": 4278 + }, + { + "epoch": 0.6726797539743363, + "grad_norm": 0.25717711448669434, + "learning_rate": 3.73355559480042e-05, + "loss": 1.0463, + "step": 4279 + }, + { + "epoch": 0.6728369588712688, + "grad_norm": 0.17935602366924286, + "learning_rate": 3.733018166165277e-05, + "loss": 1.1075, + "step": 4280 + }, + { + "epoch": 0.6729941637682014, + "grad_norm": 0.16021014750003815, + "learning_rate": 3.732480662222077e-05, + "loss": 1.0991, + "step": 4281 + }, + { + "epoch": 0.6731513686651339, + "grad_norm": 0.1721874475479126, + "learning_rate": 3.7319430830036475e-05, + "loss": 1.1008, + "step": 4282 + }, + { + "epoch": 0.6733085735620664, + "grad_norm": 0.18219947814941406, + "learning_rate": 3.731405428542821e-05, + "loss": 1.075, + "step": 4283 + }, + { + "epoch": 0.673465778458999, + "grad_norm": 0.1574048399925232, + "learning_rate": 3.7308676988724366e-05, + "loss": 1.2457, + "step": 4284 + }, + { + "epoch": 0.6736229833559315, + "grad_norm": 0.14693421125411987, + "learning_rate": 3.730329894025336e-05, + "loss": 1.118, + "step": 4285 + }, + { + "epoch": 0.6737801882528641, + "grad_norm": 0.24711671471595764, + "learning_rate": 3.729792014034367e-05, + "loss": 0.9959, + "step": 4286 + }, + { + "epoch": 0.6739373931497966, + "grad_norm": 0.17833338677883148, + "learning_rate": 3.729254058932381e-05, + "loss": 1.0796, + "step": 4287 + }, + { + "epoch": 0.6740945980467291, + "grad_norm": 0.17984211444854736, + "learning_rate": 3.728716028752234e-05, + "loss": 1.1121, + "step": 4288 + }, + { + "epoch": 0.6742518029436617, + "grad_norm": 0.19105543196201324, + "learning_rate": 3.728177923526786e-05, + "loss": 1.1225, + "step": 4289 + }, + { + "epoch": 0.6744090078405942, + "grad_norm": 0.14651957154273987, + "learning_rate": 3.727639743288904e-05, + "loss": 1.007, + "step": 4290 + }, + { + "epoch": 0.6745662127375268, + "grad_norm": 0.14845982193946838, + "learning_rate": 3.7271014880714577e-05, + "loss": 1.1094, + "step": 4291 + }, + { + "epoch": 0.6747234176344593, + "grad_norm": 0.1587723046541214, + "learning_rate": 3.726563157907321e-05, + "loss": 1.1088, + "step": 4292 + }, + { + "epoch": 0.6748806225313918, + "grad_norm": 0.16230261325836182, + "learning_rate": 3.726024752829373e-05, + "loss": 1.1123, + "step": 4293 + }, + { + "epoch": 0.6750378274283244, + "grad_norm": 0.1781100183725357, + "learning_rate": 3.725486272870498e-05, + "loss": 0.9464, + "step": 4294 + }, + { + "epoch": 0.6751950323252569, + "grad_norm": 0.13477860391139984, + "learning_rate": 3.724947718063585e-05, + "loss": 1.0361, + "step": 4295 + }, + { + "epoch": 0.6753522372221895, + "grad_norm": 0.1615627259016037, + "learning_rate": 3.724409088441525e-05, + "loss": 1.1301, + "step": 4296 + }, + { + "epoch": 0.675509442119122, + "grad_norm": 0.1631171554327011, + "learning_rate": 3.7238703840372166e-05, + "loss": 1.0858, + "step": 4297 + }, + { + "epoch": 0.6756666470160545, + "grad_norm": 0.1773926168680191, + "learning_rate": 3.7233316048835615e-05, + "loss": 0.9948, + "step": 4298 + }, + { + "epoch": 0.6758238519129871, + "grad_norm": 0.16207563877105713, + "learning_rate": 3.722792751013467e-05, + "loss": 1.1216, + "step": 4299 + }, + { + "epoch": 0.6759810568099196, + "grad_norm": 0.15376627445220947, + "learning_rate": 3.722253822459843e-05, + "loss": 1.0546, + "step": 4300 + }, + { + "epoch": 0.6761382617068522, + "grad_norm": 0.14326240122318268, + "learning_rate": 3.7217148192556065e-05, + "loss": 1.1124, + "step": 4301 + }, + { + "epoch": 0.6762954666037847, + "grad_norm": 0.13520918786525726, + "learning_rate": 3.7211757414336775e-05, + "loss": 1.1624, + "step": 4302 + }, + { + "epoch": 0.6764526715007172, + "grad_norm": 0.1636790782213211, + "learning_rate": 3.72063658902698e-05, + "loss": 1.0277, + "step": 4303 + }, + { + "epoch": 0.6766098763976498, + "grad_norm": 0.12657029926776886, + "learning_rate": 3.720097362068443e-05, + "loss": 1.1938, + "step": 4304 + }, + { + "epoch": 0.6767670812945823, + "grad_norm": 0.16188089549541473, + "learning_rate": 3.719558060591003e-05, + "loss": 1.065, + "step": 4305 + }, + { + "epoch": 0.6769242861915149, + "grad_norm": 0.20939183235168457, + "learning_rate": 3.7190186846275954e-05, + "loss": 1.1164, + "step": 4306 + }, + { + "epoch": 0.6770814910884474, + "grad_norm": 0.1548239290714264, + "learning_rate": 3.718479234211165e-05, + "loss": 1.1214, + "step": 4307 + }, + { + "epoch": 0.6772386959853799, + "grad_norm": 0.16959509253501892, + "learning_rate": 3.7179397093746595e-05, + "loss": 1.2162, + "step": 4308 + }, + { + "epoch": 0.6773959008823125, + "grad_norm": 0.1429489105939865, + "learning_rate": 3.7174001101510295e-05, + "loss": 1.1158, + "step": 4309 + }, + { + "epoch": 0.677553105779245, + "grad_norm": 0.15084104239940643, + "learning_rate": 3.716860436573234e-05, + "loss": 1.0594, + "step": 4310 + }, + { + "epoch": 0.6777103106761776, + "grad_norm": 0.1567157655954361, + "learning_rate": 3.716320688674232e-05, + "loss": 1.0448, + "step": 4311 + }, + { + "epoch": 0.6778675155731101, + "grad_norm": 0.13432151079177856, + "learning_rate": 3.71578086648699e-05, + "loss": 1.1063, + "step": 4312 + }, + { + "epoch": 0.6780247204700427, + "grad_norm": 0.14147740602493286, + "learning_rate": 3.715240970044479e-05, + "loss": 1.2635, + "step": 4313 + }, + { + "epoch": 0.6781819253669752, + "grad_norm": 0.14541368186473846, + "learning_rate": 3.7147009993796726e-05, + "loss": 1.167, + "step": 4314 + }, + { + "epoch": 0.6783391302639077, + "grad_norm": 0.15351711213588715, + "learning_rate": 3.714160954525551e-05, + "loss": 1.2112, + "step": 4315 + }, + { + "epoch": 0.6784963351608403, + "grad_norm": 0.1674315184354782, + "learning_rate": 3.713620835515098e-05, + "loss": 1.0714, + "step": 4316 + }, + { + "epoch": 0.6786535400577728, + "grad_norm": 0.1620711088180542, + "learning_rate": 3.713080642381303e-05, + "loss": 1.1816, + "step": 4317 + }, + { + "epoch": 0.6788107449547054, + "grad_norm": 0.16441158950328827, + "learning_rate": 3.7125403751571565e-05, + "loss": 1.1193, + "step": 4318 + }, + { + "epoch": 0.6789679498516379, + "grad_norm": 0.16214364767074585, + "learning_rate": 3.7120000338756574e-05, + "loss": 1.1117, + "step": 4319 + }, + { + "epoch": 0.6791251547485704, + "grad_norm": 0.1726810336112976, + "learning_rate": 3.711459618569808e-05, + "loss": 1.1549, + "step": 4320 + }, + { + "epoch": 0.6791251547485704, + "eval_loss": 1.1005306243896484, + "eval_runtime": 2322.7293, + "eval_samples_per_second": 3.986, + "eval_steps_per_second": 1.993, + "step": 4320 + }, + { + "epoch": 0.679282359645503, + "grad_norm": 0.1818498969078064, + "learning_rate": 3.710919129272614e-05, + "loss": 1.1902, + "step": 4321 + }, + { + "epoch": 0.6794395645424355, + "grad_norm": 0.13222762942314148, + "learning_rate": 3.710378566017087e-05, + "loss": 1.1281, + "step": 4322 + }, + { + "epoch": 0.6795967694393681, + "grad_norm": 0.22074824571609497, + "learning_rate": 3.709837928836242e-05, + "loss": 1.0326, + "step": 4323 + }, + { + "epoch": 0.6797539743363006, + "grad_norm": 0.20594502985477448, + "learning_rate": 3.7092972177631e-05, + "loss": 0.971, + "step": 4324 + }, + { + "epoch": 0.6799111792332331, + "grad_norm": 0.1570967435836792, + "learning_rate": 3.708756432830685e-05, + "loss": 1.1011, + "step": 4325 + }, + { + "epoch": 0.6800683841301657, + "grad_norm": 0.15465806424617767, + "learning_rate": 3.708215574072026e-05, + "loss": 1.1032, + "step": 4326 + }, + { + "epoch": 0.6802255890270982, + "grad_norm": 0.14896081387996674, + "learning_rate": 3.707674641520156e-05, + "loss": 1.1976, + "step": 4327 + }, + { + "epoch": 0.6803827939240308, + "grad_norm": 0.15382733941078186, + "learning_rate": 3.707133635208114e-05, + "loss": 1.1458, + "step": 4328 + }, + { + "epoch": 0.6805399988209633, + "grad_norm": 0.14555004239082336, + "learning_rate": 3.706592555168943e-05, + "loss": 1.1442, + "step": 4329 + }, + { + "epoch": 0.6806972037178958, + "grad_norm": 0.13018903136253357, + "learning_rate": 3.70605140143569e-05, + "loss": 1.1133, + "step": 4330 + }, + { + "epoch": 0.6808544086148284, + "grad_norm": 0.17980679869651794, + "learning_rate": 3.705510174041406e-05, + "loss": 1.1452, + "step": 4331 + }, + { + "epoch": 0.6810116135117609, + "grad_norm": 0.15200766921043396, + "learning_rate": 3.704968873019145e-05, + "loss": 1.0226, + "step": 4332 + }, + { + "epoch": 0.6811688184086935, + "grad_norm": 0.15224698185920715, + "learning_rate": 3.704427498401972e-05, + "loss": 1.2318, + "step": 4333 + }, + { + "epoch": 0.681326023305626, + "grad_norm": 0.15590931475162506, + "learning_rate": 3.70388605022295e-05, + "loss": 1.0923, + "step": 4334 + }, + { + "epoch": 0.6814832282025585, + "grad_norm": 0.16589437425136566, + "learning_rate": 3.703344528515147e-05, + "loss": 1.1986, + "step": 4335 + }, + { + "epoch": 0.6816404330994911, + "grad_norm": 0.15944154560565948, + "learning_rate": 3.7028029333116406e-05, + "loss": 1.1923, + "step": 4336 + }, + { + "epoch": 0.6817976379964236, + "grad_norm": 0.14723898470401764, + "learning_rate": 3.7022612646455064e-05, + "loss": 1.1159, + "step": 4337 + }, + { + "epoch": 0.6819548428933562, + "grad_norm": 0.1588645726442337, + "learning_rate": 3.701719522549828e-05, + "loss": 1.1289, + "step": 4338 + }, + { + "epoch": 0.6821120477902887, + "grad_norm": 0.1596093773841858, + "learning_rate": 3.701177707057694e-05, + "loss": 1.0452, + "step": 4339 + }, + { + "epoch": 0.6822692526872212, + "grad_norm": 0.17482325434684753, + "learning_rate": 3.700635818202196e-05, + "loss": 1.0804, + "step": 4340 + }, + { + "epoch": 0.6824264575841538, + "grad_norm": 0.16940900683403015, + "learning_rate": 3.70009385601643e-05, + "loss": 1.1467, + "step": 4341 + }, + { + "epoch": 0.6825836624810863, + "grad_norm": 0.18156111240386963, + "learning_rate": 3.699551820533498e-05, + "loss": 0.9255, + "step": 4342 + }, + { + "epoch": 0.6827408673780189, + "grad_norm": 0.16947069764137268, + "learning_rate": 3.6990097117865036e-05, + "loss": 1.1077, + "step": 4343 + }, + { + "epoch": 0.6828980722749514, + "grad_norm": 0.15319743752479553, + "learning_rate": 3.698467529808559e-05, + "loss": 1.1573, + "step": 4344 + }, + { + "epoch": 0.6830552771718839, + "grad_norm": 0.15475665032863617, + "learning_rate": 3.697925274632777e-05, + "loss": 1.1847, + "step": 4345 + }, + { + "epoch": 0.6832124820688165, + "grad_norm": 0.1482115238904953, + "learning_rate": 3.697382946292277e-05, + "loss": 1.0272, + "step": 4346 + }, + { + "epoch": 0.683369686965749, + "grad_norm": 0.16407650709152222, + "learning_rate": 3.696840544820182e-05, + "loss": 1.1283, + "step": 4347 + }, + { + "epoch": 0.6835268918626816, + "grad_norm": 0.165381520986557, + "learning_rate": 3.696298070249621e-05, + "loss": 1.174, + "step": 4348 + }, + { + "epoch": 0.6836840967596141, + "grad_norm": 0.15581317245960236, + "learning_rate": 3.6957555226137255e-05, + "loss": 1.1043, + "step": 4349 + }, + { + "epoch": 0.6838413016565466, + "grad_norm": 0.17735551297664642, + "learning_rate": 3.695212901945632e-05, + "loss": 1.1663, + "step": 4350 + }, + { + "epoch": 0.6839985065534792, + "grad_norm": 0.14878909289836884, + "learning_rate": 3.6946702082784815e-05, + "loss": 1.1423, + "step": 4351 + }, + { + "epoch": 0.6841557114504117, + "grad_norm": 0.1387113630771637, + "learning_rate": 3.694127441645421e-05, + "loss": 1.094, + "step": 4352 + }, + { + "epoch": 0.6843129163473443, + "grad_norm": 0.13688206672668457, + "learning_rate": 3.6935846020795986e-05, + "loss": 0.9774, + "step": 4353 + }, + { + "epoch": 0.6844701212442768, + "grad_norm": 0.1413954645395279, + "learning_rate": 3.6930416896141714e-05, + "loss": 1.0235, + "step": 4354 + }, + { + "epoch": 0.6846273261412092, + "grad_norm": 0.1530650556087494, + "learning_rate": 3.6924987042822964e-05, + "loss": 1.1557, + "step": 4355 + }, + { + "epoch": 0.6847845310381419, + "grad_norm": 0.16509467363357544, + "learning_rate": 3.691955646117137e-05, + "loss": 1.0712, + "step": 4356 + }, + { + "epoch": 0.6849417359350743, + "grad_norm": 0.1299772709608078, + "learning_rate": 3.691412515151863e-05, + "loss": 1.0695, + "step": 4357 + }, + { + "epoch": 0.685098940832007, + "grad_norm": 0.14452426135540009, + "learning_rate": 3.690869311419644e-05, + "loss": 1.1729, + "step": 4358 + }, + { + "epoch": 0.6852561457289394, + "grad_norm": 0.17244118452072144, + "learning_rate": 3.69032603495366e-05, + "loss": 1.0492, + "step": 4359 + }, + { + "epoch": 0.6854133506258719, + "grad_norm": 0.1782621592283249, + "learning_rate": 3.68978268578709e-05, + "loss": 1.0566, + "step": 4360 + }, + { + "epoch": 0.6855705555228045, + "grad_norm": 0.17903123795986176, + "learning_rate": 3.6892392639531204e-05, + "loss": 1.1614, + "step": 4361 + }, + { + "epoch": 0.685727760419737, + "grad_norm": 0.17313703894615173, + "learning_rate": 3.6886957694849414e-05, + "loss": 1.0283, + "step": 4362 + }, + { + "epoch": 0.6858849653166696, + "grad_norm": 0.13656044006347656, + "learning_rate": 3.688152202415747e-05, + "loss": 1.1608, + "step": 4363 + }, + { + "epoch": 0.6860421702136021, + "grad_norm": 0.18269652128219604, + "learning_rate": 3.6876085627787376e-05, + "loss": 1.0267, + "step": 4364 + }, + { + "epoch": 0.6861993751105347, + "grad_norm": 0.13547973334789276, + "learning_rate": 3.6870648506071154e-05, + "loss": 1.1485, + "step": 4365 + }, + { + "epoch": 0.6863565800074672, + "grad_norm": 0.13134713470935822, + "learning_rate": 3.686521065934089e-05, + "loss": 1.091, + "step": 4366 + }, + { + "epoch": 0.6865137849043997, + "grad_norm": 0.1429309844970703, + "learning_rate": 3.6859772087928694e-05, + "loss": 1.3158, + "step": 4367 + }, + { + "epoch": 0.6866709898013323, + "grad_norm": 0.14805296063423157, + "learning_rate": 3.6854332792166745e-05, + "loss": 1.1289, + "step": 4368 + }, + { + "epoch": 0.6868281946982648, + "grad_norm": 0.16160231828689575, + "learning_rate": 3.684889277238726e-05, + "loss": 1.1209, + "step": 4369 + }, + { + "epoch": 0.6869853995951974, + "grad_norm": 0.1638854295015335, + "learning_rate": 3.684345202892248e-05, + "loss": 1.188, + "step": 4370 + }, + { + "epoch": 0.6871426044921299, + "grad_norm": 0.1331411898136139, + "learning_rate": 3.683801056210471e-05, + "loss": 1.0899, + "step": 4371 + }, + { + "epoch": 0.6872998093890624, + "grad_norm": 0.1672300398349762, + "learning_rate": 3.6832568372266294e-05, + "loss": 1.1262, + "step": 4372 + }, + { + "epoch": 0.687457014285995, + "grad_norm": 0.19106687605381012, + "learning_rate": 3.682712545973963e-05, + "loss": 1.0008, + "step": 4373 + }, + { + "epoch": 0.6876142191829275, + "grad_norm": 0.18553289771080017, + "learning_rate": 3.682168182485713e-05, + "loss": 1.0962, + "step": 4374 + }, + { + "epoch": 0.6877714240798601, + "grad_norm": 0.1664271205663681, + "learning_rate": 3.681623746795129e-05, + "loss": 1.0545, + "step": 4375 + }, + { + "epoch": 0.6879286289767926, + "grad_norm": 0.16832974553108215, + "learning_rate": 3.681079238935463e-05, + "loss": 1.0419, + "step": 4376 + }, + { + "epoch": 0.6880858338737251, + "grad_norm": 0.14619651436805725, + "learning_rate": 3.6805346589399695e-05, + "loss": 1.071, + "step": 4377 + }, + { + "epoch": 0.6882430387706577, + "grad_norm": 0.16446255147457123, + "learning_rate": 3.679990006841911e-05, + "loss": 1.058, + "step": 4378 + }, + { + "epoch": 0.6884002436675902, + "grad_norm": 0.13051556050777435, + "learning_rate": 3.679445282674553e-05, + "loss": 1.1425, + "step": 4379 + }, + { + "epoch": 0.6885574485645228, + "grad_norm": 0.1575997918844223, + "learning_rate": 3.6789004864711644e-05, + "loss": 1.1478, + "step": 4380 + }, + { + "epoch": 0.6887146534614553, + "grad_norm": 0.15147651731967926, + "learning_rate": 3.67835561826502e-05, + "loss": 1.1867, + "step": 4381 + }, + { + "epoch": 0.6888718583583878, + "grad_norm": 0.17867539823055267, + "learning_rate": 3.677810678089397e-05, + "loss": 1.1013, + "step": 4382 + }, + { + "epoch": 0.6890290632553204, + "grad_norm": 0.16259698569774628, + "learning_rate": 3.677265665977579e-05, + "loss": 1.229, + "step": 4383 + }, + { + "epoch": 0.6891862681522529, + "grad_norm": 0.13512615859508514, + "learning_rate": 3.6767205819628534e-05, + "loss": 1.171, + "step": 4384 + }, + { + "epoch": 0.6893434730491855, + "grad_norm": 0.17127491533756256, + "learning_rate": 3.6761754260785126e-05, + "loss": 1.1358, + "step": 4385 + }, + { + "epoch": 0.689500677946118, + "grad_norm": 0.15448449552059174, + "learning_rate": 3.675630198357851e-05, + "loss": 1.1093, + "step": 4386 + }, + { + "epoch": 0.6896578828430505, + "grad_norm": 0.17164385318756104, + "learning_rate": 3.6750848988341704e-05, + "loss": 1.0478, + "step": 4387 + }, + { + "epoch": 0.6898150877399831, + "grad_norm": 0.17493198812007904, + "learning_rate": 3.6745395275407744e-05, + "loss": 1.1615, + "step": 4388 + }, + { + "epoch": 0.6899722926369156, + "grad_norm": 0.14925609529018402, + "learning_rate": 3.6739940845109735e-05, + "loss": 1.0931, + "step": 4389 + }, + { + "epoch": 0.6901294975338482, + "grad_norm": 0.12435558438301086, + "learning_rate": 3.6734485697780806e-05, + "loss": 1.137, + "step": 4390 + }, + { + "epoch": 0.6902867024307807, + "grad_norm": 0.16357684135437012, + "learning_rate": 3.672902983375413e-05, + "loss": 1.0787, + "step": 4391 + }, + { + "epoch": 0.6904439073277132, + "grad_norm": 0.1529734581708908, + "learning_rate": 3.6723573253362945e-05, + "loss": 1.111, + "step": 4392 + }, + { + "epoch": 0.6906011122246458, + "grad_norm": 0.19325505197048187, + "learning_rate": 3.671811595694051e-05, + "loss": 1.0739, + "step": 4393 + }, + { + "epoch": 0.6907583171215783, + "grad_norm": 0.14742138981819153, + "learning_rate": 3.6712657944820144e-05, + "loss": 1.1334, + "step": 4394 + }, + { + "epoch": 0.6909155220185109, + "grad_norm": 0.16689634323120117, + "learning_rate": 3.670719921733519e-05, + "loss": 0.9864, + "step": 4395 + }, + { + "epoch": 0.6910727269154434, + "grad_norm": 0.14415030181407928, + "learning_rate": 3.6701739774819046e-05, + "loss": 1.1618, + "step": 4396 + }, + { + "epoch": 0.6912299318123759, + "grad_norm": 0.15743502974510193, + "learning_rate": 3.6696279617605155e-05, + "loss": 1.0823, + "step": 4397 + }, + { + "epoch": 0.6913871367093085, + "grad_norm": 0.15715019404888153, + "learning_rate": 3.669081874602701e-05, + "loss": 1.093, + "step": 4398 + }, + { + "epoch": 0.691544341606241, + "grad_norm": 0.1599857062101364, + "learning_rate": 3.668535716041814e-05, + "loss": 1.1724, + "step": 4399 + }, + { + "epoch": 0.6917015465031736, + "grad_norm": 0.15201207995414734, + "learning_rate": 3.667989486111212e-05, + "loss": 1.1116, + "step": 4400 + }, + { + "epoch": 0.6918587514001061, + "grad_norm": 0.15862667560577393, + "learning_rate": 3.667443184844256e-05, + "loss": 1.1513, + "step": 4401 + }, + { + "epoch": 0.6920159562970386, + "grad_norm": 0.1607983559370041, + "learning_rate": 3.666896812274311e-05, + "loss": 1.0895, + "step": 4402 + }, + { + "epoch": 0.6921731611939712, + "grad_norm": 0.14757487177848816, + "learning_rate": 3.666350368434749e-05, + "loss": 1.0656, + "step": 4403 + }, + { + "epoch": 0.6923303660909037, + "grad_norm": 0.19606012105941772, + "learning_rate": 3.665803853358944e-05, + "loss": 1.1677, + "step": 4404 + }, + { + "epoch": 0.6924875709878363, + "grad_norm": 0.1427164077758789, + "learning_rate": 3.6652572670802754e-05, + "loss": 1.047, + "step": 4405 + }, + { + "epoch": 0.6926447758847688, + "grad_norm": 0.12801557779312134, + "learning_rate": 3.664710609632127e-05, + "loss": 1.1052, + "step": 4406 + }, + { + "epoch": 0.6928019807817013, + "grad_norm": 0.17031121253967285, + "learning_rate": 3.664163881047884e-05, + "loss": 1.0519, + "step": 4407 + }, + { + "epoch": 0.6929591856786339, + "grad_norm": 0.14045396447181702, + "learning_rate": 3.6636170813609425e-05, + "loss": 1.0799, + "step": 4408 + }, + { + "epoch": 0.6931163905755664, + "grad_norm": 0.16111011803150177, + "learning_rate": 3.663070210604697e-05, + "loss": 1.0628, + "step": 4409 + }, + { + "epoch": 0.693273595472499, + "grad_norm": 0.15665438771247864, + "learning_rate": 3.662523268812547e-05, + "loss": 1.2111, + "step": 4410 + }, + { + "epoch": 0.6934308003694315, + "grad_norm": 0.1392172873020172, + "learning_rate": 3.6619762560179006e-05, + "loss": 1.1297, + "step": 4411 + }, + { + "epoch": 0.693588005266364, + "grad_norm": 0.20096223056316376, + "learning_rate": 3.661429172254163e-05, + "loss": 1.1301, + "step": 4412 + }, + { + "epoch": 0.6937452101632966, + "grad_norm": 0.16046035289764404, + "learning_rate": 3.6608820175547526e-05, + "loss": 1.125, + "step": 4413 + }, + { + "epoch": 0.6939024150602291, + "grad_norm": 0.13595367968082428, + "learning_rate": 3.660334791953085e-05, + "loss": 1.0301, + "step": 4414 + }, + { + "epoch": 0.6940596199571617, + "grad_norm": 0.1444806009531021, + "learning_rate": 3.659787495482583e-05, + "loss": 1.1592, + "step": 4415 + }, + { + "epoch": 0.6942168248540942, + "grad_norm": 0.16791272163391113, + "learning_rate": 3.659240128176673e-05, + "loss": 1.1495, + "step": 4416 + }, + { + "epoch": 0.6943740297510267, + "grad_norm": 0.13716629147529602, + "learning_rate": 3.658692690068787e-05, + "loss": 1.127, + "step": 4417 + }, + { + "epoch": 0.6945312346479593, + "grad_norm": 0.12987127900123596, + "learning_rate": 3.6581451811923596e-05, + "loss": 1.2504, + "step": 4418 + }, + { + "epoch": 0.6946884395448918, + "grad_norm": 0.1585957407951355, + "learning_rate": 3.657597601580831e-05, + "loss": 1.0458, + "step": 4419 + }, + { + "epoch": 0.6948456444418244, + "grad_norm": 0.15420947968959808, + "learning_rate": 3.6570499512676465e-05, + "loss": 1.0563, + "step": 4420 + }, + { + "epoch": 0.6950028493387569, + "grad_norm": 0.12740959227085114, + "learning_rate": 3.6565022302862526e-05, + "loss": 1.0257, + "step": 4421 + }, + { + "epoch": 0.6951600542356895, + "grad_norm": 0.15865959227085114, + "learning_rate": 3.6559544386701015e-05, + "loss": 1.1003, + "step": 4422 + }, + { + "epoch": 0.695317259132622, + "grad_norm": 0.16942833364009857, + "learning_rate": 3.6554065764526524e-05, + "loss": 1.054, + "step": 4423 + }, + { + "epoch": 0.6954744640295545, + "grad_norm": 0.1516052782535553, + "learning_rate": 3.654858643667365e-05, + "loss": 1.0383, + "step": 4424 + }, + { + "epoch": 0.6956316689264871, + "grad_norm": 0.12675218284130096, + "learning_rate": 3.654310640347707e-05, + "loss": 1.1814, + "step": 4425 + }, + { + "epoch": 0.6957888738234196, + "grad_norm": 0.1705823838710785, + "learning_rate": 3.653762566527146e-05, + "loss": 1.1676, + "step": 4426 + }, + { + "epoch": 0.6959460787203522, + "grad_norm": 0.14712105691432953, + "learning_rate": 3.653214422239157e-05, + "loss": 1.2246, + "step": 4427 + }, + { + "epoch": 0.6961032836172847, + "grad_norm": 0.13124018907546997, + "learning_rate": 3.652666207517219e-05, + "loss": 1.1254, + "step": 4428 + }, + { + "epoch": 0.6962604885142172, + "grad_norm": 0.13952194154262543, + "learning_rate": 3.6521179223948153e-05, + "loss": 1.1184, + "step": 4429 + }, + { + "epoch": 0.6964176934111498, + "grad_norm": 0.16852107644081116, + "learning_rate": 3.651569566905432e-05, + "loss": 1.0189, + "step": 4430 + }, + { + "epoch": 0.6965748983080823, + "grad_norm": 0.15509940683841705, + "learning_rate": 3.6510211410825614e-05, + "loss": 1.018, + "step": 4431 + }, + { + "epoch": 0.6967321032050149, + "grad_norm": 0.15195202827453613, + "learning_rate": 3.650472644959698e-05, + "loss": 1.1238, + "step": 4432 + }, + { + "epoch": 0.6968893081019474, + "grad_norm": 0.18177133798599243, + "learning_rate": 3.6499240785703426e-05, + "loss": 1.2951, + "step": 4433 + }, + { + "epoch": 0.6970465129988799, + "grad_norm": 0.17335766553878784, + "learning_rate": 3.649375441948001e-05, + "loss": 1.1246, + "step": 4434 + }, + { + "epoch": 0.6972037178958125, + "grad_norm": 0.15300196409225464, + "learning_rate": 3.648826735126179e-05, + "loss": 1.0736, + "step": 4435 + }, + { + "epoch": 0.697360922792745, + "grad_norm": 0.1609075963497162, + "learning_rate": 3.648277958138392e-05, + "loss": 1.1375, + "step": 4436 + }, + { + "epoch": 0.6975181276896776, + "grad_norm": 0.1483910232782364, + "learning_rate": 3.647729111018156e-05, + "loss": 1.2151, + "step": 4437 + }, + { + "epoch": 0.6976753325866101, + "grad_norm": 0.16368529200553894, + "learning_rate": 3.647180193798992e-05, + "loss": 1.0947, + "step": 4438 + }, + { + "epoch": 0.6978325374835426, + "grad_norm": 0.16666406393051147, + "learning_rate": 3.646631206514427e-05, + "loss": 1.1475, + "step": 4439 + }, + { + "epoch": 0.6979897423804752, + "grad_norm": 0.17518015205860138, + "learning_rate": 3.646082149197991e-05, + "loss": 1.242, + "step": 4440 + }, + { + "epoch": 0.6981469472774077, + "grad_norm": 0.15665574371814728, + "learning_rate": 3.645533021883218e-05, + "loss": 1.1221, + "step": 4441 + }, + { + "epoch": 0.6983041521743403, + "grad_norm": 0.1375490128993988, + "learning_rate": 3.644983824603645e-05, + "loss": 0.9732, + "step": 4442 + }, + { + "epoch": 0.6984613570712728, + "grad_norm": 0.19858404994010925, + "learning_rate": 3.644434557392818e-05, + "loss": 1.0138, + "step": 4443 + }, + { + "epoch": 0.6986185619682053, + "grad_norm": 0.15151138603687286, + "learning_rate": 3.643885220284282e-05, + "loss": 1.0626, + "step": 4444 + }, + { + "epoch": 0.6987757668651379, + "grad_norm": 0.13844822347164154, + "learning_rate": 3.6433358133115884e-05, + "loss": 1.0452, + "step": 4445 + }, + { + "epoch": 0.6989329717620704, + "grad_norm": 0.1529357135295868, + "learning_rate": 3.642786336508294e-05, + "loss": 1.0192, + "step": 4446 + }, + { + "epoch": 0.699090176659003, + "grad_norm": 0.19727185368537903, + "learning_rate": 3.642236789907958e-05, + "loss": 1.0846, + "step": 4447 + }, + { + "epoch": 0.6992473815559355, + "grad_norm": 0.16505931317806244, + "learning_rate": 3.6416871735441446e-05, + "loss": 1.1061, + "step": 4448 + }, + { + "epoch": 0.699404586452868, + "grad_norm": 0.1713247001171112, + "learning_rate": 3.6411374874504236e-05, + "loss": 1.0497, + "step": 4449 + }, + { + "epoch": 0.6995617913498006, + "grad_norm": 0.14336654543876648, + "learning_rate": 3.640587731660366e-05, + "loss": 1.2052, + "step": 4450 + }, + { + "epoch": 0.6997189962467331, + "grad_norm": 0.15254364907741547, + "learning_rate": 3.640037906207549e-05, + "loss": 1.1133, + "step": 4451 + }, + { + "epoch": 0.6998762011436657, + "grad_norm": 0.17495720088481903, + "learning_rate": 3.639488011125553e-05, + "loss": 1.0429, + "step": 4452 + }, + { + "epoch": 0.7000334060405982, + "grad_norm": 0.14323261380195618, + "learning_rate": 3.638938046447967e-05, + "loss": 1.0586, + "step": 4453 + }, + { + "epoch": 0.7001906109375307, + "grad_norm": 0.1447620689868927, + "learning_rate": 3.6383880122083775e-05, + "loss": 1.121, + "step": 4454 + }, + { + "epoch": 0.7003478158344633, + "grad_norm": 0.15612784028053284, + "learning_rate": 3.6378379084403804e-05, + "loss": 1.09, + "step": 4455 + }, + { + "epoch": 0.7005050207313958, + "grad_norm": 0.14692716300487518, + "learning_rate": 3.637287735177571e-05, + "loss": 1.1235, + "step": 4456 + }, + { + "epoch": 0.7006622256283284, + "grad_norm": 0.14650915563106537, + "learning_rate": 3.6367374924535556e-05, + "loss": 1.0611, + "step": 4457 + }, + { + "epoch": 0.7008194305252609, + "grad_norm": 0.13822227716445923, + "learning_rate": 3.636187180301939e-05, + "loss": 1.1958, + "step": 4458 + }, + { + "epoch": 0.7009766354221934, + "grad_norm": 0.13127368688583374, + "learning_rate": 3.6356367987563316e-05, + "loss": 1.0414, + "step": 4459 + }, + { + "epoch": 0.701133840319126, + "grad_norm": 0.15684054791927338, + "learning_rate": 3.6350863478503505e-05, + "loss": 1.0821, + "step": 4460 + }, + { + "epoch": 0.7012910452160585, + "grad_norm": 0.1420765221118927, + "learning_rate": 3.634535827617612e-05, + "loss": 1.0357, + "step": 4461 + }, + { + "epoch": 0.7014482501129911, + "grad_norm": 0.1303970217704773, + "learning_rate": 3.633985238091744e-05, + "loss": 1.1562, + "step": 4462 + }, + { + "epoch": 0.7016054550099236, + "grad_norm": 0.1747986227273941, + "learning_rate": 3.633434579306371e-05, + "loss": 1.2047, + "step": 4463 + }, + { + "epoch": 0.701762659906856, + "grad_norm": 0.15176185965538025, + "learning_rate": 3.632883851295127e-05, + "loss": 0.9414, + "step": 4464 + }, + { + "epoch": 0.7019198648037887, + "grad_norm": 0.16095110774040222, + "learning_rate": 3.6323330540916474e-05, + "loss": 1.0578, + "step": 4465 + }, + { + "epoch": 0.7020770697007211, + "grad_norm": 0.14516471326351166, + "learning_rate": 3.6317821877295724e-05, + "loss": 0.99, + "step": 4466 + }, + { + "epoch": 0.7022342745976538, + "grad_norm": 0.1713501513004303, + "learning_rate": 3.631231252242549e-05, + "loss": 1.1529, + "step": 4467 + }, + { + "epoch": 0.7023914794945862, + "grad_norm": 0.13815224170684814, + "learning_rate": 3.630680247664223e-05, + "loss": 1.1611, + "step": 4468 + }, + { + "epoch": 0.7025486843915187, + "grad_norm": 0.15194745361804962, + "learning_rate": 3.63012917402825e-05, + "loss": 1.1695, + "step": 4469 + }, + { + "epoch": 0.7027058892884513, + "grad_norm": 0.15428611636161804, + "learning_rate": 3.629578031368288e-05, + "loss": 1.1073, + "step": 4470 + }, + { + "epoch": 0.7028630941853838, + "grad_norm": 0.1365196257829666, + "learning_rate": 3.6290268197179966e-05, + "loss": 1.1479, + "step": 4471 + }, + { + "epoch": 0.7030202990823164, + "grad_norm": 0.15833613276481628, + "learning_rate": 3.628475539111043e-05, + "loss": 1.1053, + "step": 4472 + }, + { + "epoch": 0.7031775039792489, + "grad_norm": 0.15446895360946655, + "learning_rate": 3.627924189581097e-05, + "loss": 1.0302, + "step": 4473 + }, + { + "epoch": 0.7033347088761815, + "grad_norm": 0.1326589435338974, + "learning_rate": 3.627372771161833e-05, + "loss": 1.1417, + "step": 4474 + }, + { + "epoch": 0.703491913773114, + "grad_norm": 0.13878080248832703, + "learning_rate": 3.62682128388693e-05, + "loss": 1.0901, + "step": 4475 + }, + { + "epoch": 0.7036491186700465, + "grad_norm": 0.19546456634998322, + "learning_rate": 3.6262697277900694e-05, + "loss": 1.0839, + "step": 4476 + }, + { + "epoch": 0.7038063235669791, + "grad_norm": 0.1378042995929718, + "learning_rate": 3.625718102904939e-05, + "loss": 1.1619, + "step": 4477 + }, + { + "epoch": 0.7039635284639116, + "grad_norm": 0.13764554262161255, + "learning_rate": 3.6251664092652305e-05, + "loss": 1.0876, + "step": 4478 + }, + { + "epoch": 0.7041207333608442, + "grad_norm": 0.1276378333568573, + "learning_rate": 3.624614646904638e-05, + "loss": 0.9421, + "step": 4479 + }, + { + "epoch": 0.7042779382577767, + "grad_norm": 0.1680934727191925, + "learning_rate": 3.624062815856862e-05, + "loss": 1.1248, + "step": 4480 + }, + { + "epoch": 0.7042779382577767, + "eval_loss": 1.0991666316986084, + "eval_runtime": 2304.3083, + "eval_samples_per_second": 4.018, + "eval_steps_per_second": 2.009, + "step": 4480 + }, + { + "epoch": 0.7044351431547092, + "grad_norm": 0.15627016127109528, + "learning_rate": 3.623510916155607e-05, + "loss": 1.19, + "step": 4481 + }, + { + "epoch": 0.7045923480516418, + "grad_norm": 0.1757209748029709, + "learning_rate": 3.622958947834579e-05, + "loss": 1.1513, + "step": 4482 + }, + { + "epoch": 0.7047495529485743, + "grad_norm": 0.17841759324073792, + "learning_rate": 3.6224069109274914e-05, + "loss": 1.1358, + "step": 4483 + }, + { + "epoch": 0.7049067578455069, + "grad_norm": 0.19921208918094635, + "learning_rate": 3.6218548054680595e-05, + "loss": 1.1385, + "step": 4484 + }, + { + "epoch": 0.7050639627424394, + "grad_norm": 0.13649779558181763, + "learning_rate": 3.6213026314900055e-05, + "loss": 1.0355, + "step": 4485 + }, + { + "epoch": 0.7052211676393719, + "grad_norm": 0.1654793620109558, + "learning_rate": 3.620750389027051e-05, + "loss": 1.1228, + "step": 4486 + }, + { + "epoch": 0.7053783725363045, + "grad_norm": 0.1438949704170227, + "learning_rate": 3.620198078112929e-05, + "loss": 1.1667, + "step": 4487 + }, + { + "epoch": 0.705535577433237, + "grad_norm": 0.14017674326896667, + "learning_rate": 3.6196456987813704e-05, + "loss": 1.0307, + "step": 4488 + }, + { + "epoch": 0.7056927823301696, + "grad_norm": 0.13663023710250854, + "learning_rate": 3.619093251066112e-05, + "loss": 1.0255, + "step": 4489 + }, + { + "epoch": 0.7058499872271021, + "grad_norm": 0.1306449919939041, + "learning_rate": 3.618540735000896e-05, + "loss": 0.9965, + "step": 4490 + }, + { + "epoch": 0.7060071921240346, + "grad_norm": 0.13589157164096832, + "learning_rate": 3.6179881506194665e-05, + "loss": 1.1561, + "step": 4491 + }, + { + "epoch": 0.7061643970209672, + "grad_norm": 0.15856143832206726, + "learning_rate": 3.6174354979555756e-05, + "loss": 0.9947, + "step": 4492 + }, + { + "epoch": 0.7063216019178997, + "grad_norm": 0.20200398564338684, + "learning_rate": 3.616882777042975e-05, + "loss": 1.1324, + "step": 4493 + }, + { + "epoch": 0.7064788068148323, + "grad_norm": 0.12902216613292694, + "learning_rate": 3.616329987915425e-05, + "loss": 1.1257, + "step": 4494 + }, + { + "epoch": 0.7066360117117648, + "grad_norm": 0.12593093514442444, + "learning_rate": 3.615777130606687e-05, + "loss": 1.0938, + "step": 4495 + }, + { + "epoch": 0.7067932166086973, + "grad_norm": 0.17199790477752686, + "learning_rate": 3.615224205150525e-05, + "loss": 1.013, + "step": 4496 + }, + { + "epoch": 0.7069504215056299, + "grad_norm": 0.12172434478998184, + "learning_rate": 3.6146712115807134e-05, + "loss": 1.0747, + "step": 4497 + }, + { + "epoch": 0.7071076264025624, + "grad_norm": 0.12670786678791046, + "learning_rate": 3.614118149931025e-05, + "loss": 1.0288, + "step": 4498 + }, + { + "epoch": 0.707264831299495, + "grad_norm": 0.14240224659442902, + "learning_rate": 3.613565020235239e-05, + "loss": 1.2203, + "step": 4499 + }, + { + "epoch": 0.7074220361964275, + "grad_norm": 0.15334568917751312, + "learning_rate": 3.613011822527138e-05, + "loss": 1.1555, + "step": 4500 + }, + { + "epoch": 0.70757924109336, + "grad_norm": 0.19086161255836487, + "learning_rate": 3.61245855684051e-05, + "loss": 0.9874, + "step": 4501 + }, + { + "epoch": 0.7077364459902926, + "grad_norm": 0.1404600292444229, + "learning_rate": 3.611905223209145e-05, + "loss": 1.1017, + "step": 4502 + }, + { + "epoch": 0.7078936508872251, + "grad_norm": 0.35620176792144775, + "learning_rate": 3.611351821666841e-05, + "loss": 1.0504, + "step": 4503 + }, + { + "epoch": 0.7080508557841577, + "grad_norm": 0.13687248528003693, + "learning_rate": 3.610798352247396e-05, + "loss": 1.0093, + "step": 4504 + }, + { + "epoch": 0.7082080606810902, + "grad_norm": 0.14304716885089874, + "learning_rate": 3.6102448149846125e-05, + "loss": 1.1835, + "step": 4505 + }, + { + "epoch": 0.7083652655780227, + "grad_norm": 0.14237457513809204, + "learning_rate": 3.609691209912302e-05, + "loss": 1.2334, + "step": 4506 + }, + { + "epoch": 0.7085224704749553, + "grad_norm": 0.15722329914569855, + "learning_rate": 3.609137537064272e-05, + "loss": 1.114, + "step": 4507 + }, + { + "epoch": 0.7086796753718878, + "grad_norm": 0.1474800407886505, + "learning_rate": 3.608583796474343e-05, + "loss": 1.0025, + "step": 4508 + }, + { + "epoch": 0.7088368802688204, + "grad_norm": 0.15540659427642822, + "learning_rate": 3.6080299881763336e-05, + "loss": 1.1032, + "step": 4509 + }, + { + "epoch": 0.7089940851657529, + "grad_norm": 0.14503051340579987, + "learning_rate": 3.6074761122040665e-05, + "loss": 1.1193, + "step": 4510 + }, + { + "epoch": 0.7091512900626854, + "grad_norm": 0.15747176110744476, + "learning_rate": 3.606922168591374e-05, + "loss": 1.0396, + "step": 4511 + }, + { + "epoch": 0.709308494959618, + "grad_norm": 0.1525149792432785, + "learning_rate": 3.606368157372087e-05, + "loss": 1.1112, + "step": 4512 + }, + { + "epoch": 0.7094656998565505, + "grad_norm": 0.20445869863033295, + "learning_rate": 3.605814078580042e-05, + "loss": 0.956, + "step": 4513 + }, + { + "epoch": 0.7096229047534831, + "grad_norm": 0.1403074860572815, + "learning_rate": 3.6052599322490805e-05, + "loss": 1.1497, + "step": 4514 + }, + { + "epoch": 0.7097801096504156, + "grad_norm": 0.16936302185058594, + "learning_rate": 3.604705718413047e-05, + "loss": 1.153, + "step": 4515 + }, + { + "epoch": 0.7099373145473481, + "grad_norm": 0.14044791460037231, + "learning_rate": 3.6041514371057916e-05, + "loss": 1.1045, + "step": 4516 + }, + { + "epoch": 0.7100945194442807, + "grad_norm": 0.13026654720306396, + "learning_rate": 3.6035970883611675e-05, + "loss": 1.0103, + "step": 4517 + }, + { + "epoch": 0.7102517243412132, + "grad_norm": 0.1425241082906723, + "learning_rate": 3.603042672213033e-05, + "loss": 1.1188, + "step": 4518 + }, + { + "epoch": 0.7104089292381458, + "grad_norm": 0.17885011434555054, + "learning_rate": 3.6024881886952474e-05, + "loss": 0.9175, + "step": 4519 + }, + { + "epoch": 0.7105661341350783, + "grad_norm": 0.1704678237438202, + "learning_rate": 3.601933637841679e-05, + "loss": 1.2171, + "step": 4520 + }, + { + "epoch": 0.7107233390320108, + "grad_norm": 0.1702074110507965, + "learning_rate": 3.601379019686196e-05, + "loss": 1.1282, + "step": 4521 + }, + { + "epoch": 0.7108805439289434, + "grad_norm": 0.14040248095989227, + "learning_rate": 3.6008243342626734e-05, + "loss": 1.1557, + "step": 4522 + }, + { + "epoch": 0.7110377488258759, + "grad_norm": 0.22482609748840332, + "learning_rate": 3.6002695816049884e-05, + "loss": 1.0938, + "step": 4523 + }, + { + "epoch": 0.7111949537228085, + "grad_norm": 0.16553983092308044, + "learning_rate": 3.599714761747024e-05, + "loss": 1.2154, + "step": 4524 + }, + { + "epoch": 0.711352158619741, + "grad_norm": 0.15201883018016815, + "learning_rate": 3.599159874722666e-05, + "loss": 1.0501, + "step": 4525 + }, + { + "epoch": 0.7115093635166736, + "grad_norm": 0.1263444870710373, + "learning_rate": 3.5986049205658046e-05, + "loss": 1.0935, + "step": 4526 + }, + { + "epoch": 0.7116665684136061, + "grad_norm": 0.1617809683084488, + "learning_rate": 3.598049899310335e-05, + "loss": 1.1585, + "step": 4527 + }, + { + "epoch": 0.7118237733105386, + "grad_norm": 0.14360898733139038, + "learning_rate": 3.5974948109901554e-05, + "loss": 1.1632, + "step": 4528 + }, + { + "epoch": 0.7119809782074712, + "grad_norm": 0.17009443044662476, + "learning_rate": 3.5969396556391674e-05, + "loss": 1.086, + "step": 4529 + }, + { + "epoch": 0.7121381831044037, + "grad_norm": 0.1356435865163803, + "learning_rate": 3.59638443329128e-05, + "loss": 1.0879, + "step": 4530 + }, + { + "epoch": 0.7122953880013363, + "grad_norm": 0.16926179826259613, + "learning_rate": 3.595829143980403e-05, + "loss": 1.0006, + "step": 4531 + }, + { + "epoch": 0.7124525928982688, + "grad_norm": 0.18381322920322418, + "learning_rate": 3.5952737877404506e-05, + "loss": 1.1344, + "step": 4532 + }, + { + "epoch": 0.7126097977952013, + "grad_norm": 0.15497080981731415, + "learning_rate": 3.594718364605342e-05, + "loss": 1.0882, + "step": 4533 + }, + { + "epoch": 0.7127670026921339, + "grad_norm": 0.14986175298690796, + "learning_rate": 3.5941628746090017e-05, + "loss": 1.1863, + "step": 4534 + }, + { + "epoch": 0.7129242075890664, + "grad_norm": 0.15243493020534515, + "learning_rate": 3.593607317785356e-05, + "loss": 1.0519, + "step": 4535 + }, + { + "epoch": 0.713081412485999, + "grad_norm": 0.15663480758666992, + "learning_rate": 3.593051694168336e-05, + "loss": 1.1227, + "step": 4536 + }, + { + "epoch": 0.7132386173829315, + "grad_norm": 0.15480025112628937, + "learning_rate": 3.5924960037918775e-05, + "loss": 1.1673, + "step": 4537 + }, + { + "epoch": 0.713395822279864, + "grad_norm": 0.13647949695587158, + "learning_rate": 3.5919402466899196e-05, + "loss": 1.1236, + "step": 4538 + }, + { + "epoch": 0.7135530271767966, + "grad_norm": 0.13751398026943207, + "learning_rate": 3.591384422896406e-05, + "loss": 1.1066, + "step": 4539 + }, + { + "epoch": 0.7137102320737291, + "grad_norm": 0.17009837925434113, + "learning_rate": 3.590828532445284e-05, + "loss": 1.113, + "step": 4540 + }, + { + "epoch": 0.7138674369706617, + "grad_norm": 0.14258937537670135, + "learning_rate": 3.590272575370506e-05, + "loss": 1.1194, + "step": 4541 + }, + { + "epoch": 0.7140246418675942, + "grad_norm": 0.16682106256484985, + "learning_rate": 3.5897165517060275e-05, + "loss": 0.9974, + "step": 4542 + }, + { + "epoch": 0.7141818467645267, + "grad_norm": 0.14075370132923126, + "learning_rate": 3.589160461485807e-05, + "loss": 1.1454, + "step": 4543 + }, + { + "epoch": 0.7143390516614593, + "grad_norm": 0.14308702945709229, + "learning_rate": 3.5886043047438107e-05, + "loss": 1.1113, + "step": 4544 + }, + { + "epoch": 0.7144962565583918, + "grad_norm": 0.16267889738082886, + "learning_rate": 3.5880480815140044e-05, + "loss": 1.0237, + "step": 4545 + }, + { + "epoch": 0.7146534614553244, + "grad_norm": 0.13113896548748016, + "learning_rate": 3.587491791830362e-05, + "loss": 1.0811, + "step": 4546 + }, + { + "epoch": 0.7148106663522569, + "grad_norm": 0.15190038084983826, + "learning_rate": 3.5869354357268583e-05, + "loss": 1.1487, + "step": 4547 + }, + { + "epoch": 0.7149678712491894, + "grad_norm": 0.15138080716133118, + "learning_rate": 3.5863790132374736e-05, + "loss": 1.0786, + "step": 4548 + }, + { + "epoch": 0.715125076146122, + "grad_norm": 0.18089433014392853, + "learning_rate": 3.585822524396192e-05, + "loss": 1.0491, + "step": 4549 + }, + { + "epoch": 0.7152822810430545, + "grad_norm": 0.14020849764347076, + "learning_rate": 3.5852659692370014e-05, + "loss": 1.147, + "step": 4550 + }, + { + "epoch": 0.7154394859399871, + "grad_norm": 0.13302665948867798, + "learning_rate": 3.5847093477938956e-05, + "loss": 1.127, + "step": 4551 + }, + { + "epoch": 0.7155966908369196, + "grad_norm": 0.12847675383090973, + "learning_rate": 3.584152660100869e-05, + "loss": 1.1067, + "step": 4552 + }, + { + "epoch": 0.7157538957338521, + "grad_norm": 0.14483784139156342, + "learning_rate": 3.583595906191924e-05, + "loss": 1.1292, + "step": 4553 + }, + { + "epoch": 0.7159111006307847, + "grad_norm": 0.21397073566913605, + "learning_rate": 3.583039086101063e-05, + "loss": 1.046, + "step": 4554 + }, + { + "epoch": 0.7160683055277172, + "grad_norm": 0.148904949426651, + "learning_rate": 3.582482199862295e-05, + "loss": 1.0681, + "step": 4555 + }, + { + "epoch": 0.7162255104246498, + "grad_norm": 0.15180173516273499, + "learning_rate": 3.5819252475096335e-05, + "loss": 1.1507, + "step": 4556 + }, + { + "epoch": 0.7163827153215823, + "grad_norm": 0.14372770488262177, + "learning_rate": 3.5813682290770944e-05, + "loss": 1.0862, + "step": 4557 + }, + { + "epoch": 0.7165399202185148, + "grad_norm": 0.1429951786994934, + "learning_rate": 3.580811144598698e-05, + "loss": 1.1292, + "step": 4558 + }, + { + "epoch": 0.7166971251154474, + "grad_norm": 0.13750120997428894, + "learning_rate": 3.580253994108469e-05, + "loss": 1.1985, + "step": 4559 + }, + { + "epoch": 0.7168543300123799, + "grad_norm": 0.1471010148525238, + "learning_rate": 3.579696777640436e-05, + "loss": 1.1356, + "step": 4560 + }, + { + "epoch": 0.7170115349093125, + "grad_norm": 0.1405636966228485, + "learning_rate": 3.579139495228632e-05, + "loss": 1.1764, + "step": 4561 + }, + { + "epoch": 0.717168739806245, + "grad_norm": 0.1493627279996872, + "learning_rate": 3.578582146907094e-05, + "loss": 0.9842, + "step": 4562 + }, + { + "epoch": 0.7173259447031775, + "grad_norm": 0.14724910259246826, + "learning_rate": 3.5780247327098614e-05, + "loss": 1.2037, + "step": 4563 + }, + { + "epoch": 0.7174831496001101, + "grad_norm": 0.15836381912231445, + "learning_rate": 3.5774672526709805e-05, + "loss": 1.1379, + "step": 4564 + }, + { + "epoch": 0.7176403544970426, + "grad_norm": 0.14133360981941223, + "learning_rate": 3.5769097068244985e-05, + "loss": 0.9887, + "step": 4565 + }, + { + "epoch": 0.7177975593939752, + "grad_norm": 0.15250210464000702, + "learning_rate": 3.576352095204469e-05, + "loss": 1.0526, + "step": 4566 + }, + { + "epoch": 0.7179547642909077, + "grad_norm": 0.15717212855815887, + "learning_rate": 3.575794417844949e-05, + "loss": 1.2018, + "step": 4567 + }, + { + "epoch": 0.7181119691878401, + "grad_norm": 0.15343016386032104, + "learning_rate": 3.5752366747799995e-05, + "loss": 1.1506, + "step": 4568 + }, + { + "epoch": 0.7182691740847728, + "grad_norm": 0.16115228831768036, + "learning_rate": 3.574678866043685e-05, + "loss": 1.0947, + "step": 4569 + }, + { + "epoch": 0.7184263789817052, + "grad_norm": 0.1491878181695938, + "learning_rate": 3.574120991670074e-05, + "loss": 1.014, + "step": 4570 + }, + { + "epoch": 0.7185835838786379, + "grad_norm": 0.16374894976615906, + "learning_rate": 3.573563051693238e-05, + "loss": 1.1004, + "step": 4571 + }, + { + "epoch": 0.7187407887755703, + "grad_norm": 0.2055385261774063, + "learning_rate": 3.573005046147258e-05, + "loss": 1.0941, + "step": 4572 + }, + { + "epoch": 0.7188979936725028, + "grad_norm": 0.14261139929294586, + "learning_rate": 3.572446975066211e-05, + "loss": 1.1006, + "step": 4573 + }, + { + "epoch": 0.7190551985694354, + "grad_norm": 0.14790108799934387, + "learning_rate": 3.571888838484183e-05, + "loss": 1.1123, + "step": 4574 + }, + { + "epoch": 0.7192124034663679, + "grad_norm": 0.13406376540660858, + "learning_rate": 3.571330636435263e-05, + "loss": 1.1246, + "step": 4575 + }, + { + "epoch": 0.7193696083633006, + "grad_norm": 0.1513693481683731, + "learning_rate": 3.570772368953545e-05, + "loss": 1.1425, + "step": 4576 + }, + { + "epoch": 0.719526813260233, + "grad_norm": 0.1454620510339737, + "learning_rate": 3.570214036073124e-05, + "loss": 1.2008, + "step": 4577 + }, + { + "epoch": 0.7196840181571657, + "grad_norm": 0.1506110578775406, + "learning_rate": 3.569655637828101e-05, + "loss": 1.31, + "step": 4578 + }, + { + "epoch": 0.7198412230540981, + "grad_norm": 0.12902851402759552, + "learning_rate": 3.569097174252582e-05, + "loss": 1.1247, + "step": 4579 + }, + { + "epoch": 0.7199984279510306, + "grad_norm": 0.14839795231819153, + "learning_rate": 3.568538645380675e-05, + "loss": 1.0916, + "step": 4580 + }, + { + "epoch": 0.7201556328479632, + "grad_norm": 0.14236797392368317, + "learning_rate": 3.567980051246494e-05, + "loss": 1.08, + "step": 4581 + }, + { + "epoch": 0.7203128377448957, + "grad_norm": 0.1483723521232605, + "learning_rate": 3.5674213918841534e-05, + "loss": 1.1429, + "step": 4582 + }, + { + "epoch": 0.7204700426418283, + "grad_norm": 0.15774306654930115, + "learning_rate": 3.566862667327777e-05, + "loss": 1.0761, + "step": 4583 + }, + { + "epoch": 0.7206272475387608, + "grad_norm": 0.1318364143371582, + "learning_rate": 3.566303877611487e-05, + "loss": 1.116, + "step": 4584 + }, + { + "epoch": 0.7207844524356933, + "grad_norm": 0.13808578252792358, + "learning_rate": 3.565745022769413e-05, + "loss": 1.0402, + "step": 4585 + }, + { + "epoch": 0.7209416573326259, + "grad_norm": 0.13911092281341553, + "learning_rate": 3.5651861028356884e-05, + "loss": 1.1032, + "step": 4586 + }, + { + "epoch": 0.7210988622295584, + "grad_norm": 0.14403727650642395, + "learning_rate": 3.56462711784445e-05, + "loss": 1.0561, + "step": 4587 + }, + { + "epoch": 0.721256067126491, + "grad_norm": 0.14054661989212036, + "learning_rate": 3.564068067829837e-05, + "loss": 1.1411, + "step": 4588 + }, + { + "epoch": 0.7214132720234235, + "grad_norm": 0.1546674221754074, + "learning_rate": 3.563508952825995e-05, + "loss": 1.0848, + "step": 4589 + }, + { + "epoch": 0.721570476920356, + "grad_norm": 0.1409955769777298, + "learning_rate": 3.5629497728670725e-05, + "loss": 1.0394, + "step": 4590 + }, + { + "epoch": 0.7217276818172886, + "grad_norm": 0.1401342898607254, + "learning_rate": 3.562390527987222e-05, + "loss": 1.1356, + "step": 4591 + }, + { + "epoch": 0.7218848867142211, + "grad_norm": 0.15182948112487793, + "learning_rate": 3.5618312182206006e-05, + "loss": 1.0383, + "step": 4592 + }, + { + "epoch": 0.7220420916111537, + "grad_norm": 0.14586354792118073, + "learning_rate": 3.561271843601369e-05, + "loss": 0.9826, + "step": 4593 + }, + { + "epoch": 0.7221992965080862, + "grad_norm": 0.14797888696193695, + "learning_rate": 3.56071240416369e-05, + "loss": 1.0666, + "step": 4594 + }, + { + "epoch": 0.7223565014050187, + "grad_norm": 0.13578853011131287, + "learning_rate": 3.560152899941733e-05, + "loss": 1.2295, + "step": 4595 + }, + { + "epoch": 0.7225137063019513, + "grad_norm": 0.12927323579788208, + "learning_rate": 3.559593330969671e-05, + "loss": 1.1353, + "step": 4596 + }, + { + "epoch": 0.7226709111988838, + "grad_norm": 0.14741744101047516, + "learning_rate": 3.559033697281679e-05, + "loss": 1.0358, + "step": 4597 + }, + { + "epoch": 0.7228281160958164, + "grad_norm": 0.14150522649288177, + "learning_rate": 3.5584739989119395e-05, + "loss": 1.1996, + "step": 4598 + }, + { + "epoch": 0.7229853209927489, + "grad_norm": 0.13429418206214905, + "learning_rate": 3.557914235894635e-05, + "loss": 1.0684, + "step": 4599 + }, + { + "epoch": 0.7231425258896814, + "grad_norm": 0.13873820006847382, + "learning_rate": 3.557354408263954e-05, + "loss": 0.9561, + "step": 4600 + }, + { + "epoch": 0.723299730786614, + "grad_norm": 0.147738978266716, + "learning_rate": 3.5567945160540884e-05, + "loss": 1.128, + "step": 4601 + }, + { + "epoch": 0.7234569356835465, + "grad_norm": 0.14353159070014954, + "learning_rate": 3.556234559299235e-05, + "loss": 1.0746, + "step": 4602 + }, + { + "epoch": 0.7236141405804791, + "grad_norm": 0.1310272216796875, + "learning_rate": 3.5556745380335934e-05, + "loss": 1.05, + "step": 4603 + }, + { + "epoch": 0.7237713454774116, + "grad_norm": 0.14752885699272156, + "learning_rate": 3.555114452291367e-05, + "loss": 1.1376, + "step": 4604 + }, + { + "epoch": 0.7239285503743441, + "grad_norm": 0.1673475056886673, + "learning_rate": 3.5545543021067645e-05, + "loss": 1.0393, + "step": 4605 + }, + { + "epoch": 0.7240857552712767, + "grad_norm": 0.15939272940158844, + "learning_rate": 3.553994087513998e-05, + "loss": 1.102, + "step": 4606 + }, + { + "epoch": 0.7242429601682092, + "grad_norm": 0.14392653107643127, + "learning_rate": 3.553433808547283e-05, + "loss": 1.1948, + "step": 4607 + }, + { + "epoch": 0.7244001650651418, + "grad_norm": 0.12796008586883545, + "learning_rate": 3.552873465240838e-05, + "loss": 1.2053, + "step": 4608 + }, + { + "epoch": 0.7245573699620743, + "grad_norm": 0.12037085741758347, + "learning_rate": 3.552313057628888e-05, + "loss": 1.0953, + "step": 4609 + }, + { + "epoch": 0.7247145748590068, + "grad_norm": 0.13031119108200073, + "learning_rate": 3.5517525857456604e-05, + "loss": 1.0254, + "step": 4610 + }, + { + "epoch": 0.7248717797559394, + "grad_norm": 0.13989228010177612, + "learning_rate": 3.551192049625387e-05, + "loss": 1.1947, + "step": 4611 + }, + { + "epoch": 0.7250289846528719, + "grad_norm": 0.17071443796157837, + "learning_rate": 3.550631449302302e-05, + "loss": 1.1944, + "step": 4612 + }, + { + "epoch": 0.7251861895498045, + "grad_norm": 0.13762187957763672, + "learning_rate": 3.550070784810646e-05, + "loss": 1.0757, + "step": 4613 + }, + { + "epoch": 0.725343394446737, + "grad_norm": 0.13957257568836212, + "learning_rate": 3.5495100561846615e-05, + "loss": 1.192, + "step": 4614 + }, + { + "epoch": 0.7255005993436695, + "grad_norm": 0.15514232218265533, + "learning_rate": 3.5489492634585955e-05, + "loss": 1.1077, + "step": 4615 + }, + { + "epoch": 0.7256578042406021, + "grad_norm": 0.1375376582145691, + "learning_rate": 3.5483884066667006e-05, + "loss": 1.1309, + "step": 4616 + }, + { + "epoch": 0.7258150091375346, + "grad_norm": 0.1555032879114151, + "learning_rate": 3.54782748584323e-05, + "loss": 1.1224, + "step": 4617 + }, + { + "epoch": 0.7259722140344672, + "grad_norm": 0.14163243770599365, + "learning_rate": 3.5472665010224434e-05, + "loss": 1.0983, + "step": 4618 + }, + { + "epoch": 0.7261294189313997, + "grad_norm": 0.13645827770233154, + "learning_rate": 3.546705452238603e-05, + "loss": 1.1141, + "step": 4619 + }, + { + "epoch": 0.7262866238283322, + "grad_norm": 0.154319167137146, + "learning_rate": 3.546144339525976e-05, + "loss": 1.1441, + "step": 4620 + }, + { + "epoch": 0.7264438287252648, + "grad_norm": 0.13194586336612701, + "learning_rate": 3.5455831629188343e-05, + "loss": 1.1506, + "step": 4621 + }, + { + "epoch": 0.7266010336221973, + "grad_norm": 0.14997638761997223, + "learning_rate": 3.5450219224514506e-05, + "loss": 1.1211, + "step": 4622 + }, + { + "epoch": 0.7267582385191299, + "grad_norm": 0.19300997257232666, + "learning_rate": 3.5444606181581034e-05, + "loss": 0.9954, + "step": 4623 + }, + { + "epoch": 0.7269154434160624, + "grad_norm": 0.12999020516872406, + "learning_rate": 3.543899250073075e-05, + "loss": 0.948, + "step": 4624 + }, + { + "epoch": 0.7270726483129949, + "grad_norm": 0.13173635303974152, + "learning_rate": 3.5433378182306534e-05, + "loss": 1.035, + "step": 4625 + }, + { + "epoch": 0.7272298532099275, + "grad_norm": 0.1882297694683075, + "learning_rate": 3.542776322665128e-05, + "loss": 1.0028, + "step": 4626 + }, + { + "epoch": 0.72738705810686, + "grad_norm": 0.1392630636692047, + "learning_rate": 3.54221476341079e-05, + "loss": 1.1616, + "step": 4627 + }, + { + "epoch": 0.7275442630037926, + "grad_norm": 0.1386382132768631, + "learning_rate": 3.5416531405019416e-05, + "loss": 0.9877, + "step": 4628 + }, + { + "epoch": 0.7277014679007251, + "grad_norm": 0.12188861519098282, + "learning_rate": 3.5410914539728827e-05, + "loss": 1.121, + "step": 4629 + }, + { + "epoch": 0.7278586727976577, + "grad_norm": 0.145427867770195, + "learning_rate": 3.540529703857918e-05, + "loss": 1.052, + "step": 4630 + }, + { + "epoch": 0.7280158776945902, + "grad_norm": 0.16741080582141876, + "learning_rate": 3.539967890191358e-05, + "loss": 1.1455, + "step": 4631 + }, + { + "epoch": 0.7281730825915227, + "grad_norm": 0.15054307878017426, + "learning_rate": 3.539406013007516e-05, + "loss": 1.133, + "step": 4632 + }, + { + "epoch": 0.7283302874884553, + "grad_norm": 0.15563537180423737, + "learning_rate": 3.5388440723407104e-05, + "loss": 1.0843, + "step": 4633 + }, + { + "epoch": 0.7284874923853878, + "grad_norm": 0.12449619174003601, + "learning_rate": 3.5382820682252605e-05, + "loss": 1.0446, + "step": 4634 + }, + { + "epoch": 0.7286446972823204, + "grad_norm": 0.1355639100074768, + "learning_rate": 3.5377200006954924e-05, + "loss": 1.1817, + "step": 4635 + }, + { + "epoch": 0.7288019021792529, + "grad_norm": 0.13719028234481812, + "learning_rate": 3.537157869785735e-05, + "loss": 1.1178, + "step": 4636 + }, + { + "epoch": 0.7289591070761854, + "grad_norm": 0.13218499720096588, + "learning_rate": 3.5365956755303216e-05, + "loss": 1.1066, + "step": 4637 + }, + { + "epoch": 0.729116311973118, + "grad_norm": 0.131890669465065, + "learning_rate": 3.536033417963587e-05, + "loss": 1.214, + "step": 4638 + }, + { + "epoch": 0.7292735168700505, + "grad_norm": 0.12651421129703522, + "learning_rate": 3.5354710971198744e-05, + "loss": 1.204, + "step": 4639 + }, + { + "epoch": 0.7294307217669831, + "grad_norm": 0.16133572161197662, + "learning_rate": 3.5349087130335265e-05, + "loss": 1.105, + "step": 4640 + }, + { + "epoch": 0.7294307217669831, + "eval_loss": 1.0980340242385864, + "eval_runtime": 2311.687, + "eval_samples_per_second": 4.005, + "eval_steps_per_second": 2.002, + "step": 4640 + }, + { + "epoch": 0.7295879266639156, + "grad_norm": 0.13650627434253693, + "learning_rate": 3.534346265738891e-05, + "loss": 1.0481, + "step": 4641 + }, + { + "epoch": 0.7297451315608481, + "grad_norm": 0.13792318105697632, + "learning_rate": 3.533783755270322e-05, + "loss": 1.1098, + "step": 4642 + }, + { + "epoch": 0.7299023364577807, + "grad_norm": 0.16713480651378632, + "learning_rate": 3.5332211816621744e-05, + "loss": 1.0176, + "step": 4643 + }, + { + "epoch": 0.7300595413547132, + "grad_norm": 0.15072934329509735, + "learning_rate": 3.532658544948809e-05, + "loss": 0.9831, + "step": 4644 + }, + { + "epoch": 0.7302167462516458, + "grad_norm": 0.17281381785869598, + "learning_rate": 3.532095845164588e-05, + "loss": 1.1876, + "step": 4645 + }, + { + "epoch": 0.7303739511485783, + "grad_norm": 0.1437559276819229, + "learning_rate": 3.531533082343878e-05, + "loss": 1.0361, + "step": 4646 + }, + { + "epoch": 0.7305311560455108, + "grad_norm": 0.13455119729042053, + "learning_rate": 3.530970256521055e-05, + "loss": 1.0722, + "step": 4647 + }, + { + "epoch": 0.7306883609424434, + "grad_norm": 0.14255169034004211, + "learning_rate": 3.530407367730489e-05, + "loss": 0.9882, + "step": 4648 + }, + { + "epoch": 0.7308455658393759, + "grad_norm": 0.14039060473442078, + "learning_rate": 3.5298444160065626e-05, + "loss": 1.0624, + "step": 4649 + }, + { + "epoch": 0.7310027707363085, + "grad_norm": 0.14412268996238708, + "learning_rate": 3.5292814013836575e-05, + "loss": 1.1823, + "step": 4650 + }, + { + "epoch": 0.731159975633241, + "grad_norm": 0.15416951477527618, + "learning_rate": 3.5287183238961605e-05, + "loss": 1.1518, + "step": 4651 + }, + { + "epoch": 0.7313171805301735, + "grad_norm": 0.14736409485340118, + "learning_rate": 3.528155183578462e-05, + "loss": 1.1206, + "step": 4652 + }, + { + "epoch": 0.7314743854271061, + "grad_norm": 0.13994631171226501, + "learning_rate": 3.5275919804649564e-05, + "loss": 1.1733, + "step": 4653 + }, + { + "epoch": 0.7316315903240386, + "grad_norm": 0.1334182471036911, + "learning_rate": 3.527028714590043e-05, + "loss": 1.0611, + "step": 4654 + }, + { + "epoch": 0.7317887952209712, + "grad_norm": 0.1282099187374115, + "learning_rate": 3.526465385988123e-05, + "loss": 1.049, + "step": 4655 + }, + { + "epoch": 0.7319460001179037, + "grad_norm": 0.16863442957401276, + "learning_rate": 3.525901994693603e-05, + "loss": 1.0266, + "step": 4656 + }, + { + "epoch": 0.7321032050148362, + "grad_norm": 0.15601807832717896, + "learning_rate": 3.5253385407408925e-05, + "loss": 1.1386, + "step": 4657 + }, + { + "epoch": 0.7322604099117688, + "grad_norm": 0.2561866044998169, + "learning_rate": 3.524775024164405e-05, + "loss": 1.0715, + "step": 4658 + }, + { + "epoch": 0.7324176148087013, + "grad_norm": 0.19980670511722565, + "learning_rate": 3.524211444998557e-05, + "loss": 1.0759, + "step": 4659 + }, + { + "epoch": 0.7325748197056339, + "grad_norm": 0.15558764338493347, + "learning_rate": 3.523647803277772e-05, + "loss": 1.0321, + "step": 4660 + }, + { + "epoch": 0.7327320246025664, + "grad_norm": 0.13551028072834015, + "learning_rate": 3.5230840990364736e-05, + "loss": 1.124, + "step": 4661 + }, + { + "epoch": 0.7328892294994989, + "grad_norm": 0.1487666517496109, + "learning_rate": 3.522520332309091e-05, + "loss": 1.0066, + "step": 4662 + }, + { + "epoch": 0.7330464343964315, + "grad_norm": 0.13485178351402283, + "learning_rate": 3.521956503130057e-05, + "loss": 1.0981, + "step": 4663 + }, + { + "epoch": 0.733203639293364, + "grad_norm": 0.1500270813703537, + "learning_rate": 3.521392611533808e-05, + "loss": 1.1002, + "step": 4664 + }, + { + "epoch": 0.7333608441902966, + "grad_norm": 0.14526516199111938, + "learning_rate": 3.520828657554785e-05, + "loss": 1.1052, + "step": 4665 + }, + { + "epoch": 0.7335180490872291, + "grad_norm": 0.14108894765377045, + "learning_rate": 3.520264641227431e-05, + "loss": 1.0931, + "step": 4666 + }, + { + "epoch": 0.7336752539841616, + "grad_norm": 0.17181095480918884, + "learning_rate": 3.519700562586195e-05, + "loss": 1.0911, + "step": 4667 + }, + { + "epoch": 0.7338324588810942, + "grad_norm": 0.1537475883960724, + "learning_rate": 3.519136421665528e-05, + "loss": 1.0408, + "step": 4668 + }, + { + "epoch": 0.7339896637780267, + "grad_norm": 0.1326746642589569, + "learning_rate": 3.518572218499885e-05, + "loss": 1.0928, + "step": 4669 + }, + { + "epoch": 0.7341468686749593, + "grad_norm": 0.14614926278591156, + "learning_rate": 3.5180079531237273e-05, + "loss": 1.1291, + "step": 4670 + }, + { + "epoch": 0.7343040735718918, + "grad_norm": 0.14450791478157043, + "learning_rate": 3.517443625571517e-05, + "loss": 1.0901, + "step": 4671 + }, + { + "epoch": 0.7344612784688243, + "grad_norm": 0.12363743782043457, + "learning_rate": 3.5168792358777216e-05, + "loss": 1.0032, + "step": 4672 + }, + { + "epoch": 0.7346184833657569, + "grad_norm": 0.14431117475032806, + "learning_rate": 3.5163147840768106e-05, + "loss": 1.0301, + "step": 4673 + }, + { + "epoch": 0.7347756882626894, + "grad_norm": 0.13479335606098175, + "learning_rate": 3.5157502702032605e-05, + "loss": 1.1135, + "step": 4674 + }, + { + "epoch": 0.734932893159622, + "grad_norm": 0.1340409368276596, + "learning_rate": 3.5151856942915475e-05, + "loss": 1.3005, + "step": 4675 + }, + { + "epoch": 0.7350900980565545, + "grad_norm": 0.12972420454025269, + "learning_rate": 3.514621056376155e-05, + "loss": 0.9629, + "step": 4676 + }, + { + "epoch": 0.735247302953487, + "grad_norm": 0.14823409914970398, + "learning_rate": 3.5140563564915694e-05, + "loss": 1.1048, + "step": 4677 + }, + { + "epoch": 0.7354045078504196, + "grad_norm": 0.12902098894119263, + "learning_rate": 3.513491594672279e-05, + "loss": 1.0985, + "step": 4678 + }, + { + "epoch": 0.735561712747352, + "grad_norm": 0.14853690564632416, + "learning_rate": 3.512926770952779e-05, + "loss": 1.1909, + "step": 4679 + }, + { + "epoch": 0.7357189176442847, + "grad_norm": 0.14038996398448944, + "learning_rate": 3.512361885367565e-05, + "loss": 1.0486, + "step": 4680 + }, + { + "epoch": 0.7358761225412171, + "grad_norm": 0.13920599222183228, + "learning_rate": 3.511796937951139e-05, + "loss": 1.1348, + "step": 4681 + }, + { + "epoch": 0.7360333274381498, + "grad_norm": 0.14314356446266174, + "learning_rate": 3.5112319287380055e-05, + "loss": 1.0857, + "step": 4682 + }, + { + "epoch": 0.7361905323350822, + "grad_norm": 0.12702102959156036, + "learning_rate": 3.510666857762673e-05, + "loss": 1.2012, + "step": 4683 + }, + { + "epoch": 0.7363477372320147, + "grad_norm": 0.16372568905353546, + "learning_rate": 3.5101017250596556e-05, + "loss": 1.0962, + "step": 4684 + }, + { + "epoch": 0.7365049421289473, + "grad_norm": 0.13005401194095612, + "learning_rate": 3.509536530663467e-05, + "loss": 1.0554, + "step": 4685 + }, + { + "epoch": 0.7366621470258798, + "grad_norm": 0.16866400837898254, + "learning_rate": 3.508971274608628e-05, + "loss": 1.1306, + "step": 4686 + }, + { + "epoch": 0.7368193519228124, + "grad_norm": 0.1414596289396286, + "learning_rate": 3.5084059569296624e-05, + "loss": 1.114, + "step": 4687 + }, + { + "epoch": 0.7369765568197449, + "grad_norm": 0.1379203200340271, + "learning_rate": 3.507840577661099e-05, + "loss": 1.094, + "step": 4688 + }, + { + "epoch": 0.7371337617166774, + "grad_norm": 0.15202675759792328, + "learning_rate": 3.5072751368374656e-05, + "loss": 1.0731, + "step": 4689 + }, + { + "epoch": 0.73729096661361, + "grad_norm": 0.19967341423034668, + "learning_rate": 3.5067096344933e-05, + "loss": 1.1763, + "step": 4690 + }, + { + "epoch": 0.7374481715105425, + "grad_norm": 0.16265927255153656, + "learning_rate": 3.5061440706631414e-05, + "loss": 1.0624, + "step": 4691 + }, + { + "epoch": 0.7376053764074751, + "grad_norm": 0.13775219023227692, + "learning_rate": 3.5055784453815296e-05, + "loss": 1.1432, + "step": 4692 + }, + { + "epoch": 0.7377625813044076, + "grad_norm": 0.142997145652771, + "learning_rate": 3.505012758683013e-05, + "loss": 1.1118, + "step": 4693 + }, + { + "epoch": 0.7379197862013401, + "grad_norm": 0.13343292474746704, + "learning_rate": 3.504447010602141e-05, + "loss": 1.2013, + "step": 4694 + }, + { + "epoch": 0.7380769910982727, + "grad_norm": 0.14398351311683655, + "learning_rate": 3.503881201173467e-05, + "loss": 1.0959, + "step": 4695 + }, + { + "epoch": 0.7382341959952052, + "grad_norm": 0.1524992734193802, + "learning_rate": 3.50331533043155e-05, + "loss": 1.1024, + "step": 4696 + }, + { + "epoch": 0.7383914008921378, + "grad_norm": 0.12862329185009003, + "learning_rate": 3.502749398410948e-05, + "loss": 1.2062, + "step": 4697 + }, + { + "epoch": 0.7385486057890703, + "grad_norm": 0.14112304151058197, + "learning_rate": 3.502183405146229e-05, + "loss": 1.1228, + "step": 4698 + }, + { + "epoch": 0.7387058106860028, + "grad_norm": 0.13773949444293976, + "learning_rate": 3.501617350671961e-05, + "loss": 1.0286, + "step": 4699 + }, + { + "epoch": 0.7388630155829354, + "grad_norm": 0.1456059068441391, + "learning_rate": 3.501051235022716e-05, + "loss": 1.1286, + "step": 4700 + }, + { + "epoch": 0.7390202204798679, + "grad_norm": 0.13958464562892914, + "learning_rate": 3.50048505823307e-05, + "loss": 1.1567, + "step": 4701 + }, + { + "epoch": 0.7391774253768005, + "grad_norm": 0.13271421194076538, + "learning_rate": 3.499918820337602e-05, + "loss": 1.1753, + "step": 4702 + }, + { + "epoch": 0.739334630273733, + "grad_norm": 0.152462437748909, + "learning_rate": 3.4993525213708986e-05, + "loss": 1.0905, + "step": 4703 + }, + { + "epoch": 0.7394918351706655, + "grad_norm": 0.13484671711921692, + "learning_rate": 3.498786161367544e-05, + "loss": 1.0469, + "step": 4704 + }, + { + "epoch": 0.7396490400675981, + "grad_norm": 0.12940473854541779, + "learning_rate": 3.498219740362133e-05, + "loss": 1.1719, + "step": 4705 + }, + { + "epoch": 0.7398062449645306, + "grad_norm": 0.16455109417438507, + "learning_rate": 3.497653258389256e-05, + "loss": 1.173, + "step": 4706 + }, + { + "epoch": 0.7399634498614632, + "grad_norm": 0.12805430591106415, + "learning_rate": 3.4970867154835153e-05, + "loss": 1.1369, + "step": 4707 + }, + { + "epoch": 0.7401206547583957, + "grad_norm": 0.16718123853206635, + "learning_rate": 3.496520111679511e-05, + "loss": 1.0984, + "step": 4708 + }, + { + "epoch": 0.7402778596553282, + "grad_norm": 0.1421629935503006, + "learning_rate": 3.49595344701185e-05, + "loss": 1.0621, + "step": 4709 + }, + { + "epoch": 0.7404350645522608, + "grad_norm": 0.12086508423089981, + "learning_rate": 3.495386721515142e-05, + "loss": 1.0634, + "step": 4710 + }, + { + "epoch": 0.7405922694491933, + "grad_norm": 0.15074370801448822, + "learning_rate": 3.494819935224e-05, + "loss": 1.1158, + "step": 4711 + }, + { + "epoch": 0.7407494743461259, + "grad_norm": 0.1683443784713745, + "learning_rate": 3.4942530881730414e-05, + "loss": 0.9991, + "step": 4712 + }, + { + "epoch": 0.7409066792430584, + "grad_norm": 0.12071124464273453, + "learning_rate": 3.4936861803968865e-05, + "loss": 1.0965, + "step": 4713 + }, + { + "epoch": 0.7410638841399909, + "grad_norm": 0.15003018081188202, + "learning_rate": 3.493119211930162e-05, + "loss": 1.0252, + "step": 4714 + }, + { + "epoch": 0.7412210890369235, + "grad_norm": 0.144414022564888, + "learning_rate": 3.4925521828074934e-05, + "loss": 1.2352, + "step": 4715 + }, + { + "epoch": 0.741378293933856, + "grad_norm": 0.13215228915214539, + "learning_rate": 3.491985093063514e-05, + "loss": 1.0834, + "step": 4716 + }, + { + "epoch": 0.7415354988307886, + "grad_norm": 0.14399638772010803, + "learning_rate": 3.491417942732859e-05, + "loss": 1.0894, + "step": 4717 + }, + { + "epoch": 0.7416927037277211, + "grad_norm": 0.12594108283519745, + "learning_rate": 3.490850731850169e-05, + "loss": 1.1273, + "step": 4718 + }, + { + "epoch": 0.7418499086246536, + "grad_norm": 0.14319142699241638, + "learning_rate": 3.490283460450086e-05, + "loss": 1.133, + "step": 4719 + }, + { + "epoch": 0.7420071135215862, + "grad_norm": 0.1428254246711731, + "learning_rate": 3.489716128567257e-05, + "loss": 1.0948, + "step": 4720 + }, + { + "epoch": 0.7421643184185187, + "grad_norm": 0.1410217434167862, + "learning_rate": 3.489148736236333e-05, + "loss": 1.1161, + "step": 4721 + }, + { + "epoch": 0.7423215233154513, + "grad_norm": 0.13551457226276398, + "learning_rate": 3.488581283491966e-05, + "loss": 1.16, + "step": 4722 + }, + { + "epoch": 0.7424787282123838, + "grad_norm": 0.15328136086463928, + "learning_rate": 3.488013770368817e-05, + "loss": 1.1669, + "step": 4723 + }, + { + "epoch": 0.7426359331093163, + "grad_norm": 0.13792365789413452, + "learning_rate": 3.487446196901546e-05, + "loss": 1.1167, + "step": 4724 + }, + { + "epoch": 0.7427931380062489, + "grad_norm": 0.148483008146286, + "learning_rate": 3.486878563124817e-05, + "loss": 1.1051, + "step": 4725 + }, + { + "epoch": 0.7429503429031814, + "grad_norm": 0.12647677958011627, + "learning_rate": 3.4863108690733024e-05, + "loss": 1.0418, + "step": 4726 + }, + { + "epoch": 0.743107547800114, + "grad_norm": 0.13564087450504303, + "learning_rate": 3.4857431147816696e-05, + "loss": 1.0032, + "step": 4727 + }, + { + "epoch": 0.7432647526970465, + "grad_norm": 0.1215532198548317, + "learning_rate": 3.4851753002846006e-05, + "loss": 1.1842, + "step": 4728 + }, + { + "epoch": 0.743421957593979, + "grad_norm": 0.14689278602600098, + "learning_rate": 3.4846074256167714e-05, + "loss": 1.0337, + "step": 4729 + }, + { + "epoch": 0.7435791624909116, + "grad_norm": 0.1328740268945694, + "learning_rate": 3.484039490812867e-05, + "loss": 0.9731, + "step": 4730 + }, + { + "epoch": 0.7437363673878441, + "grad_norm": 0.12815256416797638, + "learning_rate": 3.483471495907575e-05, + "loss": 1.1345, + "step": 4731 + }, + { + "epoch": 0.7438935722847767, + "grad_norm": 0.11724529415369034, + "learning_rate": 3.4829034409355846e-05, + "loss": 1.0029, + "step": 4732 + }, + { + "epoch": 0.7440507771817092, + "grad_norm": 0.16131702065467834, + "learning_rate": 3.4823353259315926e-05, + "loss": 1.1775, + "step": 4733 + }, + { + "epoch": 0.7442079820786417, + "grad_norm": 0.12095880508422852, + "learning_rate": 3.4817671509302965e-05, + "loss": 0.9847, + "step": 4734 + }, + { + "epoch": 0.7443651869755743, + "grad_norm": 0.12888216972351074, + "learning_rate": 3.4811989159663984e-05, + "loss": 1.1294, + "step": 4735 + }, + { + "epoch": 0.7445223918725068, + "grad_norm": 0.13063958287239075, + "learning_rate": 3.4806306210746035e-05, + "loss": 1.0426, + "step": 4736 + }, + { + "epoch": 0.7446795967694394, + "grad_norm": 0.13246606290340424, + "learning_rate": 3.480062266289621e-05, + "loss": 1.1522, + "step": 4737 + }, + { + "epoch": 0.7448368016663719, + "grad_norm": 0.1366496980190277, + "learning_rate": 3.479493851646164e-05, + "loss": 1.1525, + "step": 4738 + }, + { + "epoch": 0.7449940065633045, + "grad_norm": 0.16373786330223083, + "learning_rate": 3.47892537717895e-05, + "loss": 1.0834, + "step": 4739 + }, + { + "epoch": 0.745151211460237, + "grad_norm": 0.17399144172668457, + "learning_rate": 3.478356842922699e-05, + "loss": 1.1678, + "step": 4740 + }, + { + "epoch": 0.7453084163571695, + "grad_norm": 0.1297912895679474, + "learning_rate": 3.4777882489121325e-05, + "loss": 0.8803, + "step": 4741 + }, + { + "epoch": 0.7454656212541021, + "grad_norm": 0.1584755927324295, + "learning_rate": 3.477219595181981e-05, + "loss": 0.9579, + "step": 4742 + }, + { + "epoch": 0.7456228261510346, + "grad_norm": 0.16625195741653442, + "learning_rate": 3.476650881766975e-05, + "loss": 1.102, + "step": 4743 + }, + { + "epoch": 0.7457800310479672, + "grad_norm": 0.14477670192718506, + "learning_rate": 3.476082108701849e-05, + "loss": 1.1615, + "step": 4744 + }, + { + "epoch": 0.7459372359448997, + "grad_norm": 0.16770561039447784, + "learning_rate": 3.4755132760213415e-05, + "loss": 1.029, + "step": 4745 + }, + { + "epoch": 0.7460944408418322, + "grad_norm": 0.12708137929439545, + "learning_rate": 3.474944383760195e-05, + "loss": 1.089, + "step": 4746 + }, + { + "epoch": 0.7462516457387648, + "grad_norm": 0.1296709030866623, + "learning_rate": 3.474375431953154e-05, + "loss": 1.1327, + "step": 4747 + }, + { + "epoch": 0.7464088506356973, + "grad_norm": 0.126552015542984, + "learning_rate": 3.4738064206349694e-05, + "loss": 1.046, + "step": 4748 + }, + { + "epoch": 0.7465660555326299, + "grad_norm": 0.13531428575515747, + "learning_rate": 3.473237349840394e-05, + "loss": 1.1643, + "step": 4749 + }, + { + "epoch": 0.7467232604295624, + "grad_norm": 0.13297897577285767, + "learning_rate": 3.4726682196041844e-05, + "loss": 1.0452, + "step": 4750 + }, + { + "epoch": 0.7468804653264949, + "grad_norm": 0.12808465957641602, + "learning_rate": 3.4720990299611e-05, + "loss": 1.0513, + "step": 4751 + }, + { + "epoch": 0.7470376702234275, + "grad_norm": 0.1238451898097992, + "learning_rate": 3.4715297809459055e-05, + "loss": 1.1153, + "step": 4752 + }, + { + "epoch": 0.74719487512036, + "grad_norm": 0.15730857849121094, + "learning_rate": 3.4709604725933685e-05, + "loss": 1.0497, + "step": 4753 + }, + { + "epoch": 0.7473520800172926, + "grad_norm": 0.18751071393489838, + "learning_rate": 3.470391104938261e-05, + "loss": 1.0898, + "step": 4754 + }, + { + "epoch": 0.7475092849142251, + "grad_norm": 0.12530651688575745, + "learning_rate": 3.469821678015356e-05, + "loss": 1.0435, + "step": 4755 + }, + { + "epoch": 0.7476664898111576, + "grad_norm": 0.17319822311401367, + "learning_rate": 3.469252191859432e-05, + "loss": 1.172, + "step": 4756 + }, + { + "epoch": 0.7478236947080902, + "grad_norm": 0.12704682350158691, + "learning_rate": 3.468682646505273e-05, + "loss": 1.064, + "step": 4757 + }, + { + "epoch": 0.7479808996050227, + "grad_norm": 0.17880849540233612, + "learning_rate": 3.468113041987664e-05, + "loss": 1.1394, + "step": 4758 + }, + { + "epoch": 0.7481381045019553, + "grad_norm": 0.14523322880268097, + "learning_rate": 3.4675433783413936e-05, + "loss": 1.1393, + "step": 4759 + }, + { + "epoch": 0.7482953093988878, + "grad_norm": 0.1540546715259552, + "learning_rate": 3.466973655601254e-05, + "loss": 1.1145, + "step": 4760 + }, + { + "epoch": 0.7484525142958203, + "grad_norm": 0.1491522341966629, + "learning_rate": 3.466403873802043e-05, + "loss": 1.0209, + "step": 4761 + }, + { + "epoch": 0.7486097191927529, + "grad_norm": 0.135720893740654, + "learning_rate": 3.4658340329785606e-05, + "loss": 1.1023, + "step": 4762 + }, + { + "epoch": 0.7487669240896854, + "grad_norm": 0.1523171365261078, + "learning_rate": 3.46526413316561e-05, + "loss": 1.2391, + "step": 4763 + }, + { + "epoch": 0.748924128986618, + "grad_norm": 0.1411243975162506, + "learning_rate": 3.464694174397999e-05, + "loss": 1.1447, + "step": 4764 + }, + { + "epoch": 0.7490813338835505, + "grad_norm": 0.1597915142774582, + "learning_rate": 3.464124156710538e-05, + "loss": 1.164, + "step": 4765 + }, + { + "epoch": 0.749238538780483, + "grad_norm": 0.13481174409389496, + "learning_rate": 3.463554080138042e-05, + "loss": 1.0763, + "step": 4766 + }, + { + "epoch": 0.7493957436774156, + "grad_norm": 0.13792438805103302, + "learning_rate": 3.4629839447153286e-05, + "loss": 1.0839, + "step": 4767 + }, + { + "epoch": 0.7495529485743481, + "grad_norm": 0.1663919985294342, + "learning_rate": 3.46241375047722e-05, + "loss": 1.1405, + "step": 4768 + }, + { + "epoch": 0.7497101534712807, + "grad_norm": 0.13219061493873596, + "learning_rate": 3.461843497458541e-05, + "loss": 1.0549, + "step": 4769 + }, + { + "epoch": 0.7498673583682132, + "grad_norm": 0.1282496452331543, + "learning_rate": 3.461273185694121e-05, + "loss": 1.0713, + "step": 4770 + }, + { + "epoch": 0.7500245632651457, + "grad_norm": 0.12896636128425598, + "learning_rate": 3.460702815218792e-05, + "loss": 1.0364, + "step": 4771 + }, + { + "epoch": 0.7501817681620783, + "grad_norm": 0.13691388070583344, + "learning_rate": 3.4601323860673904e-05, + "loss": 1.038, + "step": 4772 + }, + { + "epoch": 0.7503389730590108, + "grad_norm": 0.12374941259622574, + "learning_rate": 3.459561898274756e-05, + "loss": 1.0836, + "step": 4773 + }, + { + "epoch": 0.7504961779559434, + "grad_norm": 0.14701367914676666, + "learning_rate": 3.4589913518757313e-05, + "loss": 1.2139, + "step": 4774 + }, + { + "epoch": 0.7506533828528759, + "grad_norm": 0.14951729774475098, + "learning_rate": 3.458420746905164e-05, + "loss": 1.0835, + "step": 4775 + }, + { + "epoch": 0.7508105877498084, + "grad_norm": 0.1360500305891037, + "learning_rate": 3.457850083397903e-05, + "loss": 1.1761, + "step": 4776 + }, + { + "epoch": 0.750967792646741, + "grad_norm": 0.14075034856796265, + "learning_rate": 3.4572793613888046e-05, + "loss": 1.1391, + "step": 4777 + }, + { + "epoch": 0.7511249975436735, + "grad_norm": 0.1566086709499359, + "learning_rate": 3.456708580912725e-05, + "loss": 1.0243, + "step": 4778 + }, + { + "epoch": 0.7512822024406061, + "grad_norm": 0.12710240483283997, + "learning_rate": 3.4561377420045246e-05, + "loss": 0.9759, + "step": 4779 + }, + { + "epoch": 0.7514394073375386, + "grad_norm": 0.17147888243198395, + "learning_rate": 3.455566844699069e-05, + "loss": 0.9989, + "step": 4780 + }, + { + "epoch": 0.751596612234471, + "grad_norm": 0.1665210723876953, + "learning_rate": 3.4549958890312256e-05, + "loss": 1.1576, + "step": 4781 + }, + { + "epoch": 0.7517538171314037, + "grad_norm": 0.17183759808540344, + "learning_rate": 3.4544248750358677e-05, + "loss": 1.1207, + "step": 4782 + }, + { + "epoch": 0.7519110220283362, + "grad_norm": 0.13655908405780792, + "learning_rate": 3.4538538027478696e-05, + "loss": 1.0299, + "step": 4783 + }, + { + "epoch": 0.7520682269252688, + "grad_norm": 0.13791604340076447, + "learning_rate": 3.45328267220211e-05, + "loss": 1.1945, + "step": 4784 + }, + { + "epoch": 0.7522254318222013, + "grad_norm": 0.1784014105796814, + "learning_rate": 3.4527114834334726e-05, + "loss": 1.0836, + "step": 4785 + }, + { + "epoch": 0.7523826367191337, + "grad_norm": 0.1311030387878418, + "learning_rate": 3.452140236476842e-05, + "loss": 1.1196, + "step": 4786 + }, + { + "epoch": 0.7525398416160664, + "grad_norm": 0.14000438153743744, + "learning_rate": 3.451568931367108e-05, + "loss": 1.1135, + "step": 4787 + }, + { + "epoch": 0.7526970465129988, + "grad_norm": 0.11728193610906601, + "learning_rate": 3.450997568139165e-05, + "loss": 1.0402, + "step": 4788 + }, + { + "epoch": 0.7528542514099315, + "grad_norm": 0.15927207469940186, + "learning_rate": 3.450426146827909e-05, + "loss": 1.0004, + "step": 4789 + }, + { + "epoch": 0.753011456306864, + "grad_norm": 0.12737083435058594, + "learning_rate": 3.44985466746824e-05, + "loss": 1.0429, + "step": 4790 + }, + { + "epoch": 0.7531686612037966, + "grad_norm": 0.13107572495937347, + "learning_rate": 3.449283130095061e-05, + "loss": 1.1063, + "step": 4791 + }, + { + "epoch": 0.753325866100729, + "grad_norm": 0.14925448596477509, + "learning_rate": 3.448711534743281e-05, + "loss": 1.1381, + "step": 4792 + }, + { + "epoch": 0.7534830709976615, + "grad_norm": 0.1519125998020172, + "learning_rate": 3.4481398814478096e-05, + "loss": 0.9555, + "step": 4793 + }, + { + "epoch": 0.7536402758945941, + "grad_norm": 0.13391917943954468, + "learning_rate": 3.447568170243562e-05, + "loss": 1.1197, + "step": 4794 + }, + { + "epoch": 0.7537974807915266, + "grad_norm": 0.13998793065547943, + "learning_rate": 3.446996401165455e-05, + "loss": 1.1089, + "step": 4795 + }, + { + "epoch": 0.7539546856884592, + "grad_norm": 0.12591344118118286, + "learning_rate": 3.446424574248412e-05, + "loss": 1.0845, + "step": 4796 + }, + { + "epoch": 0.7541118905853917, + "grad_norm": 0.14241157472133636, + "learning_rate": 3.445852689527356e-05, + "loss": 1.1163, + "step": 4797 + }, + { + "epoch": 0.7542690954823242, + "grad_norm": 0.14727558195590973, + "learning_rate": 3.445280747037217e-05, + "loss": 1.1268, + "step": 4798 + }, + { + "epoch": 0.7544263003792568, + "grad_norm": 0.13568533957004547, + "learning_rate": 3.444708746812927e-05, + "loss": 1.1031, + "step": 4799 + }, + { + "epoch": 0.7545835052761893, + "grad_norm": 0.16703443229198456, + "learning_rate": 3.44413668888942e-05, + "loss": 1.0572, + "step": 4800 + }, + { + "epoch": 0.7545835052761893, + "eval_loss": 1.096881628036499, + "eval_runtime": 2329.3288, + "eval_samples_per_second": 3.975, + "eval_steps_per_second": 1.987, + "step": 4800 + }, + { + "epoch": 0.7547407101731219, + "grad_norm": 0.137963205575943, + "learning_rate": 3.443564573301637e-05, + "loss": 1.2142, + "step": 4801 + }, + { + "epoch": 0.7548979150700544, + "grad_norm": 0.14127731323242188, + "learning_rate": 3.4429924000845196e-05, + "loss": 1.1542, + "step": 4802 + }, + { + "epoch": 0.7550551199669869, + "grad_norm": 0.1367490291595459, + "learning_rate": 3.442420169273015e-05, + "loss": 1.0598, + "step": 4803 + }, + { + "epoch": 0.7552123248639195, + "grad_norm": 0.155628502368927, + "learning_rate": 3.441847880902071e-05, + "loss": 1.0234, + "step": 4804 + }, + { + "epoch": 0.755369529760852, + "grad_norm": 0.1460132598876953, + "learning_rate": 3.4412755350066425e-05, + "loss": 1.0925, + "step": 4805 + }, + { + "epoch": 0.7555267346577846, + "grad_norm": 0.15177485346794128, + "learning_rate": 3.4407031316216856e-05, + "loss": 1.1072, + "step": 4806 + }, + { + "epoch": 0.7556839395547171, + "grad_norm": 0.1640729159116745, + "learning_rate": 3.440130670782161e-05, + "loss": 1.1392, + "step": 4807 + }, + { + "epoch": 0.7558411444516496, + "grad_norm": 0.13145369291305542, + "learning_rate": 3.439558152523031e-05, + "loss": 1.1667, + "step": 4808 + }, + { + "epoch": 0.7559983493485822, + "grad_norm": 0.12419599294662476, + "learning_rate": 3.438985576879265e-05, + "loss": 0.9409, + "step": 4809 + }, + { + "epoch": 0.7561555542455147, + "grad_norm": 0.14216910302639008, + "learning_rate": 3.4384129438858315e-05, + "loss": 1.1439, + "step": 4810 + }, + { + "epoch": 0.7563127591424473, + "grad_norm": 0.14362764358520508, + "learning_rate": 3.4378402535777065e-05, + "loss": 1.0857, + "step": 4811 + }, + { + "epoch": 0.7564699640393798, + "grad_norm": 0.12433180958032608, + "learning_rate": 3.4372675059898676e-05, + "loss": 1.171, + "step": 4812 + }, + { + "epoch": 0.7566271689363123, + "grad_norm": 0.14561037719249725, + "learning_rate": 3.436694701157294e-05, + "loss": 1.035, + "step": 4813 + }, + { + "epoch": 0.7567843738332449, + "grad_norm": 0.13551869988441467, + "learning_rate": 3.436121839114973e-05, + "loss": 1.1413, + "step": 4814 + }, + { + "epoch": 0.7569415787301774, + "grad_norm": 0.14296530187129974, + "learning_rate": 3.4355489198978905e-05, + "loss": 1.1458, + "step": 4815 + }, + { + "epoch": 0.75709878362711, + "grad_norm": 0.12551701068878174, + "learning_rate": 3.434975943541041e-05, + "loss": 1.1601, + "step": 4816 + }, + { + "epoch": 0.7572559885240425, + "grad_norm": 0.13226158916950226, + "learning_rate": 3.434402910079418e-05, + "loss": 1.0699, + "step": 4817 + }, + { + "epoch": 0.757413193420975, + "grad_norm": 0.1264914870262146, + "learning_rate": 3.43382981954802e-05, + "loss": 1.1619, + "step": 4818 + }, + { + "epoch": 0.7575703983179076, + "grad_norm": 0.12864720821380615, + "learning_rate": 3.4332566719818496e-05, + "loss": 1.1159, + "step": 4819 + }, + { + "epoch": 0.7577276032148401, + "grad_norm": 0.15113262832164764, + "learning_rate": 3.4326834674159124e-05, + "loss": 1.1242, + "step": 4820 + }, + { + "epoch": 0.7578848081117727, + "grad_norm": 0.14772279560565948, + "learning_rate": 3.432110205885218e-05, + "loss": 1.2477, + "step": 4821 + }, + { + "epoch": 0.7580420130087052, + "grad_norm": 0.12309836596250534, + "learning_rate": 3.431536887424779e-05, + "loss": 0.9678, + "step": 4822 + }, + { + "epoch": 0.7581992179056377, + "grad_norm": 0.14076700806617737, + "learning_rate": 3.4309635120696107e-05, + "loss": 1.0875, + "step": 4823 + }, + { + "epoch": 0.7583564228025703, + "grad_norm": 0.1606525033712387, + "learning_rate": 3.4303900798547326e-05, + "loss": 1.039, + "step": 4824 + }, + { + "epoch": 0.7585136276995028, + "grad_norm": 0.17205810546875, + "learning_rate": 3.429816590815169e-05, + "loss": 0.9445, + "step": 4825 + }, + { + "epoch": 0.7586708325964354, + "grad_norm": 0.1370888203382492, + "learning_rate": 3.429243044985946e-05, + "loss": 1.0914, + "step": 4826 + }, + { + "epoch": 0.7588280374933679, + "grad_norm": 0.13176867365837097, + "learning_rate": 3.428669442402093e-05, + "loss": 1.1661, + "step": 4827 + }, + { + "epoch": 0.7589852423903004, + "grad_norm": 0.1363096684217453, + "learning_rate": 3.428095783098644e-05, + "loss": 1.1407, + "step": 4828 + }, + { + "epoch": 0.759142447287233, + "grad_norm": 0.14611127972602844, + "learning_rate": 3.4275220671106354e-05, + "loss": 1.1294, + "step": 4829 + }, + { + "epoch": 0.7592996521841655, + "grad_norm": 0.1419224739074707, + "learning_rate": 3.4269482944731074e-05, + "loss": 1.1525, + "step": 4830 + }, + { + "epoch": 0.7594568570810981, + "grad_norm": 0.12255913764238358, + "learning_rate": 3.4263744652211055e-05, + "loss": 1.0963, + "step": 4831 + }, + { + "epoch": 0.7596140619780306, + "grad_norm": 0.13457930088043213, + "learning_rate": 3.425800579389675e-05, + "loss": 1.0591, + "step": 4832 + }, + { + "epoch": 0.7597712668749631, + "grad_norm": 0.13149945437908173, + "learning_rate": 3.4252266370138684e-05, + "loss": 1.1598, + "step": 4833 + }, + { + "epoch": 0.7599284717718957, + "grad_norm": 0.1281062662601471, + "learning_rate": 3.424652638128739e-05, + "loss": 1.1028, + "step": 4834 + }, + { + "epoch": 0.7600856766688282, + "grad_norm": 0.12758252024650574, + "learning_rate": 3.4240785827693435e-05, + "loss": 1.1538, + "step": 4835 + }, + { + "epoch": 0.7602428815657608, + "grad_norm": 0.13412296772003174, + "learning_rate": 3.423504470970745e-05, + "loss": 1.2111, + "step": 4836 + }, + { + "epoch": 0.7604000864626933, + "grad_norm": 0.13403989374637604, + "learning_rate": 3.422930302768007e-05, + "loss": 1.0381, + "step": 4837 + }, + { + "epoch": 0.7605572913596258, + "grad_norm": 0.14702297747135162, + "learning_rate": 3.422356078196198e-05, + "loss": 1.096, + "step": 4838 + }, + { + "epoch": 0.7607144962565584, + "grad_norm": 0.1320294737815857, + "learning_rate": 3.421781797290389e-05, + "loss": 1.1756, + "step": 4839 + }, + { + "epoch": 0.7608717011534909, + "grad_norm": 0.12777256965637207, + "learning_rate": 3.4212074600856536e-05, + "loss": 0.9583, + "step": 4840 + }, + { + "epoch": 0.7610289060504235, + "grad_norm": 0.15317098796367645, + "learning_rate": 3.4206330666170725e-05, + "loss": 1.0876, + "step": 4841 + }, + { + "epoch": 0.761186110947356, + "grad_norm": 0.1403750777244568, + "learning_rate": 3.4200586169197265e-05, + "loss": 1.1199, + "step": 4842 + }, + { + "epoch": 0.7613433158442886, + "grad_norm": 0.13177213072776794, + "learning_rate": 3.4194841110287016e-05, + "loss": 1.0894, + "step": 4843 + }, + { + "epoch": 0.7615005207412211, + "grad_norm": 0.13566197454929352, + "learning_rate": 3.418909548979084e-05, + "loss": 1.1449, + "step": 4844 + }, + { + "epoch": 0.7616577256381536, + "grad_norm": 0.1385362595319748, + "learning_rate": 3.418334930805968e-05, + "loss": 1.128, + "step": 4845 + }, + { + "epoch": 0.7618149305350862, + "grad_norm": 0.12482000142335892, + "learning_rate": 3.417760256544449e-05, + "loss": 1.0858, + "step": 4846 + }, + { + "epoch": 0.7619721354320187, + "grad_norm": 0.11928500980138779, + "learning_rate": 3.417185526229625e-05, + "loss": 1.1047, + "step": 4847 + }, + { + "epoch": 0.7621293403289513, + "grad_norm": 0.12555529177188873, + "learning_rate": 3.4166107398965987e-05, + "loss": 1.087, + "step": 4848 + }, + { + "epoch": 0.7622865452258838, + "grad_norm": 0.1462515890598297, + "learning_rate": 3.4160358975804755e-05, + "loss": 1.0634, + "step": 4849 + }, + { + "epoch": 0.7624437501228163, + "grad_norm": 0.1577952355146408, + "learning_rate": 3.415460999316366e-05, + "loss": 1.1449, + "step": 4850 + }, + { + "epoch": 0.7626009550197489, + "grad_norm": 0.14062662422657013, + "learning_rate": 3.41488604513938e-05, + "loss": 1.0793, + "step": 4851 + }, + { + "epoch": 0.7627581599166814, + "grad_norm": 0.13146206736564636, + "learning_rate": 3.414311035084637e-05, + "loss": 1.139, + "step": 4852 + }, + { + "epoch": 0.762915364813614, + "grad_norm": 0.12374719232320786, + "learning_rate": 3.413735969187254e-05, + "loss": 1.1886, + "step": 4853 + }, + { + "epoch": 0.7630725697105465, + "grad_norm": 0.1335606724023819, + "learning_rate": 3.413160847482355e-05, + "loss": 1.1146, + "step": 4854 + }, + { + "epoch": 0.763229774607479, + "grad_norm": 0.1613902747631073, + "learning_rate": 3.4125856700050645e-05, + "loss": 1.0692, + "step": 4855 + }, + { + "epoch": 0.7633869795044116, + "grad_norm": 0.15744736790657043, + "learning_rate": 3.4120104367905145e-05, + "loss": 1.1159, + "step": 4856 + }, + { + "epoch": 0.7635441844013441, + "grad_norm": 0.1367904543876648, + "learning_rate": 3.411435147873837e-05, + "loss": 1.132, + "step": 4857 + }, + { + "epoch": 0.7637013892982767, + "grad_norm": 0.16840605437755585, + "learning_rate": 3.410859803290168e-05, + "loss": 1.1439, + "step": 4858 + }, + { + "epoch": 0.7638585941952092, + "grad_norm": 0.13757860660552979, + "learning_rate": 3.4102844030746485e-05, + "loss": 1.0952, + "step": 4859 + }, + { + "epoch": 0.7640157990921417, + "grad_norm": 0.1161893904209137, + "learning_rate": 3.40970894726242e-05, + "loss": 1.1324, + "step": 4860 + }, + { + "epoch": 0.7641730039890743, + "grad_norm": 0.13835035264492035, + "learning_rate": 3.409133435888632e-05, + "loss": 1.1272, + "step": 4861 + }, + { + "epoch": 0.7643302088860068, + "grad_norm": 0.12219596654176712, + "learning_rate": 3.408557868988431e-05, + "loss": 1.0751, + "step": 4862 + }, + { + "epoch": 0.7644874137829394, + "grad_norm": 0.1328803300857544, + "learning_rate": 3.407982246596972e-05, + "loss": 1.0784, + "step": 4863 + }, + { + "epoch": 0.7646446186798719, + "grad_norm": 0.1300496906042099, + "learning_rate": 3.407406568749414e-05, + "loss": 1.0277, + "step": 4864 + }, + { + "epoch": 0.7648018235768044, + "grad_norm": 0.1289186179637909, + "learning_rate": 3.4068308354809134e-05, + "loss": 1.1356, + "step": 4865 + }, + { + "epoch": 0.764959028473737, + "grad_norm": 0.12275593727827072, + "learning_rate": 3.406255046826637e-05, + "loss": 1.1002, + "step": 4866 + }, + { + "epoch": 0.7651162333706695, + "grad_norm": 0.13451841473579407, + "learning_rate": 3.4056792028217494e-05, + "loss": 1.0173, + "step": 4867 + }, + { + "epoch": 0.7652734382676021, + "grad_norm": 0.14932623505592346, + "learning_rate": 3.405103303501422e-05, + "loss": 1.1156, + "step": 4868 + }, + { + "epoch": 0.7654306431645346, + "grad_norm": 0.13008396327495575, + "learning_rate": 3.404527348900829e-05, + "loss": 1.1443, + "step": 4869 + }, + { + "epoch": 0.7655878480614671, + "grad_norm": 0.15099525451660156, + "learning_rate": 3.403951339055147e-05, + "loss": 1.1328, + "step": 4870 + }, + { + "epoch": 0.7657450529583997, + "grad_norm": 0.12122808396816254, + "learning_rate": 3.4033752739995563e-05, + "loss": 1.0865, + "step": 4871 + }, + { + "epoch": 0.7659022578553322, + "grad_norm": 0.12879222631454468, + "learning_rate": 3.402799153769241e-05, + "loss": 1.0814, + "step": 4872 + }, + { + "epoch": 0.7660594627522648, + "grad_norm": 0.17125335335731506, + "learning_rate": 3.402222978399389e-05, + "loss": 1.076, + "step": 4873 + }, + { + "epoch": 0.7662166676491973, + "grad_norm": 0.11798808723688126, + "learning_rate": 3.401646747925189e-05, + "loss": 0.9454, + "step": 4874 + }, + { + "epoch": 0.7663738725461298, + "grad_norm": 0.1341928094625473, + "learning_rate": 3.401070462381837e-05, + "loss": 1.1656, + "step": 4875 + }, + { + "epoch": 0.7665310774430624, + "grad_norm": 0.14401687681674957, + "learning_rate": 3.40049412180453e-05, + "loss": 1.1035, + "step": 4876 + }, + { + "epoch": 0.7666882823399949, + "grad_norm": 0.13822953402996063, + "learning_rate": 3.399917726228466e-05, + "loss": 1.111, + "step": 4877 + }, + { + "epoch": 0.7668454872369275, + "grad_norm": 0.12841646373271942, + "learning_rate": 3.3993412756888535e-05, + "loss": 1.0493, + "step": 4878 + }, + { + "epoch": 0.76700269213386, + "grad_norm": 0.15926672518253326, + "learning_rate": 3.398764770220896e-05, + "loss": 1.1567, + "step": 4879 + }, + { + "epoch": 0.7671598970307925, + "grad_norm": 0.16407644748687744, + "learning_rate": 3.3981882098598075e-05, + "loss": 1.1008, + "step": 4880 + }, + { + "epoch": 0.7673171019277251, + "grad_norm": 0.1251910775899887, + "learning_rate": 3.3976115946408e-05, + "loss": 1.1036, + "step": 4881 + }, + { + "epoch": 0.7674743068246576, + "grad_norm": 0.1440126895904541, + "learning_rate": 3.397034924599091e-05, + "loss": 1.0757, + "step": 4882 + }, + { + "epoch": 0.7676315117215902, + "grad_norm": 0.1343274861574173, + "learning_rate": 3.3964581997699026e-05, + "loss": 1.139, + "step": 4883 + }, + { + "epoch": 0.7677887166185227, + "grad_norm": 0.13377784192562103, + "learning_rate": 3.395881420188457e-05, + "loss": 1.2354, + "step": 4884 + }, + { + "epoch": 0.7679459215154552, + "grad_norm": 0.1382710486650467, + "learning_rate": 3.395304585889984e-05, + "loss": 1.0949, + "step": 4885 + }, + { + "epoch": 0.7681031264123878, + "grad_norm": 0.12571008503437042, + "learning_rate": 3.3947276969097124e-05, + "loss": 1.1223, + "step": 4886 + }, + { + "epoch": 0.7682603313093203, + "grad_norm": 0.15523536503314972, + "learning_rate": 3.394150753282878e-05, + "loss": 1.0913, + "step": 4887 + }, + { + "epoch": 0.7684175362062529, + "grad_norm": 0.14620168507099152, + "learning_rate": 3.3935737550447175e-05, + "loss": 1.1561, + "step": 4888 + }, + { + "epoch": 0.7685747411031854, + "grad_norm": 0.156593456864357, + "learning_rate": 3.3929967022304714e-05, + "loss": 1.0524, + "step": 4889 + }, + { + "epoch": 0.7687319460001178, + "grad_norm": 0.13538812100887299, + "learning_rate": 3.392419594875385e-05, + "loss": 1.1074, + "step": 4890 + }, + { + "epoch": 0.7688891508970505, + "grad_norm": 0.1342657506465912, + "learning_rate": 3.3918424330147045e-05, + "loss": 1.0716, + "step": 4891 + }, + { + "epoch": 0.769046355793983, + "grad_norm": 0.1275315284729004, + "learning_rate": 3.391265216683682e-05, + "loss": 1.038, + "step": 4892 + }, + { + "epoch": 0.7692035606909156, + "grad_norm": 0.14354261755943298, + "learning_rate": 3.390687945917571e-05, + "loss": 1.08, + "step": 4893 + }, + { + "epoch": 0.769360765587848, + "grad_norm": 0.1632268726825714, + "learning_rate": 3.3901106207516285e-05, + "loss": 1.0238, + "step": 4894 + }, + { + "epoch": 0.7695179704847807, + "grad_norm": 0.14514832198619843, + "learning_rate": 3.389533241221117e-05, + "loss": 1.1181, + "step": 4895 + }, + { + "epoch": 0.7696751753817132, + "grad_norm": 0.15159910917282104, + "learning_rate": 3.388955807361299e-05, + "loss": 1.1007, + "step": 4896 + }, + { + "epoch": 0.7698323802786456, + "grad_norm": 0.13943949341773987, + "learning_rate": 3.388378319207443e-05, + "loss": 0.9868, + "step": 4897 + }, + { + "epoch": 0.7699895851755783, + "grad_norm": 0.12545351684093475, + "learning_rate": 3.387800776794818e-05, + "loss": 1.1619, + "step": 4898 + }, + { + "epoch": 0.7701467900725107, + "grad_norm": 0.11740878969430923, + "learning_rate": 3.387223180158701e-05, + "loss": 1.0607, + "step": 4899 + }, + { + "epoch": 0.7703039949694434, + "grad_norm": 0.13455021381378174, + "learning_rate": 3.3866455293343666e-05, + "loss": 1.0897, + "step": 4900 + }, + { + "epoch": 0.7704611998663758, + "grad_norm": 0.12867052853107452, + "learning_rate": 3.386067824357098e-05, + "loss": 1.1811, + "step": 4901 + }, + { + "epoch": 0.7706184047633083, + "grad_norm": 0.1307537704706192, + "learning_rate": 3.385490065262176e-05, + "loss": 1.0823, + "step": 4902 + }, + { + "epoch": 0.770775609660241, + "grad_norm": 0.13356691598892212, + "learning_rate": 3.3849122520848915e-05, + "loss": 1.245, + "step": 4903 + }, + { + "epoch": 0.7709328145571734, + "grad_norm": 0.13139644265174866, + "learning_rate": 3.384334384860533e-05, + "loss": 1.04, + "step": 4904 + }, + { + "epoch": 0.771090019454106, + "grad_norm": 0.13805650174617767, + "learning_rate": 3.3837564636243944e-05, + "loss": 1.0628, + "step": 4905 + }, + { + "epoch": 0.7712472243510385, + "grad_norm": 0.13503144681453705, + "learning_rate": 3.383178488411775e-05, + "loss": 1.0897, + "step": 4906 + }, + { + "epoch": 0.771404429247971, + "grad_norm": 0.1391112506389618, + "learning_rate": 3.382600459257973e-05, + "loss": 1.034, + "step": 4907 + }, + { + "epoch": 0.7715616341449036, + "grad_norm": 0.1330227553844452, + "learning_rate": 3.3820223761982926e-05, + "loss": 1.1307, + "step": 4908 + }, + { + "epoch": 0.7717188390418361, + "grad_norm": 0.13399213552474976, + "learning_rate": 3.381444239268041e-05, + "loss": 1.0304, + "step": 4909 + }, + { + "epoch": 0.7718760439387687, + "grad_norm": 0.11677687615156174, + "learning_rate": 3.380866048502531e-05, + "loss": 1.103, + "step": 4910 + }, + { + "epoch": 0.7720332488357012, + "grad_norm": 0.17897523939609528, + "learning_rate": 3.380287803937072e-05, + "loss": 1.0233, + "step": 4911 + }, + { + "epoch": 0.7721904537326337, + "grad_norm": 0.12963220477104187, + "learning_rate": 3.3797095056069836e-05, + "loss": 1.0871, + "step": 4912 + }, + { + "epoch": 0.7723476586295663, + "grad_norm": 0.13949821889400482, + "learning_rate": 3.379131153547587e-05, + "loss": 1.2007, + "step": 4913 + }, + { + "epoch": 0.7725048635264988, + "grad_norm": 0.13470464944839478, + "learning_rate": 3.378552747794203e-05, + "loss": 1.0367, + "step": 4914 + }, + { + "epoch": 0.7726620684234314, + "grad_norm": 0.1272779107093811, + "learning_rate": 3.377974288382161e-05, + "loss": 1.0967, + "step": 4915 + }, + { + "epoch": 0.7728192733203639, + "grad_norm": 0.13421685993671417, + "learning_rate": 3.377395775346789e-05, + "loss": 1.1023, + "step": 4916 + }, + { + "epoch": 0.7729764782172964, + "grad_norm": 0.12207071483135223, + "learning_rate": 3.376817208723422e-05, + "loss": 1.1236, + "step": 4917 + }, + { + "epoch": 0.773133683114229, + "grad_norm": 0.12212815135717392, + "learning_rate": 3.376238588547396e-05, + "loss": 1.0593, + "step": 4918 + }, + { + "epoch": 0.7732908880111615, + "grad_norm": 0.1423274278640747, + "learning_rate": 3.3756599148540494e-05, + "loss": 1.1556, + "step": 4919 + }, + { + "epoch": 0.7734480929080941, + "grad_norm": 0.12456110864877701, + "learning_rate": 3.375081187678729e-05, + "loss": 0.9687, + "step": 4920 + }, + { + "epoch": 0.7736052978050266, + "grad_norm": 0.13654069602489471, + "learning_rate": 3.3745024070567774e-05, + "loss": 1.1634, + "step": 4921 + }, + { + "epoch": 0.7737625027019591, + "grad_norm": 0.13371238112449646, + "learning_rate": 3.373923573023547e-05, + "loss": 1.1374, + "step": 4922 + }, + { + "epoch": 0.7739197075988917, + "grad_norm": 0.14525705575942993, + "learning_rate": 3.3733446856143895e-05, + "loss": 1.0809, + "step": 4923 + }, + { + "epoch": 0.7740769124958242, + "grad_norm": 0.1339891105890274, + "learning_rate": 3.3727657448646614e-05, + "loss": 1.1174, + "step": 4924 + }, + { + "epoch": 0.7742341173927568, + "grad_norm": 0.13140228390693665, + "learning_rate": 3.372186750809723e-05, + "loss": 1.1715, + "step": 4925 + }, + { + "epoch": 0.7743913222896893, + "grad_norm": 0.17299087345600128, + "learning_rate": 3.3716077034849345e-05, + "loss": 0.95, + "step": 4926 + }, + { + "epoch": 0.7745485271866218, + "grad_norm": 0.15158572793006897, + "learning_rate": 3.371028602925666e-05, + "loss": 1.1738, + "step": 4927 + }, + { + "epoch": 0.7747057320835544, + "grad_norm": 0.13169889152050018, + "learning_rate": 3.3704494491672837e-05, + "loss": 1.0398, + "step": 4928 + }, + { + "epoch": 0.7748629369804869, + "grad_norm": 0.13216854631900787, + "learning_rate": 3.36987024224516e-05, + "loss": 1.1112, + "step": 4929 + }, + { + "epoch": 0.7750201418774195, + "grad_norm": 0.14911629259586334, + "learning_rate": 3.369290982194671e-05, + "loss": 0.9443, + "step": 4930 + }, + { + "epoch": 0.775177346774352, + "grad_norm": 0.13613444566726685, + "learning_rate": 3.368711669051198e-05, + "loss": 1.0903, + "step": 4931 + }, + { + "epoch": 0.7753345516712845, + "grad_norm": 0.12087051570415497, + "learning_rate": 3.368132302850121e-05, + "loss": 1.1767, + "step": 4932 + }, + { + "epoch": 0.7754917565682171, + "grad_norm": 0.13154035806655884, + "learning_rate": 3.367552883626825e-05, + "loss": 1.0404, + "step": 4933 + }, + { + "epoch": 0.7756489614651496, + "grad_norm": 0.13925373554229736, + "learning_rate": 3.366973411416702e-05, + "loss": 0.9788, + "step": 4934 + }, + { + "epoch": 0.7758061663620822, + "grad_norm": 0.1517360657453537, + "learning_rate": 3.366393886255139e-05, + "loss": 1.1265, + "step": 4935 + }, + { + "epoch": 0.7759633712590147, + "grad_norm": 0.1432923823595047, + "learning_rate": 3.365814308177536e-05, + "loss": 1.088, + "step": 4936 + }, + { + "epoch": 0.7761205761559472, + "grad_norm": 0.12432681024074554, + "learning_rate": 3.365234677219288e-05, + "loss": 1.0405, + "step": 4937 + }, + { + "epoch": 0.7762777810528798, + "grad_norm": 0.1287374645471573, + "learning_rate": 3.364654993415798e-05, + "loss": 1.1894, + "step": 4938 + }, + { + "epoch": 0.7764349859498123, + "grad_norm": 0.1255466192960739, + "learning_rate": 3.3640752568024706e-05, + "loss": 1.1598, + "step": 4939 + }, + { + "epoch": 0.7765921908467449, + "grad_norm": 0.13246101140975952, + "learning_rate": 3.3634954674147146e-05, + "loss": 1.0111, + "step": 4940 + }, + { + "epoch": 0.7767493957436774, + "grad_norm": 0.1282433420419693, + "learning_rate": 3.362915625287941e-05, + "loss": 1.1041, + "step": 4941 + }, + { + "epoch": 0.7769066006406099, + "grad_norm": 0.13137435913085938, + "learning_rate": 3.362335730457564e-05, + "loss": 1.1897, + "step": 4942 + }, + { + "epoch": 0.7770638055375425, + "grad_norm": 0.15391333401203156, + "learning_rate": 3.3617557829590015e-05, + "loss": 1.0739, + "step": 4943 + }, + { + "epoch": 0.777221010434475, + "grad_norm": 0.13495557010173798, + "learning_rate": 3.361175782827675e-05, + "loss": 1.1336, + "step": 4944 + }, + { + "epoch": 0.7773782153314076, + "grad_norm": 0.12705542147159576, + "learning_rate": 3.3605957300990073e-05, + "loss": 1.0736, + "step": 4945 + }, + { + "epoch": 0.7775354202283401, + "grad_norm": 0.12318447977304459, + "learning_rate": 3.360015624808427e-05, + "loss": 1.0195, + "step": 4946 + }, + { + "epoch": 0.7776926251252727, + "grad_norm": 0.1322765052318573, + "learning_rate": 3.3594354669913654e-05, + "loss": 1.1146, + "step": 4947 + }, + { + "epoch": 0.7778498300222052, + "grad_norm": 0.14443641901016235, + "learning_rate": 3.3588552566832545e-05, + "loss": 1.0598, + "step": 4948 + }, + { + "epoch": 0.7780070349191377, + "grad_norm": 0.11805952340364456, + "learning_rate": 3.358274993919532e-05, + "loss": 0.9953, + "step": 4949 + }, + { + "epoch": 0.7781642398160703, + "grad_norm": 0.12899643182754517, + "learning_rate": 3.357694678735639e-05, + "loss": 1.1646, + "step": 4950 + }, + { + "epoch": 0.7783214447130028, + "grad_norm": 0.14672844111919403, + "learning_rate": 3.3571143111670187e-05, + "loss": 1.0165, + "step": 4951 + }, + { + "epoch": 0.7784786496099354, + "grad_norm": 0.1444154977798462, + "learning_rate": 3.3565338912491165e-05, + "loss": 1.0543, + "step": 4952 + }, + { + "epoch": 0.7786358545068679, + "grad_norm": 0.12198904156684875, + "learning_rate": 3.3559534190173834e-05, + "loss": 1.0839, + "step": 4953 + }, + { + "epoch": 0.7787930594038004, + "grad_norm": 0.13399964570999146, + "learning_rate": 3.355372894507272e-05, + "loss": 1.0607, + "step": 4954 + }, + { + "epoch": 0.778950264300733, + "grad_norm": 0.13722872734069824, + "learning_rate": 3.3547923177542396e-05, + "loss": 1.1332, + "step": 4955 + }, + { + "epoch": 0.7791074691976655, + "grad_norm": 0.12063074856996536, + "learning_rate": 3.354211688793744e-05, + "loss": 1.0272, + "step": 4956 + }, + { + "epoch": 0.7792646740945981, + "grad_norm": 0.13837161660194397, + "learning_rate": 3.353631007661249e-05, + "loss": 1.1066, + "step": 4957 + }, + { + "epoch": 0.7794218789915306, + "grad_norm": 0.1389688104391098, + "learning_rate": 3.353050274392219e-05, + "loss": 1.0007, + "step": 4958 + }, + { + "epoch": 0.7795790838884631, + "grad_norm": 0.11978980898857117, + "learning_rate": 3.3524694890221244e-05, + "loss": 1.1153, + "step": 4959 + }, + { + "epoch": 0.7797362887853957, + "grad_norm": 0.13067257404327393, + "learning_rate": 3.3518886515864366e-05, + "loss": 1.1104, + "step": 4960 + }, + { + "epoch": 0.7797362887853957, + "eval_loss": 1.0962234735488892, + "eval_runtime": 2341.8407, + "eval_samples_per_second": 3.953, + "eval_steps_per_second": 1.977, + "step": 4960 + }, + { + "epoch": 0.7798934936823282, + "grad_norm": 0.14436736702919006, + "learning_rate": 3.351307762120631e-05, + "loss": 1.1186, + "step": 4961 + }, + { + "epoch": 0.7800506985792608, + "grad_norm": 0.13643218576908112, + "learning_rate": 3.350726820660187e-05, + "loss": 1.2117, + "step": 4962 + }, + { + "epoch": 0.7802079034761933, + "grad_norm": 0.15105631947517395, + "learning_rate": 3.350145827240585e-05, + "loss": 1.082, + "step": 4963 + }, + { + "epoch": 0.7803651083731258, + "grad_norm": 0.1495438516139984, + "learning_rate": 3.349564781897311e-05, + "loss": 1.1437, + "step": 4964 + }, + { + "epoch": 0.7805223132700584, + "grad_norm": 0.13507118821144104, + "learning_rate": 3.348983684665852e-05, + "loss": 1.0525, + "step": 4965 + }, + { + "epoch": 0.7806795181669909, + "grad_norm": 0.1323672980070114, + "learning_rate": 3.348402535581701e-05, + "loss": 1.0791, + "step": 4966 + }, + { + "epoch": 0.7808367230639235, + "grad_norm": 0.15458525717258453, + "learning_rate": 3.34782133468035e-05, + "loss": 1.0797, + "step": 4967 + }, + { + "epoch": 0.780993927960856, + "grad_norm": 0.1274881511926651, + "learning_rate": 3.347240081997297e-05, + "loss": 1.0706, + "step": 4968 + }, + { + "epoch": 0.7811511328577885, + "grad_norm": 0.19001010060310364, + "learning_rate": 3.3466587775680444e-05, + "loss": 1.1347, + "step": 4969 + }, + { + "epoch": 0.7813083377547211, + "grad_norm": 0.13060776889324188, + "learning_rate": 3.3460774214280944e-05, + "loss": 1.1397, + "step": 4970 + }, + { + "epoch": 0.7814655426516536, + "grad_norm": 0.11697187274694443, + "learning_rate": 3.345496013612955e-05, + "loss": 1.0687, + "step": 4971 + }, + { + "epoch": 0.7816227475485862, + "grad_norm": 0.13726262748241425, + "learning_rate": 3.344914554158136e-05, + "loss": 1.0154, + "step": 4972 + }, + { + "epoch": 0.7817799524455187, + "grad_norm": 0.12287222594022751, + "learning_rate": 3.3443330430991506e-05, + "loss": 0.9618, + "step": 4973 + }, + { + "epoch": 0.7819371573424512, + "grad_norm": 0.14394275844097137, + "learning_rate": 3.343751480471515e-05, + "loss": 1.1896, + "step": 4974 + }, + { + "epoch": 0.7820943622393838, + "grad_norm": 0.13699425756931305, + "learning_rate": 3.3431698663107496e-05, + "loss": 1.1249, + "step": 4975 + }, + { + "epoch": 0.7822515671363163, + "grad_norm": 0.13724492490291595, + "learning_rate": 3.3425882006523765e-05, + "loss": 1.1185, + "step": 4976 + }, + { + "epoch": 0.7824087720332489, + "grad_norm": 0.13086111843585968, + "learning_rate": 3.3420064835319224e-05, + "loss": 1.122, + "step": 4977 + }, + { + "epoch": 0.7825659769301814, + "grad_norm": 0.13510918617248535, + "learning_rate": 3.341424714984916e-05, + "loss": 1.117, + "step": 4978 + }, + { + "epoch": 0.7827231818271139, + "grad_norm": 0.13024483621120453, + "learning_rate": 3.340842895046889e-05, + "loss": 1.0512, + "step": 4979 + }, + { + "epoch": 0.7828803867240465, + "grad_norm": 0.12385273724794388, + "learning_rate": 3.340261023753377e-05, + "loss": 0.9738, + "step": 4980 + }, + { + "epoch": 0.783037591620979, + "grad_norm": 0.1568519026041031, + "learning_rate": 3.33967910113992e-05, + "loss": 1.1059, + "step": 4981 + }, + { + "epoch": 0.7831947965179116, + "grad_norm": 0.1357090175151825, + "learning_rate": 3.339097127242057e-05, + "loss": 1.0016, + "step": 4982 + }, + { + "epoch": 0.7833520014148441, + "grad_norm": 0.13885211944580078, + "learning_rate": 3.3385151020953345e-05, + "loss": 1.1129, + "step": 4983 + }, + { + "epoch": 0.7835092063117766, + "grad_norm": 0.15067574381828308, + "learning_rate": 3.337933025735299e-05, + "loss": 1.1112, + "step": 4984 + }, + { + "epoch": 0.7836664112087092, + "grad_norm": 0.22319741547107697, + "learning_rate": 3.337350898197504e-05, + "loss": 1.0822, + "step": 4985 + }, + { + "epoch": 0.7838236161056417, + "grad_norm": 0.13410452008247375, + "learning_rate": 3.3367687195175006e-05, + "loss": 1.1652, + "step": 4986 + }, + { + "epoch": 0.7839808210025743, + "grad_norm": 0.15852157771587372, + "learning_rate": 3.3361864897308484e-05, + "loss": 1.0636, + "step": 4987 + }, + { + "epoch": 0.7841380258995068, + "grad_norm": 0.18127258121967316, + "learning_rate": 3.335604208873106e-05, + "loss": 1.099, + "step": 4988 + }, + { + "epoch": 0.7842952307964393, + "grad_norm": 0.14409101009368896, + "learning_rate": 3.335021876979838e-05, + "loss": 1.0831, + "step": 4989 + }, + { + "epoch": 0.7844524356933719, + "grad_norm": 0.15617966651916504, + "learning_rate": 3.334439494086612e-05, + "loss": 1.0458, + "step": 4990 + }, + { + "epoch": 0.7846096405903044, + "grad_norm": 0.12643630802631378, + "learning_rate": 3.333857060228995e-05, + "loss": 1.1234, + "step": 4991 + }, + { + "epoch": 0.784766845487237, + "grad_norm": 0.14369435608386993, + "learning_rate": 3.3332745754425626e-05, + "loss": 1.1506, + "step": 4992 + }, + { + "epoch": 0.7849240503841695, + "grad_norm": 0.12761743366718292, + "learning_rate": 3.332692039762889e-05, + "loss": 1.1512, + "step": 4993 + }, + { + "epoch": 0.785081255281102, + "grad_norm": 0.1430346816778183, + "learning_rate": 3.332109453225554e-05, + "loss": 0.99, + "step": 4994 + }, + { + "epoch": 0.7852384601780346, + "grad_norm": 0.12227287888526917, + "learning_rate": 3.3315268158661396e-05, + "loss": 1.1705, + "step": 4995 + }, + { + "epoch": 0.785395665074967, + "grad_norm": 0.1261235624551773, + "learning_rate": 3.33094412772023e-05, + "loss": 1.0471, + "step": 4996 + }, + { + "epoch": 0.7855528699718997, + "grad_norm": 0.14919154345989227, + "learning_rate": 3.330361388823416e-05, + "loss": 1.0884, + "step": 4997 + }, + { + "epoch": 0.7857100748688322, + "grad_norm": 0.1251683533191681, + "learning_rate": 3.329778599211287e-05, + "loss": 0.9639, + "step": 4998 + }, + { + "epoch": 0.7858672797657648, + "grad_norm": 0.13637672364711761, + "learning_rate": 3.3291957589194385e-05, + "loss": 1.1214, + "step": 4999 + }, + { + "epoch": 0.7860244846626973, + "grad_norm": 0.16353115439414978, + "learning_rate": 3.328612867983468e-05, + "loss": 1.0878, + "step": 5000 + }, + { + "epoch": 0.7861816895596297, + "grad_norm": 0.12601494789123535, + "learning_rate": 3.328029926438976e-05, + "loss": 1.1214, + "step": 5001 + }, + { + "epoch": 0.7863388944565624, + "grad_norm": 0.13055525720119476, + "learning_rate": 3.3274469343215666e-05, + "loss": 1.1049, + "step": 5002 + }, + { + "epoch": 0.7864960993534948, + "grad_norm": 0.14286868274211884, + "learning_rate": 3.326863891666846e-05, + "loss": 1.2236, + "step": 5003 + }, + { + "epoch": 0.7866533042504275, + "grad_norm": 0.12616075575351715, + "learning_rate": 3.326280798510426e-05, + "loss": 1.1382, + "step": 5004 + }, + { + "epoch": 0.78681050914736, + "grad_norm": 0.12813489139080048, + "learning_rate": 3.3256976548879184e-05, + "loss": 1.0599, + "step": 5005 + }, + { + "epoch": 0.7869677140442924, + "grad_norm": 0.12909750640392303, + "learning_rate": 3.325114460834939e-05, + "loss": 0.9082, + "step": 5006 + }, + { + "epoch": 0.787124918941225, + "grad_norm": 0.12364039570093155, + "learning_rate": 3.324531216387108e-05, + "loss": 1.0784, + "step": 5007 + }, + { + "epoch": 0.7872821238381575, + "grad_norm": 0.13159489631652832, + "learning_rate": 3.3239479215800476e-05, + "loss": 1.1308, + "step": 5008 + }, + { + "epoch": 0.7874393287350901, + "grad_norm": 0.1293668895959854, + "learning_rate": 3.323364576449383e-05, + "loss": 1.0157, + "step": 5009 + }, + { + "epoch": 0.7875965336320226, + "grad_norm": 0.13375996053218842, + "learning_rate": 3.3227811810307426e-05, + "loss": 1.1795, + "step": 5010 + }, + { + "epoch": 0.7877537385289551, + "grad_norm": 0.13124480843544006, + "learning_rate": 3.322197735359759e-05, + "loss": 0.9774, + "step": 5011 + }, + { + "epoch": 0.7879109434258877, + "grad_norm": 0.1319187879562378, + "learning_rate": 3.321614239472064e-05, + "loss": 1.0874, + "step": 5012 + }, + { + "epoch": 0.7880681483228202, + "grad_norm": 0.1557384580373764, + "learning_rate": 3.3210306934032995e-05, + "loss": 1.0408, + "step": 5013 + }, + { + "epoch": 0.7882253532197528, + "grad_norm": 0.17597688734531403, + "learning_rate": 3.3204470971891026e-05, + "loss": 1.0429, + "step": 5014 + }, + { + "epoch": 0.7883825581166853, + "grad_norm": 0.1346769481897354, + "learning_rate": 3.31986345086512e-05, + "loss": 1.1029, + "step": 5015 + }, + { + "epoch": 0.7885397630136178, + "grad_norm": 0.16205650568008423, + "learning_rate": 3.319279754466996e-05, + "loss": 1.1131, + "step": 5016 + }, + { + "epoch": 0.7886969679105504, + "grad_norm": 0.17671433091163635, + "learning_rate": 3.3186960080303816e-05, + "loss": 1.0771, + "step": 5017 + }, + { + "epoch": 0.7888541728074829, + "grad_norm": 0.15653863549232483, + "learning_rate": 3.31811221159093e-05, + "loss": 1.1959, + "step": 5018 + }, + { + "epoch": 0.7890113777044155, + "grad_norm": 0.14297321438789368, + "learning_rate": 3.317528365184298e-05, + "loss": 1.1692, + "step": 5019 + }, + { + "epoch": 0.789168582601348, + "grad_norm": 0.16521745920181274, + "learning_rate": 3.316944468846144e-05, + "loss": 1.0477, + "step": 5020 + }, + { + "epoch": 0.7893257874982805, + "grad_norm": 0.13730360567569733, + "learning_rate": 3.3163605226121296e-05, + "loss": 1.1641, + "step": 5021 + }, + { + "epoch": 0.7894829923952131, + "grad_norm": 0.16981300711631775, + "learning_rate": 3.315776526517921e-05, + "loss": 1.106, + "step": 5022 + }, + { + "epoch": 0.7896401972921456, + "grad_norm": 0.1734813153743744, + "learning_rate": 3.315192480599185e-05, + "loss": 1.1008, + "step": 5023 + }, + { + "epoch": 0.7897974021890782, + "grad_norm": 0.14239849150180817, + "learning_rate": 3.314608384891594e-05, + "loss": 1.0765, + "step": 5024 + }, + { + "epoch": 0.7899546070860107, + "grad_norm": 0.14138491451740265, + "learning_rate": 3.314024239430824e-05, + "loss": 1.1847, + "step": 5025 + }, + { + "epoch": 0.7901118119829432, + "grad_norm": 0.1398213654756546, + "learning_rate": 3.313440044252549e-05, + "loss": 1.1479, + "step": 5026 + }, + { + "epoch": 0.7902690168798758, + "grad_norm": 0.12144920229911804, + "learning_rate": 3.3128557993924516e-05, + "loss": 1.0971, + "step": 5027 + }, + { + "epoch": 0.7904262217768083, + "grad_norm": 0.14630118012428284, + "learning_rate": 3.312271504886214e-05, + "loss": 1.102, + "step": 5028 + }, + { + "epoch": 0.7905834266737409, + "grad_norm": 0.14643485844135284, + "learning_rate": 3.311687160769524e-05, + "loss": 1.0097, + "step": 5029 + }, + { + "epoch": 0.7907406315706734, + "grad_norm": 0.12107902765274048, + "learning_rate": 3.311102767078071e-05, + "loss": 1.1308, + "step": 5030 + }, + { + "epoch": 0.7908978364676059, + "grad_norm": 0.13668377697467804, + "learning_rate": 3.3105183238475455e-05, + "loss": 1.0909, + "step": 5031 + }, + { + "epoch": 0.7910550413645385, + "grad_norm": 0.13928672671318054, + "learning_rate": 3.309933831113646e-05, + "loss": 1.0461, + "step": 5032 + }, + { + "epoch": 0.791212246261471, + "grad_norm": 0.13744144141674042, + "learning_rate": 3.309349288912069e-05, + "loss": 0.9517, + "step": 5033 + }, + { + "epoch": 0.7913694511584036, + "grad_norm": 0.14722764492034912, + "learning_rate": 3.308764697278518e-05, + "loss": 1.1708, + "step": 5034 + }, + { + "epoch": 0.7915266560553361, + "grad_norm": 0.13956117630004883, + "learning_rate": 3.3081800562486946e-05, + "loss": 1.043, + "step": 5035 + }, + { + "epoch": 0.7916838609522686, + "grad_norm": 0.1356237679719925, + "learning_rate": 3.307595365858309e-05, + "loss": 1.1325, + "step": 5036 + }, + { + "epoch": 0.7918410658492012, + "grad_norm": 0.1514151692390442, + "learning_rate": 3.307010626143071e-05, + "loss": 1.177, + "step": 5037 + }, + { + "epoch": 0.7919982707461337, + "grad_norm": 0.11895293742418289, + "learning_rate": 3.306425837138695e-05, + "loss": 1.1792, + "step": 5038 + }, + { + "epoch": 0.7921554756430663, + "grad_norm": 0.13698509335517883, + "learning_rate": 3.305840998880897e-05, + "loss": 1.0816, + "step": 5039 + }, + { + "epoch": 0.7923126805399988, + "grad_norm": 0.13310359418392181, + "learning_rate": 3.3052561114053965e-05, + "loss": 1.0018, + "step": 5040 + }, + { + "epoch": 0.7924698854369313, + "grad_norm": 0.1219170093536377, + "learning_rate": 3.3046711747479166e-05, + "loss": 0.9989, + "step": 5041 + }, + { + "epoch": 0.7926270903338639, + "grad_norm": 0.13483086228370667, + "learning_rate": 3.304086188944183e-05, + "loss": 1.114, + "step": 5042 + }, + { + "epoch": 0.7927842952307964, + "grad_norm": 0.1272246390581131, + "learning_rate": 3.3035011540299245e-05, + "loss": 1.1405, + "step": 5043 + }, + { + "epoch": 0.792941500127729, + "grad_norm": 0.16075541079044342, + "learning_rate": 3.302916070040872e-05, + "loss": 1.1148, + "step": 5044 + }, + { + "epoch": 0.7930987050246615, + "grad_norm": 0.12627846002578735, + "learning_rate": 3.302330937012761e-05, + "loss": 1.095, + "step": 5045 + }, + { + "epoch": 0.793255909921594, + "grad_norm": 0.13641822338104248, + "learning_rate": 3.3017457549813304e-05, + "loss": 1.0507, + "step": 5046 + }, + { + "epoch": 0.7934131148185266, + "grad_norm": 0.14821748435497284, + "learning_rate": 3.301160523982317e-05, + "loss": 1.0466, + "step": 5047 + }, + { + "epoch": 0.7935703197154591, + "grad_norm": 0.13757331669330597, + "learning_rate": 3.3005752440514694e-05, + "loss": 1.1447, + "step": 5048 + }, + { + "epoch": 0.7937275246123917, + "grad_norm": 0.1334858238697052, + "learning_rate": 3.299989915224531e-05, + "loss": 1.0936, + "step": 5049 + }, + { + "epoch": 0.7938847295093242, + "grad_norm": 0.13829028606414795, + "learning_rate": 3.299404537537252e-05, + "loss": 1.1703, + "step": 5050 + }, + { + "epoch": 0.7940419344062567, + "grad_norm": 0.1311250925064087, + "learning_rate": 3.2988191110253866e-05, + "loss": 1.0749, + "step": 5051 + }, + { + "epoch": 0.7941991393031893, + "grad_norm": 0.13316184282302856, + "learning_rate": 3.2982336357246875e-05, + "loss": 1.1073, + "step": 5052 + }, + { + "epoch": 0.7943563442001218, + "grad_norm": 0.1327652633190155, + "learning_rate": 3.297648111670916e-05, + "loss": 0.986, + "step": 5053 + }, + { + "epoch": 0.7945135490970544, + "grad_norm": 0.13477909564971924, + "learning_rate": 3.297062538899832e-05, + "loss": 1.1242, + "step": 5054 + }, + { + "epoch": 0.7946707539939869, + "grad_norm": 0.1262103021144867, + "learning_rate": 3.296476917447202e-05, + "loss": 1.0395, + "step": 5055 + }, + { + "epoch": 0.7948279588909195, + "grad_norm": 0.1377694308757782, + "learning_rate": 3.295891247348791e-05, + "loss": 1.0956, + "step": 5056 + }, + { + "epoch": 0.794985163787852, + "grad_norm": 0.14717347919940948, + "learning_rate": 3.295305528640371e-05, + "loss": 1.1344, + "step": 5057 + }, + { + "epoch": 0.7951423686847845, + "grad_norm": 0.13811689615249634, + "learning_rate": 3.2947197613577156e-05, + "loss": 1.0311, + "step": 5058 + }, + { + "epoch": 0.7952995735817171, + "grad_norm": 0.13427940011024475, + "learning_rate": 3.294133945536601e-05, + "loss": 0.9746, + "step": 5059 + }, + { + "epoch": 0.7954567784786496, + "grad_norm": 0.14605213701725006, + "learning_rate": 3.293548081212807e-05, + "loss": 1.0583, + "step": 5060 + }, + { + "epoch": 0.7956139833755822, + "grad_norm": 0.11798227578401566, + "learning_rate": 3.292962168422114e-05, + "loss": 1.1211, + "step": 5061 + }, + { + "epoch": 0.7957711882725147, + "grad_norm": 0.1400463730096817, + "learning_rate": 3.2923762072003094e-05, + "loss": 1.1712, + "step": 5062 + }, + { + "epoch": 0.7959283931694472, + "grad_norm": 0.1203552633523941, + "learning_rate": 3.291790197583181e-05, + "loss": 1.0153, + "step": 5063 + }, + { + "epoch": 0.7960855980663798, + "grad_norm": 0.14843995869159698, + "learning_rate": 3.2912041396065205e-05, + "loss": 1.0718, + "step": 5064 + }, + { + "epoch": 0.7962428029633123, + "grad_norm": 0.12414611876010895, + "learning_rate": 3.290618033306121e-05, + "loss": 1.1268, + "step": 5065 + }, + { + "epoch": 0.7964000078602449, + "grad_norm": 0.13710583746433258, + "learning_rate": 3.2900318787177794e-05, + "loss": 1.0722, + "step": 5066 + }, + { + "epoch": 0.7965572127571774, + "grad_norm": 0.14502912759780884, + "learning_rate": 3.2894456758772976e-05, + "loss": 1.1507, + "step": 5067 + }, + { + "epoch": 0.7967144176541099, + "grad_norm": 0.14237187802791595, + "learning_rate": 3.288859424820477e-05, + "loss": 1.0388, + "step": 5068 + }, + { + "epoch": 0.7968716225510425, + "grad_norm": 0.1451392024755478, + "learning_rate": 3.288273125583124e-05, + "loss": 1.1638, + "step": 5069 + }, + { + "epoch": 0.797028827447975, + "grad_norm": 0.16580519080162048, + "learning_rate": 3.2876867782010477e-05, + "loss": 1.036, + "step": 5070 + }, + { + "epoch": 0.7971860323449076, + "grad_norm": 0.22899524867534637, + "learning_rate": 3.28710038271006e-05, + "loss": 1.0015, + "step": 5071 + }, + { + "epoch": 0.7973432372418401, + "grad_norm": 0.2474881261587143, + "learning_rate": 3.2865139391459764e-05, + "loss": 1.0732, + "step": 5072 + }, + { + "epoch": 0.7975004421387726, + "grad_norm": 0.15084420144557953, + "learning_rate": 3.2859274475446134e-05, + "loss": 1.0875, + "step": 5073 + }, + { + "epoch": 0.7976576470357052, + "grad_norm": 0.36738401651382446, + "learning_rate": 3.2853409079417915e-05, + "loss": 1.143, + "step": 5074 + }, + { + "epoch": 0.7978148519326377, + "grad_norm": 0.25699618458747864, + "learning_rate": 3.284754320373336e-05, + "loss": 1.1118, + "step": 5075 + }, + { + "epoch": 0.7979720568295703, + "grad_norm": 0.24444612860679626, + "learning_rate": 3.284167684875072e-05, + "loss": 1.047, + "step": 5076 + }, + { + "epoch": 0.7981292617265028, + "grad_norm": 0.1936510056257248, + "learning_rate": 3.283581001482829e-05, + "loss": 1.2441, + "step": 5077 + }, + { + "epoch": 0.7982864666234353, + "grad_norm": 0.23741251230239868, + "learning_rate": 3.28299427023244e-05, + "loss": 1.0017, + "step": 5078 + }, + { + "epoch": 0.7984436715203679, + "grad_norm": 0.13592703640460968, + "learning_rate": 3.28240749115974e-05, + "loss": 1.1265, + "step": 5079 + }, + { + "epoch": 0.7986008764173004, + "grad_norm": 0.16751481592655182, + "learning_rate": 3.2818206643005675e-05, + "loss": 1.0984, + "step": 5080 + }, + { + "epoch": 0.798758081314233, + "grad_norm": 0.1578437089920044, + "learning_rate": 3.2812337896907635e-05, + "loss": 1.1604, + "step": 5081 + }, + { + "epoch": 0.7989152862111655, + "grad_norm": 0.13940554857254028, + "learning_rate": 3.280646867366172e-05, + "loss": 0.9019, + "step": 5082 + }, + { + "epoch": 0.799072491108098, + "grad_norm": 0.17531779408454895, + "learning_rate": 3.28005989736264e-05, + "loss": 1.1489, + "step": 5083 + }, + { + "epoch": 0.7992296960050306, + "grad_norm": 0.15064075589179993, + "learning_rate": 3.279472879716017e-05, + "loss": 1.195, + "step": 5084 + }, + { + "epoch": 0.7993869009019631, + "grad_norm": 0.13430656492710114, + "learning_rate": 3.278885814462157e-05, + "loss": 1.106, + "step": 5085 + }, + { + "epoch": 0.7995441057988957, + "grad_norm": 0.15016251802444458, + "learning_rate": 3.278298701636914e-05, + "loss": 1.1596, + "step": 5086 + }, + { + "epoch": 0.7997013106958282, + "grad_norm": 0.13735370337963104, + "learning_rate": 3.277711541276148e-05, + "loss": 1.1242, + "step": 5087 + }, + { + "epoch": 0.7998585155927607, + "grad_norm": 0.13054972887039185, + "learning_rate": 3.277124333415721e-05, + "loss": 1.2209, + "step": 5088 + }, + { + "epoch": 0.8000157204896933, + "grad_norm": 0.13098429143428802, + "learning_rate": 3.276537078091495e-05, + "loss": 1.1635, + "step": 5089 + }, + { + "epoch": 0.8001729253866258, + "grad_norm": 0.14684736728668213, + "learning_rate": 3.275949775339339e-05, + "loss": 1.0943, + "step": 5090 + }, + { + "epoch": 0.8003301302835584, + "grad_norm": 0.12680160999298096, + "learning_rate": 3.2753624251951234e-05, + "loss": 1.1548, + "step": 5091 + }, + { + "epoch": 0.8004873351804909, + "grad_norm": 0.12368624657392502, + "learning_rate": 3.274775027694721e-05, + "loss": 1.1061, + "step": 5092 + }, + { + "epoch": 0.8006445400774234, + "grad_norm": 0.12971846759319305, + "learning_rate": 3.274187582874008e-05, + "loss": 1.0889, + "step": 5093 + }, + { + "epoch": 0.800801744974356, + "grad_norm": 0.1387680470943451, + "learning_rate": 3.2736000907688624e-05, + "loss": 1.0168, + "step": 5094 + }, + { + "epoch": 0.8009589498712885, + "grad_norm": 0.17290528118610382, + "learning_rate": 3.273012551415168e-05, + "loss": 1.0913, + "step": 5095 + }, + { + "epoch": 0.8011161547682211, + "grad_norm": 0.14590062201023102, + "learning_rate": 3.272424964848806e-05, + "loss": 1.0417, + "step": 5096 + }, + { + "epoch": 0.8012733596651536, + "grad_norm": 0.12929320335388184, + "learning_rate": 3.271837331105667e-05, + "loss": 1.2211, + "step": 5097 + }, + { + "epoch": 0.8014305645620861, + "grad_norm": 0.1567511260509491, + "learning_rate": 3.27124965022164e-05, + "loss": 1.1004, + "step": 5098 + }, + { + "epoch": 0.8015877694590187, + "grad_norm": 0.12611301243305206, + "learning_rate": 3.2706619222326194e-05, + "loss": 1.1534, + "step": 5099 + }, + { + "epoch": 0.8017449743559512, + "grad_norm": 0.12038855254650116, + "learning_rate": 3.2700741471745014e-05, + "loss": 0.9957, + "step": 5100 + }, + { + "epoch": 0.8019021792528838, + "grad_norm": 0.13091440498828888, + "learning_rate": 3.269486325083183e-05, + "loss": 1.1319, + "step": 5101 + }, + { + "epoch": 0.8020593841498163, + "grad_norm": 0.1311814785003662, + "learning_rate": 3.2688984559945686e-05, + "loss": 1.1079, + "step": 5102 + }, + { + "epoch": 0.8022165890467488, + "grad_norm": 0.14918267726898193, + "learning_rate": 3.268310539944561e-05, + "loss": 1.2066, + "step": 5103 + }, + { + "epoch": 0.8023737939436814, + "grad_norm": 0.14358769357204437, + "learning_rate": 3.2677225769690695e-05, + "loss": 1.1974, + "step": 5104 + }, + { + "epoch": 0.8025309988406139, + "grad_norm": 0.16421452164649963, + "learning_rate": 3.267134567104004e-05, + "loss": 1.1073, + "step": 5105 + }, + { + "epoch": 0.8026882037375465, + "grad_norm": 0.13539628684520721, + "learning_rate": 3.266546510385278e-05, + "loss": 1.0468, + "step": 5106 + }, + { + "epoch": 0.802845408634479, + "grad_norm": 0.12118083983659744, + "learning_rate": 3.265958406848807e-05, + "loss": 1.0258, + "step": 5107 + }, + { + "epoch": 0.8030026135314116, + "grad_norm": 0.12715213000774384, + "learning_rate": 3.2653702565305114e-05, + "loss": 1.152, + "step": 5108 + }, + { + "epoch": 0.803159818428344, + "grad_norm": 0.12841090559959412, + "learning_rate": 3.264782059466313e-05, + "loss": 1.091, + "step": 5109 + }, + { + "epoch": 0.8033170233252765, + "grad_norm": 0.14065606892108917, + "learning_rate": 3.264193815692136e-05, + "loss": 1.1107, + "step": 5110 + }, + { + "epoch": 0.8034742282222092, + "grad_norm": 0.12508265674114227, + "learning_rate": 3.2636055252439075e-05, + "loss": 1.219, + "step": 5111 + }, + { + "epoch": 0.8036314331191416, + "grad_norm": 0.14038169384002686, + "learning_rate": 3.2630171881575587e-05, + "loss": 1.1776, + "step": 5112 + }, + { + "epoch": 0.8037886380160743, + "grad_norm": 0.18364542722702026, + "learning_rate": 3.2624288044690244e-05, + "loss": 1.0645, + "step": 5113 + }, + { + "epoch": 0.8039458429130067, + "grad_norm": 0.13592836260795593, + "learning_rate": 3.261840374214239e-05, + "loss": 1.1253, + "step": 5114 + }, + { + "epoch": 0.8041030478099392, + "grad_norm": 0.16061215102672577, + "learning_rate": 3.261251897429142e-05, + "loss": 1.1096, + "step": 5115 + }, + { + "epoch": 0.8042602527068718, + "grad_norm": 0.13377389311790466, + "learning_rate": 3.260663374149675e-05, + "loss": 1.0646, + "step": 5116 + }, + { + "epoch": 0.8044174576038043, + "grad_norm": 0.13679906725883484, + "learning_rate": 3.260074804411784e-05, + "loss": 1.0069, + "step": 5117 + }, + { + "epoch": 0.804574662500737, + "grad_norm": 0.1492318958044052, + "learning_rate": 3.2594861882514156e-05, + "loss": 1.0913, + "step": 5118 + }, + { + "epoch": 0.8047318673976694, + "grad_norm": 0.13761930167675018, + "learning_rate": 3.2588975257045207e-05, + "loss": 1.0727, + "step": 5119 + }, + { + "epoch": 0.8048890722946019, + "grad_norm": 0.13349227607250214, + "learning_rate": 3.258308816807052e-05, + "loss": 1.1991, + "step": 5120 + }, + { + "epoch": 0.8048890722946019, + "eval_loss": 1.0947285890579224, + "eval_runtime": 2315.7281, + "eval_samples_per_second": 3.998, + "eval_steps_per_second": 1.999, + "step": 5120 + }, + { + "epoch": 0.8050462771915345, + "grad_norm": 0.1252872198820114, + "learning_rate": 3.2577200615949664e-05, + "loss": 1.185, + "step": 5121 + }, + { + "epoch": 0.805203482088467, + "grad_norm": 0.1414654552936554, + "learning_rate": 3.2571312601042217e-05, + "loss": 1.0538, + "step": 5122 + }, + { + "epoch": 0.8053606869853996, + "grad_norm": 0.1252065896987915, + "learning_rate": 3.256542412370781e-05, + "loss": 1.082, + "step": 5123 + }, + { + "epoch": 0.8055178918823321, + "grad_norm": 0.12409151345491409, + "learning_rate": 3.2559535184306076e-05, + "loss": 1.0973, + "step": 5124 + }, + { + "epoch": 0.8056750967792646, + "grad_norm": 0.14512157440185547, + "learning_rate": 3.25536457831967e-05, + "loss": 1.1707, + "step": 5125 + }, + { + "epoch": 0.8058323016761972, + "grad_norm": 0.12563149631023407, + "learning_rate": 3.254775592073937e-05, + "loss": 1.1245, + "step": 5126 + }, + { + "epoch": 0.8059895065731297, + "grad_norm": 0.14275604486465454, + "learning_rate": 3.254186559729384e-05, + "loss": 1.1181, + "step": 5127 + }, + { + "epoch": 0.8061467114700623, + "grad_norm": 0.13207076489925385, + "learning_rate": 3.2535974813219857e-05, + "loss": 1.1979, + "step": 5128 + }, + { + "epoch": 0.8063039163669948, + "grad_norm": 0.1410505622625351, + "learning_rate": 3.253008356887719e-05, + "loss": 1.0827, + "step": 5129 + }, + { + "epoch": 0.8064611212639273, + "grad_norm": 0.12954916059970856, + "learning_rate": 3.252419186462568e-05, + "loss": 1.0511, + "step": 5130 + }, + { + "epoch": 0.8066183261608599, + "grad_norm": 0.1363353431224823, + "learning_rate": 3.251829970082515e-05, + "loss": 1.0479, + "step": 5131 + }, + { + "epoch": 0.8067755310577924, + "grad_norm": 0.1394612044095993, + "learning_rate": 3.251240707783548e-05, + "loss": 1.1188, + "step": 5132 + }, + { + "epoch": 0.806932735954725, + "grad_norm": 0.1582544445991516, + "learning_rate": 3.2506513996016576e-05, + "loss": 1.0661, + "step": 5133 + }, + { + "epoch": 0.8070899408516575, + "grad_norm": 0.1461925208568573, + "learning_rate": 3.250062045572835e-05, + "loss": 1.1376, + "step": 5134 + }, + { + "epoch": 0.80724714574859, + "grad_norm": 0.13171181082725525, + "learning_rate": 3.249472645733076e-05, + "loss": 1.1843, + "step": 5135 + }, + { + "epoch": 0.8074043506455226, + "grad_norm": 0.12942472100257874, + "learning_rate": 3.24888320011838e-05, + "loss": 1.1095, + "step": 5136 + }, + { + "epoch": 0.8075615555424551, + "grad_norm": 0.1437719464302063, + "learning_rate": 3.248293708764748e-05, + "loss": 1.1822, + "step": 5137 + }, + { + "epoch": 0.8077187604393877, + "grad_norm": 0.12577079236507416, + "learning_rate": 3.247704171708183e-05, + "loss": 1.1092, + "step": 5138 + }, + { + "epoch": 0.8078759653363202, + "grad_norm": 0.15389268100261688, + "learning_rate": 3.247114588984692e-05, + "loss": 1.078, + "step": 5139 + }, + { + "epoch": 0.8080331702332527, + "grad_norm": 0.12741175293922424, + "learning_rate": 3.246524960630284e-05, + "loss": 1.1281, + "step": 5140 + }, + { + "epoch": 0.8081903751301853, + "grad_norm": 0.13801594078540802, + "learning_rate": 3.245935286680972e-05, + "loss": 0.9421, + "step": 5141 + }, + { + "epoch": 0.8083475800271178, + "grad_norm": 0.1279487907886505, + "learning_rate": 3.245345567172771e-05, + "loss": 1.1482, + "step": 5142 + }, + { + "epoch": 0.8085047849240504, + "grad_norm": 0.13442997634410858, + "learning_rate": 3.244755802141699e-05, + "loss": 1.1681, + "step": 5143 + }, + { + "epoch": 0.8086619898209829, + "grad_norm": 0.13333237171173096, + "learning_rate": 3.2441659916237754e-05, + "loss": 1.0835, + "step": 5144 + }, + { + "epoch": 0.8088191947179154, + "grad_norm": 0.12792964279651642, + "learning_rate": 3.2435761356550244e-05, + "loss": 1.0747, + "step": 5145 + }, + { + "epoch": 0.808976399614848, + "grad_norm": 0.1448972523212433, + "learning_rate": 3.242986234271473e-05, + "loss": 1.1094, + "step": 5146 + }, + { + "epoch": 0.8091336045117805, + "grad_norm": 0.12385497987270355, + "learning_rate": 3.2423962875091486e-05, + "loss": 1.0085, + "step": 5147 + }, + { + "epoch": 0.8092908094087131, + "grad_norm": 0.13567715883255005, + "learning_rate": 3.241806295404084e-05, + "loss": 1.0447, + "step": 5148 + }, + { + "epoch": 0.8094480143056456, + "grad_norm": 0.1407817155122757, + "learning_rate": 3.241216257992313e-05, + "loss": 1.1313, + "step": 5149 + }, + { + "epoch": 0.8096052192025781, + "grad_norm": 0.16297848522663116, + "learning_rate": 3.240626175309873e-05, + "loss": 1.1056, + "step": 5150 + }, + { + "epoch": 0.8097624240995107, + "grad_norm": 0.14074040949344635, + "learning_rate": 3.2400360473928046e-05, + "loss": 1.082, + "step": 5151 + }, + { + "epoch": 0.8099196289964432, + "grad_norm": 0.14632785320281982, + "learning_rate": 3.2394458742771494e-05, + "loss": 1.0199, + "step": 5152 + }, + { + "epoch": 0.8100768338933758, + "grad_norm": 0.1421334147453308, + "learning_rate": 3.238855655998954e-05, + "loss": 1.1271, + "step": 5153 + }, + { + "epoch": 0.8102340387903083, + "grad_norm": 0.1565493792295456, + "learning_rate": 3.238265392594266e-05, + "loss": 1.0984, + "step": 5154 + }, + { + "epoch": 0.8103912436872408, + "grad_norm": 0.14498156309127808, + "learning_rate": 3.237675084099137e-05, + "loss": 1.186, + "step": 5155 + }, + { + "epoch": 0.8105484485841734, + "grad_norm": 0.13381299376487732, + "learning_rate": 3.23708473054962e-05, + "loss": 1.0743, + "step": 5156 + }, + { + "epoch": 0.8107056534811059, + "grad_norm": 0.14723144471645355, + "learning_rate": 3.236494331981773e-05, + "loss": 1.0761, + "step": 5157 + }, + { + "epoch": 0.8108628583780385, + "grad_norm": 0.12565001845359802, + "learning_rate": 3.235903888431654e-05, + "loss": 1.0359, + "step": 5158 + }, + { + "epoch": 0.811020063274971, + "grad_norm": 0.14763836562633514, + "learning_rate": 3.235313399935326e-05, + "loss": 1.1093, + "step": 5159 + }, + { + "epoch": 0.8111772681719036, + "grad_norm": 0.18552212417125702, + "learning_rate": 3.234722866528852e-05, + "loss": 1.0349, + "step": 5160 + }, + { + "epoch": 0.8113344730688361, + "grad_norm": 0.14310771226882935, + "learning_rate": 3.234132288248302e-05, + "loss": 1.0869, + "step": 5161 + }, + { + "epoch": 0.8114916779657686, + "grad_norm": 0.1727200150489807, + "learning_rate": 3.2335416651297436e-05, + "loss": 1.1094, + "step": 5162 + }, + { + "epoch": 0.8116488828627012, + "grad_norm": 0.1451343148946762, + "learning_rate": 3.2329509972092524e-05, + "loss": 1.1691, + "step": 5163 + }, + { + "epoch": 0.8118060877596337, + "grad_norm": 0.13956843316555023, + "learning_rate": 3.232360284522903e-05, + "loss": 0.9013, + "step": 5164 + }, + { + "epoch": 0.8119632926565663, + "grad_norm": 0.16323736310005188, + "learning_rate": 3.2317695271067725e-05, + "loss": 1.1315, + "step": 5165 + }, + { + "epoch": 0.8121204975534988, + "grad_norm": 0.2190149426460266, + "learning_rate": 3.231178724996945e-05, + "loss": 1.2037, + "step": 5166 + }, + { + "epoch": 0.8122777024504313, + "grad_norm": 0.14855903387069702, + "learning_rate": 3.230587878229502e-05, + "loss": 1.1752, + "step": 5167 + }, + { + "epoch": 0.8124349073473639, + "grad_norm": 0.13913439214229584, + "learning_rate": 3.2299969868405324e-05, + "loss": 1.1198, + "step": 5168 + }, + { + "epoch": 0.8125921122442964, + "grad_norm": 0.13375224173069, + "learning_rate": 3.2294060508661225e-05, + "loss": 1.0448, + "step": 5169 + }, + { + "epoch": 0.812749317141229, + "grad_norm": 0.16045208275318146, + "learning_rate": 3.228815070342368e-05, + "loss": 1.1786, + "step": 5170 + }, + { + "epoch": 0.8129065220381615, + "grad_norm": 0.1577804535627365, + "learning_rate": 3.22822404530536e-05, + "loss": 1.1587, + "step": 5171 + }, + { + "epoch": 0.813063726935094, + "grad_norm": 0.15006138384342194, + "learning_rate": 3.2276329757912e-05, + "loss": 1.0087, + "step": 5172 + }, + { + "epoch": 0.8132209318320266, + "grad_norm": 0.13959920406341553, + "learning_rate": 3.227041861835985e-05, + "loss": 1.0801, + "step": 5173 + }, + { + "epoch": 0.8133781367289591, + "grad_norm": 0.130475252866745, + "learning_rate": 3.2264507034758195e-05, + "loss": 1.1616, + "step": 5174 + }, + { + "epoch": 0.8135353416258917, + "grad_norm": 0.12751032412052155, + "learning_rate": 3.2258595007468096e-05, + "loss": 1.1243, + "step": 5175 + }, + { + "epoch": 0.8136925465228242, + "grad_norm": 0.13321606814861298, + "learning_rate": 3.225268253685062e-05, + "loss": 1.1581, + "step": 5176 + }, + { + "epoch": 0.8138497514197567, + "grad_norm": 0.14121800661087036, + "learning_rate": 3.224676962326691e-05, + "loss": 1.1686, + "step": 5177 + }, + { + "epoch": 0.8140069563166893, + "grad_norm": 0.13203032314777374, + "learning_rate": 3.2240856267078065e-05, + "loss": 1.0636, + "step": 5178 + }, + { + "epoch": 0.8141641612136218, + "grad_norm": 0.15097764134407043, + "learning_rate": 3.223494246864527e-05, + "loss": 1.0259, + "step": 5179 + }, + { + "epoch": 0.8143213661105544, + "grad_norm": 0.13323961198329926, + "learning_rate": 3.2229028228329714e-05, + "loss": 1.1322, + "step": 5180 + }, + { + "epoch": 0.8144785710074869, + "grad_norm": 0.15312063694000244, + "learning_rate": 3.222311354649263e-05, + "loss": 0.9622, + "step": 5181 + }, + { + "epoch": 0.8146357759044194, + "grad_norm": 0.13417083024978638, + "learning_rate": 3.2217198423495245e-05, + "loss": 1.0706, + "step": 5182 + }, + { + "epoch": 0.814792980801352, + "grad_norm": 0.13406860828399658, + "learning_rate": 3.2211282859698846e-05, + "loss": 1.0491, + "step": 5183 + }, + { + "epoch": 0.8149501856982845, + "grad_norm": 0.15189844369888306, + "learning_rate": 3.220536685546472e-05, + "loss": 1.2357, + "step": 5184 + }, + { + "epoch": 0.8151073905952171, + "grad_norm": 0.1376749873161316, + "learning_rate": 3.21994504111542e-05, + "loss": 1.1067, + "step": 5185 + }, + { + "epoch": 0.8152645954921496, + "grad_norm": 0.15771864354610443, + "learning_rate": 3.2193533527128654e-05, + "loss": 1.1533, + "step": 5186 + }, + { + "epoch": 0.8154218003890821, + "grad_norm": 0.1384822279214859, + "learning_rate": 3.218761620374944e-05, + "loss": 1.2118, + "step": 5187 + }, + { + "epoch": 0.8155790052860147, + "grad_norm": 0.16346484422683716, + "learning_rate": 3.218169844137797e-05, + "loss": 1.0801, + "step": 5188 + }, + { + "epoch": 0.8157362101829472, + "grad_norm": 0.1343608945608139, + "learning_rate": 3.2175780240375695e-05, + "loss": 1.0816, + "step": 5189 + }, + { + "epoch": 0.8158934150798798, + "grad_norm": 0.14006014168262482, + "learning_rate": 3.216986160110406e-05, + "loss": 0.9928, + "step": 5190 + }, + { + "epoch": 0.8160506199768123, + "grad_norm": 0.1414293646812439, + "learning_rate": 3.216394252392456e-05, + "loss": 1.2733, + "step": 5191 + }, + { + "epoch": 0.8162078248737448, + "grad_norm": 0.11960668116807938, + "learning_rate": 3.2158023009198706e-05, + "loss": 1.0705, + "step": 5192 + }, + { + "epoch": 0.8163650297706774, + "grad_norm": 0.13784858584403992, + "learning_rate": 3.2152103057288045e-05, + "loss": 1.0802, + "step": 5193 + }, + { + "epoch": 0.8165222346676099, + "grad_norm": 0.15226618945598602, + "learning_rate": 3.214618266855413e-05, + "loss": 1.1768, + "step": 5194 + }, + { + "epoch": 0.8166794395645425, + "grad_norm": 0.1458420753479004, + "learning_rate": 3.2140261843358574e-05, + "loss": 1.0452, + "step": 5195 + }, + { + "epoch": 0.816836644461475, + "grad_norm": 0.12530644237995148, + "learning_rate": 3.2134340582062994e-05, + "loss": 0.9575, + "step": 5196 + }, + { + "epoch": 0.8169938493584075, + "grad_norm": 0.13370977342128754, + "learning_rate": 3.2128418885029034e-05, + "loss": 1.1113, + "step": 5197 + }, + { + "epoch": 0.8171510542553401, + "grad_norm": 0.13998764753341675, + "learning_rate": 3.212249675261838e-05, + "loss": 1.152, + "step": 5198 + }, + { + "epoch": 0.8173082591522726, + "grad_norm": 0.16114076972007751, + "learning_rate": 3.21165741851927e-05, + "loss": 1.0632, + "step": 5199 + }, + { + "epoch": 0.8174654640492052, + "grad_norm": 0.14692504703998566, + "learning_rate": 3.211065118311377e-05, + "loss": 1.1338, + "step": 5200 + }, + { + "epoch": 0.8176226689461377, + "grad_norm": 0.17466497421264648, + "learning_rate": 3.21047277467433e-05, + "loss": 1.0663, + "step": 5201 + }, + { + "epoch": 0.8177798738430702, + "grad_norm": 0.1547803282737732, + "learning_rate": 3.2098803876443103e-05, + "loss": 1.048, + "step": 5202 + }, + { + "epoch": 0.8179370787400028, + "grad_norm": 0.14291557669639587, + "learning_rate": 3.2092879572574975e-05, + "loss": 1.0921, + "step": 5203 + }, + { + "epoch": 0.8180942836369353, + "grad_norm": 0.13960640132427216, + "learning_rate": 3.2086954835500736e-05, + "loss": 1.1639, + "step": 5204 + }, + { + "epoch": 0.8182514885338679, + "grad_norm": 0.17912665009498596, + "learning_rate": 3.2081029665582274e-05, + "loss": 1.0639, + "step": 5205 + }, + { + "epoch": 0.8184086934308004, + "grad_norm": 0.14091219007968903, + "learning_rate": 3.207510406318146e-05, + "loss": 1.1496, + "step": 5206 + }, + { + "epoch": 0.8185658983277329, + "grad_norm": 0.14809809625148773, + "learning_rate": 3.206917802866021e-05, + "loss": 1.1601, + "step": 5207 + }, + { + "epoch": 0.8187231032246655, + "grad_norm": 0.1427210122346878, + "learning_rate": 3.206325156238045e-05, + "loss": 1.1745, + "step": 5208 + }, + { + "epoch": 0.818880308121598, + "grad_norm": 0.1384812444448471, + "learning_rate": 3.205732466470417e-05, + "loss": 1.1849, + "step": 5209 + }, + { + "epoch": 0.8190375130185306, + "grad_norm": 0.14473986625671387, + "learning_rate": 3.2051397335993347e-05, + "loss": 1.0299, + "step": 5210 + }, + { + "epoch": 0.819194717915463, + "grad_norm": 0.1633637249469757, + "learning_rate": 3.204546957661001e-05, + "loss": 1.0674, + "step": 5211 + }, + { + "epoch": 0.8193519228123957, + "grad_norm": 0.1735648661851883, + "learning_rate": 3.203954138691619e-05, + "loss": 1.1624, + "step": 5212 + }, + { + "epoch": 0.8195091277093282, + "grad_norm": 0.13555194437503815, + "learning_rate": 3.203361276727397e-05, + "loss": 1.1849, + "step": 5213 + }, + { + "epoch": 0.8196663326062607, + "grad_norm": 0.14566943049430847, + "learning_rate": 3.202768371804544e-05, + "loss": 1.0632, + "step": 5214 + }, + { + "epoch": 0.8198235375031933, + "grad_norm": 0.15393325686454773, + "learning_rate": 3.202175423959272e-05, + "loss": 1.0622, + "step": 5215 + }, + { + "epoch": 0.8199807424001258, + "grad_norm": 0.1464414745569229, + "learning_rate": 3.201582433227798e-05, + "loss": 1.1167, + "step": 5216 + }, + { + "epoch": 0.8201379472970584, + "grad_norm": 0.14175230264663696, + "learning_rate": 3.2009893996463384e-05, + "loss": 1.0924, + "step": 5217 + }, + { + "epoch": 0.8202951521939909, + "grad_norm": 0.1299310177564621, + "learning_rate": 3.200396323251112e-05, + "loss": 1.0429, + "step": 5218 + }, + { + "epoch": 0.8204523570909233, + "grad_norm": 0.15049515664577484, + "learning_rate": 3.199803204078344e-05, + "loss": 1.1402, + "step": 5219 + }, + { + "epoch": 0.820609561987856, + "grad_norm": 0.13316011428833008, + "learning_rate": 3.199210042164259e-05, + "loss": 1.1151, + "step": 5220 + }, + { + "epoch": 0.8207667668847884, + "grad_norm": 0.13929545879364014, + "learning_rate": 3.1986168375450845e-05, + "loss": 1.0497, + "step": 5221 + }, + { + "epoch": 0.820923971781721, + "grad_norm": 0.14065782725811005, + "learning_rate": 3.198023590257052e-05, + "loss": 1.1291, + "step": 5222 + }, + { + "epoch": 0.8210811766786535, + "grad_norm": 0.13490915298461914, + "learning_rate": 3.197430300336394e-05, + "loss": 1.1718, + "step": 5223 + }, + { + "epoch": 0.821238381575586, + "grad_norm": 0.1275462955236435, + "learning_rate": 3.196836967819347e-05, + "loss": 1.1754, + "step": 5224 + }, + { + "epoch": 0.8213955864725186, + "grad_norm": 0.1298583745956421, + "learning_rate": 3.196243592742148e-05, + "loss": 0.9097, + "step": 5225 + }, + { + "epoch": 0.8215527913694511, + "grad_norm": 0.13181264698505402, + "learning_rate": 3.1956501751410416e-05, + "loss": 1.2361, + "step": 5226 + }, + { + "epoch": 0.8217099962663837, + "grad_norm": 0.12378641217947006, + "learning_rate": 3.195056715052268e-05, + "loss": 0.9561, + "step": 5227 + }, + { + "epoch": 0.8218672011633162, + "grad_norm": 0.13846050202846527, + "learning_rate": 3.194463212512075e-05, + "loss": 1.1274, + "step": 5228 + }, + { + "epoch": 0.8220244060602487, + "grad_norm": 0.13270175457000732, + "learning_rate": 3.1938696675567114e-05, + "loss": 1.1153, + "step": 5229 + }, + { + "epoch": 0.8221816109571813, + "grad_norm": 0.13590794801712036, + "learning_rate": 3.1932760802224285e-05, + "loss": 1.1178, + "step": 5230 + }, + { + "epoch": 0.8223388158541138, + "grad_norm": 0.13039462268352509, + "learning_rate": 3.192682450545481e-05, + "loss": 1.1305, + "step": 5231 + }, + { + "epoch": 0.8224960207510464, + "grad_norm": 0.13966499269008636, + "learning_rate": 3.1920887785621235e-05, + "loss": 1.1426, + "step": 5232 + }, + { + "epoch": 0.8226532256479789, + "grad_norm": 0.1593773514032364, + "learning_rate": 3.191495064308618e-05, + "loss": 1.0802, + "step": 5233 + }, + { + "epoch": 0.8228104305449114, + "grad_norm": 0.14048096537590027, + "learning_rate": 3.1909013078212235e-05, + "loss": 1.0785, + "step": 5234 + }, + { + "epoch": 0.822967635441844, + "grad_norm": 0.1394653022289276, + "learning_rate": 3.190307509136207e-05, + "loss": 1.0112, + "step": 5235 + }, + { + "epoch": 0.8231248403387765, + "grad_norm": 0.15474697947502136, + "learning_rate": 3.189713668289834e-05, + "loss": 1.1056, + "step": 5236 + }, + { + "epoch": 0.8232820452357091, + "grad_norm": 0.15830832719802856, + "learning_rate": 3.1891197853183744e-05, + "loss": 1.2166, + "step": 5237 + }, + { + "epoch": 0.8234392501326416, + "grad_norm": 0.1479511260986328, + "learning_rate": 3.1885258602581e-05, + "loss": 0.9892, + "step": 5238 + }, + { + "epoch": 0.8235964550295741, + "grad_norm": 0.1471664309501648, + "learning_rate": 3.187931893145285e-05, + "loss": 1.0284, + "step": 5239 + }, + { + "epoch": 0.8237536599265067, + "grad_norm": 0.15496090054512024, + "learning_rate": 3.1873378840162086e-05, + "loss": 1.0534, + "step": 5240 + }, + { + "epoch": 0.8239108648234392, + "grad_norm": 0.1371716558933258, + "learning_rate": 3.186743832907149e-05, + "loss": 1.1141, + "step": 5241 + }, + { + "epoch": 0.8240680697203718, + "grad_norm": 0.1419869214296341, + "learning_rate": 3.186149739854389e-05, + "loss": 1.0923, + "step": 5242 + }, + { + "epoch": 0.8242252746173043, + "grad_norm": 0.13275684416294098, + "learning_rate": 3.185555604894214e-05, + "loss": 1.2053, + "step": 5243 + }, + { + "epoch": 0.8243824795142368, + "grad_norm": 0.14531686902046204, + "learning_rate": 3.1849614280629096e-05, + "loss": 1.0359, + "step": 5244 + }, + { + "epoch": 0.8245396844111694, + "grad_norm": 0.1412031352519989, + "learning_rate": 3.1843672093967685e-05, + "loss": 1.0295, + "step": 5245 + }, + { + "epoch": 0.8246968893081019, + "grad_norm": 0.13067127764225006, + "learning_rate": 3.183772948932082e-05, + "loss": 1.0293, + "step": 5246 + }, + { + "epoch": 0.8248540942050345, + "grad_norm": 0.1574161797761917, + "learning_rate": 3.183178646705146e-05, + "loss": 1.1704, + "step": 5247 + }, + { + "epoch": 0.825011299101967, + "grad_norm": 0.14514204859733582, + "learning_rate": 3.1825843027522554e-05, + "loss": 1.0959, + "step": 5248 + }, + { + "epoch": 0.8251685039988995, + "grad_norm": 0.1517404168844223, + "learning_rate": 3.1819899171097146e-05, + "loss": 1.035, + "step": 5249 + }, + { + "epoch": 0.8253257088958321, + "grad_norm": 0.13299009203910828, + "learning_rate": 3.181395489813824e-05, + "loss": 1.0324, + "step": 5250 + }, + { + "epoch": 0.8254829137927646, + "grad_norm": 0.14681966602802277, + "learning_rate": 3.180801020900889e-05, + "loss": 1.1329, + "step": 5251 + }, + { + "epoch": 0.8256401186896972, + "grad_norm": 0.11702141910791397, + "learning_rate": 3.180206510407218e-05, + "loss": 1.0655, + "step": 5252 + }, + { + "epoch": 0.8257973235866297, + "grad_norm": 0.12597863376140594, + "learning_rate": 3.1796119583691214e-05, + "loss": 1.1097, + "step": 5253 + }, + { + "epoch": 0.8259545284835622, + "grad_norm": 0.13226962089538574, + "learning_rate": 3.179017364822913e-05, + "loss": 1.0156, + "step": 5254 + }, + { + "epoch": 0.8261117333804948, + "grad_norm": 0.13883601129055023, + "learning_rate": 3.178422729804906e-05, + "loss": 1.1682, + "step": 5255 + }, + { + "epoch": 0.8262689382774273, + "grad_norm": 0.12312841415405273, + "learning_rate": 3.177828053351421e-05, + "loss": 1.1061, + "step": 5256 + }, + { + "epoch": 0.8264261431743599, + "grad_norm": 0.1382024735212326, + "learning_rate": 3.177233335498777e-05, + "loss": 1.062, + "step": 5257 + }, + { + "epoch": 0.8265833480712924, + "grad_norm": 0.138153076171875, + "learning_rate": 3.176638576283298e-05, + "loss": 1.1267, + "step": 5258 + }, + { + "epoch": 0.8267405529682249, + "grad_norm": 0.15296030044555664, + "learning_rate": 3.176043775741308e-05, + "loss": 1.1957, + "step": 5259 + }, + { + "epoch": 0.8268977578651575, + "grad_norm": 0.1348961442708969, + "learning_rate": 3.175448933909138e-05, + "loss": 1.1818, + "step": 5260 + }, + { + "epoch": 0.82705496276209, + "grad_norm": 0.14340707659721375, + "learning_rate": 3.1748540508231165e-05, + "loss": 1.1147, + "step": 5261 + }, + { + "epoch": 0.8272121676590226, + "grad_norm": 0.14010478556156158, + "learning_rate": 3.174259126519576e-05, + "loss": 1.0012, + "step": 5262 + }, + { + "epoch": 0.8273693725559551, + "grad_norm": 0.13827413320541382, + "learning_rate": 3.173664161034855e-05, + "loss": 1.0526, + "step": 5263 + }, + { + "epoch": 0.8275265774528877, + "grad_norm": 0.1358298659324646, + "learning_rate": 3.1730691544052894e-05, + "loss": 1.1188, + "step": 5264 + }, + { + "epoch": 0.8276837823498202, + "grad_norm": 0.13605011999607086, + "learning_rate": 3.172474106667221e-05, + "loss": 1.1694, + "step": 5265 + }, + { + "epoch": 0.8278409872467527, + "grad_norm": 0.13801071047782898, + "learning_rate": 3.171879017856993e-05, + "loss": 1.1848, + "step": 5266 + }, + { + "epoch": 0.8279981921436853, + "grad_norm": 0.15380828082561493, + "learning_rate": 3.1712838880109506e-05, + "loss": 1.087, + "step": 5267 + }, + { + "epoch": 0.8281553970406178, + "grad_norm": 0.13930104672908783, + "learning_rate": 3.170688717165442e-05, + "loss": 1.0753, + "step": 5268 + }, + { + "epoch": 0.8283126019375504, + "grad_norm": 0.14911097288131714, + "learning_rate": 3.170093505356819e-05, + "loss": 1.0268, + "step": 5269 + }, + { + "epoch": 0.8284698068344829, + "grad_norm": 0.14786234498023987, + "learning_rate": 3.169498252621434e-05, + "loss": 1.1067, + "step": 5270 + }, + { + "epoch": 0.8286270117314154, + "grad_norm": 0.12906505167484283, + "learning_rate": 3.168902958995643e-05, + "loss": 1.0424, + "step": 5271 + }, + { + "epoch": 0.828784216628348, + "grad_norm": 0.144206240773201, + "learning_rate": 3.1683076245158036e-05, + "loss": 1.2012, + "step": 5272 + }, + { + "epoch": 0.8289414215252805, + "grad_norm": 0.1476116180419922, + "learning_rate": 3.167712249218278e-05, + "loss": 1.0983, + "step": 5273 + }, + { + "epoch": 0.8290986264222131, + "grad_norm": 0.13781993091106415, + "learning_rate": 3.167116833139428e-05, + "loss": 1.0777, + "step": 5274 + }, + { + "epoch": 0.8292558313191456, + "grad_norm": 0.1404123455286026, + "learning_rate": 3.166521376315621e-05, + "loss": 1.0654, + "step": 5275 + }, + { + "epoch": 0.8294130362160781, + "grad_norm": 0.15808063745498657, + "learning_rate": 3.1659258787832245e-05, + "loss": 1.093, + "step": 5276 + }, + { + "epoch": 0.8295702411130107, + "grad_norm": 0.14129871129989624, + "learning_rate": 3.165330340578608e-05, + "loss": 1.0843, + "step": 5277 + }, + { + "epoch": 0.8297274460099432, + "grad_norm": 0.1428525745868683, + "learning_rate": 3.1647347617381464e-05, + "loss": 1.067, + "step": 5278 + }, + { + "epoch": 0.8298846509068758, + "grad_norm": 0.12404271215200424, + "learning_rate": 3.164139142298214e-05, + "loss": 1.048, + "step": 5279 + }, + { + "epoch": 0.8300418558038083, + "grad_norm": 0.12726539373397827, + "learning_rate": 3.16354348229519e-05, + "loss": 1.0153, + "step": 5280 + }, + { + "epoch": 0.8300418558038083, + "eval_loss": 1.0945003032684326, + "eval_runtime": 2386.806, + "eval_samples_per_second": 3.879, + "eval_steps_per_second": 1.939, + "step": 5280 + }, + { + "epoch": 0.8301990607007408, + "grad_norm": 0.13088205456733704, + "learning_rate": 3.1629477817654554e-05, + "loss": 1.1087, + "step": 5281 + }, + { + "epoch": 0.8303562655976734, + "grad_norm": 0.12422250211238861, + "learning_rate": 3.1623520407453925e-05, + "loss": 1.0774, + "step": 5282 + }, + { + "epoch": 0.8305134704946059, + "grad_norm": 0.14573022723197937, + "learning_rate": 3.1617562592713856e-05, + "loss": 1.1796, + "step": 5283 + }, + { + "epoch": 0.8306706753915385, + "grad_norm": 0.13225945830345154, + "learning_rate": 3.161160437379826e-05, + "loss": 1.0677, + "step": 5284 + }, + { + "epoch": 0.830827880288471, + "grad_norm": 0.132116436958313, + "learning_rate": 3.160564575107102e-05, + "loss": 1.1573, + "step": 5285 + }, + { + "epoch": 0.8309850851854035, + "grad_norm": 0.17886874079704285, + "learning_rate": 3.159968672489606e-05, + "loss": 0.948, + "step": 5286 + }, + { + "epoch": 0.8311422900823361, + "grad_norm": 0.14379629492759705, + "learning_rate": 3.159372729563736e-05, + "loss": 1.1689, + "step": 5287 + }, + { + "epoch": 0.8312994949792686, + "grad_norm": 0.12328069657087326, + "learning_rate": 3.158776746365887e-05, + "loss": 0.9509, + "step": 5288 + }, + { + "epoch": 0.8314566998762012, + "grad_norm": 0.1336534470319748, + "learning_rate": 3.158180722932461e-05, + "loss": 1.1863, + "step": 5289 + }, + { + "epoch": 0.8316139047731337, + "grad_norm": 0.1478208303451538, + "learning_rate": 3.1575846592998614e-05, + "loss": 1.1907, + "step": 5290 + }, + { + "epoch": 0.8317711096700662, + "grad_norm": 0.12341664731502533, + "learning_rate": 3.156988555504492e-05, + "loss": 1.125, + "step": 5291 + }, + { + "epoch": 0.8319283145669988, + "grad_norm": 0.13853976130485535, + "learning_rate": 3.1563924115827626e-05, + "loss": 1.1494, + "step": 5292 + }, + { + "epoch": 0.8320855194639313, + "grad_norm": 0.13517549633979797, + "learning_rate": 3.1557962275710804e-05, + "loss": 1.0341, + "step": 5293 + }, + { + "epoch": 0.8322427243608639, + "grad_norm": 0.12771376967430115, + "learning_rate": 3.155200003505861e-05, + "loss": 1.0591, + "step": 5294 + }, + { + "epoch": 0.8323999292577964, + "grad_norm": 0.14621414244174957, + "learning_rate": 3.1546037394235175e-05, + "loss": 1.1466, + "step": 5295 + }, + { + "epoch": 0.8325571341547289, + "grad_norm": 0.1306588351726532, + "learning_rate": 3.1540074353604694e-05, + "loss": 1.1094, + "step": 5296 + }, + { + "epoch": 0.8327143390516615, + "grad_norm": 0.14737164974212646, + "learning_rate": 3.153411091353134e-05, + "loss": 1.146, + "step": 5297 + }, + { + "epoch": 0.832871543948594, + "grad_norm": 0.12756302952766418, + "learning_rate": 3.152814707437937e-05, + "loss": 1.0389, + "step": 5298 + }, + { + "epoch": 0.8330287488455266, + "grad_norm": 0.18175055086612701, + "learning_rate": 3.152218283651299e-05, + "loss": 0.9774, + "step": 5299 + }, + { + "epoch": 0.8331859537424591, + "grad_norm": 0.1390320062637329, + "learning_rate": 3.1516218200296516e-05, + "loss": 1.1942, + "step": 5300 + }, + { + "epoch": 0.8333431586393916, + "grad_norm": 0.13662287592887878, + "learning_rate": 3.151025316609423e-05, + "loss": 1.2145, + "step": 5301 + }, + { + "epoch": 0.8335003635363242, + "grad_norm": 0.13008885085582733, + "learning_rate": 3.150428773427043e-05, + "loss": 1.1021, + "step": 5302 + }, + { + "epoch": 0.8336575684332567, + "grad_norm": 0.15316307544708252, + "learning_rate": 3.14983219051895e-05, + "loss": 1.0765, + "step": 5303 + }, + { + "epoch": 0.8338147733301893, + "grad_norm": 0.12470224499702454, + "learning_rate": 3.1492355679215785e-05, + "loss": 1.0923, + "step": 5304 + }, + { + "epoch": 0.8339719782271218, + "grad_norm": 0.1416996568441391, + "learning_rate": 3.148638905671369e-05, + "loss": 1.1026, + "step": 5305 + }, + { + "epoch": 0.8341291831240543, + "grad_norm": 0.12227781116962433, + "learning_rate": 3.1480422038047634e-05, + "loss": 1.06, + "step": 5306 + }, + { + "epoch": 0.8342863880209869, + "grad_norm": 0.14249075949192047, + "learning_rate": 3.147445462358204e-05, + "loss": 1.1334, + "step": 5307 + }, + { + "epoch": 0.8344435929179194, + "grad_norm": 0.13149915635585785, + "learning_rate": 3.146848681368141e-05, + "loss": 1.1708, + "step": 5308 + }, + { + "epoch": 0.834600797814852, + "grad_norm": 0.131791353225708, + "learning_rate": 3.1462518608710214e-05, + "loss": 1.1299, + "step": 5309 + }, + { + "epoch": 0.8347580027117845, + "grad_norm": 0.15127769112586975, + "learning_rate": 3.145655000903297e-05, + "loss": 1.0058, + "step": 5310 + }, + { + "epoch": 0.834915207608717, + "grad_norm": 0.1310853809118271, + "learning_rate": 3.145058101501421e-05, + "loss": 1.1463, + "step": 5311 + }, + { + "epoch": 0.8350724125056496, + "grad_norm": 0.11817453801631927, + "learning_rate": 3.144461162701851e-05, + "loss": 1.0382, + "step": 5312 + }, + { + "epoch": 0.8352296174025821, + "grad_norm": 0.13717181980609894, + "learning_rate": 3.1438641845410445e-05, + "loss": 1.103, + "step": 5313 + }, + { + "epoch": 0.8353868222995147, + "grad_norm": 0.12618355453014374, + "learning_rate": 3.143267167055464e-05, + "loss": 1.1123, + "step": 5314 + }, + { + "epoch": 0.8355440271964472, + "grad_norm": 0.14461813867092133, + "learning_rate": 3.1426701102815717e-05, + "loss": 1.1436, + "step": 5315 + }, + { + "epoch": 0.8357012320933797, + "grad_norm": 0.12690824270248413, + "learning_rate": 3.1420730142558346e-05, + "loss": 1.1202, + "step": 5316 + }, + { + "epoch": 0.8358584369903123, + "grad_norm": 0.13442037999629974, + "learning_rate": 3.1414758790147205e-05, + "loss": 1.096, + "step": 5317 + }, + { + "epoch": 0.8360156418872448, + "grad_norm": 0.13896392285823822, + "learning_rate": 3.1408787045947005e-05, + "loss": 1.1199, + "step": 5318 + }, + { + "epoch": 0.8361728467841774, + "grad_norm": 0.12997986376285553, + "learning_rate": 3.1402814910322485e-05, + "loss": 1.0643, + "step": 5319 + }, + { + "epoch": 0.8363300516811099, + "grad_norm": 0.15869256854057312, + "learning_rate": 3.1396842383638385e-05, + "loss": 1.0611, + "step": 5320 + }, + { + "epoch": 0.8364872565780425, + "grad_norm": 0.14344894886016846, + "learning_rate": 3.1390869466259484e-05, + "loss": 1.1521, + "step": 5321 + }, + { + "epoch": 0.836644461474975, + "grad_norm": 0.13265429437160492, + "learning_rate": 3.13848961585506e-05, + "loss": 1.1979, + "step": 5322 + }, + { + "epoch": 0.8368016663719074, + "grad_norm": 0.14370469748973846, + "learning_rate": 3.137892246087655e-05, + "loss": 1.0703, + "step": 5323 + }, + { + "epoch": 0.83695887126884, + "grad_norm": 0.13036467134952545, + "learning_rate": 3.137294837360219e-05, + "loss": 1.0914, + "step": 5324 + }, + { + "epoch": 0.8371160761657725, + "grad_norm": 0.14057908952236176, + "learning_rate": 3.136697389709239e-05, + "loss": 1.2288, + "step": 5325 + }, + { + "epoch": 0.8372732810627052, + "grad_norm": 0.13568785786628723, + "learning_rate": 3.1360999031712043e-05, + "loss": 1.1753, + "step": 5326 + }, + { + "epoch": 0.8374304859596376, + "grad_norm": 0.13155247271060944, + "learning_rate": 3.1355023777826076e-05, + "loss": 1.1589, + "step": 5327 + }, + { + "epoch": 0.8375876908565701, + "grad_norm": 0.12807098031044006, + "learning_rate": 3.1349048135799445e-05, + "loss": 0.8532, + "step": 5328 + }, + { + "epoch": 0.8377448957535027, + "grad_norm": 0.13014431297779083, + "learning_rate": 3.13430721059971e-05, + "loss": 1.0457, + "step": 5329 + }, + { + "epoch": 0.8379021006504352, + "grad_norm": 0.13080735504627228, + "learning_rate": 3.133709568878405e-05, + "loss": 1.1771, + "step": 5330 + }, + { + "epoch": 0.8380593055473678, + "grad_norm": 0.14092925190925598, + "learning_rate": 3.1331118884525314e-05, + "loss": 1.1508, + "step": 5331 + }, + { + "epoch": 0.8382165104443003, + "grad_norm": 0.1221790760755539, + "learning_rate": 3.132514169358591e-05, + "loss": 1.0459, + "step": 5332 + }, + { + "epoch": 0.8383737153412328, + "grad_norm": 0.13303996622562408, + "learning_rate": 3.131916411633092e-05, + "loss": 1.1042, + "step": 5333 + }, + { + "epoch": 0.8385309202381654, + "grad_norm": 0.14398670196533203, + "learning_rate": 3.131318615312544e-05, + "loss": 1.1293, + "step": 5334 + }, + { + "epoch": 0.8386881251350979, + "grad_norm": 0.12707754969596863, + "learning_rate": 3.130720780433456e-05, + "loss": 1.1123, + "step": 5335 + }, + { + "epoch": 0.8388453300320305, + "grad_norm": 0.13046690821647644, + "learning_rate": 3.130122907032343e-05, + "loss": 1.0741, + "step": 5336 + }, + { + "epoch": 0.839002534928963, + "grad_norm": 0.1457493156194687, + "learning_rate": 3.1295249951457194e-05, + "loss": 1.091, + "step": 5337 + }, + { + "epoch": 0.8391597398258955, + "grad_norm": 0.12993571162223816, + "learning_rate": 3.128927044810105e-05, + "loss": 0.9597, + "step": 5338 + }, + { + "epoch": 0.8393169447228281, + "grad_norm": 0.12592121958732605, + "learning_rate": 3.12832905606202e-05, + "loss": 1.1182, + "step": 5339 + }, + { + "epoch": 0.8394741496197606, + "grad_norm": 0.1494685560464859, + "learning_rate": 3.1277310289379855e-05, + "loss": 1.1964, + "step": 5340 + }, + { + "epoch": 0.8396313545166932, + "grad_norm": 0.1290110945701599, + "learning_rate": 3.12713296347453e-05, + "loss": 1.2448, + "step": 5341 + }, + { + "epoch": 0.8397885594136257, + "grad_norm": 0.15891002118587494, + "learning_rate": 3.126534859708177e-05, + "loss": 0.9689, + "step": 5342 + }, + { + "epoch": 0.8399457643105582, + "grad_norm": 0.14195381104946136, + "learning_rate": 3.1259367176754604e-05, + "loss": 1.1326, + "step": 5343 + }, + { + "epoch": 0.8401029692074908, + "grad_norm": 0.15046872198581696, + "learning_rate": 3.12533853741291e-05, + "loss": 1.251, + "step": 5344 + }, + { + "epoch": 0.8402601741044233, + "grad_norm": 0.1452338993549347, + "learning_rate": 3.124740318957062e-05, + "loss": 1.1473, + "step": 5345 + }, + { + "epoch": 0.8404173790013559, + "grad_norm": 0.163826584815979, + "learning_rate": 3.12414206234445e-05, + "loss": 1.1462, + "step": 5346 + }, + { + "epoch": 0.8405745838982884, + "grad_norm": 0.1200469508767128, + "learning_rate": 3.1235437676116176e-05, + "loss": 0.9745, + "step": 5347 + }, + { + "epoch": 0.8407317887952209, + "grad_norm": 0.1339074671268463, + "learning_rate": 3.122945434795103e-05, + "loss": 1.1068, + "step": 5348 + }, + { + "epoch": 0.8408889936921535, + "grad_norm": 0.13477961719036102, + "learning_rate": 3.1223470639314524e-05, + "loss": 0.973, + "step": 5349 + }, + { + "epoch": 0.841046198589086, + "grad_norm": 0.14580164849758148, + "learning_rate": 3.1217486550572106e-05, + "loss": 1.1526, + "step": 5350 + }, + { + "epoch": 0.8412034034860186, + "grad_norm": 0.17580059170722961, + "learning_rate": 3.121150208208926e-05, + "loss": 1.2484, + "step": 5351 + }, + { + "epoch": 0.8413606083829511, + "grad_norm": 0.1546156108379364, + "learning_rate": 3.120551723423151e-05, + "loss": 1.0801, + "step": 5352 + }, + { + "epoch": 0.8415178132798836, + "grad_norm": 0.12587282061576843, + "learning_rate": 3.119953200736437e-05, + "loss": 0.9925, + "step": 5353 + }, + { + "epoch": 0.8416750181768162, + "grad_norm": 0.159348264336586, + "learning_rate": 3.11935464018534e-05, + "loss": 1.1751, + "step": 5354 + }, + { + "epoch": 0.8418322230737487, + "grad_norm": 0.12547065317630768, + "learning_rate": 3.1187560418064194e-05, + "loss": 0.9741, + "step": 5355 + }, + { + "epoch": 0.8419894279706813, + "grad_norm": 0.13250605762004852, + "learning_rate": 3.1181574056362326e-05, + "loss": 1.0199, + "step": 5356 + }, + { + "epoch": 0.8421466328676138, + "grad_norm": 0.13810336589813232, + "learning_rate": 3.117558731711344e-05, + "loss": 1.0869, + "step": 5357 + }, + { + "epoch": 0.8423038377645463, + "grad_norm": 0.12813332676887512, + "learning_rate": 3.116960020068318e-05, + "loss": 1.0874, + "step": 5358 + }, + { + "epoch": 0.8424610426614789, + "grad_norm": 0.15823881328105927, + "learning_rate": 3.1163612707437215e-05, + "loss": 1.1293, + "step": 5359 + }, + { + "epoch": 0.8426182475584114, + "grad_norm": 0.1422443985939026, + "learning_rate": 3.1157624837741227e-05, + "loss": 1.1145, + "step": 5360 + }, + { + "epoch": 0.842775452455344, + "grad_norm": 0.15155455470085144, + "learning_rate": 3.115163659196095e-05, + "loss": 1.0931, + "step": 5361 + }, + { + "epoch": 0.8429326573522765, + "grad_norm": 0.14358840882778168, + "learning_rate": 3.114564797046211e-05, + "loss": 1.0439, + "step": 5362 + }, + { + "epoch": 0.843089862249209, + "grad_norm": 0.14389050006866455, + "learning_rate": 3.1139658973610476e-05, + "loss": 1.1341, + "step": 5363 + }, + { + "epoch": 0.8432470671461416, + "grad_norm": 0.1381431668996811, + "learning_rate": 3.113366960177184e-05, + "loss": 1.0978, + "step": 5364 + }, + { + "epoch": 0.8434042720430741, + "grad_norm": 0.149738609790802, + "learning_rate": 3.112767985531199e-05, + "loss": 1.0735, + "step": 5365 + }, + { + "epoch": 0.8435614769400067, + "grad_norm": 0.11222139745950699, + "learning_rate": 3.112168973459678e-05, + "loss": 1.0173, + "step": 5366 + }, + { + "epoch": 0.8437186818369392, + "grad_norm": 0.15191705524921417, + "learning_rate": 3.111569923999204e-05, + "loss": 1.0776, + "step": 5367 + }, + { + "epoch": 0.8438758867338717, + "grad_norm": 0.12807922065258026, + "learning_rate": 3.1109708371863666e-05, + "loss": 1.1728, + "step": 5368 + }, + { + "epoch": 0.8440330916308043, + "grad_norm": 0.11920659989118576, + "learning_rate": 3.1103717130577554e-05, + "loss": 0.9683, + "step": 5369 + }, + { + "epoch": 0.8441902965277368, + "grad_norm": 0.1410108059644699, + "learning_rate": 3.109772551649962e-05, + "loss": 1.0232, + "step": 5370 + }, + { + "epoch": 0.8443475014246694, + "grad_norm": 0.13245229423046112, + "learning_rate": 3.10917335299958e-05, + "loss": 1.1438, + "step": 5371 + }, + { + "epoch": 0.8445047063216019, + "grad_norm": 0.1411040872335434, + "learning_rate": 3.108574117143209e-05, + "loss": 1.145, + "step": 5372 + }, + { + "epoch": 0.8446619112185345, + "grad_norm": 0.12620750069618225, + "learning_rate": 3.107974844117446e-05, + "loss": 1.0968, + "step": 5373 + }, + { + "epoch": 0.844819116115467, + "grad_norm": 0.13534575700759888, + "learning_rate": 3.107375533958892e-05, + "loss": 1.045, + "step": 5374 + }, + { + "epoch": 0.8449763210123995, + "grad_norm": 0.11987775564193726, + "learning_rate": 3.106776186704152e-05, + "loss": 1.127, + "step": 5375 + }, + { + "epoch": 0.8451335259093321, + "grad_norm": 0.12598861753940582, + "learning_rate": 3.106176802389831e-05, + "loss": 1.1279, + "step": 5376 + }, + { + "epoch": 0.8452907308062646, + "grad_norm": 0.12267209589481354, + "learning_rate": 3.1055773810525374e-05, + "loss": 1.0636, + "step": 5377 + }, + { + "epoch": 0.8454479357031972, + "grad_norm": 0.14591240882873535, + "learning_rate": 3.1049779227288814e-05, + "loss": 1.0918, + "step": 5378 + }, + { + "epoch": 0.8456051406001297, + "grad_norm": 0.13399077951908112, + "learning_rate": 3.104378427455476e-05, + "loss": 1.1301, + "step": 5379 + }, + { + "epoch": 0.8457623454970622, + "grad_norm": 0.13426455855369568, + "learning_rate": 3.1037788952689354e-05, + "loss": 1.0888, + "step": 5380 + }, + { + "epoch": 0.8459195503939948, + "grad_norm": 0.14700891077518463, + "learning_rate": 3.1031793262058764e-05, + "loss": 1.1236, + "step": 5381 + }, + { + "epoch": 0.8460767552909273, + "grad_norm": 0.11843100935220718, + "learning_rate": 3.10257972030292e-05, + "loss": 1.0882, + "step": 5382 + }, + { + "epoch": 0.8462339601878599, + "grad_norm": 0.12139568477869034, + "learning_rate": 3.101980077596687e-05, + "loss": 1.0898, + "step": 5383 + }, + { + "epoch": 0.8463911650847924, + "grad_norm": 0.12524093687534332, + "learning_rate": 3.1013803981238005e-05, + "loss": 1.0751, + "step": 5384 + }, + { + "epoch": 0.8465483699817249, + "grad_norm": 0.1385330855846405, + "learning_rate": 3.100780681920888e-05, + "loss": 1.1185, + "step": 5385 + }, + { + "epoch": 0.8467055748786575, + "grad_norm": 0.11674893647432327, + "learning_rate": 3.100180929024576e-05, + "loss": 0.9345, + "step": 5386 + }, + { + "epoch": 0.84686277977559, + "grad_norm": 0.14611515402793884, + "learning_rate": 3.0995811394714984e-05, + "loss": 1.1097, + "step": 5387 + }, + { + "epoch": 0.8470199846725226, + "grad_norm": 0.13174444437026978, + "learning_rate": 3.0989813132982846e-05, + "loss": 1.0707, + "step": 5388 + }, + { + "epoch": 0.8471771895694551, + "grad_norm": 0.14666838943958282, + "learning_rate": 3.098381450541572e-05, + "loss": 1.1851, + "step": 5389 + }, + { + "epoch": 0.8473343944663876, + "grad_norm": 0.13270537555217743, + "learning_rate": 3.097781551237997e-05, + "loss": 1.212, + "step": 5390 + }, + { + "epoch": 0.8474915993633202, + "grad_norm": 0.1427568793296814, + "learning_rate": 3.097181615424199e-05, + "loss": 1.0267, + "step": 5391 + }, + { + "epoch": 0.8476488042602527, + "grad_norm": 0.13215166330337524, + "learning_rate": 3.09658164313682e-05, + "loss": 1.1913, + "step": 5392 + }, + { + "epoch": 0.8478060091571853, + "grad_norm": 0.13114792108535767, + "learning_rate": 3.095981634412504e-05, + "loss": 1.114, + "step": 5393 + }, + { + "epoch": 0.8479632140541178, + "grad_norm": 0.132401704788208, + "learning_rate": 3.095381589287898e-05, + "loss": 1.1754, + "step": 5394 + }, + { + "epoch": 0.8481204189510503, + "grad_norm": 0.13039790093898773, + "learning_rate": 3.09478150779965e-05, + "loss": 1.0717, + "step": 5395 + }, + { + "epoch": 0.8482776238479829, + "grad_norm": 0.13499532639980316, + "learning_rate": 3.0941813899844094e-05, + "loss": 1.0281, + "step": 5396 + }, + { + "epoch": 0.8484348287449154, + "grad_norm": 0.13586722314357758, + "learning_rate": 3.0935812358788305e-05, + "loss": 1.1253, + "step": 5397 + }, + { + "epoch": 0.848592033641848, + "grad_norm": 0.12624286115169525, + "learning_rate": 3.092981045519568e-05, + "loss": 1.1136, + "step": 5398 + }, + { + "epoch": 0.8487492385387805, + "grad_norm": 0.1326322853565216, + "learning_rate": 3.09238081894328e-05, + "loss": 1.1286, + "step": 5399 + }, + { + "epoch": 0.848906443435713, + "grad_norm": 0.12387052923440933, + "learning_rate": 3.0917805561866245e-05, + "loss": 1.0677, + "step": 5400 + }, + { + "epoch": 0.8490636483326456, + "grad_norm": 0.12564332783222198, + "learning_rate": 3.091180257286265e-05, + "loss": 1.1143, + "step": 5401 + }, + { + "epoch": 0.8492208532295781, + "grad_norm": 0.15920014679431915, + "learning_rate": 3.090579922278865e-05, + "loss": 1.1023, + "step": 5402 + }, + { + "epoch": 0.8493780581265107, + "grad_norm": 0.1484300047159195, + "learning_rate": 3.0899795512010896e-05, + "loss": 1.0745, + "step": 5403 + }, + { + "epoch": 0.8495352630234432, + "grad_norm": 0.1515434831380844, + "learning_rate": 3.089379144089608e-05, + "loss": 1.2058, + "step": 5404 + }, + { + "epoch": 0.8496924679203757, + "grad_norm": 0.147175595164299, + "learning_rate": 3.088778700981091e-05, + "loss": 1.1041, + "step": 5405 + }, + { + "epoch": 0.8498496728173083, + "grad_norm": 0.132612407207489, + "learning_rate": 3.088178221912211e-05, + "loss": 1.1286, + "step": 5406 + }, + { + "epoch": 0.8500068777142408, + "grad_norm": 0.13999781012535095, + "learning_rate": 3.087577706919643e-05, + "loss": 1.1865, + "step": 5407 + }, + { + "epoch": 0.8501640826111734, + "grad_norm": 0.13010193407535553, + "learning_rate": 3.0869771560400645e-05, + "loss": 1.1399, + "step": 5408 + }, + { + "epoch": 0.8503212875081059, + "grad_norm": 0.1409631073474884, + "learning_rate": 3.086376569310154e-05, + "loss": 1.0495, + "step": 5409 + }, + { + "epoch": 0.8504784924050384, + "grad_norm": 0.16881372034549713, + "learning_rate": 3.085775946766594e-05, + "loss": 1.0261, + "step": 5410 + }, + { + "epoch": 0.850635697301971, + "grad_norm": 0.1193733662366867, + "learning_rate": 3.085175288446068e-05, + "loss": 0.9864, + "step": 5411 + }, + { + "epoch": 0.8507929021989035, + "grad_norm": 0.13321813941001892, + "learning_rate": 3.084574594385262e-05, + "loss": 1.0589, + "step": 5412 + }, + { + "epoch": 0.8509501070958361, + "grad_norm": 0.1246391087770462, + "learning_rate": 3.083973864620864e-05, + "loss": 1.0761, + "step": 5413 + }, + { + "epoch": 0.8511073119927686, + "grad_norm": 0.13496242463588715, + "learning_rate": 3.083373099189564e-05, + "loss": 1.1504, + "step": 5414 + }, + { + "epoch": 0.8512645168897011, + "grad_norm": 0.13265138864517212, + "learning_rate": 3.0827722981280544e-05, + "loss": 1.0905, + "step": 5415 + }, + { + "epoch": 0.8514217217866337, + "grad_norm": 0.13705994188785553, + "learning_rate": 3.08217146147303e-05, + "loss": 1.1943, + "step": 5416 + }, + { + "epoch": 0.8515789266835662, + "grad_norm": 0.14216330647468567, + "learning_rate": 3.081570589261188e-05, + "loss": 1.176, + "step": 5417 + }, + { + "epoch": 0.8517361315804988, + "grad_norm": 0.14198951423168182, + "learning_rate": 3.0809696815292285e-05, + "loss": 1.022, + "step": 5418 + }, + { + "epoch": 0.8518933364774313, + "grad_norm": 0.12915819883346558, + "learning_rate": 3.08036873831385e-05, + "loss": 1.087, + "step": 5419 + }, + { + "epoch": 0.8520505413743638, + "grad_norm": 0.12581247091293335, + "learning_rate": 3.079767759651757e-05, + "loss": 1.1603, + "step": 5420 + }, + { + "epoch": 0.8522077462712964, + "grad_norm": 0.13584332168102264, + "learning_rate": 3.0791667455796555e-05, + "loss": 1.1442, + "step": 5421 + }, + { + "epoch": 0.8523649511682289, + "grad_norm": 0.14215992391109467, + "learning_rate": 3.078565696134252e-05, + "loss": 1.1093, + "step": 5422 + }, + { + "epoch": 0.8525221560651615, + "grad_norm": 0.1295357495546341, + "learning_rate": 3.077964611352258e-05, + "loss": 1.0629, + "step": 5423 + }, + { + "epoch": 0.852679360962094, + "grad_norm": 0.13530464470386505, + "learning_rate": 3.0773634912703845e-05, + "loss": 1.0584, + "step": 5424 + }, + { + "epoch": 0.8528365658590266, + "grad_norm": 0.12629027664661407, + "learning_rate": 3.076762335925345e-05, + "loss": 1.2017, + "step": 5425 + }, + { + "epoch": 0.8529937707559591, + "grad_norm": 0.1429501473903656, + "learning_rate": 3.076161145353857e-05, + "loss": 1.0904, + "step": 5426 + }, + { + "epoch": 0.8531509756528916, + "grad_norm": 0.12334276735782623, + "learning_rate": 3.075559919592639e-05, + "loss": 1.0963, + "step": 5427 + }, + { + "epoch": 0.8533081805498242, + "grad_norm": 0.15736989676952362, + "learning_rate": 3.0749586586784096e-05, + "loss": 1.1083, + "step": 5428 + }, + { + "epoch": 0.8534653854467567, + "grad_norm": 0.1402183324098587, + "learning_rate": 3.074357362647894e-05, + "loss": 1.161, + "step": 5429 + }, + { + "epoch": 0.8536225903436893, + "grad_norm": 0.13921000063419342, + "learning_rate": 3.073756031537815e-05, + "loss": 1.1324, + "step": 5430 + }, + { + "epoch": 0.8537797952406218, + "grad_norm": 0.1283966600894928, + "learning_rate": 3.073154665384901e-05, + "loss": 1.0291, + "step": 5431 + }, + { + "epoch": 0.8539370001375542, + "grad_norm": 0.1352003812789917, + "learning_rate": 3.072553264225881e-05, + "loss": 1.151, + "step": 5432 + }, + { + "epoch": 0.8540942050344869, + "grad_norm": 0.12936684489250183, + "learning_rate": 3.0719518280974854e-05, + "loss": 1.112, + "step": 5433 + }, + { + "epoch": 0.8542514099314193, + "grad_norm": 0.1397724449634552, + "learning_rate": 3.07135035703645e-05, + "loss": 1.0103, + "step": 5434 + }, + { + "epoch": 0.854408614828352, + "grad_norm": 0.1389678418636322, + "learning_rate": 3.070748851079507e-05, + "loss": 1.0327, + "step": 5435 + }, + { + "epoch": 0.8545658197252844, + "grad_norm": 0.140939861536026, + "learning_rate": 3.070147310263396e-05, + "loss": 1.0129, + "step": 5436 + }, + { + "epoch": 0.8547230246222169, + "grad_norm": 0.14864715933799744, + "learning_rate": 3.069545734624857e-05, + "loss": 1.1009, + "step": 5437 + }, + { + "epoch": 0.8548802295191495, + "grad_norm": 0.13071830570697784, + "learning_rate": 3.068944124200631e-05, + "loss": 1.0967, + "step": 5438 + }, + { + "epoch": 0.855037434416082, + "grad_norm": 0.1281632035970688, + "learning_rate": 3.068342479027463e-05, + "loss": 1.0902, + "step": 5439 + }, + { + "epoch": 0.8551946393130146, + "grad_norm": 0.1176469698548317, + "learning_rate": 3.067740799142098e-05, + "loss": 1.0927, + "step": 5440 + }, + { + "epoch": 0.8551946393130146, + "eval_loss": 1.0936750173568726, + "eval_runtime": 2349.1606, + "eval_samples_per_second": 3.941, + "eval_steps_per_second": 1.97, + "step": 5440 + }, + { + "epoch": 0.8553518442099471, + "grad_norm": 0.14160789549350739, + "learning_rate": 3.0671390845812866e-05, + "loss": 1.0995, + "step": 5441 + }, + { + "epoch": 0.8555090491068796, + "grad_norm": 0.14717133343219757, + "learning_rate": 3.0665373353817763e-05, + "loss": 1.0685, + "step": 5442 + }, + { + "epoch": 0.8556662540038122, + "grad_norm": 0.1374252736568451, + "learning_rate": 3.065935551580322e-05, + "loss": 1.1388, + "step": 5443 + }, + { + "epoch": 0.8558234589007447, + "grad_norm": 0.13222144544124603, + "learning_rate": 3.0653337332136765e-05, + "loss": 1.1279, + "step": 5444 + }, + { + "epoch": 0.8559806637976773, + "grad_norm": 0.1340630054473877, + "learning_rate": 3.0647318803185985e-05, + "loss": 1.1498, + "step": 5445 + }, + { + "epoch": 0.8561378686946098, + "grad_norm": 0.1354343742132187, + "learning_rate": 3.064129992931845e-05, + "loss": 1.1056, + "step": 5446 + }, + { + "epoch": 0.8562950735915423, + "grad_norm": 0.13388031721115112, + "learning_rate": 3.063528071090179e-05, + "loss": 1.1328, + "step": 5447 + }, + { + "epoch": 0.8564522784884749, + "grad_norm": 0.13545604050159454, + "learning_rate": 3.062926114830362e-05, + "loss": 1.0219, + "step": 5448 + }, + { + "epoch": 0.8566094833854074, + "grad_norm": 0.13546516001224518, + "learning_rate": 3.062324124189159e-05, + "loss": 1.14, + "step": 5449 + }, + { + "epoch": 0.85676668828234, + "grad_norm": 0.12476568669080734, + "learning_rate": 3.061722099203339e-05, + "loss": 1.0946, + "step": 5450 + }, + { + "epoch": 0.8569238931792725, + "grad_norm": 0.15816162526607513, + "learning_rate": 3.06112003990967e-05, + "loss": 1.1427, + "step": 5451 + }, + { + "epoch": 0.857081098076205, + "grad_norm": 0.11912461370229721, + "learning_rate": 3.060517946344924e-05, + "loss": 1.0007, + "step": 5452 + }, + { + "epoch": 0.8572383029731376, + "grad_norm": 0.11991431564092636, + "learning_rate": 3.059915818545874e-05, + "loss": 0.9457, + "step": 5453 + }, + { + "epoch": 0.8573955078700701, + "grad_norm": 0.12637083232402802, + "learning_rate": 3.059313656549297e-05, + "loss": 1.0484, + "step": 5454 + }, + { + "epoch": 0.8575527127670027, + "grad_norm": 0.1313098669052124, + "learning_rate": 3.0587114603919694e-05, + "loss": 1.1909, + "step": 5455 + }, + { + "epoch": 0.8577099176639352, + "grad_norm": 0.15286266803741455, + "learning_rate": 3.058109230110671e-05, + "loss": 1.1285, + "step": 5456 + }, + { + "epoch": 0.8578671225608677, + "grad_norm": 0.12987251579761505, + "learning_rate": 3.057506965742186e-05, + "loss": 1.0106, + "step": 5457 + }, + { + "epoch": 0.8580243274578003, + "grad_norm": 0.13223089277744293, + "learning_rate": 3.056904667323295e-05, + "loss": 1.1985, + "step": 5458 + }, + { + "epoch": 0.8581815323547328, + "grad_norm": 0.1345953494310379, + "learning_rate": 3.056302334890786e-05, + "loss": 0.9578, + "step": 5459 + }, + { + "epoch": 0.8583387372516654, + "grad_norm": 0.12239735573530197, + "learning_rate": 3.055699968481447e-05, + "loss": 1.102, + "step": 5460 + }, + { + "epoch": 0.8584959421485979, + "grad_norm": 0.1293540745973587, + "learning_rate": 3.055097568132069e-05, + "loss": 1.1148, + "step": 5461 + }, + { + "epoch": 0.8586531470455304, + "grad_norm": 0.13322846591472626, + "learning_rate": 3.0544951338794435e-05, + "loss": 1.1197, + "step": 5462 + }, + { + "epoch": 0.858810351942463, + "grad_norm": 0.13431444764137268, + "learning_rate": 3.053892665760364e-05, + "loss": 1.0264, + "step": 5463 + }, + { + "epoch": 0.8589675568393955, + "grad_norm": 0.12476301938295364, + "learning_rate": 3.053290163811629e-05, + "loss": 1.0053, + "step": 5464 + }, + { + "epoch": 0.8591247617363281, + "grad_norm": 0.13603773713111877, + "learning_rate": 3.052687628070035e-05, + "loss": 1.0254, + "step": 5465 + }, + { + "epoch": 0.8592819666332606, + "grad_norm": 0.13530340790748596, + "learning_rate": 3.0520850585723836e-05, + "loss": 1.2159, + "step": 5466 + }, + { + "epoch": 0.8594391715301931, + "grad_norm": 0.15153460204601288, + "learning_rate": 3.051482455355478e-05, + "loss": 1.0566, + "step": 5467 + }, + { + "epoch": 0.8595963764271257, + "grad_norm": 0.11691222339868546, + "learning_rate": 3.050879818456122e-05, + "loss": 1.0906, + "step": 5468 + }, + { + "epoch": 0.8597535813240582, + "grad_norm": 0.1401914656162262, + "learning_rate": 3.0502771479111237e-05, + "loss": 0.9768, + "step": 5469 + }, + { + "epoch": 0.8599107862209908, + "grad_norm": 0.1299663484096527, + "learning_rate": 3.049674443757289e-05, + "loss": 1.1304, + "step": 5470 + }, + { + "epoch": 0.8600679911179233, + "grad_norm": 0.13918045163154602, + "learning_rate": 3.0490717060314315e-05, + "loss": 1.0823, + "step": 5471 + }, + { + "epoch": 0.8602251960148558, + "grad_norm": 0.14035938680171967, + "learning_rate": 3.0484689347703633e-05, + "loss": 1.0413, + "step": 5472 + }, + { + "epoch": 0.8603824009117884, + "grad_norm": 0.13718706369400024, + "learning_rate": 3.047866130010899e-05, + "loss": 1.0863, + "step": 5473 + }, + { + "epoch": 0.8605396058087209, + "grad_norm": 0.12892606854438782, + "learning_rate": 3.047263291789857e-05, + "loss": 1.1313, + "step": 5474 + }, + { + "epoch": 0.8606968107056535, + "grad_norm": 0.1236448660492897, + "learning_rate": 3.0466604201440533e-05, + "loss": 1.1341, + "step": 5475 + }, + { + "epoch": 0.860854015602586, + "grad_norm": 0.12461274117231369, + "learning_rate": 3.0460575151103132e-05, + "loss": 1.1213, + "step": 5476 + }, + { + "epoch": 0.8610112204995186, + "grad_norm": 0.13777461647987366, + "learning_rate": 3.0454545767254567e-05, + "loss": 1.055, + "step": 5477 + }, + { + "epoch": 0.8611684253964511, + "grad_norm": 0.15131552517414093, + "learning_rate": 3.04485160502631e-05, + "loss": 1.1431, + "step": 5478 + }, + { + "epoch": 0.8613256302933836, + "grad_norm": 0.11675114184617996, + "learning_rate": 3.0442486000497005e-05, + "loss": 1.0713, + "step": 5479 + }, + { + "epoch": 0.8614828351903162, + "grad_norm": 0.1256312131881714, + "learning_rate": 3.0436455618324567e-05, + "loss": 1.0094, + "step": 5480 + }, + { + "epoch": 0.8616400400872487, + "grad_norm": 0.14991901814937592, + "learning_rate": 3.0430424904114113e-05, + "loss": 1.0932, + "step": 5481 + }, + { + "epoch": 0.8617972449841813, + "grad_norm": 0.1254003494977951, + "learning_rate": 3.0424393858233962e-05, + "loss": 1.1138, + "step": 5482 + }, + { + "epoch": 0.8619544498811138, + "grad_norm": 0.11838214099407196, + "learning_rate": 3.0418362481052477e-05, + "loss": 1.0147, + "step": 5483 + }, + { + "epoch": 0.8621116547780463, + "grad_norm": 0.15180039405822754, + "learning_rate": 3.041233077293802e-05, + "loss": 1.1562, + "step": 5484 + }, + { + "epoch": 0.8622688596749789, + "grad_norm": 0.12819436192512512, + "learning_rate": 3.0406298734259e-05, + "loss": 0.972, + "step": 5485 + }, + { + "epoch": 0.8624260645719114, + "grad_norm": 0.12701740860939026, + "learning_rate": 3.040026636538381e-05, + "loss": 1.0347, + "step": 5486 + }, + { + "epoch": 0.862583269468844, + "grad_norm": 0.13513225317001343, + "learning_rate": 3.039423366668091e-05, + "loss": 0.9998, + "step": 5487 + }, + { + "epoch": 0.8627404743657765, + "grad_norm": 0.13709568977355957, + "learning_rate": 3.0388200638518736e-05, + "loss": 1.1749, + "step": 5488 + }, + { + "epoch": 0.862897679262709, + "grad_norm": 0.13355977833271027, + "learning_rate": 3.0382167281265766e-05, + "loss": 1.0943, + "step": 5489 + }, + { + "epoch": 0.8630548841596416, + "grad_norm": 0.13331088423728943, + "learning_rate": 3.0376133595290502e-05, + "loss": 1.0732, + "step": 5490 + }, + { + "epoch": 0.8632120890565741, + "grad_norm": 0.13878341019153595, + "learning_rate": 3.0370099580961446e-05, + "loss": 0.9917, + "step": 5491 + }, + { + "epoch": 0.8633692939535067, + "grad_norm": 0.12891429662704468, + "learning_rate": 3.036406523864715e-05, + "loss": 1.0314, + "step": 5492 + }, + { + "epoch": 0.8635264988504392, + "grad_norm": 0.13124819099903107, + "learning_rate": 3.035803056871615e-05, + "loss": 1.0451, + "step": 5493 + }, + { + "epoch": 0.8636837037473717, + "grad_norm": 0.1321152299642563, + "learning_rate": 3.035199557153703e-05, + "loss": 1.1865, + "step": 5494 + }, + { + "epoch": 0.8638409086443043, + "grad_norm": 0.1386841982603073, + "learning_rate": 3.0345960247478383e-05, + "loss": 1.164, + "step": 5495 + }, + { + "epoch": 0.8639981135412368, + "grad_norm": 0.1330791562795639, + "learning_rate": 3.033992459690882e-05, + "loss": 1.1475, + "step": 5496 + }, + { + "epoch": 0.8641553184381694, + "grad_norm": 0.14237408339977264, + "learning_rate": 3.0333888620196994e-05, + "loss": 1.0093, + "step": 5497 + }, + { + "epoch": 0.8643125233351019, + "grad_norm": 0.14803604781627655, + "learning_rate": 3.0327852317711532e-05, + "loss": 1.0605, + "step": 5498 + }, + { + "epoch": 0.8644697282320344, + "grad_norm": 0.13252919912338257, + "learning_rate": 3.0321815689821124e-05, + "loss": 1.0748, + "step": 5499 + }, + { + "epoch": 0.864626933128967, + "grad_norm": 0.14394722878932953, + "learning_rate": 3.031577873689446e-05, + "loss": 1.1368, + "step": 5500 + }, + { + "epoch": 0.8647841380258995, + "grad_norm": 0.1400299370288849, + "learning_rate": 3.0309741459300262e-05, + "loss": 1.1587, + "step": 5501 + }, + { + "epoch": 0.8649413429228321, + "grad_norm": 0.13285602629184723, + "learning_rate": 3.0303703857407255e-05, + "loss": 1.1335, + "step": 5502 + }, + { + "epoch": 0.8650985478197646, + "grad_norm": 0.13549081981182098, + "learning_rate": 3.0297665931584197e-05, + "loss": 1.1105, + "step": 5503 + }, + { + "epoch": 0.8652557527166971, + "grad_norm": 0.12438593804836273, + "learning_rate": 3.029162768219986e-05, + "loss": 1.0722, + "step": 5504 + }, + { + "epoch": 0.8654129576136297, + "grad_norm": 0.12310291081666946, + "learning_rate": 3.0285589109623035e-05, + "loss": 1.0132, + "step": 5505 + }, + { + "epoch": 0.8655701625105622, + "grad_norm": 0.1295916885137558, + "learning_rate": 3.027955021422254e-05, + "loss": 1.1142, + "step": 5506 + }, + { + "epoch": 0.8657273674074948, + "grad_norm": 0.173175647854805, + "learning_rate": 3.0273510996367205e-05, + "loss": 1.0724, + "step": 5507 + }, + { + "epoch": 0.8658845723044273, + "grad_norm": 0.1283375471830368, + "learning_rate": 3.0267471456425877e-05, + "loss": 0.9609, + "step": 5508 + }, + { + "epoch": 0.8660417772013598, + "grad_norm": 0.13712435960769653, + "learning_rate": 3.0261431594767443e-05, + "loss": 1.1551, + "step": 5509 + }, + { + "epoch": 0.8661989820982924, + "grad_norm": 0.11773756146430969, + "learning_rate": 3.025539141176078e-05, + "loss": 1.1755, + "step": 5510 + }, + { + "epoch": 0.8663561869952249, + "grad_norm": 0.12601979076862335, + "learning_rate": 3.024935090777481e-05, + "loss": 1.0711, + "step": 5511 + }, + { + "epoch": 0.8665133918921575, + "grad_norm": 0.13084006309509277, + "learning_rate": 3.0243310083178455e-05, + "loss": 1.0349, + "step": 5512 + }, + { + "epoch": 0.86667059678909, + "grad_norm": 0.1331014335155487, + "learning_rate": 3.0237268938340668e-05, + "loss": 1.1105, + "step": 5513 + }, + { + "epoch": 0.8668278016860225, + "grad_norm": 0.2072014957666397, + "learning_rate": 3.023122747363042e-05, + "loss": 1.0971, + "step": 5514 + }, + { + "epoch": 0.8669850065829551, + "grad_norm": 0.14520519971847534, + "learning_rate": 3.0225185689416703e-05, + "loss": 1.028, + "step": 5515 + }, + { + "epoch": 0.8671422114798876, + "grad_norm": 0.13639235496520996, + "learning_rate": 3.021914358606853e-05, + "loss": 1.1286, + "step": 5516 + }, + { + "epoch": 0.8672994163768202, + "grad_norm": 0.12936940789222717, + "learning_rate": 3.0213101163954923e-05, + "loss": 1.0371, + "step": 5517 + }, + { + "epoch": 0.8674566212737527, + "grad_norm": 0.12748393416404724, + "learning_rate": 3.0207058423444933e-05, + "loss": 0.9518, + "step": 5518 + }, + { + "epoch": 0.8676138261706852, + "grad_norm": 0.14864744246006012, + "learning_rate": 3.0201015364907613e-05, + "loss": 1.1465, + "step": 5519 + }, + { + "epoch": 0.8677710310676178, + "grad_norm": 0.14203327894210815, + "learning_rate": 3.0194971988712078e-05, + "loss": 1.0514, + "step": 5520 + }, + { + "epoch": 0.8679282359645503, + "grad_norm": 0.1269863098859787, + "learning_rate": 3.018892829522741e-05, + "loss": 1.1623, + "step": 5521 + }, + { + "epoch": 0.8680854408614829, + "grad_norm": 0.14576047658920288, + "learning_rate": 3.0182884284822748e-05, + "loss": 1.1095, + "step": 5522 + }, + { + "epoch": 0.8682426457584154, + "grad_norm": 0.13361620903015137, + "learning_rate": 3.0176839957867238e-05, + "loss": 0.9394, + "step": 5523 + }, + { + "epoch": 0.8683998506553479, + "grad_norm": 0.13372379541397095, + "learning_rate": 3.017079531473003e-05, + "loss": 1.0522, + "step": 5524 + }, + { + "epoch": 0.8685570555522805, + "grad_norm": 0.13323388993740082, + "learning_rate": 3.0164750355780326e-05, + "loss": 0.9998, + "step": 5525 + }, + { + "epoch": 0.868714260449213, + "grad_norm": 0.1420932561159134, + "learning_rate": 3.0158705081387317e-05, + "loss": 1.1981, + "step": 5526 + }, + { + "epoch": 0.8688714653461456, + "grad_norm": 0.1274615079164505, + "learning_rate": 3.0152659491920227e-05, + "loss": 0.974, + "step": 5527 + }, + { + "epoch": 0.8690286702430781, + "grad_norm": 0.15061865746974945, + "learning_rate": 3.0146613587748308e-05, + "loss": 0.9766, + "step": 5528 + }, + { + "epoch": 0.8691858751400107, + "grad_norm": 0.14399336278438568, + "learning_rate": 3.0140567369240802e-05, + "loss": 1.0955, + "step": 5529 + }, + { + "epoch": 0.8693430800369432, + "grad_norm": 0.13892965018749237, + "learning_rate": 3.0134520836767012e-05, + "loss": 1.1415, + "step": 5530 + }, + { + "epoch": 0.8695002849338757, + "grad_norm": 0.14336195588111877, + "learning_rate": 3.0128473990696214e-05, + "loss": 1.1325, + "step": 5531 + }, + { + "epoch": 0.8696574898308083, + "grad_norm": 0.13188253343105316, + "learning_rate": 3.012242683139774e-05, + "loss": 1.047, + "step": 5532 + }, + { + "epoch": 0.8698146947277408, + "grad_norm": 0.1784677803516388, + "learning_rate": 3.011637935924093e-05, + "loss": 1.0963, + "step": 5533 + }, + { + "epoch": 0.8699718996246734, + "grad_norm": 0.14768408238887787, + "learning_rate": 3.011033157459513e-05, + "loss": 1.1515, + "step": 5534 + }, + { + "epoch": 0.8701291045216059, + "grad_norm": 0.1321885734796524, + "learning_rate": 3.010428347782972e-05, + "loss": 1.0597, + "step": 5535 + }, + { + "epoch": 0.8702863094185384, + "grad_norm": 0.13586130738258362, + "learning_rate": 3.0098235069314097e-05, + "loss": 1.1279, + "step": 5536 + }, + { + "epoch": 0.870443514315471, + "grad_norm": 0.1359790563583374, + "learning_rate": 3.0092186349417677e-05, + "loss": 1.0931, + "step": 5537 + }, + { + "epoch": 0.8706007192124035, + "grad_norm": 0.13127247989177704, + "learning_rate": 3.0086137318509888e-05, + "loss": 1.1395, + "step": 5538 + }, + { + "epoch": 0.8707579241093361, + "grad_norm": 0.14404800534248352, + "learning_rate": 3.008008797696018e-05, + "loss": 1.0059, + "step": 5539 + }, + { + "epoch": 0.8709151290062686, + "grad_norm": 0.14222590625286102, + "learning_rate": 3.0074038325138026e-05, + "loss": 1.03, + "step": 5540 + }, + { + "epoch": 0.871072333903201, + "grad_norm": 0.12845419347286224, + "learning_rate": 3.0067988363412918e-05, + "loss": 1.1652, + "step": 5541 + }, + { + "epoch": 0.8712295388001337, + "grad_norm": 0.13392123579978943, + "learning_rate": 3.0061938092154367e-05, + "loss": 1.0313, + "step": 5542 + }, + { + "epoch": 0.8713867436970661, + "grad_norm": 0.13119371235370636, + "learning_rate": 3.0055887511731888e-05, + "loss": 1.0816, + "step": 5543 + }, + { + "epoch": 0.8715439485939988, + "grad_norm": 0.1327655166387558, + "learning_rate": 3.004983662251505e-05, + "loss": 1.0581, + "step": 5544 + }, + { + "epoch": 0.8717011534909312, + "grad_norm": 0.13374944031238556, + "learning_rate": 3.0043785424873394e-05, + "loss": 1.1971, + "step": 5545 + }, + { + "epoch": 0.8718583583878637, + "grad_norm": 0.13144607841968536, + "learning_rate": 3.0037733919176525e-05, + "loss": 1.1351, + "step": 5546 + }, + { + "epoch": 0.8720155632847963, + "grad_norm": 0.14060606062412262, + "learning_rate": 3.0031682105794022e-05, + "loss": 1.1078, + "step": 5547 + }, + { + "epoch": 0.8721727681817288, + "grad_norm": 0.14446894824504852, + "learning_rate": 3.002562998509553e-05, + "loss": 1.036, + "step": 5548 + }, + { + "epoch": 0.8723299730786614, + "grad_norm": 0.13659563660621643, + "learning_rate": 3.0019577557450672e-05, + "loss": 1.1279, + "step": 5549 + }, + { + "epoch": 0.8724871779755939, + "grad_norm": 0.13352961838245392, + "learning_rate": 3.0013524823229118e-05, + "loss": 1.0542, + "step": 5550 + }, + { + "epoch": 0.8726443828725264, + "grad_norm": 0.1410629004240036, + "learning_rate": 3.0007471782800546e-05, + "loss": 1.067, + "step": 5551 + }, + { + "epoch": 0.872801587769459, + "grad_norm": 0.12700222432613373, + "learning_rate": 3.000141843653465e-05, + "loss": 1.0184, + "step": 5552 + }, + { + "epoch": 0.8729587926663915, + "grad_norm": 0.12503810226917267, + "learning_rate": 2.9995364784801137e-05, + "loss": 0.9738, + "step": 5553 + }, + { + "epoch": 0.8731159975633241, + "grad_norm": 0.12451665848493576, + "learning_rate": 2.9989310827969758e-05, + "loss": 1.008, + "step": 5554 + }, + { + "epoch": 0.8732732024602566, + "grad_norm": 0.1338919848203659, + "learning_rate": 2.9983256566410255e-05, + "loss": 1.1498, + "step": 5555 + }, + { + "epoch": 0.8734304073571891, + "grad_norm": 0.13377127051353455, + "learning_rate": 2.9977202000492398e-05, + "loss": 1.0195, + "step": 5556 + }, + { + "epoch": 0.8735876122541217, + "grad_norm": 0.13620005548000336, + "learning_rate": 2.997114713058598e-05, + "loss": 1.1029, + "step": 5557 + }, + { + "epoch": 0.8737448171510542, + "grad_norm": 0.1268072873353958, + "learning_rate": 2.996509195706081e-05, + "loss": 1.093, + "step": 5558 + }, + { + "epoch": 0.8739020220479868, + "grad_norm": 0.14982236921787262, + "learning_rate": 2.9959036480286716e-05, + "loss": 1.0551, + "step": 5559 + }, + { + "epoch": 0.8740592269449193, + "grad_norm": 0.12541651725769043, + "learning_rate": 2.9952980700633543e-05, + "loss": 1.0325, + "step": 5560 + }, + { + "epoch": 0.8742164318418518, + "grad_norm": 0.12023629993200302, + "learning_rate": 2.994692461847115e-05, + "loss": 1.0384, + "step": 5561 + }, + { + "epoch": 0.8743736367387844, + "grad_norm": 0.1331315040588379, + "learning_rate": 2.9940868234169423e-05, + "loss": 1.0521, + "step": 5562 + }, + { + "epoch": 0.8745308416357169, + "grad_norm": 0.14275115728378296, + "learning_rate": 2.9934811548098263e-05, + "loss": 1.0653, + "step": 5563 + }, + { + "epoch": 0.8746880465326495, + "grad_norm": 0.1589425504207611, + "learning_rate": 2.9928754560627592e-05, + "loss": 1.1964, + "step": 5564 + }, + { + "epoch": 0.874845251429582, + "grad_norm": 0.1386173516511917, + "learning_rate": 2.9922697272127348e-05, + "loss": 1.1132, + "step": 5565 + }, + { + "epoch": 0.8750024563265145, + "grad_norm": 0.13489043712615967, + "learning_rate": 2.9916639682967477e-05, + "loss": 1.1132, + "step": 5566 + }, + { + "epoch": 0.8751596612234471, + "grad_norm": 0.14840570092201233, + "learning_rate": 2.9910581793517967e-05, + "loss": 1.1113, + "step": 5567 + }, + { + "epoch": 0.8753168661203796, + "grad_norm": 0.13752590119838715, + "learning_rate": 2.99045236041488e-05, + "loss": 1.0738, + "step": 5568 + }, + { + "epoch": 0.8754740710173122, + "grad_norm": 0.21968597173690796, + "learning_rate": 2.9898465115229996e-05, + "loss": 1.0295, + "step": 5569 + }, + { + "epoch": 0.8756312759142447, + "grad_norm": 0.14655014872550964, + "learning_rate": 2.9892406327131573e-05, + "loss": 1.1608, + "step": 5570 + }, + { + "epoch": 0.8757884808111772, + "grad_norm": 0.12686704099178314, + "learning_rate": 2.988634724022359e-05, + "loss": 1.0242, + "step": 5571 + }, + { + "epoch": 0.8759456857081098, + "grad_norm": 0.13236230611801147, + "learning_rate": 2.9880287854876117e-05, + "loss": 1.0938, + "step": 5572 + }, + { + "epoch": 0.8761028906050423, + "grad_norm": 0.12991240620613098, + "learning_rate": 2.987422817145921e-05, + "loss": 1.1035, + "step": 5573 + }, + { + "epoch": 0.8762600955019749, + "grad_norm": 0.17237314581871033, + "learning_rate": 2.986816819034301e-05, + "loss": 1.1487, + "step": 5574 + }, + { + "epoch": 0.8764173003989074, + "grad_norm": 0.11916259676218033, + "learning_rate": 2.9862107911897615e-05, + "loss": 1.0189, + "step": 5575 + }, + { + "epoch": 0.8765745052958399, + "grad_norm": 0.1443348228931427, + "learning_rate": 2.9856047336493164e-05, + "loss": 1.0019, + "step": 5576 + }, + { + "epoch": 0.8767317101927725, + "grad_norm": 0.12370751053094864, + "learning_rate": 2.9849986464499823e-05, + "loss": 0.831, + "step": 5577 + }, + { + "epoch": 0.876888915089705, + "grad_norm": 0.13544991612434387, + "learning_rate": 2.984392529628775e-05, + "loss": 0.9465, + "step": 5578 + }, + { + "epoch": 0.8770461199866376, + "grad_norm": 0.13607020676136017, + "learning_rate": 2.9837863832227164e-05, + "loss": 1.2025, + "step": 5579 + }, + { + "epoch": 0.8772033248835701, + "grad_norm": 0.13577814400196075, + "learning_rate": 2.9831802072688254e-05, + "loss": 0.9317, + "step": 5580 + }, + { + "epoch": 0.8773605297805027, + "grad_norm": 0.15547913312911987, + "learning_rate": 2.9825740018041264e-05, + "loss": 1.1532, + "step": 5581 + }, + { + "epoch": 0.8775177346774352, + "grad_norm": 0.13382650911808014, + "learning_rate": 2.981967766865643e-05, + "loss": 1.1313, + "step": 5582 + }, + { + "epoch": 0.8776749395743677, + "grad_norm": 0.13078917562961578, + "learning_rate": 2.981361502490402e-05, + "loss": 1.125, + "step": 5583 + }, + { + "epoch": 0.8778321444713003, + "grad_norm": 0.1317652016878128, + "learning_rate": 2.9807552087154318e-05, + "loss": 1.1222, + "step": 5584 + }, + { + "epoch": 0.8779893493682328, + "grad_norm": 0.121950164437294, + "learning_rate": 2.9801488855777627e-05, + "loss": 1.0486, + "step": 5585 + }, + { + "epoch": 0.8781465542651654, + "grad_norm": 0.13826113939285278, + "learning_rate": 2.9795425331144272e-05, + "loss": 1.122, + "step": 5586 + }, + { + "epoch": 0.8783037591620979, + "grad_norm": 0.12719380855560303, + "learning_rate": 2.9789361513624574e-05, + "loss": 1.0332, + "step": 5587 + }, + { + "epoch": 0.8784609640590304, + "grad_norm": 0.1368698924779892, + "learning_rate": 2.97832974035889e-05, + "loss": 1.1888, + "step": 5588 + }, + { + "epoch": 0.878618168955963, + "grad_norm": 0.13354262709617615, + "learning_rate": 2.9777233001407623e-05, + "loss": 1.1043, + "step": 5589 + }, + { + "epoch": 0.8787753738528955, + "grad_norm": 0.1467258483171463, + "learning_rate": 2.9771168307451126e-05, + "loss": 1.1014, + "step": 5590 + }, + { + "epoch": 0.8789325787498281, + "grad_norm": 0.12892787158489227, + "learning_rate": 2.9765103322089827e-05, + "loss": 1.1119, + "step": 5591 + }, + { + "epoch": 0.8790897836467606, + "grad_norm": 0.13993503153324127, + "learning_rate": 2.9759038045694143e-05, + "loss": 1.0177, + "step": 5592 + }, + { + "epoch": 0.8792469885436931, + "grad_norm": 0.12599575519561768, + "learning_rate": 2.975297247863452e-05, + "loss": 1.0052, + "step": 5593 + }, + { + "epoch": 0.8794041934406257, + "grad_norm": 0.1355002373456955, + "learning_rate": 2.974690662128143e-05, + "loss": 1.0763, + "step": 5594 + }, + { + "epoch": 0.8795613983375582, + "grad_norm": 0.12462567538022995, + "learning_rate": 2.9740840474005337e-05, + "loss": 1.0454, + "step": 5595 + }, + { + "epoch": 0.8797186032344908, + "grad_norm": 0.13172924518585205, + "learning_rate": 2.9734774037176748e-05, + "loss": 1.186, + "step": 5596 + }, + { + "epoch": 0.8798758081314233, + "grad_norm": 0.13119898736476898, + "learning_rate": 2.9728707311166177e-05, + "loss": 1.1055, + "step": 5597 + }, + { + "epoch": 0.8800330130283558, + "grad_norm": 0.16086314618587494, + "learning_rate": 2.972264029634415e-05, + "loss": 1.16, + "step": 5598 + }, + { + "epoch": 0.8801902179252884, + "grad_norm": 0.1285528838634491, + "learning_rate": 2.9716572993081227e-05, + "loss": 1.151, + "step": 5599 + }, + { + "epoch": 0.8803474228222209, + "grad_norm": 0.15503568947315216, + "learning_rate": 2.9710505401747974e-05, + "loss": 1.2136, + "step": 5600 + }, + { + "epoch": 0.8803474228222209, + "eval_loss": 1.092179536819458, + "eval_runtime": 2318.8274, + "eval_samples_per_second": 3.993, + "eval_steps_per_second": 1.996, + "step": 5600 + }, + { + "epoch": 0.8805046277191535, + "grad_norm": 0.13900360465049744, + "learning_rate": 2.9704437522714966e-05, + "loss": 1.1117, + "step": 5601 + }, + { + "epoch": 0.880661832616086, + "grad_norm": 0.14534588158130646, + "learning_rate": 2.9698369356352824e-05, + "loss": 1.0728, + "step": 5602 + }, + { + "epoch": 0.8808190375130185, + "grad_norm": 0.1375589370727539, + "learning_rate": 2.9692300903032146e-05, + "loss": 1.0369, + "step": 5603 + }, + { + "epoch": 0.8809762424099511, + "grad_norm": 0.14678697288036346, + "learning_rate": 2.9686232163123585e-05, + "loss": 1.1632, + "step": 5604 + }, + { + "epoch": 0.8811334473068836, + "grad_norm": 0.1398497372865677, + "learning_rate": 2.9680163136997797e-05, + "loss": 1.0731, + "step": 5605 + }, + { + "epoch": 0.8812906522038162, + "grad_norm": 0.13608963787555695, + "learning_rate": 2.9674093825025444e-05, + "loss": 1.012, + "step": 5606 + }, + { + "epoch": 0.8814478571007487, + "grad_norm": 0.12462413311004639, + "learning_rate": 2.9668024227577228e-05, + "loss": 1.1281, + "step": 5607 + }, + { + "epoch": 0.8816050619976812, + "grad_norm": 0.1289960741996765, + "learning_rate": 2.9661954345023857e-05, + "loss": 1.0557, + "step": 5608 + }, + { + "epoch": 0.8817622668946138, + "grad_norm": 0.14277970790863037, + "learning_rate": 2.965588417773605e-05, + "loss": 1.1804, + "step": 5609 + }, + { + "epoch": 0.8819194717915463, + "grad_norm": 0.14598487317562103, + "learning_rate": 2.964981372608455e-05, + "loss": 1.167, + "step": 5610 + }, + { + "epoch": 0.8820766766884789, + "grad_norm": 0.13253039121627808, + "learning_rate": 2.964374299044012e-05, + "loss": 1.107, + "step": 5611 + }, + { + "epoch": 0.8822338815854114, + "grad_norm": 0.1302221566438675, + "learning_rate": 2.9637671971173537e-05, + "loss": 1.1797, + "step": 5612 + }, + { + "epoch": 0.8823910864823439, + "grad_norm": 0.135704904794693, + "learning_rate": 2.963160066865559e-05, + "loss": 1.1462, + "step": 5613 + }, + { + "epoch": 0.8825482913792765, + "grad_norm": 0.11983178555965424, + "learning_rate": 2.962552908325711e-05, + "loss": 1.1256, + "step": 5614 + }, + { + "epoch": 0.882705496276209, + "grad_norm": 0.1335216611623764, + "learning_rate": 2.96194572153489e-05, + "loss": 1.1394, + "step": 5615 + }, + { + "epoch": 0.8828627011731416, + "grad_norm": 0.12216528505086899, + "learning_rate": 2.9613385065301825e-05, + "loss": 1.1634, + "step": 5616 + }, + { + "epoch": 0.8830199060700741, + "grad_norm": 0.11962614208459854, + "learning_rate": 2.9607312633486732e-05, + "loss": 1.0316, + "step": 5617 + }, + { + "epoch": 0.8831771109670066, + "grad_norm": 0.13816697895526886, + "learning_rate": 2.9601239920274525e-05, + "loss": 1.0586, + "step": 5618 + }, + { + "epoch": 0.8833343158639392, + "grad_norm": 0.16819511353969574, + "learning_rate": 2.9595166926036088e-05, + "loss": 1.0594, + "step": 5619 + }, + { + "epoch": 0.8834915207608717, + "grad_norm": 0.13210850954055786, + "learning_rate": 2.9589093651142336e-05, + "loss": 1.0821, + "step": 5620 + }, + { + "epoch": 0.8836487256578043, + "grad_norm": 0.13068845868110657, + "learning_rate": 2.9583020095964207e-05, + "loss": 1.1491, + "step": 5621 + }, + { + "epoch": 0.8838059305547368, + "grad_norm": 0.13062553107738495, + "learning_rate": 2.9576946260872647e-05, + "loss": 1.052, + "step": 5622 + }, + { + "epoch": 0.8839631354516693, + "grad_norm": 0.13623276352882385, + "learning_rate": 2.9570872146238627e-05, + "loss": 1.0872, + "step": 5623 + }, + { + "epoch": 0.8841203403486019, + "grad_norm": 0.12047271430492401, + "learning_rate": 2.9564797752433116e-05, + "loss": 1.0985, + "step": 5624 + }, + { + "epoch": 0.8842775452455344, + "grad_norm": 0.12949922680854797, + "learning_rate": 2.9558723079827134e-05, + "loss": 1.0161, + "step": 5625 + }, + { + "epoch": 0.884434750142467, + "grad_norm": 0.14409802854061127, + "learning_rate": 2.9552648128791692e-05, + "loss": 1.2198, + "step": 5626 + }, + { + "epoch": 0.8845919550393995, + "grad_norm": 0.1382865011692047, + "learning_rate": 2.9546572899697823e-05, + "loss": 1.0821, + "step": 5627 + }, + { + "epoch": 0.884749159936332, + "grad_norm": 0.12341055274009705, + "learning_rate": 2.9540497392916576e-05, + "loss": 1.0665, + "step": 5628 + }, + { + "epoch": 0.8849063648332646, + "grad_norm": 0.13860835134983063, + "learning_rate": 2.953442160881903e-05, + "loss": 1.1157, + "step": 5629 + }, + { + "epoch": 0.8850635697301971, + "grad_norm": 0.14050939679145813, + "learning_rate": 2.9528345547776264e-05, + "loss": 1.0045, + "step": 5630 + }, + { + "epoch": 0.8852207746271297, + "grad_norm": 0.1365601122379303, + "learning_rate": 2.952226921015938e-05, + "loss": 1.0824, + "step": 5631 + }, + { + "epoch": 0.8853779795240622, + "grad_norm": 0.13242600858211517, + "learning_rate": 2.9516192596339498e-05, + "loss": 1.0734, + "step": 5632 + }, + { + "epoch": 0.8855351844209947, + "grad_norm": 0.12390458583831787, + "learning_rate": 2.951011570668775e-05, + "loss": 1.0952, + "step": 5633 + }, + { + "epoch": 0.8856923893179273, + "grad_norm": 0.12306761741638184, + "learning_rate": 2.95040385415753e-05, + "loss": 1.1909, + "step": 5634 + }, + { + "epoch": 0.8858495942148598, + "grad_norm": 0.12778042256832123, + "learning_rate": 2.9497961101373318e-05, + "loss": 1.0, + "step": 5635 + }, + { + "epoch": 0.8860067991117924, + "grad_norm": 0.12413736432790756, + "learning_rate": 2.949188338645298e-05, + "loss": 1.0075, + "step": 5636 + }, + { + "epoch": 0.8861640040087249, + "grad_norm": 0.13342618942260742, + "learning_rate": 2.948580539718549e-05, + "loss": 0.9862, + "step": 5637 + }, + { + "epoch": 0.8863212089056575, + "grad_norm": 0.14185477793216705, + "learning_rate": 2.9479727133942074e-05, + "loss": 1.0782, + "step": 5638 + }, + { + "epoch": 0.88647841380259, + "grad_norm": 0.1387973427772522, + "learning_rate": 2.9473648597093972e-05, + "loss": 1.1845, + "step": 5639 + }, + { + "epoch": 0.8866356186995225, + "grad_norm": 0.11654473096132278, + "learning_rate": 2.9467569787012444e-05, + "loss": 1.137, + "step": 5640 + }, + { + "epoch": 0.8867928235964551, + "grad_norm": 0.12042026221752167, + "learning_rate": 2.946149070406874e-05, + "loss": 1.1053, + "step": 5641 + }, + { + "epoch": 0.8869500284933876, + "grad_norm": 0.12750862538814545, + "learning_rate": 2.9455411348634166e-05, + "loss": 1.1555, + "step": 5642 + }, + { + "epoch": 0.8871072333903202, + "grad_norm": 0.12096679210662842, + "learning_rate": 2.944933172108001e-05, + "loss": 1.0446, + "step": 5643 + }, + { + "epoch": 0.8872644382872527, + "grad_norm": 0.12263115495443344, + "learning_rate": 2.9443251821777612e-05, + "loss": 1.1041, + "step": 5644 + }, + { + "epoch": 0.8874216431841851, + "grad_norm": 0.1340923011302948, + "learning_rate": 2.943717165109829e-05, + "loss": 1.1432, + "step": 5645 + }, + { + "epoch": 0.8875788480811178, + "grad_norm": 0.13965152204036713, + "learning_rate": 2.9431091209413408e-05, + "loss": 1.0197, + "step": 5646 + }, + { + "epoch": 0.8877360529780502, + "grad_norm": 0.1291363686323166, + "learning_rate": 2.942501049709433e-05, + "loss": 1.0755, + "step": 5647 + }, + { + "epoch": 0.8878932578749829, + "grad_norm": 0.12972478568553925, + "learning_rate": 2.941892951451245e-05, + "loss": 1.1581, + "step": 5648 + }, + { + "epoch": 0.8880504627719153, + "grad_norm": 0.1353209912776947, + "learning_rate": 2.941284826203917e-05, + "loss": 1.0234, + "step": 5649 + }, + { + "epoch": 0.8882076676688478, + "grad_norm": 0.14914408326148987, + "learning_rate": 2.940676674004591e-05, + "loss": 1.1458, + "step": 5650 + }, + { + "epoch": 0.8883648725657805, + "grad_norm": 0.13796725869178772, + "learning_rate": 2.94006849489041e-05, + "loss": 1.1501, + "step": 5651 + }, + { + "epoch": 0.888522077462713, + "grad_norm": 0.1332174837589264, + "learning_rate": 2.9394602888985195e-05, + "loss": 1.1728, + "step": 5652 + }, + { + "epoch": 0.8886792823596456, + "grad_norm": 0.12382815033197403, + "learning_rate": 2.9388520560660672e-05, + "loss": 1.0498, + "step": 5653 + }, + { + "epoch": 0.888836487256578, + "grad_norm": 0.12109874933958054, + "learning_rate": 2.9382437964302012e-05, + "loss": 1.1484, + "step": 5654 + }, + { + "epoch": 0.8889936921535105, + "grad_norm": 0.12309691309928894, + "learning_rate": 2.9376355100280707e-05, + "loss": 1.1729, + "step": 5655 + }, + { + "epoch": 0.8891508970504431, + "grad_norm": 0.1355372965335846, + "learning_rate": 2.937027196896829e-05, + "loss": 1.0669, + "step": 5656 + }, + { + "epoch": 0.8893081019473756, + "grad_norm": 0.12167185544967651, + "learning_rate": 2.9364188570736278e-05, + "loss": 1.1057, + "step": 5657 + }, + { + "epoch": 0.8894653068443082, + "grad_norm": 0.14521557092666626, + "learning_rate": 2.9358104905956246e-05, + "loss": 1.1579, + "step": 5658 + }, + { + "epoch": 0.8896225117412407, + "grad_norm": 0.13225121796131134, + "learning_rate": 2.9352020974999745e-05, + "loss": 1.2134, + "step": 5659 + }, + { + "epoch": 0.8897797166381732, + "grad_norm": 0.18582189083099365, + "learning_rate": 2.9345936778238352e-05, + "loss": 1.1533, + "step": 5660 + }, + { + "epoch": 0.8899369215351058, + "grad_norm": 0.11820016801357269, + "learning_rate": 2.9339852316043692e-05, + "loss": 1.0359, + "step": 5661 + }, + { + "epoch": 0.8900941264320383, + "grad_norm": 0.14417026937007904, + "learning_rate": 2.9333767588787347e-05, + "loss": 1.0174, + "step": 5662 + }, + { + "epoch": 0.8902513313289709, + "grad_norm": 0.12442802637815475, + "learning_rate": 2.9327682596840978e-05, + "loss": 1.0564, + "step": 5663 + }, + { + "epoch": 0.8904085362259034, + "grad_norm": 0.12792329490184784, + "learning_rate": 2.932159734057622e-05, + "loss": 1.0533, + "step": 5664 + }, + { + "epoch": 0.8905657411228359, + "grad_norm": 0.14213603734970093, + "learning_rate": 2.931551182036474e-05, + "loss": 1.0509, + "step": 5665 + }, + { + "epoch": 0.8907229460197685, + "grad_norm": 0.3698267638683319, + "learning_rate": 2.930942603657821e-05, + "loss": 1.0973, + "step": 5666 + }, + { + "epoch": 0.890880150916701, + "grad_norm": 0.13819676637649536, + "learning_rate": 2.930333998958834e-05, + "loss": 1.0142, + "step": 5667 + }, + { + "epoch": 0.8910373558136336, + "grad_norm": 0.13877476751804352, + "learning_rate": 2.9297253679766833e-05, + "loss": 1.1646, + "step": 5668 + }, + { + "epoch": 0.8911945607105661, + "grad_norm": 0.12779487669467926, + "learning_rate": 2.929116710748543e-05, + "loss": 1.0479, + "step": 5669 + }, + { + "epoch": 0.8913517656074986, + "grad_norm": 0.1457994282245636, + "learning_rate": 2.928508027311586e-05, + "loss": 1.106, + "step": 5670 + }, + { + "epoch": 0.8915089705044312, + "grad_norm": 0.13743549585342407, + "learning_rate": 2.9278993177029886e-05, + "loss": 1.201, + "step": 5671 + }, + { + "epoch": 0.8916661754013637, + "grad_norm": 0.1481037139892578, + "learning_rate": 2.92729058195993e-05, + "loss": 1.1667, + "step": 5672 + }, + { + "epoch": 0.8918233802982963, + "grad_norm": 0.1308559626340866, + "learning_rate": 2.9266818201195877e-05, + "loss": 1.1738, + "step": 5673 + }, + { + "epoch": 0.8919805851952288, + "grad_norm": 0.12915688753128052, + "learning_rate": 2.9260730322191433e-05, + "loss": 1.1074, + "step": 5674 + }, + { + "epoch": 0.8921377900921613, + "grad_norm": 0.13497284054756165, + "learning_rate": 2.9254642182957796e-05, + "loss": 1.0524, + "step": 5675 + }, + { + "epoch": 0.8922949949890939, + "grad_norm": 0.1505495011806488, + "learning_rate": 2.9248553783866795e-05, + "loss": 0.9744, + "step": 5676 + }, + { + "epoch": 0.8924521998860264, + "grad_norm": 0.14345882833003998, + "learning_rate": 2.9242465125290296e-05, + "loss": 1.1452, + "step": 5677 + }, + { + "epoch": 0.892609404782959, + "grad_norm": 0.14089809358119965, + "learning_rate": 2.923637620760017e-05, + "loss": 1.1527, + "step": 5678 + }, + { + "epoch": 0.8927666096798915, + "grad_norm": 0.13136571645736694, + "learning_rate": 2.9230287031168313e-05, + "loss": 1.1374, + "step": 5679 + }, + { + "epoch": 0.892923814576824, + "grad_norm": 0.13048121333122253, + "learning_rate": 2.922419759636661e-05, + "loss": 1.1425, + "step": 5680 + }, + { + "epoch": 0.8930810194737566, + "grad_norm": 0.1330985426902771, + "learning_rate": 2.9218107903566982e-05, + "loss": 1.0982, + "step": 5681 + }, + { + "epoch": 0.8932382243706891, + "grad_norm": 0.1535046398639679, + "learning_rate": 2.921201795314138e-05, + "loss": 1.0162, + "step": 5682 + }, + { + "epoch": 0.8933954292676217, + "grad_norm": 0.13158921897411346, + "learning_rate": 2.920592774546175e-05, + "loss": 1.0662, + "step": 5683 + }, + { + "epoch": 0.8935526341645542, + "grad_norm": 0.14994926750659943, + "learning_rate": 2.9199837280900055e-05, + "loss": 1.1095, + "step": 5684 + }, + { + "epoch": 0.8937098390614867, + "grad_norm": 0.14015507698059082, + "learning_rate": 2.9193746559828277e-05, + "loss": 1.0476, + "step": 5685 + }, + { + "epoch": 0.8938670439584193, + "grad_norm": 0.14255230128765106, + "learning_rate": 2.918765558261841e-05, + "loss": 1.0836, + "step": 5686 + }, + { + "epoch": 0.8940242488553518, + "grad_norm": 0.14916080236434937, + "learning_rate": 2.918156434964248e-05, + "loss": 1.1232, + "step": 5687 + }, + { + "epoch": 0.8941814537522844, + "grad_norm": 0.13430987298488617, + "learning_rate": 2.917547286127251e-05, + "loss": 1.0157, + "step": 5688 + }, + { + "epoch": 0.8943386586492169, + "grad_norm": 0.15098822116851807, + "learning_rate": 2.916938111788054e-05, + "loss": 1.0752, + "step": 5689 + }, + { + "epoch": 0.8944958635461495, + "grad_norm": 0.13318057358264923, + "learning_rate": 2.9163289119838637e-05, + "loss": 1.128, + "step": 5690 + }, + { + "epoch": 0.894653068443082, + "grad_norm": 0.13592058420181274, + "learning_rate": 2.915719686751887e-05, + "loss": 1.1154, + "step": 5691 + }, + { + "epoch": 0.8948102733400145, + "grad_norm": 0.12344607710838318, + "learning_rate": 2.9151104361293342e-05, + "loss": 1.1124, + "step": 5692 + }, + { + "epoch": 0.8949674782369471, + "grad_norm": 0.14875905215740204, + "learning_rate": 2.9145011601534156e-05, + "loss": 1.1848, + "step": 5693 + }, + { + "epoch": 0.8951246831338796, + "grad_norm": 0.119333416223526, + "learning_rate": 2.9138918588613423e-05, + "loss": 1.1708, + "step": 5694 + }, + { + "epoch": 0.8952818880308122, + "grad_norm": 0.1300589144229889, + "learning_rate": 2.913282532290329e-05, + "loss": 1.0361, + "step": 5695 + }, + { + "epoch": 0.8954390929277447, + "grad_norm": 0.12374424189329147, + "learning_rate": 2.9126731804775915e-05, + "loss": 0.9735, + "step": 5696 + }, + { + "epoch": 0.8955962978246772, + "grad_norm": 0.12504424154758453, + "learning_rate": 2.9120638034603465e-05, + "loss": 1.1243, + "step": 5697 + }, + { + "epoch": 0.8957535027216098, + "grad_norm": 0.14099591970443726, + "learning_rate": 2.9114544012758123e-05, + "loss": 1.0982, + "step": 5698 + }, + { + "epoch": 0.8959107076185423, + "grad_norm": 0.12895767390727997, + "learning_rate": 2.910844973961208e-05, + "loss": 0.9725, + "step": 5699 + }, + { + "epoch": 0.8960679125154749, + "grad_norm": 0.17303088307380676, + "learning_rate": 2.9102355215537564e-05, + "loss": 1.1134, + "step": 5700 + }, + { + "epoch": 0.8962251174124074, + "grad_norm": 0.13230673968791962, + "learning_rate": 2.9096260440906797e-05, + "loss": 1.1241, + "step": 5701 + }, + { + "epoch": 0.8963823223093399, + "grad_norm": 0.14133276045322418, + "learning_rate": 2.9090165416092026e-05, + "loss": 1.1413, + "step": 5702 + }, + { + "epoch": 0.8965395272062725, + "grad_norm": 0.13716180622577667, + "learning_rate": 2.908407014146552e-05, + "loss": 1.171, + "step": 5703 + }, + { + "epoch": 0.896696732103205, + "grad_norm": 0.15286429226398468, + "learning_rate": 2.9077974617399544e-05, + "loss": 1.0598, + "step": 5704 + }, + { + "epoch": 0.8968539370001376, + "grad_norm": 0.13907144963741302, + "learning_rate": 2.9071878844266397e-05, + "loss": 1.1922, + "step": 5705 + }, + { + "epoch": 0.8970111418970701, + "grad_norm": 0.14333570003509521, + "learning_rate": 2.9065782822438376e-05, + "loss": 1.2253, + "step": 5706 + }, + { + "epoch": 0.8971683467940026, + "grad_norm": 0.13578782975673676, + "learning_rate": 2.9059686552287817e-05, + "loss": 1.0356, + "step": 5707 + }, + { + "epoch": 0.8973255516909352, + "grad_norm": 0.14451970160007477, + "learning_rate": 2.9053590034187044e-05, + "loss": 1.0968, + "step": 5708 + }, + { + "epoch": 0.8974827565878677, + "grad_norm": 0.16025343537330627, + "learning_rate": 2.9047493268508418e-05, + "loss": 1.1268, + "step": 5709 + }, + { + "epoch": 0.8976399614848003, + "grad_norm": 0.12989938259124756, + "learning_rate": 2.904139625562431e-05, + "loss": 1.04, + "step": 5710 + }, + { + "epoch": 0.8977971663817328, + "grad_norm": 0.12494029849767685, + "learning_rate": 2.903529899590708e-05, + "loss": 1.1163, + "step": 5711 + }, + { + "epoch": 0.8979543712786653, + "grad_norm": 0.16564156115055084, + "learning_rate": 2.9029201489729154e-05, + "loss": 1.165, + "step": 5712 + }, + { + "epoch": 0.8981115761755979, + "grad_norm": 0.13529354333877563, + "learning_rate": 2.902310373746292e-05, + "loss": 1.1381, + "step": 5713 + }, + { + "epoch": 0.8982687810725304, + "grad_norm": 0.1375499963760376, + "learning_rate": 2.9017005739480817e-05, + "loss": 1.1211, + "step": 5714 + }, + { + "epoch": 0.898425985969463, + "grad_norm": 0.13199225068092346, + "learning_rate": 2.9010907496155298e-05, + "loss": 1.0222, + "step": 5715 + }, + { + "epoch": 0.8985831908663955, + "grad_norm": 0.12772680819034576, + "learning_rate": 2.9004809007858797e-05, + "loss": 1.0434, + "step": 5716 + }, + { + "epoch": 0.898740395763328, + "grad_norm": 0.1435181349515915, + "learning_rate": 2.899871027496381e-05, + "loss": 1.1251, + "step": 5717 + }, + { + "epoch": 0.8988976006602606, + "grad_norm": 0.14025722444057465, + "learning_rate": 2.899261129784281e-05, + "loss": 1.1731, + "step": 5718 + }, + { + "epoch": 0.8990548055571931, + "grad_norm": 0.13375252485275269, + "learning_rate": 2.8986512076868305e-05, + "loss": 1.2054, + "step": 5719 + }, + { + "epoch": 0.8992120104541257, + "grad_norm": 0.1311463713645935, + "learning_rate": 2.8980412612412804e-05, + "loss": 1.1628, + "step": 5720 + }, + { + "epoch": 0.8993692153510582, + "grad_norm": 0.15299151837825775, + "learning_rate": 2.897431290484885e-05, + "loss": 1.1187, + "step": 5721 + }, + { + "epoch": 0.8995264202479907, + "grad_norm": 0.13497400283813477, + "learning_rate": 2.8968212954548978e-05, + "loss": 1.1027, + "step": 5722 + }, + { + "epoch": 0.8996836251449233, + "grad_norm": 0.12568099796772003, + "learning_rate": 2.896211276188576e-05, + "loss": 1.1713, + "step": 5723 + }, + { + "epoch": 0.8998408300418558, + "grad_norm": 0.1380302459001541, + "learning_rate": 2.895601232723178e-05, + "loss": 0.9251, + "step": 5724 + }, + { + "epoch": 0.8999980349387884, + "grad_norm": 0.1408771425485611, + "learning_rate": 2.8949911650959605e-05, + "loss": 1.1332, + "step": 5725 + }, + { + "epoch": 0.9001552398357209, + "grad_norm": 0.16044233739376068, + "learning_rate": 2.894381073344186e-05, + "loss": 0.9742, + "step": 5726 + }, + { + "epoch": 0.9003124447326534, + "grad_norm": 0.1280396729707718, + "learning_rate": 2.893770957505116e-05, + "loss": 1.0877, + "step": 5727 + }, + { + "epoch": 0.900469649629586, + "grad_norm": 0.14867059886455536, + "learning_rate": 2.8931608176160147e-05, + "loss": 1.1679, + "step": 5728 + }, + { + "epoch": 0.9006268545265185, + "grad_norm": 0.12422164529561996, + "learning_rate": 2.8925506537141457e-05, + "loss": 0.998, + "step": 5729 + }, + { + "epoch": 0.9007840594234511, + "grad_norm": 0.13027115166187286, + "learning_rate": 2.8919404658367767e-05, + "loss": 1.0958, + "step": 5730 + }, + { + "epoch": 0.9009412643203836, + "grad_norm": 0.13427293300628662, + "learning_rate": 2.8913302540211744e-05, + "loss": 1.1546, + "step": 5731 + }, + { + "epoch": 0.9010984692173161, + "grad_norm": 0.13530409336090088, + "learning_rate": 2.89072001830461e-05, + "loss": 1.1248, + "step": 5732 + }, + { + "epoch": 0.9012556741142487, + "grad_norm": 0.13067205250263214, + "learning_rate": 2.8901097587243543e-05, + "loss": 1.1047, + "step": 5733 + }, + { + "epoch": 0.9014128790111812, + "grad_norm": 0.12409019470214844, + "learning_rate": 2.8894994753176774e-05, + "loss": 1.0545, + "step": 5734 + }, + { + "epoch": 0.9015700839081138, + "grad_norm": 0.12260918319225311, + "learning_rate": 2.888889168121855e-05, + "loss": 1.0372, + "step": 5735 + }, + { + "epoch": 0.9017272888050463, + "grad_norm": 0.16670134663581848, + "learning_rate": 2.8882788371741616e-05, + "loss": 1.0605, + "step": 5736 + }, + { + "epoch": 0.9018844937019788, + "grad_norm": 0.13768966495990753, + "learning_rate": 2.8876684825118745e-05, + "loss": 1.1089, + "step": 5737 + }, + { + "epoch": 0.9020416985989114, + "grad_norm": 0.1394687443971634, + "learning_rate": 2.8870581041722718e-05, + "loss": 1.0188, + "step": 5738 + }, + { + "epoch": 0.9021989034958439, + "grad_norm": 0.13164588809013367, + "learning_rate": 2.8864477021926317e-05, + "loss": 1.1304, + "step": 5739 + }, + { + "epoch": 0.9023561083927765, + "grad_norm": 0.1338716596364975, + "learning_rate": 2.885837276610237e-05, + "loss": 1.1424, + "step": 5740 + }, + { + "epoch": 0.902513313289709, + "grad_norm": 0.1532304883003235, + "learning_rate": 2.8852268274623688e-05, + "loss": 1.0907, + "step": 5741 + }, + { + "epoch": 0.9026705181866416, + "grad_norm": 0.13141104578971863, + "learning_rate": 2.8846163547863127e-05, + "loss": 1.1975, + "step": 5742 + }, + { + "epoch": 0.9028277230835741, + "grad_norm": 0.13862882554531097, + "learning_rate": 2.8840058586193526e-05, + "loss": 1.1301, + "step": 5743 + }, + { + "epoch": 0.9029849279805066, + "grad_norm": 0.12489652633666992, + "learning_rate": 2.8833953389987755e-05, + "loss": 1.0293, + "step": 5744 + }, + { + "epoch": 0.9031421328774392, + "grad_norm": 0.1392124742269516, + "learning_rate": 2.8827847959618697e-05, + "loss": 1.1501, + "step": 5745 + }, + { + "epoch": 0.9032993377743717, + "grad_norm": 0.13289155066013336, + "learning_rate": 2.882174229545925e-05, + "loss": 1.1389, + "step": 5746 + }, + { + "epoch": 0.9034565426713043, + "grad_norm": 0.15226851403713226, + "learning_rate": 2.881563639788233e-05, + "loss": 1.1321, + "step": 5747 + }, + { + "epoch": 0.9036137475682368, + "grad_norm": 0.14896303415298462, + "learning_rate": 2.8809530267260855e-05, + "loss": 1.233, + "step": 5748 + }, + { + "epoch": 0.9037709524651693, + "grad_norm": 0.14482875168323517, + "learning_rate": 2.8803423903967757e-05, + "loss": 1.1535, + "step": 5749 + }, + { + "epoch": 0.9039281573621019, + "grad_norm": 0.13182827830314636, + "learning_rate": 2.8797317308376005e-05, + "loss": 1.0509, + "step": 5750 + }, + { + "epoch": 0.9040853622590344, + "grad_norm": 0.14121131598949432, + "learning_rate": 2.879121048085856e-05, + "loss": 1.1413, + "step": 5751 + }, + { + "epoch": 0.904242567155967, + "grad_norm": 0.14073173701763153, + "learning_rate": 2.8785103421788406e-05, + "loss": 1.1655, + "step": 5752 + }, + { + "epoch": 0.9043997720528995, + "grad_norm": 0.13899986445903778, + "learning_rate": 2.8778996131538534e-05, + "loss": 1.0742, + "step": 5753 + }, + { + "epoch": 0.904556976949832, + "grad_norm": 0.12695467472076416, + "learning_rate": 2.877288861048196e-05, + "loss": 1.0693, + "step": 5754 + }, + { + "epoch": 0.9047141818467646, + "grad_norm": 0.12788677215576172, + "learning_rate": 2.8766780858991686e-05, + "loss": 1.0867, + "step": 5755 + }, + { + "epoch": 0.904871386743697, + "grad_norm": 0.14149411022663116, + "learning_rate": 2.876067287744079e-05, + "loss": 1.2043, + "step": 5756 + }, + { + "epoch": 0.9050285916406297, + "grad_norm": 0.152032271027565, + "learning_rate": 2.8754564666202295e-05, + "loss": 1.1418, + "step": 5757 + }, + { + "epoch": 0.9051857965375621, + "grad_norm": 0.1416063904762268, + "learning_rate": 2.8748456225649273e-05, + "loss": 1.0548, + "step": 5758 + }, + { + "epoch": 0.9053430014344946, + "grad_norm": 0.13762247562408447, + "learning_rate": 2.8742347556154812e-05, + "loss": 1.1292, + "step": 5759 + }, + { + "epoch": 0.9055002063314272, + "grad_norm": 0.1333901286125183, + "learning_rate": 2.8736238658091998e-05, + "loss": 1.054, + "step": 5760 + }, + { + "epoch": 0.9055002063314272, + "eval_loss": 1.0913219451904297, + "eval_runtime": 2381.0688, + "eval_samples_per_second": 3.888, + "eval_steps_per_second": 1.944, + "step": 5760 + }, + { + "epoch": 0.9056574112283597, + "grad_norm": 0.12779124081134796, + "learning_rate": 2.8730129531833943e-05, + "loss": 1.0889, + "step": 5761 + }, + { + "epoch": 0.9058146161252923, + "grad_norm": 0.12788347899913788, + "learning_rate": 2.872402017775377e-05, + "loss": 1.0804, + "step": 5762 + }, + { + "epoch": 0.9059718210222248, + "grad_norm": 0.1475571095943451, + "learning_rate": 2.8717910596224613e-05, + "loss": 1.0516, + "step": 5763 + }, + { + "epoch": 0.9061290259191573, + "grad_norm": 0.12808263301849365, + "learning_rate": 2.8711800787619625e-05, + "loss": 1.229, + "step": 5764 + }, + { + "epoch": 0.9062862308160899, + "grad_norm": 0.1640312373638153, + "learning_rate": 2.8705690752311954e-05, + "loss": 1.0579, + "step": 5765 + }, + { + "epoch": 0.9064434357130224, + "grad_norm": 0.12811321020126343, + "learning_rate": 2.8699580490674803e-05, + "loss": 1.0869, + "step": 5766 + }, + { + "epoch": 0.906600640609955, + "grad_norm": 0.13901269435882568, + "learning_rate": 2.869347000308135e-05, + "loss": 1.0646, + "step": 5767 + }, + { + "epoch": 0.9067578455068875, + "grad_norm": 0.12225610762834549, + "learning_rate": 2.868735928990481e-05, + "loss": 1.0325, + "step": 5768 + }, + { + "epoch": 0.90691505040382, + "grad_norm": 0.13094498217105865, + "learning_rate": 2.8681248351518382e-05, + "loss": 1.1776, + "step": 5769 + }, + { + "epoch": 0.9070722553007526, + "grad_norm": 0.12676210701465607, + "learning_rate": 2.867513718829531e-05, + "loss": 1.0929, + "step": 5770 + }, + { + "epoch": 0.9072294601976851, + "grad_norm": 0.13293857872486115, + "learning_rate": 2.8669025800608845e-05, + "loss": 1.0235, + "step": 5771 + }, + { + "epoch": 0.9073866650946177, + "grad_norm": 0.14774401485919952, + "learning_rate": 2.8662914188832244e-05, + "loss": 1.0696, + "step": 5772 + }, + { + "epoch": 0.9075438699915502, + "grad_norm": 0.14324910938739777, + "learning_rate": 2.8656802353338786e-05, + "loss": 1.2142, + "step": 5773 + }, + { + "epoch": 0.9077010748884827, + "grad_norm": 0.14124560356140137, + "learning_rate": 2.8650690294501747e-05, + "loss": 1.0529, + "step": 5774 + }, + { + "epoch": 0.9078582797854153, + "grad_norm": 0.1517721563577652, + "learning_rate": 2.864457801269444e-05, + "loss": 1.1425, + "step": 5775 + }, + { + "epoch": 0.9080154846823478, + "grad_norm": 0.17718786001205444, + "learning_rate": 2.8638465508290168e-05, + "loss": 1.1572, + "step": 5776 + }, + { + "epoch": 0.9081726895792804, + "grad_norm": 0.14374038577079773, + "learning_rate": 2.863235278166227e-05, + "loss": 1.1278, + "step": 5777 + }, + { + "epoch": 0.9083298944762129, + "grad_norm": 0.12687242031097412, + "learning_rate": 2.8626239833184088e-05, + "loss": 1.096, + "step": 5778 + }, + { + "epoch": 0.9084870993731454, + "grad_norm": 0.12965981662273407, + "learning_rate": 2.8620126663228968e-05, + "loss": 1.264, + "step": 5779 + }, + { + "epoch": 0.908644304270078, + "grad_norm": 0.13415074348449707, + "learning_rate": 2.8614013272170286e-05, + "loss": 1.0826, + "step": 5780 + }, + { + "epoch": 0.9088015091670105, + "grad_norm": 0.1368751972913742, + "learning_rate": 2.8607899660381423e-05, + "loss": 1.0569, + "step": 5781 + }, + { + "epoch": 0.9089587140639431, + "grad_norm": 0.15209050476551056, + "learning_rate": 2.8601785828235782e-05, + "loss": 1.0354, + "step": 5782 + }, + { + "epoch": 0.9091159189608756, + "grad_norm": 0.1487579345703125, + "learning_rate": 2.8595671776106757e-05, + "loss": 1.0684, + "step": 5783 + }, + { + "epoch": 0.9092731238578081, + "grad_norm": 0.14560696482658386, + "learning_rate": 2.8589557504367785e-05, + "loss": 1.1637, + "step": 5784 + }, + { + "epoch": 0.9094303287547407, + "grad_norm": 0.12533873319625854, + "learning_rate": 2.85834430133923e-05, + "loss": 1.0534, + "step": 5785 + }, + { + "epoch": 0.9095875336516732, + "grad_norm": 0.12900641560554504, + "learning_rate": 2.8577328303553747e-05, + "loss": 1.1495, + "step": 5786 + }, + { + "epoch": 0.9097447385486058, + "grad_norm": 0.1582890748977661, + "learning_rate": 2.8571213375225598e-05, + "loss": 1.1261, + "step": 5787 + }, + { + "epoch": 0.9099019434455383, + "grad_norm": 0.12620587646961212, + "learning_rate": 2.856509822878132e-05, + "loss": 1.0539, + "step": 5788 + }, + { + "epoch": 0.9100591483424708, + "grad_norm": 0.1520870178937912, + "learning_rate": 2.8558982864594403e-05, + "loss": 1.0509, + "step": 5789 + }, + { + "epoch": 0.9102163532394034, + "grad_norm": 0.13580292463302612, + "learning_rate": 2.8552867283038355e-05, + "loss": 1.0996, + "step": 5790 + }, + { + "epoch": 0.9103735581363359, + "grad_norm": 0.1273009032011032, + "learning_rate": 2.85467514844867e-05, + "loss": 1.2277, + "step": 5791 + }, + { + "epoch": 0.9105307630332685, + "grad_norm": 0.12350128591060638, + "learning_rate": 2.8540635469312948e-05, + "loss": 1.0113, + "step": 5792 + }, + { + "epoch": 0.910687967930201, + "grad_norm": 0.12338768690824509, + "learning_rate": 2.8534519237890655e-05, + "loss": 1.1468, + "step": 5793 + }, + { + "epoch": 0.9108451728271336, + "grad_norm": 0.12832117080688477, + "learning_rate": 2.852840279059338e-05, + "loss": 1.1789, + "step": 5794 + }, + { + "epoch": 0.9110023777240661, + "grad_norm": 0.13606378436088562, + "learning_rate": 2.8522286127794678e-05, + "loss": 1.1277, + "step": 5795 + }, + { + "epoch": 0.9111595826209986, + "grad_norm": 0.13515432178974152, + "learning_rate": 2.851616924986815e-05, + "loss": 1.0243, + "step": 5796 + }, + { + "epoch": 0.9113167875179312, + "grad_norm": 0.14000850915908813, + "learning_rate": 2.851005215718738e-05, + "loss": 1.1349, + "step": 5797 + }, + { + "epoch": 0.9114739924148637, + "grad_norm": 0.1517215222120285, + "learning_rate": 2.8503934850125972e-05, + "loss": 1.1119, + "step": 5798 + }, + { + "epoch": 0.9116311973117963, + "grad_norm": 0.12336322665214539, + "learning_rate": 2.8497817329057565e-05, + "loss": 1.0401, + "step": 5799 + }, + { + "epoch": 0.9117884022087288, + "grad_norm": 0.1374485194683075, + "learning_rate": 2.8491699594355776e-05, + "loss": 1.1117, + "step": 5800 + }, + { + "epoch": 0.9119456071056613, + "grad_norm": 0.1384294331073761, + "learning_rate": 2.8485581646394266e-05, + "loss": 1.0481, + "step": 5801 + }, + { + "epoch": 0.9121028120025939, + "grad_norm": 0.13905028998851776, + "learning_rate": 2.8479463485546688e-05, + "loss": 1.1051, + "step": 5802 + }, + { + "epoch": 0.9122600168995264, + "grad_norm": 0.1367316097021103, + "learning_rate": 2.8473345112186722e-05, + "loss": 1.0943, + "step": 5803 + }, + { + "epoch": 0.912417221796459, + "grad_norm": 0.14246809482574463, + "learning_rate": 2.8467226526688046e-05, + "loss": 1.1433, + "step": 5804 + }, + { + "epoch": 0.9125744266933915, + "grad_norm": 0.128557026386261, + "learning_rate": 2.8461107729424374e-05, + "loss": 1.1241, + "step": 5805 + }, + { + "epoch": 0.912731631590324, + "grad_norm": 0.12136935442686081, + "learning_rate": 2.8454988720769403e-05, + "loss": 1.151, + "step": 5806 + }, + { + "epoch": 0.9128888364872566, + "grad_norm": 0.1438896656036377, + "learning_rate": 2.8448869501096868e-05, + "loss": 1.2422, + "step": 5807 + }, + { + "epoch": 0.9130460413841891, + "grad_norm": 0.13560909032821655, + "learning_rate": 2.844275007078051e-05, + "loss": 1.1132, + "step": 5808 + }, + { + "epoch": 0.9132032462811217, + "grad_norm": 0.14087963104248047, + "learning_rate": 2.843663043019407e-05, + "loss": 1.0373, + "step": 5809 + }, + { + "epoch": 0.9133604511780542, + "grad_norm": 0.14995509386062622, + "learning_rate": 2.8430510579711327e-05, + "loss": 1.0702, + "step": 5810 + }, + { + "epoch": 0.9135176560749867, + "grad_norm": 0.14328469336032867, + "learning_rate": 2.8424390519706044e-05, + "loss": 1.0609, + "step": 5811 + }, + { + "epoch": 0.9136748609719193, + "grad_norm": 0.14439690113067627, + "learning_rate": 2.8418270250552014e-05, + "loss": 1.1415, + "step": 5812 + }, + { + "epoch": 0.9138320658688518, + "grad_norm": 0.12316414713859558, + "learning_rate": 2.8412149772623054e-05, + "loss": 1.0635, + "step": 5813 + }, + { + "epoch": 0.9139892707657844, + "grad_norm": 0.13240422308444977, + "learning_rate": 2.8406029086292957e-05, + "loss": 0.9615, + "step": 5814 + }, + { + "epoch": 0.9141464756627169, + "grad_norm": 0.12478620558977127, + "learning_rate": 2.839990819193557e-05, + "loss": 1.0586, + "step": 5815 + }, + { + "epoch": 0.9143036805596494, + "grad_norm": 0.13056431710720062, + "learning_rate": 2.839378708992472e-05, + "loss": 1.0356, + "step": 5816 + }, + { + "epoch": 0.914460885456582, + "grad_norm": 0.11709750443696976, + "learning_rate": 2.8387665780634275e-05, + "loss": 1.0724, + "step": 5817 + }, + { + "epoch": 0.9146180903535145, + "grad_norm": 0.12349488586187363, + "learning_rate": 2.8381544264438092e-05, + "loss": 1.1358, + "step": 5818 + }, + { + "epoch": 0.9147752952504471, + "grad_norm": 0.12479537725448608, + "learning_rate": 2.8375422541710045e-05, + "loss": 1.0634, + "step": 5819 + }, + { + "epoch": 0.9149325001473796, + "grad_norm": 0.12252938747406006, + "learning_rate": 2.8369300612824034e-05, + "loss": 1.1054, + "step": 5820 + }, + { + "epoch": 0.9150897050443121, + "grad_norm": 0.13676980137825012, + "learning_rate": 2.836317847815396e-05, + "loss": 1.082, + "step": 5821 + }, + { + "epoch": 0.9152469099412447, + "grad_norm": 0.12986131012439728, + "learning_rate": 2.8357056138073746e-05, + "loss": 1.0068, + "step": 5822 + }, + { + "epoch": 0.9154041148381772, + "grad_norm": 0.13949459791183472, + "learning_rate": 2.8350933592957314e-05, + "loss": 1.1275, + "step": 5823 + }, + { + "epoch": 0.9155613197351098, + "grad_norm": 0.13199569284915924, + "learning_rate": 2.8344810843178603e-05, + "loss": 1.0789, + "step": 5824 + }, + { + "epoch": 0.9157185246320423, + "grad_norm": 0.1328863799571991, + "learning_rate": 2.8338687889111577e-05, + "loss": 1.0018, + "step": 5825 + }, + { + "epoch": 0.9158757295289748, + "grad_norm": 0.13771453499794006, + "learning_rate": 2.833256473113019e-05, + "loss": 1.1348, + "step": 5826 + }, + { + "epoch": 0.9160329344259074, + "grad_norm": 0.11951122432947159, + "learning_rate": 2.8326441369608443e-05, + "loss": 1.1216, + "step": 5827 + }, + { + "epoch": 0.9161901393228399, + "grad_norm": 0.12968111038208008, + "learning_rate": 2.83203178049203e-05, + "loss": 1.0872, + "step": 5828 + }, + { + "epoch": 0.9163473442197725, + "grad_norm": 0.13724130392074585, + "learning_rate": 2.8314194037439786e-05, + "loss": 1.2194, + "step": 5829 + }, + { + "epoch": 0.916504549116705, + "grad_norm": 0.1234157383441925, + "learning_rate": 2.8308070067540905e-05, + "loss": 1.0796, + "step": 5830 + }, + { + "epoch": 0.9166617540136375, + "grad_norm": 0.1210472360253334, + "learning_rate": 2.8301945895597694e-05, + "loss": 1.0127, + "step": 5831 + }, + { + "epoch": 0.9168189589105701, + "grad_norm": 0.13493800163269043, + "learning_rate": 2.8295821521984188e-05, + "loss": 1.0607, + "step": 5832 + }, + { + "epoch": 0.9169761638075026, + "grad_norm": 0.1553046554327011, + "learning_rate": 2.8289696947074447e-05, + "loss": 1.0886, + "step": 5833 + }, + { + "epoch": 0.9171333687044352, + "grad_norm": 0.13993187248706818, + "learning_rate": 2.8283572171242528e-05, + "loss": 1.0531, + "step": 5834 + }, + { + "epoch": 0.9172905736013677, + "grad_norm": 0.1344640552997589, + "learning_rate": 2.8277447194862518e-05, + "loss": 1.0535, + "step": 5835 + }, + { + "epoch": 0.9174477784983002, + "grad_norm": 0.13232530653476715, + "learning_rate": 2.82713220183085e-05, + "loss": 1.0514, + "step": 5836 + }, + { + "epoch": 0.9176049833952328, + "grad_norm": 0.12874165177345276, + "learning_rate": 2.8265196641954582e-05, + "loss": 1.0815, + "step": 5837 + }, + { + "epoch": 0.9177621882921653, + "grad_norm": 0.13770556449890137, + "learning_rate": 2.8259071066174882e-05, + "loss": 1.0686, + "step": 5838 + }, + { + "epoch": 0.9179193931890979, + "grad_norm": 0.12717744708061218, + "learning_rate": 2.825294529134351e-05, + "loss": 1.1387, + "step": 5839 + }, + { + "epoch": 0.9180765980860304, + "grad_norm": 0.1538948118686676, + "learning_rate": 2.8246819317834623e-05, + "loss": 1.0717, + "step": 5840 + }, + { + "epoch": 0.9182338029829629, + "grad_norm": 0.13529753684997559, + "learning_rate": 2.824069314602236e-05, + "loss": 1.0865, + "step": 5841 + }, + { + "epoch": 0.9183910078798955, + "grad_norm": 0.1670359969139099, + "learning_rate": 2.8234566776280898e-05, + "loss": 1.0697, + "step": 5842 + }, + { + "epoch": 0.918548212776828, + "grad_norm": 0.1306200921535492, + "learning_rate": 2.8228440208984402e-05, + "loss": 1.2538, + "step": 5843 + }, + { + "epoch": 0.9187054176737606, + "grad_norm": 0.1301957219839096, + "learning_rate": 2.8222313444507053e-05, + "loss": 1.0753, + "step": 5844 + }, + { + "epoch": 0.9188626225706931, + "grad_norm": 0.13786588609218597, + "learning_rate": 2.8216186483223068e-05, + "loss": 1.0855, + "step": 5845 + }, + { + "epoch": 0.9190198274676257, + "grad_norm": 0.13362693786621094, + "learning_rate": 2.8210059325506644e-05, + "loss": 1.146, + "step": 5846 + }, + { + "epoch": 0.9191770323645582, + "grad_norm": 0.12979008257389069, + "learning_rate": 2.820393197173201e-05, + "loss": 1.0117, + "step": 5847 + }, + { + "epoch": 0.9193342372614907, + "grad_norm": 0.1249472126364708, + "learning_rate": 2.8197804422273406e-05, + "loss": 1.0241, + "step": 5848 + }, + { + "epoch": 0.9194914421584233, + "grad_norm": 0.13277561962604523, + "learning_rate": 2.819167667750507e-05, + "loss": 1.0737, + "step": 5849 + }, + { + "epoch": 0.9196486470553558, + "grad_norm": 0.12641245126724243, + "learning_rate": 2.818554873780127e-05, + "loss": 1.0097, + "step": 5850 + }, + { + "epoch": 0.9198058519522884, + "grad_norm": 0.13647952675819397, + "learning_rate": 2.8179420603536272e-05, + "loss": 1.0617, + "step": 5851 + }, + { + "epoch": 0.9199630568492209, + "grad_norm": 0.13099946081638336, + "learning_rate": 2.8173292275084363e-05, + "loss": 1.2114, + "step": 5852 + }, + { + "epoch": 0.9201202617461534, + "grad_norm": 0.13287284970283508, + "learning_rate": 2.8167163752819826e-05, + "loss": 1.0427, + "step": 5853 + }, + { + "epoch": 0.920277466643086, + "grad_norm": 0.14015206694602966, + "learning_rate": 2.816103503711699e-05, + "loss": 1.1068, + "step": 5854 + }, + { + "epoch": 0.9204346715400185, + "grad_norm": 0.1290660947561264, + "learning_rate": 2.815490612835015e-05, + "loss": 1.0849, + "step": 5855 + }, + { + "epoch": 0.9205918764369511, + "grad_norm": 0.13064253330230713, + "learning_rate": 2.8148777026893653e-05, + "loss": 1.165, + "step": 5856 + }, + { + "epoch": 0.9207490813338836, + "grad_norm": 0.13498087227344513, + "learning_rate": 2.814264773312184e-05, + "loss": 1.0333, + "step": 5857 + }, + { + "epoch": 0.920906286230816, + "grad_norm": 0.11860901862382889, + "learning_rate": 2.813651824740905e-05, + "loss": 1.0343, + "step": 5858 + }, + { + "epoch": 0.9210634911277487, + "grad_norm": 0.12405065447092056, + "learning_rate": 2.8130388570129662e-05, + "loss": 1.1688, + "step": 5859 + }, + { + "epoch": 0.9212206960246812, + "grad_norm": 0.1312331110239029, + "learning_rate": 2.812425870165805e-05, + "loss": 1.1117, + "step": 5860 + }, + { + "epoch": 0.9213779009216138, + "grad_norm": 0.15156623721122742, + "learning_rate": 2.811812864236861e-05, + "loss": 1.2279, + "step": 5861 + }, + { + "epoch": 0.9215351058185463, + "grad_norm": 0.1340988129377365, + "learning_rate": 2.8111998392635735e-05, + "loss": 1.144, + "step": 5862 + }, + { + "epoch": 0.9216923107154787, + "grad_norm": 0.16027574241161346, + "learning_rate": 2.810586795283383e-05, + "loss": 0.9969, + "step": 5863 + }, + { + "epoch": 0.9218495156124114, + "grad_norm": 0.1477731466293335, + "learning_rate": 2.8099737323337337e-05, + "loss": 1.0804, + "step": 5864 + }, + { + "epoch": 0.9220067205093438, + "grad_norm": 0.14026421308517456, + "learning_rate": 2.8093606504520675e-05, + "loss": 0.9982, + "step": 5865 + }, + { + "epoch": 0.9221639254062765, + "grad_norm": 0.12011507898569107, + "learning_rate": 2.8087475496758308e-05, + "loss": 1.0439, + "step": 5866 + }, + { + "epoch": 0.922321130303209, + "grad_norm": 0.12446123361587524, + "learning_rate": 2.808134430042468e-05, + "loss": 1.0308, + "step": 5867 + }, + { + "epoch": 0.9224783352001414, + "grad_norm": 0.13376714289188385, + "learning_rate": 2.807521291589426e-05, + "loss": 1.0662, + "step": 5868 + }, + { + "epoch": 0.922635540097074, + "grad_norm": 0.13055849075317383, + "learning_rate": 2.8069081343541537e-05, + "loss": 1.1211, + "step": 5869 + }, + { + "epoch": 0.9227927449940065, + "grad_norm": 0.14766715466976166, + "learning_rate": 2.8062949583741005e-05, + "loss": 1.1205, + "step": 5870 + }, + { + "epoch": 0.9229499498909391, + "grad_norm": 0.15254098176956177, + "learning_rate": 2.805681763686717e-05, + "loss": 1.0811, + "step": 5871 + }, + { + "epoch": 0.9231071547878716, + "grad_norm": 0.13679249584674835, + "learning_rate": 2.8050685503294537e-05, + "loss": 1.1751, + "step": 5872 + }, + { + "epoch": 0.9232643596848041, + "grad_norm": 0.17285694181919098, + "learning_rate": 2.804455318339765e-05, + "loss": 1.0943, + "step": 5873 + }, + { + "epoch": 0.9234215645817367, + "grad_norm": 0.1367441713809967, + "learning_rate": 2.8038420677551027e-05, + "loss": 1.0196, + "step": 5874 + }, + { + "epoch": 0.9235787694786692, + "grad_norm": 0.14842143654823303, + "learning_rate": 2.8032287986129236e-05, + "loss": 1.0186, + "step": 5875 + }, + { + "epoch": 0.9237359743756018, + "grad_norm": 0.1617974489927292, + "learning_rate": 2.8026155109506835e-05, + "loss": 1.0159, + "step": 5876 + }, + { + "epoch": 0.9238931792725343, + "grad_norm": 0.12496098130941391, + "learning_rate": 2.8020022048058386e-05, + "loss": 1.1676, + "step": 5877 + }, + { + "epoch": 0.9240503841694668, + "grad_norm": 0.12146230041980743, + "learning_rate": 2.8013888802158483e-05, + "loss": 1.1056, + "step": 5878 + }, + { + "epoch": 0.9242075890663994, + "grad_norm": 0.1262625753879547, + "learning_rate": 2.800775537218172e-05, + "loss": 1.1292, + "step": 5879 + }, + { + "epoch": 0.9243647939633319, + "grad_norm": 0.13556911051273346, + "learning_rate": 2.8001621758502704e-05, + "loss": 0.9448, + "step": 5880 + }, + { + "epoch": 0.9245219988602645, + "grad_norm": 0.13289503753185272, + "learning_rate": 2.7995487961496054e-05, + "loss": 1.0163, + "step": 5881 + }, + { + "epoch": 0.924679203757197, + "grad_norm": 0.14427532255649567, + "learning_rate": 2.7989353981536394e-05, + "loss": 1.1987, + "step": 5882 + }, + { + "epoch": 0.9248364086541295, + "grad_norm": 0.13093596696853638, + "learning_rate": 2.7983219818998364e-05, + "loss": 1.0336, + "step": 5883 + }, + { + "epoch": 0.9249936135510621, + "grad_norm": 0.12032842636108398, + "learning_rate": 2.7977085474256616e-05, + "loss": 1.0426, + "step": 5884 + }, + { + "epoch": 0.9251508184479946, + "grad_norm": 0.13053454458713531, + "learning_rate": 2.797095094768582e-05, + "loss": 1.0823, + "step": 5885 + }, + { + "epoch": 0.9253080233449272, + "grad_norm": 0.12247161567211151, + "learning_rate": 2.796481623966064e-05, + "loss": 1.1532, + "step": 5886 + }, + { + "epoch": 0.9254652282418597, + "grad_norm": 0.13273625075817108, + "learning_rate": 2.7958681350555765e-05, + "loss": 1.1202, + "step": 5887 + }, + { + "epoch": 0.9256224331387922, + "grad_norm": 0.12845084071159363, + "learning_rate": 2.795254628074589e-05, + "loss": 1.0923, + "step": 5888 + }, + { + "epoch": 0.9257796380357248, + "grad_norm": 0.13342683017253876, + "learning_rate": 2.7946411030605724e-05, + "loss": 1.0369, + "step": 5889 + }, + { + "epoch": 0.9259368429326573, + "grad_norm": 0.14172682166099548, + "learning_rate": 2.7940275600509984e-05, + "loss": 1.1097, + "step": 5890 + }, + { + "epoch": 0.9260940478295899, + "grad_norm": 0.14476054906845093, + "learning_rate": 2.7934139990833393e-05, + "loss": 1.0546, + "step": 5891 + }, + { + "epoch": 0.9262512527265224, + "grad_norm": 0.1423303782939911, + "learning_rate": 2.7928004201950702e-05, + "loss": 1.0276, + "step": 5892 + }, + { + "epoch": 0.9264084576234549, + "grad_norm": 0.1325106918811798, + "learning_rate": 2.792186823423664e-05, + "loss": 1.1484, + "step": 5893 + }, + { + "epoch": 0.9265656625203875, + "grad_norm": 0.15245391428470612, + "learning_rate": 2.7915732088066e-05, + "loss": 1.0228, + "step": 5894 + }, + { + "epoch": 0.92672286741732, + "grad_norm": 0.12855403125286102, + "learning_rate": 2.790959576381353e-05, + "loss": 1.0859, + "step": 5895 + }, + { + "epoch": 0.9268800723142526, + "grad_norm": 0.1401538997888565, + "learning_rate": 2.790345926185402e-05, + "loss": 1.0592, + "step": 5896 + }, + { + "epoch": 0.9270372772111851, + "grad_norm": 0.1314084231853485, + "learning_rate": 2.789732258256227e-05, + "loss": 1.0552, + "step": 5897 + }, + { + "epoch": 0.9271944821081176, + "grad_norm": 0.1389852911233902, + "learning_rate": 2.7891185726313067e-05, + "loss": 1.0712, + "step": 5898 + }, + { + "epoch": 0.9273516870050502, + "grad_norm": 0.15014514327049255, + "learning_rate": 2.788504869348126e-05, + "loss": 1.0415, + "step": 5899 + }, + { + "epoch": 0.9275088919019827, + "grad_norm": 0.15944015979766846, + "learning_rate": 2.7878911484441638e-05, + "loss": 1.0502, + "step": 5900 + }, + { + "epoch": 0.9276660967989153, + "grad_norm": 0.1378808170557022, + "learning_rate": 2.7872774099569073e-05, + "loss": 1.1293, + "step": 5901 + }, + { + "epoch": 0.9278233016958478, + "grad_norm": 0.13429345190525055, + "learning_rate": 2.7866636539238383e-05, + "loss": 1.0598, + "step": 5902 + }, + { + "epoch": 0.9279805065927804, + "grad_norm": 0.15235687792301178, + "learning_rate": 2.786049880382444e-05, + "loss": 1.1065, + "step": 5903 + }, + { + "epoch": 0.9281377114897129, + "grad_norm": 0.13226331770420074, + "learning_rate": 2.785436089370212e-05, + "loss": 1.0603, + "step": 5904 + }, + { + "epoch": 0.9282949163866454, + "grad_norm": 0.15453143417835236, + "learning_rate": 2.7848222809246295e-05, + "loss": 1.0911, + "step": 5905 + }, + { + "epoch": 0.928452121283578, + "grad_norm": 0.16839811205863953, + "learning_rate": 2.784208455083186e-05, + "loss": 1.1159, + "step": 5906 + }, + { + "epoch": 0.9286093261805105, + "grad_norm": 0.13449357450008392, + "learning_rate": 2.783594611883371e-05, + "loss": 1.2091, + "step": 5907 + }, + { + "epoch": 0.9287665310774431, + "grad_norm": 0.1276775449514389, + "learning_rate": 2.782980751362676e-05, + "loss": 1.0628, + "step": 5908 + }, + { + "epoch": 0.9289237359743756, + "grad_norm": 0.12461748719215393, + "learning_rate": 2.7823668735585934e-05, + "loss": 1.0725, + "step": 5909 + }, + { + "epoch": 0.9290809408713081, + "grad_norm": 0.13275814056396484, + "learning_rate": 2.7817529785086167e-05, + "loss": 1.098, + "step": 5910 + }, + { + "epoch": 0.9292381457682407, + "grad_norm": 0.1452742964029312, + "learning_rate": 2.7811390662502407e-05, + "loss": 1.1648, + "step": 5911 + }, + { + "epoch": 0.9293953506651732, + "grad_norm": 0.13696105778217316, + "learning_rate": 2.7805251368209585e-05, + "loss": 1.2319, + "step": 5912 + }, + { + "epoch": 0.9295525555621058, + "grad_norm": 0.12167777866125107, + "learning_rate": 2.7799111902582696e-05, + "loss": 1.0486, + "step": 5913 + }, + { + "epoch": 0.9297097604590383, + "grad_norm": 0.1411798894405365, + "learning_rate": 2.7792972265996704e-05, + "loss": 1.0378, + "step": 5914 + }, + { + "epoch": 0.9298669653559708, + "grad_norm": 0.12194959074258804, + "learning_rate": 2.778683245882659e-05, + "loss": 1.129, + "step": 5915 + }, + { + "epoch": 0.9300241702529034, + "grad_norm": 0.13540031015872955, + "learning_rate": 2.7780692481447345e-05, + "loss": 1.0871, + "step": 5916 + }, + { + "epoch": 0.9301813751498359, + "grad_norm": 0.14042246341705322, + "learning_rate": 2.777455233423399e-05, + "loss": 1.0133, + "step": 5917 + }, + { + "epoch": 0.9303385800467685, + "grad_norm": 0.14951664209365845, + "learning_rate": 2.7768412017561534e-05, + "loss": 1.1248, + "step": 5918 + }, + { + "epoch": 0.930495784943701, + "grad_norm": 0.1268850564956665, + "learning_rate": 2.7762271531805002e-05, + "loss": 0.9968, + "step": 5919 + }, + { + "epoch": 0.9306529898406335, + "grad_norm": 0.14185664057731628, + "learning_rate": 2.7756130877339443e-05, + "loss": 1.1533, + "step": 5920 + }, + { + "epoch": 0.9306529898406335, + "eval_loss": 1.0906589031219482, + "eval_runtime": 2347.279, + "eval_samples_per_second": 3.944, + "eval_steps_per_second": 1.972, + "step": 5920 + }, + { + "epoch": 0.9308101947375661, + "grad_norm": 0.14762406051158905, + "learning_rate": 2.774999005453989e-05, + "loss": 1.0303, + "step": 5921 + }, + { + "epoch": 0.9309673996344986, + "grad_norm": 0.14337274432182312, + "learning_rate": 2.7743849063781402e-05, + "loss": 0.9713, + "step": 5922 + }, + { + "epoch": 0.9311246045314312, + "grad_norm": 0.13716350495815277, + "learning_rate": 2.773770790543906e-05, + "loss": 1.1897, + "step": 5923 + }, + { + "epoch": 0.9312818094283637, + "grad_norm": 0.13537666201591492, + "learning_rate": 2.7731566579887935e-05, + "loss": 1.0438, + "step": 5924 + }, + { + "epoch": 0.9314390143252962, + "grad_norm": 0.1292024552822113, + "learning_rate": 2.772542508750312e-05, + "loss": 1.1203, + "step": 5925 + }, + { + "epoch": 0.9315962192222288, + "grad_norm": 0.13635164499282837, + "learning_rate": 2.7719283428659703e-05, + "loss": 1.0948, + "step": 5926 + }, + { + "epoch": 0.9317534241191613, + "grad_norm": 0.13263124227523804, + "learning_rate": 2.7713141603732802e-05, + "loss": 1.0804, + "step": 5927 + }, + { + "epoch": 0.9319106290160939, + "grad_norm": 0.1261124163866043, + "learning_rate": 2.7706999613097533e-05, + "loss": 1.0164, + "step": 5928 + }, + { + "epoch": 0.9320678339130264, + "grad_norm": 0.121151864528656, + "learning_rate": 2.7700857457129038e-05, + "loss": 1.1652, + "step": 5929 + }, + { + "epoch": 0.9322250388099589, + "grad_norm": 0.15104156732559204, + "learning_rate": 2.7694715136202432e-05, + "loss": 0.9362, + "step": 5930 + }, + { + "epoch": 0.9323822437068915, + "grad_norm": 0.12918272614479065, + "learning_rate": 2.7688572650692883e-05, + "loss": 1.1307, + "step": 5931 + }, + { + "epoch": 0.932539448603824, + "grad_norm": 0.12568433582782745, + "learning_rate": 2.7682430000975545e-05, + "loss": 1.1169, + "step": 5932 + }, + { + "epoch": 0.9326966535007566, + "grad_norm": 0.11823861300945282, + "learning_rate": 2.767628718742559e-05, + "loss": 0.9237, + "step": 5933 + }, + { + "epoch": 0.9328538583976891, + "grad_norm": 0.1292971968650818, + "learning_rate": 2.7670144210418202e-05, + "loss": 1.0329, + "step": 5934 + }, + { + "epoch": 0.9330110632946216, + "grad_norm": 0.13379380106925964, + "learning_rate": 2.7664001070328554e-05, + "loss": 1.1778, + "step": 5935 + }, + { + "epoch": 0.9331682681915542, + "grad_norm": 0.14074085652828217, + "learning_rate": 2.7657857767531865e-05, + "loss": 1.0906, + "step": 5936 + }, + { + "epoch": 0.9333254730884867, + "grad_norm": 0.13014034926891327, + "learning_rate": 2.7651714302403332e-05, + "loss": 1.0965, + "step": 5937 + }, + { + "epoch": 0.9334826779854193, + "grad_norm": 0.12991267442703247, + "learning_rate": 2.764557067531818e-05, + "loss": 1.1594, + "step": 5938 + }, + { + "epoch": 0.9336398828823518, + "grad_norm": 0.12371101975440979, + "learning_rate": 2.7639426886651636e-05, + "loss": 1.1176, + "step": 5939 + }, + { + "epoch": 0.9337970877792843, + "grad_norm": 0.13973870873451233, + "learning_rate": 2.7633282936778937e-05, + "loss": 1.0586, + "step": 5940 + }, + { + "epoch": 0.9339542926762169, + "grad_norm": 0.1414852738380432, + "learning_rate": 2.7627138826075345e-05, + "loss": 1.1858, + "step": 5941 + }, + { + "epoch": 0.9341114975731494, + "grad_norm": 0.13373959064483643, + "learning_rate": 2.7620994554916097e-05, + "loss": 1.0933, + "step": 5942 + }, + { + "epoch": 0.934268702470082, + "grad_norm": 0.1334017515182495, + "learning_rate": 2.7614850123676482e-05, + "loss": 1.1826, + "step": 5943 + }, + { + "epoch": 0.9344259073670145, + "grad_norm": 0.11887276917695999, + "learning_rate": 2.7608705532731766e-05, + "loss": 0.9777, + "step": 5944 + }, + { + "epoch": 0.934583112263947, + "grad_norm": 0.126699298620224, + "learning_rate": 2.7602560782457242e-05, + "loss": 1.1145, + "step": 5945 + }, + { + "epoch": 0.9347403171608796, + "grad_norm": 0.13699033856391907, + "learning_rate": 2.7596415873228214e-05, + "loss": 1.3064, + "step": 5946 + }, + { + "epoch": 0.9348975220578121, + "grad_norm": 0.133773073554039, + "learning_rate": 2.759027080541997e-05, + "loss": 1.0474, + "step": 5947 + }, + { + "epoch": 0.9350547269547447, + "grad_norm": 0.12632089853286743, + "learning_rate": 2.7584125579407854e-05, + "loss": 1.1138, + "step": 5948 + }, + { + "epoch": 0.9352119318516772, + "grad_norm": 0.14024488627910614, + "learning_rate": 2.7577980195567173e-05, + "loss": 1.1289, + "step": 5949 + }, + { + "epoch": 0.9353691367486097, + "grad_norm": 0.14063555002212524, + "learning_rate": 2.7571834654273272e-05, + "loss": 1.1547, + "step": 5950 + }, + { + "epoch": 0.9355263416455423, + "grad_norm": 0.1355229765176773, + "learning_rate": 2.75656889559015e-05, + "loss": 1.0998, + "step": 5951 + }, + { + "epoch": 0.9356835465424748, + "grad_norm": 0.14299823343753815, + "learning_rate": 2.755954310082719e-05, + "loss": 1.1242, + "step": 5952 + }, + { + "epoch": 0.9358407514394074, + "grad_norm": 0.13115337491035461, + "learning_rate": 2.755339708942575e-05, + "loss": 0.9714, + "step": 5953 + }, + { + "epoch": 0.9359979563363399, + "grad_norm": 0.12538568675518036, + "learning_rate": 2.7547250922072514e-05, + "loss": 1.0528, + "step": 5954 + }, + { + "epoch": 0.9361551612332725, + "grad_norm": 0.1327783763408661, + "learning_rate": 2.754110459914289e-05, + "loss": 1.0557, + "step": 5955 + }, + { + "epoch": 0.936312366130205, + "grad_norm": 0.13240036368370056, + "learning_rate": 2.7534958121012265e-05, + "loss": 1.06, + "step": 5956 + }, + { + "epoch": 0.9364695710271375, + "grad_norm": 0.13960221409797668, + "learning_rate": 2.7528811488056044e-05, + "loss": 1.0654, + "step": 5957 + }, + { + "epoch": 0.9366267759240701, + "grad_norm": 0.13258858025074005, + "learning_rate": 2.7522664700649636e-05, + "loss": 1.0193, + "step": 5958 + }, + { + "epoch": 0.9367839808210026, + "grad_norm": 0.13239261507987976, + "learning_rate": 2.7516517759168464e-05, + "loss": 1.1384, + "step": 5959 + }, + { + "epoch": 0.9369411857179352, + "grad_norm": 0.12734214961528778, + "learning_rate": 2.7510370663987973e-05, + "loss": 1.117, + "step": 5960 + }, + { + "epoch": 0.9370983906148677, + "grad_norm": 0.13943642377853394, + "learning_rate": 2.7504223415483582e-05, + "loss": 1.1862, + "step": 5961 + }, + { + "epoch": 0.9372555955118002, + "grad_norm": 0.1283179223537445, + "learning_rate": 2.7498076014030756e-05, + "loss": 1.0944, + "step": 5962 + }, + { + "epoch": 0.9374128004087328, + "grad_norm": 0.1538512259721756, + "learning_rate": 2.7491928460004952e-05, + "loss": 1.0519, + "step": 5963 + }, + { + "epoch": 0.9375700053056653, + "grad_norm": 0.1428849995136261, + "learning_rate": 2.7485780753781642e-05, + "loss": 1.079, + "step": 5964 + }, + { + "epoch": 0.9377272102025979, + "grad_norm": 0.12558673322200775, + "learning_rate": 2.7479632895736296e-05, + "loss": 1.0957, + "step": 5965 + }, + { + "epoch": 0.9378844150995304, + "grad_norm": 0.13500721752643585, + "learning_rate": 2.7473484886244406e-05, + "loss": 1.0686, + "step": 5966 + }, + { + "epoch": 0.9380416199964628, + "grad_norm": 0.12610837817192078, + "learning_rate": 2.7467336725681477e-05, + "loss": 1.0099, + "step": 5967 + }, + { + "epoch": 0.9381988248933955, + "grad_norm": 0.13855506479740143, + "learning_rate": 2.7461188414423e-05, + "loss": 1.1038, + "step": 5968 + }, + { + "epoch": 0.938356029790328, + "grad_norm": 0.11587034165859222, + "learning_rate": 2.7455039952844507e-05, + "loss": 1.1082, + "step": 5969 + }, + { + "epoch": 0.9385132346872606, + "grad_norm": 0.12418701499700546, + "learning_rate": 2.7448891341321507e-05, + "loss": 1.0648, + "step": 5970 + }, + { + "epoch": 0.938670439584193, + "grad_norm": 0.1330374926328659, + "learning_rate": 2.7442742580229548e-05, + "loss": 1.1212, + "step": 5971 + }, + { + "epoch": 0.9388276444811255, + "grad_norm": 0.12268505990505219, + "learning_rate": 2.743659366994416e-05, + "loss": 1.131, + "step": 5972 + }, + { + "epoch": 0.9389848493780582, + "grad_norm": 0.1336454302072525, + "learning_rate": 2.7430444610840904e-05, + "loss": 1.1284, + "step": 5973 + }, + { + "epoch": 0.9391420542749906, + "grad_norm": 0.14830490946769714, + "learning_rate": 2.7424295403295348e-05, + "loss": 0.9071, + "step": 5974 + }, + { + "epoch": 0.9392992591719233, + "grad_norm": 0.13443098962306976, + "learning_rate": 2.7418146047683042e-05, + "loss": 1.1436, + "step": 5975 + }, + { + "epoch": 0.9394564640688557, + "grad_norm": 0.13682928681373596, + "learning_rate": 2.7411996544379575e-05, + "loss": 1.0893, + "step": 5976 + }, + { + "epoch": 0.9396136689657882, + "grad_norm": 0.13836413621902466, + "learning_rate": 2.7405846893760533e-05, + "loss": 1.1783, + "step": 5977 + }, + { + "epoch": 0.9397708738627208, + "grad_norm": 0.12278363853693008, + "learning_rate": 2.7399697096201533e-05, + "loss": 1.1166, + "step": 5978 + }, + { + "epoch": 0.9399280787596533, + "grad_norm": 0.15446826815605164, + "learning_rate": 2.739354715207815e-05, + "loss": 1.0532, + "step": 5979 + }, + { + "epoch": 0.940085283656586, + "grad_norm": 0.133811354637146, + "learning_rate": 2.738739706176602e-05, + "loss": 1.0008, + "step": 5980 + }, + { + "epoch": 0.9402424885535184, + "grad_norm": 0.13732394576072693, + "learning_rate": 2.7381246825640755e-05, + "loss": 1.2071, + "step": 5981 + }, + { + "epoch": 0.9403996934504509, + "grad_norm": 0.1425594836473465, + "learning_rate": 2.7375096444077996e-05, + "loss": 1.0522, + "step": 5982 + }, + { + "epoch": 0.9405568983473835, + "grad_norm": 0.13414670526981354, + "learning_rate": 2.7368945917453387e-05, + "loss": 1.0864, + "step": 5983 + }, + { + "epoch": 0.940714103244316, + "grad_norm": 0.1373647302389145, + "learning_rate": 2.7362795246142568e-05, + "loss": 1.1594, + "step": 5984 + }, + { + "epoch": 0.9408713081412486, + "grad_norm": 0.13399659097194672, + "learning_rate": 2.735664443052121e-05, + "loss": 1.1394, + "step": 5985 + }, + { + "epoch": 0.9410285130381811, + "grad_norm": 0.12571309506893158, + "learning_rate": 2.735049347096498e-05, + "loss": 1.0476, + "step": 5986 + }, + { + "epoch": 0.9411857179351136, + "grad_norm": 0.12670376896858215, + "learning_rate": 2.734434236784954e-05, + "loss": 1.1336, + "step": 5987 + }, + { + "epoch": 0.9413429228320462, + "grad_norm": 0.12558923661708832, + "learning_rate": 2.73381911215506e-05, + "loss": 1.1723, + "step": 5988 + }, + { + "epoch": 0.9415001277289787, + "grad_norm": 0.13416606187820435, + "learning_rate": 2.733203973244384e-05, + "loss": 0.9964, + "step": 5989 + }, + { + "epoch": 0.9416573326259113, + "grad_norm": 0.13717839121818542, + "learning_rate": 2.7325888200904966e-05, + "loss": 1.1231, + "step": 5990 + }, + { + "epoch": 0.9418145375228438, + "grad_norm": 0.17096485197544098, + "learning_rate": 2.731973652730968e-05, + "loss": 1.1715, + "step": 5991 + }, + { + "epoch": 0.9419717424197763, + "grad_norm": 0.13871747255325317, + "learning_rate": 2.7313584712033726e-05, + "loss": 1.1094, + "step": 5992 + }, + { + "epoch": 0.9421289473167089, + "grad_norm": 0.1268129050731659, + "learning_rate": 2.7307432755452815e-05, + "loss": 1.0558, + "step": 5993 + }, + { + "epoch": 0.9422861522136414, + "grad_norm": 0.1336231827735901, + "learning_rate": 2.7301280657942692e-05, + "loss": 1.0604, + "step": 5994 + }, + { + "epoch": 0.942443357110574, + "grad_norm": 0.13423700630664825, + "learning_rate": 2.7295128419879103e-05, + "loss": 1.0102, + "step": 5995 + }, + { + "epoch": 0.9426005620075065, + "grad_norm": 0.13563703000545502, + "learning_rate": 2.7288976041637792e-05, + "loss": 1.0981, + "step": 5996 + }, + { + "epoch": 0.942757766904439, + "grad_norm": 0.14337728917598724, + "learning_rate": 2.7282823523594547e-05, + "loss": 1.0548, + "step": 5997 + }, + { + "epoch": 0.9429149718013716, + "grad_norm": 0.14261376857757568, + "learning_rate": 2.727667086612512e-05, + "loss": 1.1155, + "step": 5998 + }, + { + "epoch": 0.9430721766983041, + "grad_norm": 0.1305425465106964, + "learning_rate": 2.72705180696053e-05, + "loss": 1.0848, + "step": 5999 + }, + { + "epoch": 0.9432293815952367, + "grad_norm": 0.14552736282348633, + "learning_rate": 2.7264365134410878e-05, + "loss": 1.1096, + "step": 6000 + }, + { + "epoch": 0.9433865864921692, + "grad_norm": 0.142582967877388, + "learning_rate": 2.7258212060917643e-05, + "loss": 1.1045, + "step": 6001 + }, + { + "epoch": 0.9435437913891017, + "grad_norm": 0.12845946848392487, + "learning_rate": 2.7252058849501416e-05, + "loss": 1.0095, + "step": 6002 + }, + { + "epoch": 0.9437009962860343, + "grad_norm": 0.13173560798168182, + "learning_rate": 2.7245905500537992e-05, + "loss": 1.1181, + "step": 6003 + }, + { + "epoch": 0.9438582011829668, + "grad_norm": 0.13160601258277893, + "learning_rate": 2.7239752014403218e-05, + "loss": 1.0954, + "step": 6004 + }, + { + "epoch": 0.9440154060798994, + "grad_norm": 0.12833614647388458, + "learning_rate": 2.7233598391472903e-05, + "loss": 1.1083, + "step": 6005 + }, + { + "epoch": 0.9441726109768319, + "grad_norm": 0.134452223777771, + "learning_rate": 2.72274446321229e-05, + "loss": 1.0622, + "step": 6006 + }, + { + "epoch": 0.9443298158737645, + "grad_norm": 0.1359095722436905, + "learning_rate": 2.722129073672905e-05, + "loss": 0.8811, + "step": 6007 + }, + { + "epoch": 0.944487020770697, + "grad_norm": 0.1305527538061142, + "learning_rate": 2.7215136705667216e-05, + "loss": 1.0987, + "step": 6008 + }, + { + "epoch": 0.9446442256676295, + "grad_norm": 0.13655494153499603, + "learning_rate": 2.7208982539313264e-05, + "loss": 1.0505, + "step": 6009 + }, + { + "epoch": 0.9448014305645621, + "grad_norm": 0.13174045085906982, + "learning_rate": 2.720282823804306e-05, + "loss": 1.2056, + "step": 6010 + }, + { + "epoch": 0.9449586354614946, + "grad_norm": 0.14271695911884308, + "learning_rate": 2.7196673802232486e-05, + "loss": 1.1816, + "step": 6011 + }, + { + "epoch": 0.9451158403584272, + "grad_norm": 0.125487819314003, + "learning_rate": 2.7190519232257432e-05, + "loss": 1.0736, + "step": 6012 + }, + { + "epoch": 0.9452730452553597, + "grad_norm": 0.12941718101501465, + "learning_rate": 2.71843645284938e-05, + "loss": 1.1029, + "step": 6013 + }, + { + "epoch": 0.9454302501522922, + "grad_norm": 0.16499774158000946, + "learning_rate": 2.71782096913175e-05, + "loss": 1.1181, + "step": 6014 + }, + { + "epoch": 0.9455874550492248, + "grad_norm": 0.13209642469882965, + "learning_rate": 2.7172054721104435e-05, + "loss": 1.0738, + "step": 6015 + }, + { + "epoch": 0.9457446599461573, + "grad_norm": 0.1435754895210266, + "learning_rate": 2.7165899618230528e-05, + "loss": 1.0761, + "step": 6016 + }, + { + "epoch": 0.9459018648430899, + "grad_norm": 0.13407909870147705, + "learning_rate": 2.7159744383071718e-05, + "loss": 1.1451, + "step": 6017 + }, + { + "epoch": 0.9460590697400224, + "grad_norm": 0.1326179802417755, + "learning_rate": 2.715358901600394e-05, + "loss": 1.0337, + "step": 6018 + }, + { + "epoch": 0.9462162746369549, + "grad_norm": 0.13564349710941315, + "learning_rate": 2.714743351740313e-05, + "loss": 1.0482, + "step": 6019 + }, + { + "epoch": 0.9463734795338875, + "grad_norm": 0.12801401317119598, + "learning_rate": 2.7141277887645254e-05, + "loss": 1.002, + "step": 6020 + }, + { + "epoch": 0.94653068443082, + "grad_norm": 0.1299794614315033, + "learning_rate": 2.7135122127106276e-05, + "loss": 1.1075, + "step": 6021 + }, + { + "epoch": 0.9466878893277526, + "grad_norm": 0.13797403872013092, + "learning_rate": 2.7128966236162158e-05, + "loss": 1.1547, + "step": 6022 + }, + { + "epoch": 0.9468450942246851, + "grad_norm": 0.14355377852916718, + "learning_rate": 2.7122810215188888e-05, + "loss": 1.1562, + "step": 6023 + }, + { + "epoch": 0.9470022991216176, + "grad_norm": 0.13126687705516815, + "learning_rate": 2.7116654064562445e-05, + "loss": 1.0477, + "step": 6024 + }, + { + "epoch": 0.9471595040185502, + "grad_norm": 0.12429299205541611, + "learning_rate": 2.711049778465883e-05, + "loss": 1.0903, + "step": 6025 + }, + { + "epoch": 0.9473167089154827, + "grad_norm": 0.1281753033399582, + "learning_rate": 2.7104341375854024e-05, + "loss": 1.0305, + "step": 6026 + }, + { + "epoch": 0.9474739138124153, + "grad_norm": 0.14104846119880676, + "learning_rate": 2.7098184838524065e-05, + "loss": 1.0619, + "step": 6027 + }, + { + "epoch": 0.9476311187093478, + "grad_norm": 0.1359645128250122, + "learning_rate": 2.709202817304496e-05, + "loss": 1.0584, + "step": 6028 + }, + { + "epoch": 0.9477883236062803, + "grad_norm": 0.1438485085964203, + "learning_rate": 2.7085871379792726e-05, + "loss": 1.0572, + "step": 6029 + }, + { + "epoch": 0.9479455285032129, + "grad_norm": 0.13046479225158691, + "learning_rate": 2.7079714459143417e-05, + "loss": 1.1666, + "step": 6030 + }, + { + "epoch": 0.9481027334001454, + "grad_norm": 0.12564343214035034, + "learning_rate": 2.707355741147305e-05, + "loss": 1.0413, + "step": 6031 + }, + { + "epoch": 0.948259938297078, + "grad_norm": 0.1311103254556656, + "learning_rate": 2.7067400237157693e-05, + "loss": 0.9933, + "step": 6032 + }, + { + "epoch": 0.9484171431940105, + "grad_norm": 0.1461622565984726, + "learning_rate": 2.706124293657339e-05, + "loss": 1.0575, + "step": 6033 + }, + { + "epoch": 0.948574348090943, + "grad_norm": 0.13953527808189392, + "learning_rate": 2.705508551009621e-05, + "loss": 1.1293, + "step": 6034 + }, + { + "epoch": 0.9487315529878756, + "grad_norm": 0.1280178278684616, + "learning_rate": 2.704892795810223e-05, + "loss": 1.053, + "step": 6035 + }, + { + "epoch": 0.9488887578848081, + "grad_norm": 0.13811422884464264, + "learning_rate": 2.7042770280967523e-05, + "loss": 1.145, + "step": 6036 + }, + { + "epoch": 0.9490459627817407, + "grad_norm": 0.15218695998191833, + "learning_rate": 2.7036612479068185e-05, + "loss": 1.1136, + "step": 6037 + }, + { + "epoch": 0.9492031676786732, + "grad_norm": 0.13221168518066406, + "learning_rate": 2.7030454552780304e-05, + "loss": 1.1037, + "step": 6038 + }, + { + "epoch": 0.9493603725756057, + "grad_norm": 0.12861289083957672, + "learning_rate": 2.7024296502479986e-05, + "loss": 1.0381, + "step": 6039 + }, + { + "epoch": 0.9495175774725383, + "grad_norm": 0.12132981419563293, + "learning_rate": 2.701813832854333e-05, + "loss": 1.0931, + "step": 6040 + }, + { + "epoch": 0.9496747823694708, + "grad_norm": 0.13624799251556396, + "learning_rate": 2.7011980031346474e-05, + "loss": 1.1435, + "step": 6041 + }, + { + "epoch": 0.9498319872664034, + "grad_norm": 0.13776588439941406, + "learning_rate": 2.7005821611265523e-05, + "loss": 1.0719, + "step": 6042 + }, + { + "epoch": 0.9499891921633359, + "grad_norm": 0.1409396529197693, + "learning_rate": 2.699966306867663e-05, + "loss": 1.1888, + "step": 6043 + }, + { + "epoch": 0.9501463970602684, + "grad_norm": 0.12747706472873688, + "learning_rate": 2.699350440395592e-05, + "loss": 1.0563, + "step": 6044 + }, + { + "epoch": 0.950303601957201, + "grad_norm": 0.14183779060840607, + "learning_rate": 2.6987345617479542e-05, + "loss": 1.1842, + "step": 6045 + }, + { + "epoch": 0.9504608068541335, + "grad_norm": 0.12934386730194092, + "learning_rate": 2.698118670962366e-05, + "loss": 1.0775, + "step": 6046 + }, + { + "epoch": 0.9506180117510661, + "grad_norm": 0.13298103213310242, + "learning_rate": 2.697502768076443e-05, + "loss": 1.1511, + "step": 6047 + }, + { + "epoch": 0.9507752166479986, + "grad_norm": 0.12333855777978897, + "learning_rate": 2.6968868531278026e-05, + "loss": 1.0139, + "step": 6048 + }, + { + "epoch": 0.9509324215449311, + "grad_norm": 0.13036374747753143, + "learning_rate": 2.6962709261540626e-05, + "loss": 1.1394, + "step": 6049 + }, + { + "epoch": 0.9510896264418637, + "grad_norm": 0.1290486752986908, + "learning_rate": 2.69565498719284e-05, + "loss": 1.1441, + "step": 6050 + }, + { + "epoch": 0.9512468313387962, + "grad_norm": 0.13396979868412018, + "learning_rate": 2.6950390362817562e-05, + "loss": 1.028, + "step": 6051 + }, + { + "epoch": 0.9514040362357288, + "grad_norm": 0.1336859166622162, + "learning_rate": 2.6944230734584296e-05, + "loss": 1.0234, + "step": 6052 + }, + { + "epoch": 0.9515612411326613, + "grad_norm": 0.15525901317596436, + "learning_rate": 2.6938070987604824e-05, + "loss": 1.1063, + "step": 6053 + }, + { + "epoch": 0.9517184460295938, + "grad_norm": 0.13165932893753052, + "learning_rate": 2.6931911122255344e-05, + "loss": 1.1612, + "step": 6054 + }, + { + "epoch": 0.9518756509265264, + "grad_norm": 0.13331614434719086, + "learning_rate": 2.692575113891208e-05, + "loss": 1.1504, + "step": 6055 + }, + { + "epoch": 0.9520328558234589, + "grad_norm": 0.1331959068775177, + "learning_rate": 2.6919591037951265e-05, + "loss": 0.9773, + "step": 6056 + }, + { + "epoch": 0.9521900607203915, + "grad_norm": 0.1333232969045639, + "learning_rate": 2.691343081974913e-05, + "loss": 0.9842, + "step": 6057 + }, + { + "epoch": 0.952347265617324, + "grad_norm": 0.13365936279296875, + "learning_rate": 2.690727048468193e-05, + "loss": 1.0764, + "step": 6058 + }, + { + "epoch": 0.9525044705142566, + "grad_norm": 0.13716116547584534, + "learning_rate": 2.69011100331259e-05, + "loss": 1.0894, + "step": 6059 + }, + { + "epoch": 0.9526616754111891, + "grad_norm": 0.15759888291358948, + "learning_rate": 2.6894949465457296e-05, + "loss": 1.039, + "step": 6060 + }, + { + "epoch": 0.9528188803081216, + "grad_norm": 0.13740909099578857, + "learning_rate": 2.6888788782052394e-05, + "loss": 0.9639, + "step": 6061 + }, + { + "epoch": 0.9529760852050542, + "grad_norm": 0.14021463692188263, + "learning_rate": 2.6882627983287463e-05, + "loss": 1.0953, + "step": 6062 + }, + { + "epoch": 0.9531332901019867, + "grad_norm": 0.12414146214723587, + "learning_rate": 2.6876467069538775e-05, + "loss": 1.0279, + "step": 6063 + }, + { + "epoch": 0.9532904949989193, + "grad_norm": 0.14076709747314453, + "learning_rate": 2.687030604118262e-05, + "loss": 1.1196, + "step": 6064 + }, + { + "epoch": 0.9534476998958518, + "grad_norm": 0.14148470759391785, + "learning_rate": 2.6864144898595285e-05, + "loss": 1.0793, + "step": 6065 + }, + { + "epoch": 0.9536049047927843, + "grad_norm": 0.15464699268341064, + "learning_rate": 2.6857983642153068e-05, + "loss": 1.1615, + "step": 6066 + }, + { + "epoch": 0.9537621096897169, + "grad_norm": 0.13368132710456848, + "learning_rate": 2.6851822272232292e-05, + "loss": 1.1298, + "step": 6067 + }, + { + "epoch": 0.9539193145866494, + "grad_norm": 0.1264435052871704, + "learning_rate": 2.6845660789209244e-05, + "loss": 1.1182, + "step": 6068 + }, + { + "epoch": 0.954076519483582, + "grad_norm": 0.12793458998203278, + "learning_rate": 2.6839499193460262e-05, + "loss": 1.143, + "step": 6069 + }, + { + "epoch": 0.9542337243805145, + "grad_norm": 0.1353050023317337, + "learning_rate": 2.683333748536167e-05, + "loss": 1.1312, + "step": 6070 + }, + { + "epoch": 0.954390929277447, + "grad_norm": 0.12441742420196533, + "learning_rate": 2.6827175665289795e-05, + "loss": 1.0556, + "step": 6071 + }, + { + "epoch": 0.9545481341743796, + "grad_norm": 0.1288517415523529, + "learning_rate": 2.682101373362099e-05, + "loss": 1.0787, + "step": 6072 + }, + { + "epoch": 0.954705339071312, + "grad_norm": 0.13948756456375122, + "learning_rate": 2.681485169073159e-05, + "loss": 1.0722, + "step": 6073 + }, + { + "epoch": 0.9548625439682447, + "grad_norm": 0.15211845934391022, + "learning_rate": 2.680868953699796e-05, + "loss": 1.087, + "step": 6074 + }, + { + "epoch": 0.9550197488651772, + "grad_norm": 0.13332803547382355, + "learning_rate": 2.680252727279644e-05, + "loss": 1.1758, + "step": 6075 + }, + { + "epoch": 0.9551769537621096, + "grad_norm": 0.12998460233211517, + "learning_rate": 2.6796364898503427e-05, + "loss": 1.1045, + "step": 6076 + }, + { + "epoch": 0.9553341586590423, + "grad_norm": 0.13156652450561523, + "learning_rate": 2.6790202414495273e-05, + "loss": 1.0624, + "step": 6077 + }, + { + "epoch": 0.9554913635559747, + "grad_norm": 0.16905687749385834, + "learning_rate": 2.6784039821148365e-05, + "loss": 1.0451, + "step": 6078 + }, + { + "epoch": 0.9556485684529074, + "grad_norm": 0.13479562103748322, + "learning_rate": 2.6777877118839106e-05, + "loss": 1.047, + "step": 6079 + }, + { + "epoch": 0.9558057733498398, + "grad_norm": 0.1421484351158142, + "learning_rate": 2.6771714307943857e-05, + "loss": 1.0943, + "step": 6080 + }, + { + "epoch": 0.9558057733498398, + "eval_loss": 1.0893487930297852, + "eval_runtime": 2316.0117, + "eval_samples_per_second": 3.997, + "eval_steps_per_second": 1.999, + "step": 6080 + }, + { + "epoch": 0.9559629782467723, + "grad_norm": 0.1265874207019806, + "learning_rate": 2.6765551388839056e-05, + "loss": 1.0724, + "step": 6081 + }, + { + "epoch": 0.956120183143705, + "grad_norm": 0.13201916217803955, + "learning_rate": 2.675938836190108e-05, + "loss": 1.104, + "step": 6082 + }, + { + "epoch": 0.9562773880406374, + "grad_norm": 0.1400064080953598, + "learning_rate": 2.6753225227506367e-05, + "loss": 0.9727, + "step": 6083 + }, + { + "epoch": 0.95643459293757, + "grad_norm": 0.15624095499515533, + "learning_rate": 2.674706198603133e-05, + "loss": 1.0167, + "step": 6084 + }, + { + "epoch": 0.9565917978345025, + "grad_norm": 0.13529807329177856, + "learning_rate": 2.6740898637852385e-05, + "loss": 1.1256, + "step": 6085 + }, + { + "epoch": 0.956749002731435, + "grad_norm": 0.13604608178138733, + "learning_rate": 2.6734735183345983e-05, + "loss": 1.1264, + "step": 6086 + }, + { + "epoch": 0.9569062076283676, + "grad_norm": 0.1391611248254776, + "learning_rate": 2.672857162288855e-05, + "loss": 1.1428, + "step": 6087 + }, + { + "epoch": 0.9570634125253001, + "grad_norm": 0.1305060237646103, + "learning_rate": 2.6722407956856545e-05, + "loss": 1.1461, + "step": 6088 + }, + { + "epoch": 0.9572206174222327, + "grad_norm": 0.14163479208946228, + "learning_rate": 2.671624418562641e-05, + "loss": 1.069, + "step": 6089 + }, + { + "epoch": 0.9573778223191652, + "grad_norm": 0.14188191294670105, + "learning_rate": 2.671008030957461e-05, + "loss": 1.1133, + "step": 6090 + }, + { + "epoch": 0.9575350272160977, + "grad_norm": 0.13564598560333252, + "learning_rate": 2.670391632907761e-05, + "loss": 1.1491, + "step": 6091 + }, + { + "epoch": 0.9576922321130303, + "grad_norm": 0.13552945852279663, + "learning_rate": 2.6697752244511885e-05, + "loss": 1.1435, + "step": 6092 + }, + { + "epoch": 0.9578494370099628, + "grad_norm": 0.12939490377902985, + "learning_rate": 2.669158805625392e-05, + "loss": 1.0372, + "step": 6093 + }, + { + "epoch": 0.9580066419068954, + "grad_norm": 0.1298532634973526, + "learning_rate": 2.668542376468018e-05, + "loss": 1.0962, + "step": 6094 + }, + { + "epoch": 0.9581638468038279, + "grad_norm": 0.13761554658412933, + "learning_rate": 2.6679259370167176e-05, + "loss": 1.0445, + "step": 6095 + }, + { + "epoch": 0.9583210517007604, + "grad_norm": 0.1348113864660263, + "learning_rate": 2.6673094873091398e-05, + "loss": 1.1789, + "step": 6096 + }, + { + "epoch": 0.958478256597693, + "grad_norm": 0.12150844186544418, + "learning_rate": 2.666693027382935e-05, + "loss": 0.9774, + "step": 6097 + }, + { + "epoch": 0.9586354614946255, + "grad_norm": 0.13621921837329865, + "learning_rate": 2.6660765572757545e-05, + "loss": 1.2028, + "step": 6098 + }, + { + "epoch": 0.9587926663915581, + "grad_norm": 0.13343468308448792, + "learning_rate": 2.665460077025249e-05, + "loss": 1.1118, + "step": 6099 + }, + { + "epoch": 0.9589498712884906, + "grad_norm": 0.14203141629695892, + "learning_rate": 2.6648435866690728e-05, + "loss": 1.2192, + "step": 6100 + }, + { + "epoch": 0.9591070761854231, + "grad_norm": 0.1324109435081482, + "learning_rate": 2.6642270862448766e-05, + "loss": 1.175, + "step": 6101 + }, + { + "epoch": 0.9592642810823557, + "grad_norm": 0.13290967047214508, + "learning_rate": 2.6636105757903158e-05, + "loss": 1.0874, + "step": 6102 + }, + { + "epoch": 0.9594214859792882, + "grad_norm": 0.13357378542423248, + "learning_rate": 2.6629940553430426e-05, + "loss": 1.1817, + "step": 6103 + }, + { + "epoch": 0.9595786908762208, + "grad_norm": 0.1280289590358734, + "learning_rate": 2.662377524940713e-05, + "loss": 1.0133, + "step": 6104 + }, + { + "epoch": 0.9597358957731533, + "grad_norm": 0.15631495416164398, + "learning_rate": 2.6617609846209816e-05, + "loss": 0.9574, + "step": 6105 + }, + { + "epoch": 0.9598931006700858, + "grad_norm": 0.1374405324459076, + "learning_rate": 2.6611444344215048e-05, + "loss": 1.086, + "step": 6106 + }, + { + "epoch": 0.9600503055670184, + "grad_norm": 0.12703076004981995, + "learning_rate": 2.6605278743799395e-05, + "loss": 1.1476, + "step": 6107 + }, + { + "epoch": 0.9602075104639509, + "grad_norm": 0.15830785036087036, + "learning_rate": 2.6599113045339424e-05, + "loss": 1.0131, + "step": 6108 + }, + { + "epoch": 0.9603647153608835, + "grad_norm": 0.12500980496406555, + "learning_rate": 2.659294724921171e-05, + "loss": 1.0345, + "step": 6109 + }, + { + "epoch": 0.960521920257816, + "grad_norm": 0.12072091549634933, + "learning_rate": 2.6586781355792834e-05, + "loss": 1.0841, + "step": 6110 + }, + { + "epoch": 0.9606791251547486, + "grad_norm": 0.13469047844409943, + "learning_rate": 2.6580615365459394e-05, + "loss": 1.125, + "step": 6111 + }, + { + "epoch": 0.9608363300516811, + "grad_norm": 0.13239020109176636, + "learning_rate": 2.6574449278587986e-05, + "loss": 0.986, + "step": 6112 + }, + { + "epoch": 0.9609935349486136, + "grad_norm": 0.13411234319210052, + "learning_rate": 2.65682830955552e-05, + "loss": 1.202, + "step": 6113 + }, + { + "epoch": 0.9611507398455462, + "grad_norm": 0.12794964015483856, + "learning_rate": 2.6562116816737653e-05, + "loss": 1.1079, + "step": 6114 + }, + { + "epoch": 0.9613079447424787, + "grad_norm": 0.14135703444480896, + "learning_rate": 2.655595044251195e-05, + "loss": 1.0801, + "step": 6115 + }, + { + "epoch": 0.9614651496394113, + "grad_norm": 0.15455910563468933, + "learning_rate": 2.6549783973254722e-05, + "loss": 1.0957, + "step": 6116 + }, + { + "epoch": 0.9616223545363438, + "grad_norm": 0.12595167756080627, + "learning_rate": 2.654361740934258e-05, + "loss": 1.0999, + "step": 6117 + }, + { + "epoch": 0.9617795594332763, + "grad_norm": 0.14508840441703796, + "learning_rate": 2.653745075115216e-05, + "loss": 1.2061, + "step": 6118 + }, + { + "epoch": 0.9619367643302089, + "grad_norm": 0.13701015710830688, + "learning_rate": 2.6531283999060096e-05, + "loss": 1.1047, + "step": 6119 + }, + { + "epoch": 0.9620939692271414, + "grad_norm": 0.15574191510677338, + "learning_rate": 2.6525117153443035e-05, + "loss": 1.124, + "step": 6120 + }, + { + "epoch": 0.962251174124074, + "grad_norm": 0.13183417916297913, + "learning_rate": 2.6518950214677625e-05, + "loss": 1.0123, + "step": 6121 + }, + { + "epoch": 0.9624083790210065, + "grad_norm": 0.1495794802904129, + "learning_rate": 2.6512783183140512e-05, + "loss": 1.0191, + "step": 6122 + }, + { + "epoch": 0.962565583917939, + "grad_norm": 0.13691960275173187, + "learning_rate": 2.650661605920835e-05, + "loss": 1.0099, + "step": 6123 + }, + { + "epoch": 0.9627227888148716, + "grad_norm": 0.12653347849845886, + "learning_rate": 2.650044884325782e-05, + "loss": 1.0371, + "step": 6124 + }, + { + "epoch": 0.9628799937118041, + "grad_norm": 0.1424267292022705, + "learning_rate": 2.6494281535665577e-05, + "loss": 1.1305, + "step": 6125 + }, + { + "epoch": 0.9630371986087367, + "grad_norm": 0.1382642388343811, + "learning_rate": 2.6488114136808313e-05, + "loss": 1.0557, + "step": 6126 + }, + { + "epoch": 0.9631944035056692, + "grad_norm": 0.13813577592372894, + "learning_rate": 2.648194664706269e-05, + "loss": 1.2143, + "step": 6127 + }, + { + "epoch": 0.9633516084026017, + "grad_norm": 0.11540228128433228, + "learning_rate": 2.647577906680541e-05, + "loss": 0.9599, + "step": 6128 + }, + { + "epoch": 0.9635088132995343, + "grad_norm": 0.13012580573558807, + "learning_rate": 2.6469611396413152e-05, + "loss": 1.096, + "step": 6129 + }, + { + "epoch": 0.9636660181964668, + "grad_norm": 0.13335809111595154, + "learning_rate": 2.646344363626263e-05, + "loss": 0.9971, + "step": 6130 + }, + { + "epoch": 0.9638232230933994, + "grad_norm": 0.13520598411560059, + "learning_rate": 2.6457275786730534e-05, + "loss": 1.0272, + "step": 6131 + }, + { + "epoch": 0.9639804279903319, + "grad_norm": 0.14121781289577484, + "learning_rate": 2.645110784819358e-05, + "loss": 1.2217, + "step": 6132 + }, + { + "epoch": 0.9641376328872644, + "grad_norm": 0.1341332197189331, + "learning_rate": 2.644493982102848e-05, + "loss": 1.0377, + "step": 6133 + }, + { + "epoch": 0.964294837784197, + "grad_norm": 0.12677514553070068, + "learning_rate": 2.643877170561195e-05, + "loss": 1.0939, + "step": 6134 + }, + { + "epoch": 0.9644520426811295, + "grad_norm": 0.12420421838760376, + "learning_rate": 2.643260350232072e-05, + "loss": 1.0513, + "step": 6135 + }, + { + "epoch": 0.9646092475780621, + "grad_norm": 0.12889213860034943, + "learning_rate": 2.642643521153152e-05, + "loss": 1.0382, + "step": 6136 + }, + { + "epoch": 0.9647664524749946, + "grad_norm": 0.12273343652486801, + "learning_rate": 2.6420266833621087e-05, + "loss": 1.0017, + "step": 6137 + }, + { + "epoch": 0.9649236573719271, + "grad_norm": 0.15092088282108307, + "learning_rate": 2.6414098368966156e-05, + "loss": 1.1538, + "step": 6138 + }, + { + "epoch": 0.9650808622688597, + "grad_norm": 0.13954736292362213, + "learning_rate": 2.6407929817943472e-05, + "loss": 1.1457, + "step": 6139 + }, + { + "epoch": 0.9652380671657922, + "grad_norm": 0.13667570054531097, + "learning_rate": 2.6401761180929797e-05, + "loss": 1.0489, + "step": 6140 + }, + { + "epoch": 0.9653952720627248, + "grad_norm": 0.1319783627986908, + "learning_rate": 2.639559245830188e-05, + "loss": 1.013, + "step": 6141 + }, + { + "epoch": 0.9655524769596573, + "grad_norm": 0.1445406973361969, + "learning_rate": 2.6389423650436497e-05, + "loss": 1.1909, + "step": 6142 + }, + { + "epoch": 0.9657096818565898, + "grad_norm": 0.1487741619348526, + "learning_rate": 2.638325475771039e-05, + "loss": 1.0472, + "step": 6143 + }, + { + "epoch": 0.9658668867535224, + "grad_norm": 0.13818944990634918, + "learning_rate": 2.637708578050035e-05, + "loss": 1.0829, + "step": 6144 + }, + { + "epoch": 0.9660240916504549, + "grad_norm": 0.13556529581546783, + "learning_rate": 2.637091671918315e-05, + "loss": 1.0579, + "step": 6145 + }, + { + "epoch": 0.9661812965473875, + "grad_norm": 0.12764258682727814, + "learning_rate": 2.6364747574135568e-05, + "loss": 1.0945, + "step": 6146 + }, + { + "epoch": 0.96633850144432, + "grad_norm": 0.12845858931541443, + "learning_rate": 2.6358578345734402e-05, + "loss": 1.0288, + "step": 6147 + }, + { + "epoch": 0.9664957063412525, + "grad_norm": 0.12101628631353378, + "learning_rate": 2.635240903435644e-05, + "loss": 1.0437, + "step": 6148 + }, + { + "epoch": 0.9666529112381851, + "grad_norm": 0.12999002635478973, + "learning_rate": 2.634623964037848e-05, + "loss": 1.1647, + "step": 6149 + }, + { + "epoch": 0.9668101161351176, + "grad_norm": 0.12494503706693649, + "learning_rate": 2.6340070164177323e-05, + "loss": 1.1271, + "step": 6150 + }, + { + "epoch": 0.9669673210320502, + "grad_norm": 0.12134553492069244, + "learning_rate": 2.633390060612978e-05, + "loss": 1.1516, + "step": 6151 + }, + { + "epoch": 0.9671245259289827, + "grad_norm": 0.12905550003051758, + "learning_rate": 2.6327730966612662e-05, + "loss": 1.1469, + "step": 6152 + }, + { + "epoch": 0.9672817308259152, + "grad_norm": 0.12773872911930084, + "learning_rate": 2.6321561246002792e-05, + "loss": 1.205, + "step": 6153 + }, + { + "epoch": 0.9674389357228478, + "grad_norm": 0.13463151454925537, + "learning_rate": 2.6315391444676984e-05, + "loss": 1.0364, + "step": 6154 + }, + { + "epoch": 0.9675961406197803, + "grad_norm": 0.1401897668838501, + "learning_rate": 2.630922156301207e-05, + "loss": 1.1124, + "step": 6155 + }, + { + "epoch": 0.9677533455167129, + "grad_norm": 0.12664048373699188, + "learning_rate": 2.6303051601384893e-05, + "loss": 1.0784, + "step": 6156 + }, + { + "epoch": 0.9679105504136454, + "grad_norm": 0.12859436869621277, + "learning_rate": 2.6296881560172277e-05, + "loss": 1.0471, + "step": 6157 + }, + { + "epoch": 0.9680677553105779, + "grad_norm": 0.13436628878116608, + "learning_rate": 2.6290711439751065e-05, + "loss": 0.9743, + "step": 6158 + }, + { + "epoch": 0.9682249602075105, + "grad_norm": 0.13251890242099762, + "learning_rate": 2.6284541240498113e-05, + "loss": 1.0582, + "step": 6159 + }, + { + "epoch": 0.968382165104443, + "grad_norm": 0.13687990605831146, + "learning_rate": 2.6278370962790266e-05, + "loss": 1.1794, + "step": 6160 + }, + { + "epoch": 0.9685393700013756, + "grad_norm": 0.15140476822853088, + "learning_rate": 2.6272200607004395e-05, + "loss": 1.1011, + "step": 6161 + }, + { + "epoch": 0.9686965748983081, + "grad_norm": 0.1268806904554367, + "learning_rate": 2.6266030173517343e-05, + "loss": 1.1222, + "step": 6162 + }, + { + "epoch": 0.9688537797952407, + "grad_norm": 0.14067350327968597, + "learning_rate": 2.6259859662705988e-05, + "loss": 1.0984, + "step": 6163 + }, + { + "epoch": 0.9690109846921732, + "grad_norm": 0.12795695662498474, + "learning_rate": 2.6253689074947195e-05, + "loss": 1.0568, + "step": 6164 + }, + { + "epoch": 0.9691681895891057, + "grad_norm": 0.1411520540714264, + "learning_rate": 2.624751841061785e-05, + "loss": 1.1682, + "step": 6165 + }, + { + "epoch": 0.9693253944860383, + "grad_norm": 0.1330268234014511, + "learning_rate": 2.624134767009482e-05, + "loss": 1.112, + "step": 6166 + }, + { + "epoch": 0.9694825993829708, + "grad_norm": 0.1503315418958664, + "learning_rate": 2.6235176853754994e-05, + "loss": 1.1601, + "step": 6167 + }, + { + "epoch": 0.9696398042799034, + "grad_norm": 0.1329040825366974, + "learning_rate": 2.6229005961975274e-05, + "loss": 1.1247, + "step": 6168 + }, + { + "epoch": 0.9697970091768359, + "grad_norm": 0.14206020534038544, + "learning_rate": 2.622283499513254e-05, + "loss": 1.1044, + "step": 6169 + }, + { + "epoch": 0.9699542140737684, + "grad_norm": 0.1384289562702179, + "learning_rate": 2.6216663953603703e-05, + "loss": 1.1516, + "step": 6170 + }, + { + "epoch": 0.970111418970701, + "grad_norm": 0.12899836897850037, + "learning_rate": 2.621049283776566e-05, + "loss": 1.1035, + "step": 6171 + }, + { + "epoch": 0.9702686238676335, + "grad_norm": 0.1486945003271103, + "learning_rate": 2.6204321647995316e-05, + "loss": 1.0819, + "step": 6172 + }, + { + "epoch": 0.9704258287645661, + "grad_norm": 0.1350465714931488, + "learning_rate": 2.6198150384669584e-05, + "loss": 1.1197, + "step": 6173 + }, + { + "epoch": 0.9705830336614986, + "grad_norm": 0.13862740993499756, + "learning_rate": 2.6191979048165395e-05, + "loss": 1.1544, + "step": 6174 + }, + { + "epoch": 0.9707402385584311, + "grad_norm": 0.17539523541927338, + "learning_rate": 2.618580763885966e-05, + "loss": 1.0008, + "step": 6175 + }, + { + "epoch": 0.9708974434553637, + "grad_norm": 0.13120503723621368, + "learning_rate": 2.6179636157129302e-05, + "loss": 1.1877, + "step": 6176 + }, + { + "epoch": 0.9710546483522962, + "grad_norm": 0.12969760596752167, + "learning_rate": 2.617346460335126e-05, + "loss": 1.0353, + "step": 6177 + }, + { + "epoch": 0.9712118532492288, + "grad_norm": 0.13179469108581543, + "learning_rate": 2.6167292977902452e-05, + "loss": 1.1506, + "step": 6178 + }, + { + "epoch": 0.9713690581461613, + "grad_norm": 0.1329708695411682, + "learning_rate": 2.6161121281159844e-05, + "loss": 1.1509, + "step": 6179 + }, + { + "epoch": 0.9715262630430938, + "grad_norm": 0.15588557720184326, + "learning_rate": 2.6154949513500358e-05, + "loss": 1.1712, + "step": 6180 + }, + { + "epoch": 0.9716834679400264, + "grad_norm": 0.1486482322216034, + "learning_rate": 2.6148777675300957e-05, + "loss": 1.08, + "step": 6181 + }, + { + "epoch": 0.9718406728369589, + "grad_norm": 0.15243008732795715, + "learning_rate": 2.6142605766938582e-05, + "loss": 1.1083, + "step": 6182 + }, + { + "epoch": 0.9719978777338915, + "grad_norm": 0.14067894220352173, + "learning_rate": 2.6136433788790192e-05, + "loss": 1.1021, + "step": 6183 + }, + { + "epoch": 0.972155082630824, + "grad_norm": 0.133790522813797, + "learning_rate": 2.6130261741232755e-05, + "loss": 1.0794, + "step": 6184 + }, + { + "epoch": 0.9723122875277564, + "grad_norm": 0.12939126789569855, + "learning_rate": 2.6124089624643227e-05, + "loss": 1.1018, + "step": 6185 + }, + { + "epoch": 0.972469492424689, + "grad_norm": 0.12914714217185974, + "learning_rate": 2.6117917439398588e-05, + "loss": 1.1129, + "step": 6186 + }, + { + "epoch": 0.9726266973216215, + "grad_norm": 0.13804006576538086, + "learning_rate": 2.6111745185875803e-05, + "loss": 1.1041, + "step": 6187 + }, + { + "epoch": 0.9727839022185542, + "grad_norm": 0.1260584443807602, + "learning_rate": 2.6105572864451844e-05, + "loss": 1.1208, + "step": 6188 + }, + { + "epoch": 0.9729411071154866, + "grad_norm": 0.1362789422273636, + "learning_rate": 2.609940047550371e-05, + "loss": 1.062, + "step": 6189 + }, + { + "epoch": 0.9730983120124191, + "grad_norm": 0.12965382635593414, + "learning_rate": 2.6093228019408374e-05, + "loss": 1.1375, + "step": 6190 + }, + { + "epoch": 0.9732555169093517, + "grad_norm": 0.14326868951320648, + "learning_rate": 2.6087055496542833e-05, + "loss": 1.1179, + "step": 6191 + }, + { + "epoch": 0.9734127218062842, + "grad_norm": 0.13889260590076447, + "learning_rate": 2.6080882907284075e-05, + "loss": 1.1059, + "step": 6192 + }, + { + "epoch": 0.9735699267032168, + "grad_norm": 0.12914693355560303, + "learning_rate": 2.60747102520091e-05, + "loss": 1.1255, + "step": 6193 + }, + { + "epoch": 0.9737271316001493, + "grad_norm": 0.126310333609581, + "learning_rate": 2.6068537531094917e-05, + "loss": 1.1256, + "step": 6194 + }, + { + "epoch": 0.9738843364970818, + "grad_norm": 0.1418408900499344, + "learning_rate": 2.6062364744918522e-05, + "loss": 1.1711, + "step": 6195 + }, + { + "epoch": 0.9740415413940144, + "grad_norm": 0.12994380295276642, + "learning_rate": 2.6056191893856934e-05, + "loss": 1.0924, + "step": 6196 + }, + { + "epoch": 0.9741987462909469, + "grad_norm": 0.1267334669828415, + "learning_rate": 2.605001897828716e-05, + "loss": 1.1808, + "step": 6197 + }, + { + "epoch": 0.9743559511878795, + "grad_norm": 0.1365547627210617, + "learning_rate": 2.6043845998586226e-05, + "loss": 1.0379, + "step": 6198 + }, + { + "epoch": 0.974513156084812, + "grad_norm": 0.1332072913646698, + "learning_rate": 2.603767295513115e-05, + "loss": 1.0663, + "step": 6199 + }, + { + "epoch": 0.9746703609817445, + "grad_norm": 0.13396549224853516, + "learning_rate": 2.6031499848298957e-05, + "loss": 1.0936, + "step": 6200 + }, + { + "epoch": 0.9748275658786771, + "grad_norm": 0.3274918794631958, + "learning_rate": 2.6025326678466676e-05, + "loss": 1.0217, + "step": 6201 + }, + { + "epoch": 0.9749847707756096, + "grad_norm": 0.12621642649173737, + "learning_rate": 2.601915344601134e-05, + "loss": 1.0786, + "step": 6202 + }, + { + "epoch": 0.9751419756725422, + "grad_norm": 0.13528160750865936, + "learning_rate": 2.6012980151309997e-05, + "loss": 1.1678, + "step": 6203 + }, + { + "epoch": 0.9752991805694747, + "grad_norm": 0.12717285752296448, + "learning_rate": 2.6006806794739674e-05, + "loss": 1.0711, + "step": 6204 + }, + { + "epoch": 0.9754563854664072, + "grad_norm": 0.1379251629114151, + "learning_rate": 2.600063337667743e-05, + "loss": 1.0256, + "step": 6205 + }, + { + "epoch": 0.9756135903633398, + "grad_norm": 0.12964457273483276, + "learning_rate": 2.59944598975003e-05, + "loss": 1.2223, + "step": 6206 + }, + { + "epoch": 0.9757707952602723, + "grad_norm": 0.13275854289531708, + "learning_rate": 2.5988286357585338e-05, + "loss": 1.1318, + "step": 6207 + }, + { + "epoch": 0.9759280001572049, + "grad_norm": 0.13796468079090118, + "learning_rate": 2.5982112757309613e-05, + "loss": 1.0826, + "step": 6208 + }, + { + "epoch": 0.9760852050541374, + "grad_norm": 0.15402348339557648, + "learning_rate": 2.597593909705018e-05, + "loss": 1.0671, + "step": 6209 + }, + { + "epoch": 0.9762424099510699, + "grad_norm": 0.1303425133228302, + "learning_rate": 2.5969765377184098e-05, + "loss": 1.0822, + "step": 6210 + }, + { + "epoch": 0.9763996148480025, + "grad_norm": 0.1238548681139946, + "learning_rate": 2.596359159808843e-05, + "loss": 1.0675, + "step": 6211 + }, + { + "epoch": 0.976556819744935, + "grad_norm": 0.15095461905002594, + "learning_rate": 2.595741776014027e-05, + "loss": 0.9866, + "step": 6212 + }, + { + "epoch": 0.9767140246418676, + "grad_norm": 0.12814933061599731, + "learning_rate": 2.5951243863716658e-05, + "loss": 1.1411, + "step": 6213 + }, + { + "epoch": 0.9768712295388001, + "grad_norm": 0.1266186386346817, + "learning_rate": 2.5945069909194698e-05, + "loss": 0.9509, + "step": 6214 + }, + { + "epoch": 0.9770284344357326, + "grad_norm": 0.12861023843288422, + "learning_rate": 2.5938895896951458e-05, + "loss": 1.0975, + "step": 6215 + }, + { + "epoch": 0.9771856393326652, + "grad_norm": 0.1364997923374176, + "learning_rate": 2.5932721827364037e-05, + "loss": 1.1704, + "step": 6216 + }, + { + "epoch": 0.9773428442295977, + "grad_norm": 0.1218840628862381, + "learning_rate": 2.5926547700809512e-05, + "loss": 1.1246, + "step": 6217 + }, + { + "epoch": 0.9775000491265303, + "grad_norm": 0.1375397890806198, + "learning_rate": 2.5920373517664975e-05, + "loss": 1.1121, + "step": 6218 + }, + { + "epoch": 0.9776572540234628, + "grad_norm": 0.14667972922325134, + "learning_rate": 2.5914199278307532e-05, + "loss": 1.1275, + "step": 6219 + }, + { + "epoch": 0.9778144589203954, + "grad_norm": 0.1273236721754074, + "learning_rate": 2.590802498311427e-05, + "loss": 1.0646, + "step": 6220 + }, + { + "epoch": 0.9779716638173279, + "grad_norm": 0.12383287400007248, + "learning_rate": 2.59018506324623e-05, + "loss": 0.9424, + "step": 6221 + }, + { + "epoch": 0.9781288687142604, + "grad_norm": 0.12053514271974564, + "learning_rate": 2.5895676226728722e-05, + "loss": 1.0176, + "step": 6222 + }, + { + "epoch": 0.978286073611193, + "grad_norm": 0.13440188765525818, + "learning_rate": 2.5889501766290646e-05, + "loss": 1.1624, + "step": 6223 + }, + { + "epoch": 0.9784432785081255, + "grad_norm": 0.13317684829235077, + "learning_rate": 2.5883327251525198e-05, + "loss": 1.0566, + "step": 6224 + }, + { + "epoch": 0.9786004834050581, + "grad_norm": 0.13120582699775696, + "learning_rate": 2.587715268280947e-05, + "loss": 1.1637, + "step": 6225 + }, + { + "epoch": 0.9787576883019906, + "grad_norm": 0.1328977346420288, + "learning_rate": 2.5870978060520606e-05, + "loss": 1.0726, + "step": 6226 + }, + { + "epoch": 0.9789148931989231, + "grad_norm": 0.1394965499639511, + "learning_rate": 2.5864803385035697e-05, + "loss": 0.9967, + "step": 6227 + }, + { + "epoch": 0.9790720980958557, + "grad_norm": 0.13868212699890137, + "learning_rate": 2.5858628656731905e-05, + "loss": 1.1235, + "step": 6228 + }, + { + "epoch": 0.9792293029927882, + "grad_norm": 0.1316707730293274, + "learning_rate": 2.585245387598633e-05, + "loss": 1.0445, + "step": 6229 + }, + { + "epoch": 0.9793865078897208, + "grad_norm": 0.1559041291475296, + "learning_rate": 2.5846279043176125e-05, + "loss": 1.1346, + "step": 6230 + }, + { + "epoch": 0.9795437127866533, + "grad_norm": 0.14253021776676178, + "learning_rate": 2.5840104158678413e-05, + "loss": 1.0781, + "step": 6231 + }, + { + "epoch": 0.9797009176835858, + "grad_norm": 0.1312159150838852, + "learning_rate": 2.583392922287033e-05, + "loss": 1.1647, + "step": 6232 + }, + { + "epoch": 0.9798581225805184, + "grad_norm": 0.13869550824165344, + "learning_rate": 2.5827754236129033e-05, + "loss": 1.1461, + "step": 6233 + }, + { + "epoch": 0.9800153274774509, + "grad_norm": 0.14042840898036957, + "learning_rate": 2.5821579198831646e-05, + "loss": 1.1134, + "step": 6234 + }, + { + "epoch": 0.9801725323743835, + "grad_norm": 0.13856734335422516, + "learning_rate": 2.581540411135533e-05, + "loss": 1.1263, + "step": 6235 + }, + { + "epoch": 0.980329737271316, + "grad_norm": 0.13499343395233154, + "learning_rate": 2.5809228974077237e-05, + "loss": 1.0546, + "step": 6236 + }, + { + "epoch": 0.9804869421682485, + "grad_norm": 0.12439657002687454, + "learning_rate": 2.5803053787374508e-05, + "loss": 1.0863, + "step": 6237 + }, + { + "epoch": 0.9806441470651811, + "grad_norm": 0.13374559581279755, + "learning_rate": 2.5796878551624325e-05, + "loss": 1.0763, + "step": 6238 + }, + { + "epoch": 0.9808013519621136, + "grad_norm": 0.13930253684520721, + "learning_rate": 2.579070326720382e-05, + "loss": 1.1024, + "step": 6239 + }, + { + "epoch": 0.9809585568590462, + "grad_norm": 0.12608858942985535, + "learning_rate": 2.5784527934490177e-05, + "loss": 0.9953, + "step": 6240 + }, + { + "epoch": 0.9809585568590462, + "eval_loss": 1.0886449813842773, + "eval_runtime": 2315.6407, + "eval_samples_per_second": 3.998, + "eval_steps_per_second": 1.999, + "step": 6240 + }, + { + "epoch": 0.9811157617559787, + "grad_norm": 0.12502624094486237, + "learning_rate": 2.5778352553860545e-05, + "loss": 1.0484, + "step": 6241 + }, + { + "epoch": 0.9812729666529112, + "grad_norm": 0.126753032207489, + "learning_rate": 2.5772177125692104e-05, + "loss": 1.1274, + "step": 6242 + }, + { + "epoch": 0.9814301715498438, + "grad_norm": 0.1299557089805603, + "learning_rate": 2.5766001650362014e-05, + "loss": 1.1236, + "step": 6243 + }, + { + "epoch": 0.9815873764467763, + "grad_norm": 0.16220217943191528, + "learning_rate": 2.5759826128247468e-05, + "loss": 1.2838, + "step": 6244 + }, + { + "epoch": 0.9817445813437089, + "grad_norm": 0.13452132046222687, + "learning_rate": 2.5753650559725633e-05, + "loss": 1.2423, + "step": 6245 + }, + { + "epoch": 0.9819017862406414, + "grad_norm": 0.12851646542549133, + "learning_rate": 2.5747474945173684e-05, + "loss": 1.1797, + "step": 6246 + }, + { + "epoch": 0.9820589911375739, + "grad_norm": 0.129379004240036, + "learning_rate": 2.5741299284968807e-05, + "loss": 1.0399, + "step": 6247 + }, + { + "epoch": 0.9822161960345065, + "grad_norm": 0.1322186142206192, + "learning_rate": 2.5735123579488195e-05, + "loss": 1.1011, + "step": 6248 + }, + { + "epoch": 0.982373400931439, + "grad_norm": 0.13123802840709686, + "learning_rate": 2.5728947829109028e-05, + "loss": 1.0747, + "step": 6249 + }, + { + "epoch": 0.9825306058283716, + "grad_norm": 0.13443827629089355, + "learning_rate": 2.572277203420851e-05, + "loss": 1.0444, + "step": 6250 + }, + { + "epoch": 0.9826878107253041, + "grad_norm": 0.13353443145751953, + "learning_rate": 2.5716596195163817e-05, + "loss": 1.2098, + "step": 6251 + }, + { + "epoch": 0.9828450156222366, + "grad_norm": 0.12316974252462387, + "learning_rate": 2.5710420312352156e-05, + "loss": 0.9612, + "step": 6252 + }, + { + "epoch": 0.9830022205191692, + "grad_norm": 0.14744386076927185, + "learning_rate": 2.5704244386150727e-05, + "loss": 1.1537, + "step": 6253 + }, + { + "epoch": 0.9831594254161017, + "grad_norm": 0.12601707875728607, + "learning_rate": 2.569806841693673e-05, + "loss": 1.0192, + "step": 6254 + }, + { + "epoch": 0.9833166303130343, + "grad_norm": 0.13158632814884186, + "learning_rate": 2.5691892405087376e-05, + "loss": 1.0729, + "step": 6255 + }, + { + "epoch": 0.9834738352099668, + "grad_norm": 0.13453510403633118, + "learning_rate": 2.568571635097986e-05, + "loss": 1.0895, + "step": 6256 + }, + { + "epoch": 0.9836310401068993, + "grad_norm": 0.1353486329317093, + "learning_rate": 2.5679540254991398e-05, + "loss": 1.1086, + "step": 6257 + }, + { + "epoch": 0.9837882450038319, + "grad_norm": 0.13683579862117767, + "learning_rate": 2.56733641174992e-05, + "loss": 1.1337, + "step": 6258 + }, + { + "epoch": 0.9839454499007644, + "grad_norm": 0.14560097455978394, + "learning_rate": 2.5667187938880492e-05, + "loss": 1.1472, + "step": 6259 + }, + { + "epoch": 0.984102654797697, + "grad_norm": 0.13739261031150818, + "learning_rate": 2.566101171951248e-05, + "loss": 1.0105, + "step": 6260 + }, + { + "epoch": 0.9842598596946295, + "grad_norm": 0.13820849359035492, + "learning_rate": 2.5654835459772387e-05, + "loss": 1.1956, + "step": 6261 + }, + { + "epoch": 0.984417064591562, + "grad_norm": 0.1351996809244156, + "learning_rate": 2.564865916003743e-05, + "loss": 0.9823, + "step": 6262 + }, + { + "epoch": 0.9845742694884946, + "grad_norm": 0.12587571144104004, + "learning_rate": 2.564248282068485e-05, + "loss": 1.1213, + "step": 6263 + }, + { + "epoch": 0.9847314743854271, + "grad_norm": 0.13515618443489075, + "learning_rate": 2.5636306442091852e-05, + "loss": 1.1614, + "step": 6264 + }, + { + "epoch": 0.9848886792823597, + "grad_norm": 0.13904693722724915, + "learning_rate": 2.5630130024635685e-05, + "loss": 1.1153, + "step": 6265 + }, + { + "epoch": 0.9850458841792922, + "grad_norm": 0.1402917504310608, + "learning_rate": 2.5623953568693576e-05, + "loss": 1.0931, + "step": 6266 + }, + { + "epoch": 0.9852030890762247, + "grad_norm": 0.12222890555858612, + "learning_rate": 2.5617777074642747e-05, + "loss": 1.1137, + "step": 6267 + }, + { + "epoch": 0.9853602939731573, + "grad_norm": 0.13428887724876404, + "learning_rate": 2.5611600542860452e-05, + "loss": 1.0492, + "step": 6268 + }, + { + "epoch": 0.9855174988700898, + "grad_norm": 0.13254012167453766, + "learning_rate": 2.560542397372392e-05, + "loss": 1.098, + "step": 6269 + }, + { + "epoch": 0.9856747037670224, + "grad_norm": 0.13445062935352325, + "learning_rate": 2.5599247367610397e-05, + "loss": 1.1914, + "step": 6270 + }, + { + "epoch": 0.9858319086639549, + "grad_norm": 0.12535728514194489, + "learning_rate": 2.5593070724897127e-05, + "loss": 0.9577, + "step": 6271 + }, + { + "epoch": 0.9859891135608875, + "grad_norm": 0.14537984132766724, + "learning_rate": 2.5586894045961347e-05, + "loss": 1.0938, + "step": 6272 + }, + { + "epoch": 0.98614631845782, + "grad_norm": 0.13581863045692444, + "learning_rate": 2.558071733118032e-05, + "loss": 1.1955, + "step": 6273 + }, + { + "epoch": 0.9863035233547525, + "grad_norm": 0.1212281882762909, + "learning_rate": 2.5574540580931282e-05, + "loss": 1.0618, + "step": 6274 + }, + { + "epoch": 0.9864607282516851, + "grad_norm": 0.12363947182893753, + "learning_rate": 2.5568363795591494e-05, + "loss": 1.0225, + "step": 6275 + }, + { + "epoch": 0.9866179331486176, + "grad_norm": 0.1227293461561203, + "learning_rate": 2.55621869755382e-05, + "loss": 1.1876, + "step": 6276 + }, + { + "epoch": 0.9867751380455502, + "grad_norm": 0.14107313752174377, + "learning_rate": 2.5556010121148678e-05, + "loss": 1.1927, + "step": 6277 + }, + { + "epoch": 0.9869323429424827, + "grad_norm": 0.15813635289669037, + "learning_rate": 2.5549833232800162e-05, + "loss": 1.1385, + "step": 6278 + }, + { + "epoch": 0.9870895478394152, + "grad_norm": 0.12730398774147034, + "learning_rate": 2.5543656310869928e-05, + "loss": 1.1213, + "step": 6279 + }, + { + "epoch": 0.9872467527363478, + "grad_norm": 0.1417589783668518, + "learning_rate": 2.5537479355735244e-05, + "loss": 1.0666, + "step": 6280 + }, + { + "epoch": 0.9874039576332803, + "grad_norm": 0.13554830849170685, + "learning_rate": 2.5531302367773347e-05, + "loss": 1.0842, + "step": 6281 + }, + { + "epoch": 0.9875611625302129, + "grad_norm": 0.1291741281747818, + "learning_rate": 2.552512534736154e-05, + "loss": 1.1595, + "step": 6282 + }, + { + "epoch": 0.9877183674271454, + "grad_norm": 0.1397232562303543, + "learning_rate": 2.5518948294877067e-05, + "loss": 1.1331, + "step": 6283 + }, + { + "epoch": 0.9878755723240779, + "grad_norm": 0.12582436203956604, + "learning_rate": 2.5512771210697208e-05, + "loss": 1.0512, + "step": 6284 + }, + { + "epoch": 0.9880327772210105, + "grad_norm": 0.1307750940322876, + "learning_rate": 2.5506594095199237e-05, + "loss": 1.0236, + "step": 6285 + }, + { + "epoch": 0.988189982117943, + "grad_norm": 0.13207784295082092, + "learning_rate": 2.5500416948760413e-05, + "loss": 1.135, + "step": 6286 + }, + { + "epoch": 0.9883471870148756, + "grad_norm": 0.13650716841220856, + "learning_rate": 2.5494239771758045e-05, + "loss": 1.0816, + "step": 6287 + }, + { + "epoch": 0.9885043919118081, + "grad_norm": 0.1528988480567932, + "learning_rate": 2.548806256456938e-05, + "loss": 0.999, + "step": 6288 + }, + { + "epoch": 0.9886615968087406, + "grad_norm": 0.16303037106990814, + "learning_rate": 2.548188532757172e-05, + "loss": 1.0918, + "step": 6289 + }, + { + "epoch": 0.9888188017056732, + "grad_norm": 0.14084094762802124, + "learning_rate": 2.5475708061142327e-05, + "loss": 1.1722, + "step": 6290 + }, + { + "epoch": 0.9889760066026057, + "grad_norm": 0.12933306396007538, + "learning_rate": 2.54695307656585e-05, + "loss": 1.0084, + "step": 6291 + }, + { + "epoch": 0.9891332114995383, + "grad_norm": 0.12259813398122787, + "learning_rate": 2.546335344149752e-05, + "loss": 1.1068, + "step": 6292 + }, + { + "epoch": 0.9892904163964708, + "grad_norm": 0.15874385833740234, + "learning_rate": 2.5457176089036673e-05, + "loss": 1.2221, + "step": 6293 + }, + { + "epoch": 0.9894476212934032, + "grad_norm": 0.1365712434053421, + "learning_rate": 2.5450998708653258e-05, + "loss": 1.0651, + "step": 6294 + }, + { + "epoch": 0.9896048261903359, + "grad_norm": 0.14056332409381866, + "learning_rate": 2.5444821300724548e-05, + "loss": 0.9759, + "step": 6295 + }, + { + "epoch": 0.9897620310872683, + "grad_norm": 0.13075505197048187, + "learning_rate": 2.543864386562785e-05, + "loss": 1.197, + "step": 6296 + }, + { + "epoch": 0.989919235984201, + "grad_norm": 0.14680463075637817, + "learning_rate": 2.543246640374045e-05, + "loss": 1.1073, + "step": 6297 + }, + { + "epoch": 0.9900764408811334, + "grad_norm": 0.1336512565612793, + "learning_rate": 2.542628891543965e-05, + "loss": 1.0275, + "step": 6298 + }, + { + "epoch": 0.9902336457780659, + "grad_norm": 0.13211551308631897, + "learning_rate": 2.5420111401102746e-05, + "loss": 1.1159, + "step": 6299 + }, + { + "epoch": 0.9903908506749985, + "grad_norm": 0.12867555022239685, + "learning_rate": 2.5413933861107038e-05, + "loss": 1.1325, + "step": 6300 + }, + { + "epoch": 0.990548055571931, + "grad_norm": 0.13708464801311493, + "learning_rate": 2.5407756295829822e-05, + "loss": 1.158, + "step": 6301 + }, + { + "epoch": 0.9907052604688636, + "grad_norm": 0.11703987419605255, + "learning_rate": 2.54015787056484e-05, + "loss": 0.9593, + "step": 6302 + }, + { + "epoch": 0.9908624653657961, + "grad_norm": 0.1257048100233078, + "learning_rate": 2.5395401090940087e-05, + "loss": 1.1157, + "step": 6303 + }, + { + "epoch": 0.9910196702627286, + "grad_norm": 0.14393088221549988, + "learning_rate": 2.5389223452082172e-05, + "loss": 1.1057, + "step": 6304 + }, + { + "epoch": 0.9911768751596612, + "grad_norm": 0.1277211755514145, + "learning_rate": 2.538304578945197e-05, + "loss": 1.0921, + "step": 6305 + }, + { + "epoch": 0.9913340800565937, + "grad_norm": 0.13631990551948547, + "learning_rate": 2.537686810342679e-05, + "loss": 1.0043, + "step": 6306 + }, + { + "epoch": 0.9914912849535263, + "grad_norm": 0.12600469589233398, + "learning_rate": 2.5370690394383938e-05, + "loss": 1.0116, + "step": 6307 + }, + { + "epoch": 0.9916484898504588, + "grad_norm": 0.14064371585845947, + "learning_rate": 2.5364512662700733e-05, + "loss": 1.1529, + "step": 6308 + }, + { + "epoch": 0.9918056947473913, + "grad_norm": 0.13178332149982452, + "learning_rate": 2.5358334908754477e-05, + "loss": 1.0628, + "step": 6309 + }, + { + "epoch": 0.9919628996443239, + "grad_norm": 0.12952658534049988, + "learning_rate": 2.5352157132922495e-05, + "loss": 1.12, + "step": 6310 + }, + { + "epoch": 0.9921201045412564, + "grad_norm": 0.15064801275730133, + "learning_rate": 2.534597933558208e-05, + "loss": 1.082, + "step": 6311 + }, + { + "epoch": 0.992277309438189, + "grad_norm": 0.12884730100631714, + "learning_rate": 2.533980151711058e-05, + "loss": 1.1727, + "step": 6312 + }, + { + "epoch": 0.9924345143351215, + "grad_norm": 0.1387583464384079, + "learning_rate": 2.5333623677885286e-05, + "loss": 1.0762, + "step": 6313 + }, + { + "epoch": 0.992591719232054, + "grad_norm": 0.15405724942684174, + "learning_rate": 2.5327445818283528e-05, + "loss": 0.9823, + "step": 6314 + }, + { + "epoch": 0.9927489241289866, + "grad_norm": 0.13114172220230103, + "learning_rate": 2.532126793868263e-05, + "loss": 1.0457, + "step": 6315 + }, + { + "epoch": 0.9929061290259191, + "grad_norm": 0.1413452923297882, + "learning_rate": 2.5315090039459897e-05, + "loss": 1.0372, + "step": 6316 + }, + { + "epoch": 0.9930633339228517, + "grad_norm": 0.131413534283638, + "learning_rate": 2.5308912120992677e-05, + "loss": 1.1606, + "step": 6317 + }, + { + "epoch": 0.9932205388197842, + "grad_norm": 0.1430748999118805, + "learning_rate": 2.530273418365828e-05, + "loss": 1.019, + "step": 6318 + }, + { + "epoch": 0.9933777437167167, + "grad_norm": 0.14210323989391327, + "learning_rate": 2.529655622783402e-05, + "loss": 0.9616, + "step": 6319 + }, + { + "epoch": 0.9935349486136493, + "grad_norm": 0.16489598155021667, + "learning_rate": 2.5290378253897246e-05, + "loss": 1.0651, + "step": 6320 + }, + { + "epoch": 0.9936921535105818, + "grad_norm": 0.13444717228412628, + "learning_rate": 2.528420026222526e-05, + "loss": 1.2318, + "step": 6321 + }, + { + "epoch": 0.9938493584075144, + "grad_norm": 0.13124394416809082, + "learning_rate": 2.5278022253195415e-05, + "loss": 0.9969, + "step": 6322 + }, + { + "epoch": 0.9940065633044469, + "grad_norm": 0.14871492981910706, + "learning_rate": 2.5271844227185025e-05, + "loss": 1.0518, + "step": 6323 + }, + { + "epoch": 0.9941637682013795, + "grad_norm": 0.1398722231388092, + "learning_rate": 2.5265666184571425e-05, + "loss": 1.0509, + "step": 6324 + }, + { + "epoch": 0.994320973098312, + "grad_norm": 0.13985921442508698, + "learning_rate": 2.5259488125731946e-05, + "loss": 1.1401, + "step": 6325 + }, + { + "epoch": 0.9944781779952445, + "grad_norm": 0.12822510302066803, + "learning_rate": 2.525331005104391e-05, + "loss": 1.0589, + "step": 6326 + }, + { + "epoch": 0.9946353828921771, + "grad_norm": 0.14868903160095215, + "learning_rate": 2.5247131960884667e-05, + "loss": 1.1305, + "step": 6327 + }, + { + "epoch": 0.9947925877891096, + "grad_norm": 0.13582776486873627, + "learning_rate": 2.5240953855631544e-05, + "loss": 1.1533, + "step": 6328 + }, + { + "epoch": 0.9949497926860422, + "grad_norm": 0.12167494744062424, + "learning_rate": 2.523477573566188e-05, + "loss": 1.1137, + "step": 6329 + }, + { + "epoch": 0.9951069975829747, + "grad_norm": 0.13908728957176208, + "learning_rate": 2.5228597601353e-05, + "loss": 1.0966, + "step": 6330 + }, + { + "epoch": 0.9952642024799072, + "grad_norm": 0.13457591831684113, + "learning_rate": 2.5222419453082253e-05, + "loss": 1.1333, + "step": 6331 + }, + { + "epoch": 0.9954214073768398, + "grad_norm": 0.13862866163253784, + "learning_rate": 2.5216241291226966e-05, + "loss": 1.0788, + "step": 6332 + }, + { + "epoch": 0.9955786122737723, + "grad_norm": 0.12104427069425583, + "learning_rate": 2.521006311616449e-05, + "loss": 1.0803, + "step": 6333 + }, + { + "epoch": 0.9957358171707049, + "grad_norm": 0.12326116114854813, + "learning_rate": 2.520388492827216e-05, + "loss": 1.0303, + "step": 6334 + }, + { + "epoch": 0.9958930220676374, + "grad_norm": 0.1255730241537094, + "learning_rate": 2.5197706727927305e-05, + "loss": 1.0595, + "step": 6335 + }, + { + "epoch": 0.9960502269645699, + "grad_norm": 0.1284172683954239, + "learning_rate": 2.5191528515507285e-05, + "loss": 1.0375, + "step": 6336 + }, + { + "epoch": 0.9962074318615025, + "grad_norm": 0.14409960806369781, + "learning_rate": 2.5185350291389425e-05, + "loss": 1.0852, + "step": 6337 + }, + { + "epoch": 0.996364636758435, + "grad_norm": 0.13724562525749207, + "learning_rate": 2.5179172055951084e-05, + "loss": 1.2068, + "step": 6338 + }, + { + "epoch": 0.9965218416553676, + "grad_norm": 0.1279664784669876, + "learning_rate": 2.5172993809569583e-05, + "loss": 0.9881, + "step": 6339 + }, + { + "epoch": 0.9966790465523001, + "grad_norm": 0.13202717900276184, + "learning_rate": 2.5166815552622284e-05, + "loss": 1.1256, + "step": 6340 + }, + { + "epoch": 0.9968362514492326, + "grad_norm": 0.1382056623697281, + "learning_rate": 2.5160637285486528e-05, + "loss": 1.1352, + "step": 6341 + }, + { + "epoch": 0.9969934563461652, + "grad_norm": 0.13173623383045197, + "learning_rate": 2.5154459008539656e-05, + "loss": 1.0508, + "step": 6342 + }, + { + "epoch": 0.9971506612430977, + "grad_norm": 0.13691706955432892, + "learning_rate": 2.5148280722159016e-05, + "loss": 1.2331, + "step": 6343 + }, + { + "epoch": 0.9973078661400303, + "grad_norm": 0.13203303515911102, + "learning_rate": 2.5142102426721954e-05, + "loss": 1.1185, + "step": 6344 + }, + { + "epoch": 0.9974650710369628, + "grad_norm": 0.14125902950763702, + "learning_rate": 2.5135924122605813e-05, + "loss": 1.0026, + "step": 6345 + }, + { + "epoch": 0.9976222759338953, + "grad_norm": 0.14264841377735138, + "learning_rate": 2.5129745810187943e-05, + "loss": 0.9871, + "step": 6346 + }, + { + "epoch": 0.9977794808308279, + "grad_norm": 0.1278458684682846, + "learning_rate": 2.5123567489845697e-05, + "loss": 1.0998, + "step": 6347 + }, + { + "epoch": 0.9979366857277604, + "grad_norm": 0.13270807266235352, + "learning_rate": 2.5117389161956417e-05, + "loss": 0.9923, + "step": 6348 + }, + { + "epoch": 0.998093890624693, + "grad_norm": 0.1347019523382187, + "learning_rate": 2.511121082689745e-05, + "loss": 1.0918, + "step": 6349 + }, + { + "epoch": 0.9982510955216255, + "grad_norm": 0.1361355185508728, + "learning_rate": 2.5105032485046155e-05, + "loss": 1.0562, + "step": 6350 + }, + { + "epoch": 0.998408300418558, + "grad_norm": 0.15076801180839539, + "learning_rate": 2.5098854136779875e-05, + "loss": 1.0956, + "step": 6351 + }, + { + "epoch": 0.9985655053154906, + "grad_norm": 0.142885223031044, + "learning_rate": 2.509267578247596e-05, + "loss": 1.0504, + "step": 6352 + }, + { + "epoch": 0.9987227102124231, + "grad_norm": 0.13812819123268127, + "learning_rate": 2.5086497422511763e-05, + "loss": 1.0593, + "step": 6353 + }, + { + "epoch": 0.9988799151093557, + "grad_norm": 0.1277756243944168, + "learning_rate": 2.5080319057264624e-05, + "loss": 1.0753, + "step": 6354 + }, + { + "epoch": 0.9990371200062882, + "grad_norm": 0.16406641900539398, + "learning_rate": 2.5074140687111908e-05, + "loss": 1.0903, + "step": 6355 + }, + { + "epoch": 0.9991943249032207, + "grad_norm": 0.15688486397266388, + "learning_rate": 2.506796231243096e-05, + "loss": 1.1403, + "step": 6356 + }, + { + "epoch": 0.9993515298001533, + "grad_norm": 0.13128222525119781, + "learning_rate": 2.5061783933599136e-05, + "loss": 1.0178, + "step": 6357 + }, + { + "epoch": 0.9995087346970858, + "grad_norm": 0.1294260174036026, + "learning_rate": 2.5055605550993782e-05, + "loss": 1.0899, + "step": 6358 + }, + { + "epoch": 0.9996659395940184, + "grad_norm": 0.1612711399793625, + "learning_rate": 2.5049427164992252e-05, + "loss": 1.0954, + "step": 6359 + }, + { + "epoch": 0.9998231444909509, + "grad_norm": 0.14568625390529633, + "learning_rate": 2.5043248775971896e-05, + "loss": 1.1663, + "step": 6360 + }, + { + "epoch": 0.9999803493878834, + "grad_norm": 0.13954225182533264, + "learning_rate": 2.503707038431007e-05, + "loss": 1.0526, + "step": 6361 + }, + { + "epoch": 1.0001572048969325, + "grad_norm": 0.1299106329679489, + "learning_rate": 2.5030891990384132e-05, + "loss": 1.9274, + "step": 6362 + }, + { + "epoch": 1.000314409793865, + "grad_norm": 0.13823172450065613, + "learning_rate": 2.5024713594571426e-05, + "loss": 1.0489, + "step": 6363 + }, + { + "epoch": 1.0004716146907977, + "grad_norm": 0.12603749334812164, + "learning_rate": 2.501853519724931e-05, + "loss": 1.1243, + "step": 6364 + }, + { + "epoch": 1.0006288195877302, + "grad_norm": 0.16069631278514862, + "learning_rate": 2.5012356798795123e-05, + "loss": 1.1332, + "step": 6365 + }, + { + "epoch": 1.0007860244846627, + "grad_norm": 0.1280946582555771, + "learning_rate": 2.5006178399586246e-05, + "loss": 1.0832, + "step": 6366 + }, + { + "epoch": 1.0009432293815952, + "grad_norm": 0.14590582251548767, + "learning_rate": 2.5e-05, + "loss": 1.1259, + "step": 6367 + }, + { + "epoch": 1.0011004342785277, + "grad_norm": 0.13885480165481567, + "learning_rate": 2.4993821600413764e-05, + "loss": 1.053, + "step": 6368 + }, + { + "epoch": 1.0012576391754604, + "grad_norm": 0.1317761093378067, + "learning_rate": 2.4987643201204883e-05, + "loss": 1.0102, + "step": 6369 + }, + { + "epoch": 1.001414844072393, + "grad_norm": 0.13858036696910858, + "learning_rate": 2.4981464802750698e-05, + "loss": 1.1517, + "step": 6370 + }, + { + "epoch": 1.0015720489693254, + "grad_norm": 0.13943566381931305, + "learning_rate": 2.497528640542859e-05, + "loss": 1.1237, + "step": 6371 + }, + { + "epoch": 1.0017292538662579, + "grad_norm": 0.12917844951152802, + "learning_rate": 2.4969108009615874e-05, + "loss": 1.0486, + "step": 6372 + }, + { + "epoch": 1.0018864587631904, + "grad_norm": 0.12163997441530228, + "learning_rate": 2.496292961568993e-05, + "loss": 1.0612, + "step": 6373 + }, + { + "epoch": 1.002043663660123, + "grad_norm": 0.1303284466266632, + "learning_rate": 2.495675122402811e-05, + "loss": 1.0513, + "step": 6374 + }, + { + "epoch": 1.0022008685570556, + "grad_norm": 0.1599988043308258, + "learning_rate": 2.4950572835007757e-05, + "loss": 1.1369, + "step": 6375 + }, + { + "epoch": 1.002358073453988, + "grad_norm": 0.12788580358028412, + "learning_rate": 2.494439444900623e-05, + "loss": 1.0901, + "step": 6376 + }, + { + "epoch": 1.0025152783509206, + "grad_norm": 0.13581426441669464, + "learning_rate": 2.493821606640087e-05, + "loss": 1.0147, + "step": 6377 + }, + { + "epoch": 1.002672483247853, + "grad_norm": 0.13627904653549194, + "learning_rate": 2.493203768756904e-05, + "loss": 1.0719, + "step": 6378 + }, + { + "epoch": 1.0028296881447858, + "grad_norm": 0.14597761631011963, + "learning_rate": 2.4925859312888098e-05, + "loss": 1.089, + "step": 6379 + }, + { + "epoch": 1.0029868930417183, + "grad_norm": 0.1353975385427475, + "learning_rate": 2.4919680942735378e-05, + "loss": 1.1104, + "step": 6380 + }, + { + "epoch": 1.0031440979386508, + "grad_norm": 0.13635997474193573, + "learning_rate": 2.4913502577488253e-05, + "loss": 1.1243, + "step": 6381 + }, + { + "epoch": 1.0033013028355833, + "grad_norm": 0.13782460987567902, + "learning_rate": 2.4907324217524044e-05, + "loss": 1.2011, + "step": 6382 + }, + { + "epoch": 1.0034585077325158, + "grad_norm": 0.1340285837650299, + "learning_rate": 2.4901145863220127e-05, + "loss": 1.1395, + "step": 6383 + }, + { + "epoch": 1.0036157126294485, + "grad_norm": 0.14155049622058868, + "learning_rate": 2.4894967514953847e-05, + "loss": 1.1667, + "step": 6384 + }, + { + "epoch": 1.003772917526381, + "grad_norm": 0.16872969269752502, + "learning_rate": 2.488878917310255e-05, + "loss": 1.0246, + "step": 6385 + }, + { + "epoch": 1.0039301224233135, + "grad_norm": 0.13815303146839142, + "learning_rate": 2.4882610838043592e-05, + "loss": 1.1142, + "step": 6386 + }, + { + "epoch": 1.004087327320246, + "grad_norm": 0.1526249796152115, + "learning_rate": 2.487643251015431e-05, + "loss": 1.0075, + "step": 6387 + }, + { + "epoch": 1.0042445322171785, + "grad_norm": 0.12049448490142822, + "learning_rate": 2.4870254189812056e-05, + "loss": 0.9185, + "step": 6388 + }, + { + "epoch": 1.0044017371141112, + "grad_norm": 0.1362065076828003, + "learning_rate": 2.4864075877394192e-05, + "loss": 0.9745, + "step": 6389 + }, + { + "epoch": 1.0045589420110437, + "grad_norm": 0.1315588355064392, + "learning_rate": 2.4857897573278055e-05, + "loss": 1.0904, + "step": 6390 + }, + { + "epoch": 1.0047161469079762, + "grad_norm": 0.15180738270282745, + "learning_rate": 2.4851719277840996e-05, + "loss": 1.1136, + "step": 6391 + }, + { + "epoch": 1.0048733518049087, + "grad_norm": 0.12382183969020844, + "learning_rate": 2.4845540991460353e-05, + "loss": 1.0592, + "step": 6392 + }, + { + "epoch": 1.0050305567018412, + "grad_norm": 0.14550195634365082, + "learning_rate": 2.483936271451347e-05, + "loss": 1.0294, + "step": 6393 + }, + { + "epoch": 1.0051877615987739, + "grad_norm": 0.1435728818178177, + "learning_rate": 2.4833184447377718e-05, + "loss": 1.0657, + "step": 6394 + }, + { + "epoch": 1.0053449664957064, + "grad_norm": 0.1408451497554779, + "learning_rate": 2.4827006190430423e-05, + "loss": 1.0373, + "step": 6395 + }, + { + "epoch": 1.0055021713926389, + "grad_norm": 0.14456969499588013, + "learning_rate": 2.482082794404893e-05, + "loss": 1.1237, + "step": 6396 + }, + { + "epoch": 1.0056593762895714, + "grad_norm": 0.13339631259441376, + "learning_rate": 2.481464970861058e-05, + "loss": 1.0776, + "step": 6397 + }, + { + "epoch": 1.0058165811865039, + "grad_norm": 0.11940862983465195, + "learning_rate": 2.4808471484492717e-05, + "loss": 1.0656, + "step": 6398 + }, + { + "epoch": 1.0059737860834366, + "grad_norm": 0.13193495571613312, + "learning_rate": 2.48022932720727e-05, + "loss": 0.9743, + "step": 6399 + }, + { + "epoch": 1.006130990980369, + "grad_norm": 0.13147178292274475, + "learning_rate": 2.4796115071727847e-05, + "loss": 1.0621, + "step": 6400 + }, + { + "epoch": 1.006130990980369, + "eval_loss": 1.0882635116577148, + "eval_runtime": 2349.8687, + "eval_samples_per_second": 3.94, + "eval_steps_per_second": 1.97, + "step": 6400 + }, + { + "epoch": 1.0062881958773016, + "grad_norm": 0.13858306407928467, + "learning_rate": 2.4789936883835516e-05, + "loss": 1.0142, + "step": 6401 + }, + { + "epoch": 1.006445400774234, + "grad_norm": 0.13647626340389252, + "learning_rate": 2.478375870877304e-05, + "loss": 1.035, + "step": 6402 + }, + { + "epoch": 1.0066026056711666, + "grad_norm": 0.14345145225524902, + "learning_rate": 2.4777580546917753e-05, + "loss": 1.0996, + "step": 6403 + }, + { + "epoch": 1.0067598105680993, + "grad_norm": 0.15048737823963165, + "learning_rate": 2.477140239864701e-05, + "loss": 1.122, + "step": 6404 + }, + { + "epoch": 1.0069170154650318, + "grad_norm": 0.13660579919815063, + "learning_rate": 2.4765224264338126e-05, + "loss": 1.1144, + "step": 6405 + }, + { + "epoch": 1.0070742203619643, + "grad_norm": 0.1633283495903015, + "learning_rate": 2.4759046144368462e-05, + "loss": 1.0682, + "step": 6406 + }, + { + "epoch": 1.0072314252588968, + "grad_norm": 0.13291175663471222, + "learning_rate": 2.475286803911534e-05, + "loss": 1.0505, + "step": 6407 + }, + { + "epoch": 1.0073886301558292, + "grad_norm": 0.13030974566936493, + "learning_rate": 2.474668994895609e-05, + "loss": 0.9586, + "step": 6408 + }, + { + "epoch": 1.007545835052762, + "grad_norm": 0.16185326874256134, + "learning_rate": 2.474051187426807e-05, + "loss": 1.1532, + "step": 6409 + }, + { + "epoch": 1.0077030399496945, + "grad_norm": 0.134599506855011, + "learning_rate": 2.4734333815428578e-05, + "loss": 1.1307, + "step": 6410 + }, + { + "epoch": 1.007860244846627, + "grad_norm": 0.14329268038272858, + "learning_rate": 2.4728155772814987e-05, + "loss": 1.0548, + "step": 6411 + }, + { + "epoch": 1.0080174497435594, + "grad_norm": 0.16063834726810455, + "learning_rate": 2.472197774680459e-05, + "loss": 1.0179, + "step": 6412 + }, + { + "epoch": 1.008174654640492, + "grad_norm": 0.1387328952550888, + "learning_rate": 2.471579973777474e-05, + "loss": 0.9968, + "step": 6413 + }, + { + "epoch": 1.0083318595374247, + "grad_norm": 0.1397387534379959, + "learning_rate": 2.4709621746102767e-05, + "loss": 1.074, + "step": 6414 + }, + { + "epoch": 1.0084890644343572, + "grad_norm": 0.13462291657924652, + "learning_rate": 2.470344377216598e-05, + "loss": 1.0553, + "step": 6415 + }, + { + "epoch": 1.0086462693312896, + "grad_norm": 0.12612217664718628, + "learning_rate": 2.4697265816341734e-05, + "loss": 1.0954, + "step": 6416 + }, + { + "epoch": 1.0088034742282221, + "grad_norm": 0.13030223548412323, + "learning_rate": 2.4691087879007326e-05, + "loss": 1.0118, + "step": 6417 + }, + { + "epoch": 1.0089606791251549, + "grad_norm": 0.13945329189300537, + "learning_rate": 2.4684909960540098e-05, + "loss": 1.0743, + "step": 6418 + }, + { + "epoch": 1.0091178840220874, + "grad_norm": 0.1253206878900528, + "learning_rate": 2.4678732061317378e-05, + "loss": 1.126, + "step": 6419 + }, + { + "epoch": 1.0092750889190198, + "grad_norm": 0.13366207480430603, + "learning_rate": 2.4672554181716474e-05, + "loss": 1.0279, + "step": 6420 + }, + { + "epoch": 1.0094322938159523, + "grad_norm": 0.1361444741487503, + "learning_rate": 2.4666376322114727e-05, + "loss": 1.0714, + "step": 6421 + }, + { + "epoch": 1.0095894987128848, + "grad_norm": 0.1402117908000946, + "learning_rate": 2.4660198482889427e-05, + "loss": 1.0793, + "step": 6422 + }, + { + "epoch": 1.0097467036098176, + "grad_norm": 0.14054907858371735, + "learning_rate": 2.465402066441792e-05, + "loss": 1.1108, + "step": 6423 + }, + { + "epoch": 1.00990390850675, + "grad_norm": 0.14116577804088593, + "learning_rate": 2.4647842867077517e-05, + "loss": 1.1291, + "step": 6424 + }, + { + "epoch": 1.0100611134036825, + "grad_norm": 0.1428392082452774, + "learning_rate": 2.464166509124553e-05, + "loss": 1.2933, + "step": 6425 + }, + { + "epoch": 1.010218318300615, + "grad_norm": 0.14450033009052277, + "learning_rate": 2.4635487337299276e-05, + "loss": 1.1368, + "step": 6426 + }, + { + "epoch": 1.0103755231975475, + "grad_norm": 0.1361183524131775, + "learning_rate": 2.4629309605616065e-05, + "loss": 0.9797, + "step": 6427 + }, + { + "epoch": 1.0105327280944802, + "grad_norm": 0.14418378472328186, + "learning_rate": 2.4623131896573214e-05, + "loss": 1.1191, + "step": 6428 + }, + { + "epoch": 1.0106899329914127, + "grad_norm": 0.14843346178531647, + "learning_rate": 2.4616954210548036e-05, + "loss": 0.9986, + "step": 6429 + }, + { + "epoch": 1.0108471378883452, + "grad_norm": 0.12695366144180298, + "learning_rate": 2.4610776547917834e-05, + "loss": 0.8308, + "step": 6430 + }, + { + "epoch": 1.0110043427852777, + "grad_norm": 0.14178785681724548, + "learning_rate": 2.460459890905992e-05, + "loss": 1.1103, + "step": 6431 + }, + { + "epoch": 1.0111615476822102, + "grad_norm": 0.12391351908445358, + "learning_rate": 2.4598421294351605e-05, + "loss": 1.0589, + "step": 6432 + }, + { + "epoch": 1.011318752579143, + "grad_norm": 0.15103794634342194, + "learning_rate": 2.4592243704170177e-05, + "loss": 1.0231, + "step": 6433 + }, + { + "epoch": 1.0114759574760754, + "grad_norm": 0.1457628309726715, + "learning_rate": 2.4586066138892975e-05, + "loss": 1.1587, + "step": 6434 + }, + { + "epoch": 1.011633162373008, + "grad_norm": 0.13931329548358917, + "learning_rate": 2.4579888598897256e-05, + "loss": 1.1835, + "step": 6435 + }, + { + "epoch": 1.0117903672699404, + "grad_norm": 0.13331428170204163, + "learning_rate": 2.457371108456035e-05, + "loss": 1.0825, + "step": 6436 + }, + { + "epoch": 1.011947572166873, + "grad_norm": 0.15705661475658417, + "learning_rate": 2.4567533596259555e-05, + "loss": 1.2163, + "step": 6437 + }, + { + "epoch": 1.0121047770638056, + "grad_norm": 0.138403058052063, + "learning_rate": 2.4561356134372156e-05, + "loss": 1.1735, + "step": 6438 + }, + { + "epoch": 1.0122619819607381, + "grad_norm": 0.1449640989303589, + "learning_rate": 2.455517869927546e-05, + "loss": 1.2208, + "step": 6439 + }, + { + "epoch": 1.0124191868576706, + "grad_norm": 0.14121830463409424, + "learning_rate": 2.4549001291346748e-05, + "loss": 0.9478, + "step": 6440 + }, + { + "epoch": 1.0125763917546031, + "grad_norm": 0.14475440979003906, + "learning_rate": 2.4542823910963326e-05, + "loss": 0.8971, + "step": 6441 + }, + { + "epoch": 1.0127335966515356, + "grad_norm": 0.14058662950992584, + "learning_rate": 2.4536646558502484e-05, + "loss": 1.1625, + "step": 6442 + }, + { + "epoch": 1.0128908015484683, + "grad_norm": 0.15417198836803436, + "learning_rate": 2.4530469234341504e-05, + "loss": 1.1301, + "step": 6443 + }, + { + "epoch": 1.0130480064454008, + "grad_norm": 0.1284470111131668, + "learning_rate": 2.4524291938857682e-05, + "loss": 1.0453, + "step": 6444 + }, + { + "epoch": 1.0132052113423333, + "grad_norm": 0.16337668895721436, + "learning_rate": 2.4518114672428286e-05, + "loss": 1.1067, + "step": 6445 + }, + { + "epoch": 1.0133624162392658, + "grad_norm": 0.14603199064731598, + "learning_rate": 2.4511937435430618e-05, + "loss": 1.0693, + "step": 6446 + }, + { + "epoch": 1.0135196211361983, + "grad_norm": 0.13317929208278656, + "learning_rate": 2.450576022824196e-05, + "loss": 1.0609, + "step": 6447 + }, + { + "epoch": 1.013676826033131, + "grad_norm": 0.15966759622097015, + "learning_rate": 2.4499583051239583e-05, + "loss": 0.929, + "step": 6448 + }, + { + "epoch": 1.0138340309300635, + "grad_norm": 0.13287387788295746, + "learning_rate": 2.4493405904800772e-05, + "loss": 1.1264, + "step": 6449 + }, + { + "epoch": 1.013991235826996, + "grad_norm": 0.13444417715072632, + "learning_rate": 2.4487228789302798e-05, + "loss": 1.1118, + "step": 6450 + }, + { + "epoch": 1.0141484407239285, + "grad_norm": 0.14054074883460999, + "learning_rate": 2.4481051705122936e-05, + "loss": 1.0325, + "step": 6451 + }, + { + "epoch": 1.014305645620861, + "grad_norm": 0.13343651592731476, + "learning_rate": 2.4474874652638466e-05, + "loss": 1.0846, + "step": 6452 + }, + { + "epoch": 1.0144628505177937, + "grad_norm": 0.1339552253484726, + "learning_rate": 2.446869763222665e-05, + "loss": 1.0696, + "step": 6453 + }, + { + "epoch": 1.0146200554147262, + "grad_norm": 0.15116937458515167, + "learning_rate": 2.446252064426477e-05, + "loss": 1.0111, + "step": 6454 + }, + { + "epoch": 1.0147772603116587, + "grad_norm": 0.12791171669960022, + "learning_rate": 2.4456343689130078e-05, + "loss": 1.1446, + "step": 6455 + }, + { + "epoch": 1.0149344652085912, + "grad_norm": 0.13993287086486816, + "learning_rate": 2.4450166767199837e-05, + "loss": 1.0458, + "step": 6456 + }, + { + "epoch": 1.0150916701055237, + "grad_norm": 0.13733619451522827, + "learning_rate": 2.4443989878851328e-05, + "loss": 1.131, + "step": 6457 + }, + { + "epoch": 1.0152488750024564, + "grad_norm": 0.15433922410011292, + "learning_rate": 2.4437813024461797e-05, + "loss": 1.1019, + "step": 6458 + }, + { + "epoch": 1.015406079899389, + "grad_norm": 0.13969793915748596, + "learning_rate": 2.4431636204408515e-05, + "loss": 1.0439, + "step": 6459 + }, + { + "epoch": 1.0155632847963214, + "grad_norm": 0.15085668861865997, + "learning_rate": 2.4425459419068724e-05, + "loss": 1.0518, + "step": 6460 + }, + { + "epoch": 1.015720489693254, + "grad_norm": 0.13246352970600128, + "learning_rate": 2.441928266881968e-05, + "loss": 1.0987, + "step": 6461 + }, + { + "epoch": 1.0158776945901864, + "grad_norm": 0.13948623836040497, + "learning_rate": 2.441310595403866e-05, + "loss": 1.1742, + "step": 6462 + }, + { + "epoch": 1.0160348994871191, + "grad_norm": 0.1362038552761078, + "learning_rate": 2.440692927510288e-05, + "loss": 1.2179, + "step": 6463 + }, + { + "epoch": 1.0161921043840516, + "grad_norm": 0.1325484812259674, + "learning_rate": 2.440075263238961e-05, + "loss": 1.1831, + "step": 6464 + }, + { + "epoch": 1.016349309280984, + "grad_norm": 0.1380319744348526, + "learning_rate": 2.4394576026276083e-05, + "loss": 1.0784, + "step": 6465 + }, + { + "epoch": 1.0165065141779166, + "grad_norm": 0.13605383038520813, + "learning_rate": 2.438839945713955e-05, + "loss": 1.0525, + "step": 6466 + }, + { + "epoch": 1.016663719074849, + "grad_norm": 0.12745130062103271, + "learning_rate": 2.4382222925357263e-05, + "loss": 1.0636, + "step": 6467 + }, + { + "epoch": 1.0168209239717818, + "grad_norm": 0.13201391696929932, + "learning_rate": 2.4376046431306433e-05, + "loss": 1.0135, + "step": 6468 + }, + { + "epoch": 1.0169781288687143, + "grad_norm": 0.1400802582502365, + "learning_rate": 2.4369869975364324e-05, + "loss": 1.1093, + "step": 6469 + }, + { + "epoch": 1.0171353337656468, + "grad_norm": 0.14345529675483704, + "learning_rate": 2.436369355790815e-05, + "loss": 1.0652, + "step": 6470 + }, + { + "epoch": 1.0172925386625793, + "grad_norm": 0.14295189082622528, + "learning_rate": 2.435751717931516e-05, + "loss": 1.1475, + "step": 6471 + }, + { + "epoch": 1.0174497435595118, + "grad_norm": 0.13964888453483582, + "learning_rate": 2.4351340839962578e-05, + "loss": 1.1328, + "step": 6472 + }, + { + "epoch": 1.0176069484564445, + "grad_norm": 0.14445465803146362, + "learning_rate": 2.4345164540227616e-05, + "loss": 1.1664, + "step": 6473 + }, + { + "epoch": 1.017764153353377, + "grad_norm": 0.13211970031261444, + "learning_rate": 2.4338988280487536e-05, + "loss": 0.9766, + "step": 6474 + }, + { + "epoch": 1.0179213582503095, + "grad_norm": 0.1310061812400818, + "learning_rate": 2.4332812061119514e-05, + "loss": 1.0741, + "step": 6475 + }, + { + "epoch": 1.018078563147242, + "grad_norm": 0.1573285013437271, + "learning_rate": 2.43266358825008e-05, + "loss": 0.9688, + "step": 6476 + }, + { + "epoch": 1.0182357680441745, + "grad_norm": 0.14303627610206604, + "learning_rate": 2.4320459745008608e-05, + "loss": 1.0668, + "step": 6477 + }, + { + "epoch": 1.0183929729411072, + "grad_norm": 0.14365479350090027, + "learning_rate": 2.4314283649020146e-05, + "loss": 1.1021, + "step": 6478 + }, + { + "epoch": 1.0185501778380397, + "grad_norm": 0.12707237899303436, + "learning_rate": 2.430810759491264e-05, + "loss": 1.0715, + "step": 6479 + }, + { + "epoch": 1.0187073827349722, + "grad_norm": 0.15213000774383545, + "learning_rate": 2.430193158306327e-05, + "loss": 1.088, + "step": 6480 + }, + { + "epoch": 1.0188645876319047, + "grad_norm": 0.12318674474954605, + "learning_rate": 2.4295755613849272e-05, + "loss": 0.9919, + "step": 6481 + }, + { + "epoch": 1.0190217925288372, + "grad_norm": 0.14013998210430145, + "learning_rate": 2.428957968764785e-05, + "loss": 1.1628, + "step": 6482 + }, + { + "epoch": 1.01917899742577, + "grad_norm": 0.12830761075019836, + "learning_rate": 2.428340380483619e-05, + "loss": 1.1338, + "step": 6483 + }, + { + "epoch": 1.0193362023227024, + "grad_norm": 0.16488949954509735, + "learning_rate": 2.4277227965791502e-05, + "loss": 1.2237, + "step": 6484 + }, + { + "epoch": 1.0194934072196349, + "grad_norm": 0.15099136531352997, + "learning_rate": 2.4271052170890975e-05, + "loss": 1.0959, + "step": 6485 + }, + { + "epoch": 1.0196506121165674, + "grad_norm": 0.15304993093013763, + "learning_rate": 2.4264876420511807e-05, + "loss": 1.071, + "step": 6486 + }, + { + "epoch": 1.0198078170134999, + "grad_norm": 0.147121399641037, + "learning_rate": 2.4258700715031196e-05, + "loss": 1.0002, + "step": 6487 + }, + { + "epoch": 1.0199650219104326, + "grad_norm": 0.1368657350540161, + "learning_rate": 2.4252525054826322e-05, + "loss": 1.1563, + "step": 6488 + }, + { + "epoch": 1.020122226807365, + "grad_norm": 0.13219110667705536, + "learning_rate": 2.424634944027438e-05, + "loss": 1.0881, + "step": 6489 + }, + { + "epoch": 1.0202794317042976, + "grad_norm": 0.12664826214313507, + "learning_rate": 2.4240173871752538e-05, + "loss": 1.0571, + "step": 6490 + }, + { + "epoch": 1.02043663660123, + "grad_norm": 0.13178735971450806, + "learning_rate": 2.4233998349637982e-05, + "loss": 1.174, + "step": 6491 + }, + { + "epoch": 1.0205938414981626, + "grad_norm": 0.13245108723640442, + "learning_rate": 2.4227822874307905e-05, + "loss": 1.0059, + "step": 6492 + }, + { + "epoch": 1.0207510463950953, + "grad_norm": 0.14155948162078857, + "learning_rate": 2.422164744613946e-05, + "loss": 1.1411, + "step": 6493 + }, + { + "epoch": 1.0209082512920278, + "grad_norm": 0.133402019739151, + "learning_rate": 2.4215472065509835e-05, + "loss": 1.0712, + "step": 6494 + }, + { + "epoch": 1.0210654561889603, + "grad_norm": 0.1284060925245285, + "learning_rate": 2.4209296732796186e-05, + "loss": 1.0701, + "step": 6495 + }, + { + "epoch": 1.0212226610858928, + "grad_norm": 0.1364603191614151, + "learning_rate": 2.4203121448375677e-05, + "loss": 1.1074, + "step": 6496 + }, + { + "epoch": 1.0213798659828253, + "grad_norm": 0.15322890877723694, + "learning_rate": 2.4196946212625495e-05, + "loss": 1.1142, + "step": 6497 + }, + { + "epoch": 1.021537070879758, + "grad_norm": 0.15149898827075958, + "learning_rate": 2.4190771025922765e-05, + "loss": 1.2059, + "step": 6498 + }, + { + "epoch": 1.0216942757766905, + "grad_norm": 0.15193890035152435, + "learning_rate": 2.4184595888644676e-05, + "loss": 1.0939, + "step": 6499 + }, + { + "epoch": 1.021851480673623, + "grad_norm": 0.13252975046634674, + "learning_rate": 2.417842080116836e-05, + "loss": 1.0236, + "step": 6500 + }, + { + "epoch": 1.0220086855705555, + "grad_norm": 0.13443732261657715, + "learning_rate": 2.4172245763870977e-05, + "loss": 1.0545, + "step": 6501 + }, + { + "epoch": 1.022165890467488, + "grad_norm": 0.13161426782608032, + "learning_rate": 2.416607077712968e-05, + "loss": 1.0364, + "step": 6502 + }, + { + "epoch": 1.0223230953644207, + "grad_norm": 0.1371748447418213, + "learning_rate": 2.4159895841321592e-05, + "loss": 1.0448, + "step": 6503 + }, + { + "epoch": 1.0224803002613532, + "grad_norm": 0.13697615265846252, + "learning_rate": 2.4153720956823884e-05, + "loss": 1.2092, + "step": 6504 + }, + { + "epoch": 1.0226375051582857, + "grad_norm": 0.1284060925245285, + "learning_rate": 2.414754612401367e-05, + "loss": 1.1229, + "step": 6505 + }, + { + "epoch": 1.0227947100552182, + "grad_norm": 0.1457095444202423, + "learning_rate": 2.41413713432681e-05, + "loss": 1.1019, + "step": 6506 + }, + { + "epoch": 1.0229519149521507, + "grad_norm": 0.1317131519317627, + "learning_rate": 2.4135196614964305e-05, + "loss": 1.1085, + "step": 6507 + }, + { + "epoch": 1.0231091198490834, + "grad_norm": 0.15324752032756805, + "learning_rate": 2.41290219394794e-05, + "loss": 1.0719, + "step": 6508 + }, + { + "epoch": 1.0232663247460159, + "grad_norm": 0.14084196090698242, + "learning_rate": 2.412284731719054e-05, + "loss": 1.0143, + "step": 6509 + }, + { + "epoch": 1.0234235296429484, + "grad_norm": 0.14040584862232208, + "learning_rate": 2.4116672748474808e-05, + "loss": 1.1363, + "step": 6510 + }, + { + "epoch": 1.0235807345398809, + "grad_norm": 0.1352028101682663, + "learning_rate": 2.4110498233709353e-05, + "loss": 1.0848, + "step": 6511 + }, + { + "epoch": 1.0237379394368133, + "grad_norm": 0.13853280246257782, + "learning_rate": 2.4104323773271284e-05, + "loss": 1.0919, + "step": 6512 + }, + { + "epoch": 1.023895144333746, + "grad_norm": 0.14174368977546692, + "learning_rate": 2.4098149367537706e-05, + "loss": 1.1235, + "step": 6513 + }, + { + "epoch": 1.0240523492306786, + "grad_norm": 0.13649876415729523, + "learning_rate": 2.409197501688574e-05, + "loss": 1.0375, + "step": 6514 + }, + { + "epoch": 1.024209554127611, + "grad_norm": 0.1531284749507904, + "learning_rate": 2.4085800721692473e-05, + "loss": 1.0019, + "step": 6515 + }, + { + "epoch": 1.0243667590245435, + "grad_norm": 0.12809935212135315, + "learning_rate": 2.4079626482335028e-05, + "loss": 1.0562, + "step": 6516 + }, + { + "epoch": 1.0245239639214763, + "grad_norm": 0.1316598802804947, + "learning_rate": 2.4073452299190497e-05, + "loss": 1.0424, + "step": 6517 + }, + { + "epoch": 1.0246811688184088, + "grad_norm": 0.12484854459762573, + "learning_rate": 2.406727817263597e-05, + "loss": 0.9395, + "step": 6518 + }, + { + "epoch": 1.0248383737153413, + "grad_norm": 0.139391228556633, + "learning_rate": 2.406110410304855e-05, + "loss": 1.0713, + "step": 6519 + }, + { + "epoch": 1.0249955786122737, + "grad_norm": 0.13923563063144684, + "learning_rate": 2.4054930090805308e-05, + "loss": 1.0547, + "step": 6520 + }, + { + "epoch": 1.0251527835092062, + "grad_norm": 0.14140965044498444, + "learning_rate": 2.4048756136283345e-05, + "loss": 1.016, + "step": 6521 + }, + { + "epoch": 1.0253099884061387, + "grad_norm": 0.15269608795642853, + "learning_rate": 2.4042582239859744e-05, + "loss": 1.0897, + "step": 6522 + }, + { + "epoch": 1.0254671933030715, + "grad_norm": 0.1279529184103012, + "learning_rate": 2.403640840191157e-05, + "loss": 0.9957, + "step": 6523 + }, + { + "epoch": 1.025624398200004, + "grad_norm": 0.1656380295753479, + "learning_rate": 2.4030234622815915e-05, + "loss": 1.0772, + "step": 6524 + }, + { + "epoch": 1.0257816030969364, + "grad_norm": 0.17439262568950653, + "learning_rate": 2.402406090294983e-05, + "loss": 1.0627, + "step": 6525 + }, + { + "epoch": 1.025938807993869, + "grad_norm": 0.13184446096420288, + "learning_rate": 2.4017887242690383e-05, + "loss": 1.1178, + "step": 6526 + }, + { + "epoch": 1.0260960128908017, + "grad_norm": 0.1401938945055008, + "learning_rate": 2.4011713642414664e-05, + "loss": 1.0651, + "step": 6527 + }, + { + "epoch": 1.0262532177877342, + "grad_norm": 0.13300791382789612, + "learning_rate": 2.4005540102499706e-05, + "loss": 1.0455, + "step": 6528 + }, + { + "epoch": 1.0264104226846666, + "grad_norm": 0.15082721412181854, + "learning_rate": 2.3999366623322584e-05, + "loss": 1.0668, + "step": 6529 + }, + { + "epoch": 1.0265676275815991, + "grad_norm": 0.12946711480617523, + "learning_rate": 2.3993193205260332e-05, + "loss": 1.084, + "step": 6530 + }, + { + "epoch": 1.0267248324785316, + "grad_norm": 0.1557915359735489, + "learning_rate": 2.3987019848690006e-05, + "loss": 0.9574, + "step": 6531 + }, + { + "epoch": 1.0268820373754644, + "grad_norm": 0.14675593376159668, + "learning_rate": 2.3980846553988663e-05, + "loss": 1.2071, + "step": 6532 + }, + { + "epoch": 1.0270392422723968, + "grad_norm": 0.1399666965007782, + "learning_rate": 2.397467332153333e-05, + "loss": 1.0803, + "step": 6533 + }, + { + "epoch": 1.0271964471693293, + "grad_norm": 0.1544557511806488, + "learning_rate": 2.3968500151701052e-05, + "loss": 1.0974, + "step": 6534 + }, + { + "epoch": 1.0273536520662618, + "grad_norm": 0.1377180516719818, + "learning_rate": 2.396232704486886e-05, + "loss": 1.0661, + "step": 6535 + }, + { + "epoch": 1.0275108569631943, + "grad_norm": 0.13871777057647705, + "learning_rate": 2.3956154001413776e-05, + "loss": 1.1283, + "step": 6536 + }, + { + "epoch": 1.027668061860127, + "grad_norm": 0.14054739475250244, + "learning_rate": 2.3949981021712847e-05, + "loss": 0.9715, + "step": 6537 + }, + { + "epoch": 1.0278252667570595, + "grad_norm": 0.1351758986711502, + "learning_rate": 2.394380810614307e-05, + "loss": 1.1713, + "step": 6538 + }, + { + "epoch": 1.027982471653992, + "grad_norm": 0.14041534066200256, + "learning_rate": 2.3937635255081487e-05, + "loss": 1.118, + "step": 6539 + }, + { + "epoch": 1.0281396765509245, + "grad_norm": 0.15355032682418823, + "learning_rate": 2.3931462468905092e-05, + "loss": 1.0319, + "step": 6540 + }, + { + "epoch": 1.028296881447857, + "grad_norm": 0.1415993571281433, + "learning_rate": 2.3925289747990902e-05, + "loss": 1.0679, + "step": 6541 + }, + { + "epoch": 1.0284540863447897, + "grad_norm": 0.15271086990833282, + "learning_rate": 2.3919117092715938e-05, + "loss": 1.0838, + "step": 6542 + }, + { + "epoch": 1.0286112912417222, + "grad_norm": 0.12374086678028107, + "learning_rate": 2.3912944503457173e-05, + "loss": 1.0098, + "step": 6543 + }, + { + "epoch": 1.0287684961386547, + "grad_norm": 0.14749404788017273, + "learning_rate": 2.390677198059164e-05, + "loss": 1.0522, + "step": 6544 + }, + { + "epoch": 1.0289257010355872, + "grad_norm": 0.14672315120697021, + "learning_rate": 2.39005995244963e-05, + "loss": 1.1425, + "step": 6545 + }, + { + "epoch": 1.0290829059325197, + "grad_norm": 0.14794443547725677, + "learning_rate": 2.389442713554816e-05, + "loss": 0.9882, + "step": 6546 + }, + { + "epoch": 1.0292401108294524, + "grad_norm": 0.19367676973342896, + "learning_rate": 2.388825481412421e-05, + "loss": 1.0241, + "step": 6547 + }, + { + "epoch": 1.029397315726385, + "grad_norm": 0.13971471786499023, + "learning_rate": 2.388208256060142e-05, + "loss": 1.0916, + "step": 6548 + }, + { + "epoch": 1.0295545206233174, + "grad_norm": 0.134723961353302, + "learning_rate": 2.3875910375356783e-05, + "loss": 1.1148, + "step": 6549 + }, + { + "epoch": 1.02971172552025, + "grad_norm": 0.14524322748184204, + "learning_rate": 2.3869738258767248e-05, + "loss": 1.0904, + "step": 6550 + }, + { + "epoch": 1.0298689304171824, + "grad_norm": 0.14155533909797668, + "learning_rate": 2.386356621120981e-05, + "loss": 1.059, + "step": 6551 + }, + { + "epoch": 1.0300261353141151, + "grad_norm": 0.14879059791564941, + "learning_rate": 2.3857394233061424e-05, + "loss": 1.0893, + "step": 6552 + }, + { + "epoch": 1.0301833402110476, + "grad_norm": 0.1372535526752472, + "learning_rate": 2.3851222324699052e-05, + "loss": 1.0988, + "step": 6553 + }, + { + "epoch": 1.0303405451079801, + "grad_norm": 0.12561412155628204, + "learning_rate": 2.384505048649964e-05, + "loss": 1.0437, + "step": 6554 + }, + { + "epoch": 1.0304977500049126, + "grad_norm": 0.13449938595294952, + "learning_rate": 2.383887871884016e-05, + "loss": 1.1098, + "step": 6555 + }, + { + "epoch": 1.030654954901845, + "grad_norm": 0.14272966980934143, + "learning_rate": 2.3832707022097547e-05, + "loss": 1.0568, + "step": 6556 + }, + { + "epoch": 1.0308121597987778, + "grad_norm": 0.14943712949752808, + "learning_rate": 2.3826535396648753e-05, + "loss": 1.0375, + "step": 6557 + }, + { + "epoch": 1.0309693646957103, + "grad_norm": 0.14257186651229858, + "learning_rate": 2.3820363842870704e-05, + "loss": 1.0666, + "step": 6558 + }, + { + "epoch": 1.0311265695926428, + "grad_norm": 0.1672075390815735, + "learning_rate": 2.3814192361140342e-05, + "loss": 1.0933, + "step": 6559 + }, + { + "epoch": 1.0312837744895753, + "grad_norm": 0.13151812553405762, + "learning_rate": 2.3808020951834614e-05, + "loss": 1.0801, + "step": 6560 + }, + { + "epoch": 1.0312837744895753, + "eval_loss": 1.0874505043029785, + "eval_runtime": 2322.7762, + "eval_samples_per_second": 3.986, + "eval_steps_per_second": 1.993, + "step": 6560 + }, + { + "epoch": 1.0314409793865078, + "grad_norm": 0.12800061702728271, + "learning_rate": 2.3801849615330412e-05, + "loss": 1.1397, + "step": 6561 + }, + { + "epoch": 1.0315981842834405, + "grad_norm": 0.13340391218662262, + "learning_rate": 2.379567835200469e-05, + "loss": 1.1216, + "step": 6562 + }, + { + "epoch": 1.031755389180373, + "grad_norm": 0.14047126471996307, + "learning_rate": 2.3789507162234347e-05, + "loss": 1.1148, + "step": 6563 + }, + { + "epoch": 1.0319125940773055, + "grad_norm": 0.14181764423847198, + "learning_rate": 2.3783336046396302e-05, + "loss": 1.1003, + "step": 6564 + }, + { + "epoch": 1.032069798974238, + "grad_norm": 0.14892888069152832, + "learning_rate": 2.3777165004867466e-05, + "loss": 1.0541, + "step": 6565 + }, + { + "epoch": 1.0322270038711705, + "grad_norm": 0.14764860272407532, + "learning_rate": 2.3770994038024725e-05, + "loss": 0.9247, + "step": 6566 + }, + { + "epoch": 1.0323842087681032, + "grad_norm": 0.14466546475887299, + "learning_rate": 2.3764823146245008e-05, + "loss": 1.1699, + "step": 6567 + }, + { + "epoch": 1.0325414136650357, + "grad_norm": 0.17898455262184143, + "learning_rate": 2.3758652329905186e-05, + "loss": 1.0758, + "step": 6568 + }, + { + "epoch": 1.0326986185619682, + "grad_norm": 0.13833412528038025, + "learning_rate": 2.375248158938216e-05, + "loss": 1.0654, + "step": 6569 + }, + { + "epoch": 1.0328558234589007, + "grad_norm": 0.13184380531311035, + "learning_rate": 2.374631092505281e-05, + "loss": 1.0344, + "step": 6570 + }, + { + "epoch": 1.0330130283558332, + "grad_norm": 0.12904293835163116, + "learning_rate": 2.3740140337294014e-05, + "loss": 1.0803, + "step": 6571 + }, + { + "epoch": 1.033170233252766, + "grad_norm": 0.13617415726184845, + "learning_rate": 2.3733969826482666e-05, + "loss": 1.1581, + "step": 6572 + }, + { + "epoch": 1.0333274381496984, + "grad_norm": 0.13786554336547852, + "learning_rate": 2.3727799392995607e-05, + "loss": 1.1613, + "step": 6573 + }, + { + "epoch": 1.033484643046631, + "grad_norm": 0.14242956042289734, + "learning_rate": 2.372162903720973e-05, + "loss": 1.1469, + "step": 6574 + }, + { + "epoch": 1.0336418479435634, + "grad_norm": 0.1505378782749176, + "learning_rate": 2.3715458759501893e-05, + "loss": 1.0686, + "step": 6575 + }, + { + "epoch": 1.033799052840496, + "grad_norm": 0.13413214683532715, + "learning_rate": 2.370928856024894e-05, + "loss": 1.1497, + "step": 6576 + }, + { + "epoch": 1.0339562577374286, + "grad_norm": 0.15098947286605835, + "learning_rate": 2.3703118439827736e-05, + "loss": 1.1479, + "step": 6577 + }, + { + "epoch": 1.034113462634361, + "grad_norm": 0.1412951797246933, + "learning_rate": 2.3696948398615112e-05, + "loss": 1.06, + "step": 6578 + }, + { + "epoch": 1.0342706675312936, + "grad_norm": 0.13955259323120117, + "learning_rate": 2.369077843698793e-05, + "loss": 0.9353, + "step": 6579 + }, + { + "epoch": 1.034427872428226, + "grad_norm": 0.1352538764476776, + "learning_rate": 2.3684608555323022e-05, + "loss": 1.1048, + "step": 6580 + }, + { + "epoch": 1.0345850773251586, + "grad_norm": 0.1281067281961441, + "learning_rate": 2.3678438753997217e-05, + "loss": 1.0011, + "step": 6581 + }, + { + "epoch": 1.0347422822220913, + "grad_norm": 0.1379912793636322, + "learning_rate": 2.3672269033387347e-05, + "loss": 1.067, + "step": 6582 + }, + { + "epoch": 1.0348994871190238, + "grad_norm": 0.1383332759141922, + "learning_rate": 2.3666099393870223e-05, + "loss": 1.1522, + "step": 6583 + }, + { + "epoch": 1.0350566920159563, + "grad_norm": 0.14555424451828003, + "learning_rate": 2.365992983582268e-05, + "loss": 1.0956, + "step": 6584 + }, + { + "epoch": 1.0352138969128888, + "grad_norm": 0.14239095151424408, + "learning_rate": 2.3653760359621527e-05, + "loss": 0.9666, + "step": 6585 + }, + { + "epoch": 1.0353711018098213, + "grad_norm": 0.13024353981018066, + "learning_rate": 2.3647590965643563e-05, + "loss": 1.1228, + "step": 6586 + }, + { + "epoch": 1.035528306706754, + "grad_norm": 0.13997390866279602, + "learning_rate": 2.3641421654265604e-05, + "loss": 0.8697, + "step": 6587 + }, + { + "epoch": 1.0356855116036865, + "grad_norm": 0.15388011932373047, + "learning_rate": 2.3635252425864438e-05, + "loss": 1.1431, + "step": 6588 + }, + { + "epoch": 1.035842716500619, + "grad_norm": 0.13432490825653076, + "learning_rate": 2.3629083280816854e-05, + "loss": 1.0263, + "step": 6589 + }, + { + "epoch": 1.0359999213975515, + "grad_norm": 0.13459350168704987, + "learning_rate": 2.362291421949966e-05, + "loss": 1.0741, + "step": 6590 + }, + { + "epoch": 1.036157126294484, + "grad_norm": 0.14230036735534668, + "learning_rate": 2.3616745242289617e-05, + "loss": 1.1533, + "step": 6591 + }, + { + "epoch": 1.0363143311914167, + "grad_norm": 0.1396442949771881, + "learning_rate": 2.361057634956352e-05, + "loss": 1.0272, + "step": 6592 + }, + { + "epoch": 1.0364715360883492, + "grad_norm": 0.15559406578540802, + "learning_rate": 2.3604407541698122e-05, + "loss": 1.087, + "step": 6593 + }, + { + "epoch": 1.0366287409852817, + "grad_norm": 0.13708201050758362, + "learning_rate": 2.3598238819070202e-05, + "loss": 1.0741, + "step": 6594 + }, + { + "epoch": 1.0367859458822142, + "grad_norm": 0.1490902155637741, + "learning_rate": 2.359207018205653e-05, + "loss": 1.2703, + "step": 6595 + }, + { + "epoch": 1.0369431507791467, + "grad_norm": 0.15177972614765167, + "learning_rate": 2.3585901631033854e-05, + "loss": 1.2111, + "step": 6596 + }, + { + "epoch": 1.0371003556760794, + "grad_norm": 0.1324014663696289, + "learning_rate": 2.3579733166378925e-05, + "loss": 1.0493, + "step": 6597 + }, + { + "epoch": 1.0372575605730119, + "grad_norm": 0.14288055896759033, + "learning_rate": 2.357356478846849e-05, + "loss": 1.1638, + "step": 6598 + }, + { + "epoch": 1.0374147654699444, + "grad_norm": 0.14222581684589386, + "learning_rate": 2.356739649767928e-05, + "loss": 1.116, + "step": 6599 + }, + { + "epoch": 1.0375719703668769, + "grad_norm": 0.13752423226833344, + "learning_rate": 2.356122829438806e-05, + "loss": 1.0332, + "step": 6600 + }, + { + "epoch": 1.0377291752638094, + "grad_norm": 0.14417120814323425, + "learning_rate": 2.3555060178971524e-05, + "loss": 1.1577, + "step": 6601 + }, + { + "epoch": 1.037886380160742, + "grad_norm": 0.16917288303375244, + "learning_rate": 2.354889215180643e-05, + "loss": 1.1147, + "step": 6602 + }, + { + "epoch": 1.0380435850576746, + "grad_norm": 0.12715454399585724, + "learning_rate": 2.3542724213269472e-05, + "loss": 1.149, + "step": 6603 + }, + { + "epoch": 1.038200789954607, + "grad_norm": 0.14440487325191498, + "learning_rate": 2.3536556363737375e-05, + "loss": 1.1086, + "step": 6604 + }, + { + "epoch": 1.0383579948515396, + "grad_norm": 0.13209013640880585, + "learning_rate": 2.3530388603586854e-05, + "loss": 1.0373, + "step": 6605 + }, + { + "epoch": 1.038515199748472, + "grad_norm": 0.14722275733947754, + "learning_rate": 2.3524220933194594e-05, + "loss": 1.182, + "step": 6606 + }, + { + "epoch": 1.0386724046454048, + "grad_norm": 0.12757335603237152, + "learning_rate": 2.3518053352937324e-05, + "loss": 1.051, + "step": 6607 + }, + { + "epoch": 1.0388296095423373, + "grad_norm": 0.15868781507015228, + "learning_rate": 2.3511885863191696e-05, + "loss": 0.9525, + "step": 6608 + }, + { + "epoch": 1.0389868144392698, + "grad_norm": 0.12984570860862732, + "learning_rate": 2.3505718464334426e-05, + "loss": 1.0681, + "step": 6609 + }, + { + "epoch": 1.0391440193362023, + "grad_norm": 0.13670901954174042, + "learning_rate": 2.349955115674219e-05, + "loss": 1.0987, + "step": 6610 + }, + { + "epoch": 1.0393012242331348, + "grad_norm": 0.13572148978710175, + "learning_rate": 2.3493383940791656e-05, + "loss": 1.1916, + "step": 6611 + }, + { + "epoch": 1.0394584291300675, + "grad_norm": 0.1378479152917862, + "learning_rate": 2.3487216816859504e-05, + "loss": 1.1251, + "step": 6612 + }, + { + "epoch": 1.039615634027, + "grad_norm": 0.13000816106796265, + "learning_rate": 2.348104978532238e-05, + "loss": 1.1511, + "step": 6613 + }, + { + "epoch": 1.0397728389239325, + "grad_norm": 0.13067586719989777, + "learning_rate": 2.3474882846556964e-05, + "loss": 0.9957, + "step": 6614 + }, + { + "epoch": 1.039930043820865, + "grad_norm": 0.149440199136734, + "learning_rate": 2.3468716000939907e-05, + "loss": 1.0988, + "step": 6615 + }, + { + "epoch": 1.0400872487177975, + "grad_norm": 0.13088715076446533, + "learning_rate": 2.3462549248847846e-05, + "loss": 1.0889, + "step": 6616 + }, + { + "epoch": 1.0402444536147302, + "grad_norm": 0.137752965092659, + "learning_rate": 2.345638259065743e-05, + "loss": 1.1514, + "step": 6617 + }, + { + "epoch": 1.0404016585116627, + "grad_norm": 0.13600778579711914, + "learning_rate": 2.345021602674528e-05, + "loss": 1.0494, + "step": 6618 + }, + { + "epoch": 1.0405588634085952, + "grad_norm": 0.1397845596075058, + "learning_rate": 2.3444049557488048e-05, + "loss": 1.0998, + "step": 6619 + }, + { + "epoch": 1.0407160683055277, + "grad_norm": 0.12723177671432495, + "learning_rate": 2.3437883183262353e-05, + "loss": 0.9676, + "step": 6620 + }, + { + "epoch": 1.0408732732024601, + "grad_norm": 0.13335742056369781, + "learning_rate": 2.34317169044448e-05, + "loss": 1.0878, + "step": 6621 + }, + { + "epoch": 1.0410304780993929, + "grad_norm": 0.1352199912071228, + "learning_rate": 2.3425550721412023e-05, + "loss": 1.0795, + "step": 6622 + }, + { + "epoch": 1.0411876829963254, + "grad_norm": 0.14530329406261444, + "learning_rate": 2.3419384634540612e-05, + "loss": 1.1439, + "step": 6623 + }, + { + "epoch": 1.0413448878932579, + "grad_norm": 0.13339565694332123, + "learning_rate": 2.3413218644207165e-05, + "loss": 1.05, + "step": 6624 + }, + { + "epoch": 1.0415020927901903, + "grad_norm": 0.14778709411621094, + "learning_rate": 2.34070527507883e-05, + "loss": 1.1343, + "step": 6625 + }, + { + "epoch": 1.041659297687123, + "grad_norm": 0.1390741467475891, + "learning_rate": 2.3400886954660582e-05, + "loss": 1.0687, + "step": 6626 + }, + { + "epoch": 1.0418165025840556, + "grad_norm": 0.14989504218101501, + "learning_rate": 2.3394721256200615e-05, + "loss": 1.185, + "step": 6627 + }, + { + "epoch": 1.041973707480988, + "grad_norm": 0.151128888130188, + "learning_rate": 2.3388555655784958e-05, + "loss": 1.0161, + "step": 6628 + }, + { + "epoch": 1.0421309123779205, + "grad_norm": 0.1283392757177353, + "learning_rate": 2.3382390153790186e-05, + "loss": 1.084, + "step": 6629 + }, + { + "epoch": 1.042288117274853, + "grad_norm": 0.1323283612728119, + "learning_rate": 2.337622475059288e-05, + "loss": 1.0872, + "step": 6630 + }, + { + "epoch": 1.0424453221717855, + "grad_norm": 0.1192610114812851, + "learning_rate": 2.337005944656958e-05, + "loss": 0.9583, + "step": 6631 + }, + { + "epoch": 1.0426025270687183, + "grad_norm": 0.1214812770485878, + "learning_rate": 2.3363894242096855e-05, + "loss": 1.0549, + "step": 6632 + }, + { + "epoch": 1.0427597319656507, + "grad_norm": 0.13652664422988892, + "learning_rate": 2.3357729137551236e-05, + "loss": 1.0338, + "step": 6633 + }, + { + "epoch": 1.0429169368625832, + "grad_norm": 0.12820550799369812, + "learning_rate": 2.3351564133309274e-05, + "loss": 1.051, + "step": 6634 + }, + { + "epoch": 1.0430741417595157, + "grad_norm": 0.15153834223747253, + "learning_rate": 2.3345399229747515e-05, + "loss": 1.0832, + "step": 6635 + }, + { + "epoch": 1.0432313466564485, + "grad_norm": 0.15335457026958466, + "learning_rate": 2.333923442724246e-05, + "loss": 1.0397, + "step": 6636 + }, + { + "epoch": 1.043388551553381, + "grad_norm": 0.12528929114341736, + "learning_rate": 2.3333069726170657e-05, + "loss": 0.9939, + "step": 6637 + }, + { + "epoch": 1.0435457564503134, + "grad_norm": 0.1348038911819458, + "learning_rate": 2.332690512690861e-05, + "loss": 1.0895, + "step": 6638 + }, + { + "epoch": 1.043702961347246, + "grad_norm": 0.1262330412864685, + "learning_rate": 2.332074062983283e-05, + "loss": 1.0959, + "step": 6639 + }, + { + "epoch": 1.0438601662441784, + "grad_norm": 0.1513056755065918, + "learning_rate": 2.331457623531983e-05, + "loss": 1.2109, + "step": 6640 + }, + { + "epoch": 1.0440173711411112, + "grad_norm": 0.12648868560791016, + "learning_rate": 2.330841194374609e-05, + "loss": 0.9869, + "step": 6641 + }, + { + "epoch": 1.0441745760380436, + "grad_norm": 0.1470041275024414, + "learning_rate": 2.330224775548812e-05, + "loss": 1.1068, + "step": 6642 + }, + { + "epoch": 1.0443317809349761, + "grad_norm": 0.14008654654026031, + "learning_rate": 2.3296083670922396e-05, + "loss": 1.029, + "step": 6643 + }, + { + "epoch": 1.0444889858319086, + "grad_norm": 0.1475469022989273, + "learning_rate": 2.3289919690425396e-05, + "loss": 1.0889, + "step": 6644 + }, + { + "epoch": 1.0446461907288411, + "grad_norm": 0.14843840897083282, + "learning_rate": 2.3283755814373602e-05, + "loss": 1.0337, + "step": 6645 + }, + { + "epoch": 1.0448033956257738, + "grad_norm": 0.1407221108675003, + "learning_rate": 2.327759204314346e-05, + "loss": 1.2032, + "step": 6646 + }, + { + "epoch": 1.0449606005227063, + "grad_norm": 0.15893158316612244, + "learning_rate": 2.327142837711146e-05, + "loss": 1.1338, + "step": 6647 + }, + { + "epoch": 1.0451178054196388, + "grad_norm": 0.13186657428741455, + "learning_rate": 2.3265264816654026e-05, + "loss": 1.0376, + "step": 6648 + }, + { + "epoch": 1.0452750103165713, + "grad_norm": 0.1285805106163025, + "learning_rate": 2.3259101362147617e-05, + "loss": 0.9454, + "step": 6649 + }, + { + "epoch": 1.0454322152135038, + "grad_norm": 0.15267476439476013, + "learning_rate": 2.325293801396868e-05, + "loss": 1.1629, + "step": 6650 + }, + { + "epoch": 1.0455894201104365, + "grad_norm": 0.14108550548553467, + "learning_rate": 2.3246774772493636e-05, + "loss": 1.1764, + "step": 6651 + }, + { + "epoch": 1.045746625007369, + "grad_norm": 0.13639004528522491, + "learning_rate": 2.3240611638098924e-05, + "loss": 1.1107, + "step": 6652 + }, + { + "epoch": 1.0459038299043015, + "grad_norm": 0.14737004041671753, + "learning_rate": 2.323444861116095e-05, + "loss": 1.0584, + "step": 6653 + }, + { + "epoch": 1.046061034801234, + "grad_norm": 0.1460399180650711, + "learning_rate": 2.3228285692056138e-05, + "loss": 1.0588, + "step": 6654 + }, + { + "epoch": 1.0462182396981665, + "grad_norm": 0.13852059841156006, + "learning_rate": 2.3222122881160907e-05, + "loss": 1.1782, + "step": 6655 + }, + { + "epoch": 1.0463754445950992, + "grad_norm": 0.1420789659023285, + "learning_rate": 2.3215960178851637e-05, + "loss": 1.1909, + "step": 6656 + }, + { + "epoch": 1.0465326494920317, + "grad_norm": 0.1295926421880722, + "learning_rate": 2.3209797585504736e-05, + "loss": 0.9921, + "step": 6657 + }, + { + "epoch": 1.0466898543889642, + "grad_norm": 0.135441854596138, + "learning_rate": 2.320363510149658e-05, + "loss": 1.1333, + "step": 6658 + }, + { + "epoch": 1.0468470592858967, + "grad_norm": 0.12456726282835007, + "learning_rate": 2.3197472727203558e-05, + "loss": 1.0216, + "step": 6659 + }, + { + "epoch": 1.0470042641828292, + "grad_norm": 0.13905774056911469, + "learning_rate": 2.3191310463002053e-05, + "loss": 1.062, + "step": 6660 + }, + { + "epoch": 1.047161469079762, + "grad_norm": 0.13988234102725983, + "learning_rate": 2.3185148309268416e-05, + "loss": 1.1052, + "step": 6661 + }, + { + "epoch": 1.0473186739766944, + "grad_norm": 0.1325724869966507, + "learning_rate": 2.3178986266379022e-05, + "loss": 1.1315, + "step": 6662 + }, + { + "epoch": 1.047475878873627, + "grad_norm": 0.1671644151210785, + "learning_rate": 2.317282433471021e-05, + "loss": 0.9749, + "step": 6663 + }, + { + "epoch": 1.0476330837705594, + "grad_norm": 0.15419074892997742, + "learning_rate": 2.3166662514638332e-05, + "loss": 0.9919, + "step": 6664 + }, + { + "epoch": 1.047790288667492, + "grad_norm": 0.14497032761573792, + "learning_rate": 2.316050080653974e-05, + "loss": 1.0888, + "step": 6665 + }, + { + "epoch": 1.0479474935644246, + "grad_norm": 0.15157732367515564, + "learning_rate": 2.315433921079076e-05, + "loss": 1.1411, + "step": 6666 + }, + { + "epoch": 1.0481046984613571, + "grad_norm": 0.15685704350471497, + "learning_rate": 2.3148177727767723e-05, + "loss": 1.0979, + "step": 6667 + }, + { + "epoch": 1.0482619033582896, + "grad_norm": 0.13416093587875366, + "learning_rate": 2.3142016357846938e-05, + "loss": 1.1193, + "step": 6668 + }, + { + "epoch": 1.048419108255222, + "grad_norm": 0.1444331407546997, + "learning_rate": 2.3135855101404718e-05, + "loss": 1.269, + "step": 6669 + }, + { + "epoch": 1.0485763131521546, + "grad_norm": 0.15641018748283386, + "learning_rate": 2.3129693958817394e-05, + "loss": 1.0193, + "step": 6670 + }, + { + "epoch": 1.0487335180490873, + "grad_norm": 0.13697724044322968, + "learning_rate": 2.312353293046123e-05, + "loss": 1.0507, + "step": 6671 + }, + { + "epoch": 1.0488907229460198, + "grad_norm": 0.13118332624435425, + "learning_rate": 2.3117372016712546e-05, + "loss": 0.9716, + "step": 6672 + }, + { + "epoch": 1.0490479278429523, + "grad_norm": 0.13458602130413055, + "learning_rate": 2.3111211217947608e-05, + "loss": 0.9716, + "step": 6673 + }, + { + "epoch": 1.0492051327398848, + "grad_norm": 0.13774584233760834, + "learning_rate": 2.3105050534542706e-05, + "loss": 0.9323, + "step": 6674 + }, + { + "epoch": 1.0493623376368173, + "grad_norm": 0.1374964714050293, + "learning_rate": 2.3098889966874113e-05, + "loss": 1.1443, + "step": 6675 + }, + { + "epoch": 1.04951954253375, + "grad_norm": 0.12411943823099136, + "learning_rate": 2.3092729515318077e-05, + "loss": 0.9678, + "step": 6676 + }, + { + "epoch": 1.0496767474306825, + "grad_norm": 0.1425098031759262, + "learning_rate": 2.308656918025087e-05, + "loss": 1.1489, + "step": 6677 + }, + { + "epoch": 1.049833952327615, + "grad_norm": 0.1463724970817566, + "learning_rate": 2.308040896204874e-05, + "loss": 1.1247, + "step": 6678 + }, + { + "epoch": 1.0499911572245475, + "grad_norm": 0.1474163979291916, + "learning_rate": 2.3074248861087926e-05, + "loss": 1.0334, + "step": 6679 + }, + { + "epoch": 1.05014836212148, + "grad_norm": 0.1415747106075287, + "learning_rate": 2.306808887774467e-05, + "loss": 1.0549, + "step": 6680 + }, + { + "epoch": 1.0503055670184127, + "grad_norm": 0.14319616556167603, + "learning_rate": 2.306192901239518e-05, + "loss": 1.1104, + "step": 6681 + }, + { + "epoch": 1.0504627719153452, + "grad_norm": 0.1388777792453766, + "learning_rate": 2.30557692654157e-05, + "loss": 1.0659, + "step": 6682 + }, + { + "epoch": 1.0506199768122777, + "grad_norm": 0.14447036385536194, + "learning_rate": 2.3049609637182444e-05, + "loss": 1.1091, + "step": 6683 + }, + { + "epoch": 1.0507771817092102, + "grad_norm": 0.1469668298959732, + "learning_rate": 2.30434501280716e-05, + "loss": 1.1594, + "step": 6684 + }, + { + "epoch": 1.0509343866061427, + "grad_norm": 0.13659493625164032, + "learning_rate": 2.3037290738459386e-05, + "loss": 1.1755, + "step": 6685 + }, + { + "epoch": 1.0510915915030754, + "grad_norm": 0.14157113432884216, + "learning_rate": 2.303113146872198e-05, + "loss": 1.1112, + "step": 6686 + }, + { + "epoch": 1.051248796400008, + "grad_norm": 0.14882853627204895, + "learning_rate": 2.3024972319235565e-05, + "loss": 1.1403, + "step": 6687 + }, + { + "epoch": 1.0514060012969404, + "grad_norm": 0.14302197098731995, + "learning_rate": 2.301881329037634e-05, + "loss": 1.0715, + "step": 6688 + }, + { + "epoch": 1.051563206193873, + "grad_norm": 0.1544528305530548, + "learning_rate": 2.3012654382520457e-05, + "loss": 1.1278, + "step": 6689 + }, + { + "epoch": 1.0517204110908054, + "grad_norm": 0.154702827334404, + "learning_rate": 2.300649559604409e-05, + "loss": 1.0315, + "step": 6690 + }, + { + "epoch": 1.051877615987738, + "grad_norm": 0.12326081097126007, + "learning_rate": 2.3000336931323377e-05, + "loss": 0.8942, + "step": 6691 + }, + { + "epoch": 1.0520348208846706, + "grad_norm": 0.14815938472747803, + "learning_rate": 2.2994178388734472e-05, + "loss": 1.0832, + "step": 6692 + }, + { + "epoch": 1.052192025781603, + "grad_norm": 0.14106889069080353, + "learning_rate": 2.2988019968653532e-05, + "loss": 0.9192, + "step": 6693 + }, + { + "epoch": 1.0523492306785356, + "grad_norm": 0.13490770757198334, + "learning_rate": 2.298186167145667e-05, + "loss": 1.0387, + "step": 6694 + }, + { + "epoch": 1.052506435575468, + "grad_norm": 0.13335709273815155, + "learning_rate": 2.2975703497520024e-05, + "loss": 1.0908, + "step": 6695 + }, + { + "epoch": 1.0526636404724008, + "grad_norm": 0.13451732695102692, + "learning_rate": 2.2969545447219702e-05, + "loss": 1.1668, + "step": 6696 + }, + { + "epoch": 1.0528208453693333, + "grad_norm": 0.13412080705165863, + "learning_rate": 2.296338752093181e-05, + "loss": 1.0084, + "step": 6697 + }, + { + "epoch": 1.0529780502662658, + "grad_norm": 0.15511669218540192, + "learning_rate": 2.295722971903248e-05, + "loss": 1.0088, + "step": 6698 + }, + { + "epoch": 1.0531352551631983, + "grad_norm": 0.145992711186409, + "learning_rate": 2.295107204189777e-05, + "loss": 1.123, + "step": 6699 + }, + { + "epoch": 1.0532924600601308, + "grad_norm": 0.16947920620441437, + "learning_rate": 2.294491448990379e-05, + "loss": 1.1299, + "step": 6700 + }, + { + "epoch": 1.0534496649570635, + "grad_norm": 0.15201131999492645, + "learning_rate": 2.2938757063426617e-05, + "loss": 1.1251, + "step": 6701 + }, + { + "epoch": 1.053606869853996, + "grad_norm": 0.1356029212474823, + "learning_rate": 2.2932599762842313e-05, + "loss": 1.0911, + "step": 6702 + }, + { + "epoch": 1.0537640747509285, + "grad_norm": 0.13767895102500916, + "learning_rate": 2.2926442588526958e-05, + "loss": 1.1906, + "step": 6703 + }, + { + "epoch": 1.053921279647861, + "grad_norm": 0.13769546151161194, + "learning_rate": 2.292028554085659e-05, + "loss": 1.0888, + "step": 6704 + }, + { + "epoch": 1.0540784845447935, + "grad_norm": 0.14323018491268158, + "learning_rate": 2.2914128620207276e-05, + "loss": 1.1373, + "step": 6705 + }, + { + "epoch": 1.0542356894417262, + "grad_norm": 0.14844290912151337, + "learning_rate": 2.2907971826955047e-05, + "loss": 1.0637, + "step": 6706 + }, + { + "epoch": 1.0543928943386587, + "grad_norm": 0.1331752985715866, + "learning_rate": 2.2901815161475938e-05, + "loss": 0.9926, + "step": 6707 + }, + { + "epoch": 1.0545500992355912, + "grad_norm": 0.1645750105381012, + "learning_rate": 2.289565862414598e-05, + "loss": 1.0232, + "step": 6708 + }, + { + "epoch": 1.0547073041325237, + "grad_norm": 0.1634742021560669, + "learning_rate": 2.2889502215341176e-05, + "loss": 1.2135, + "step": 6709 + }, + { + "epoch": 1.0548645090294562, + "grad_norm": 0.14262425899505615, + "learning_rate": 2.2883345935437567e-05, + "loss": 1.1945, + "step": 6710 + }, + { + "epoch": 1.0550217139263889, + "grad_norm": 0.1507275104522705, + "learning_rate": 2.2877189784811115e-05, + "loss": 1.1128, + "step": 6711 + }, + { + "epoch": 1.0551789188233214, + "grad_norm": 0.1388731449842453, + "learning_rate": 2.287103376383784e-05, + "loss": 1.0582, + "step": 6712 + }, + { + "epoch": 1.0553361237202539, + "grad_norm": 0.14195913076400757, + "learning_rate": 2.286487787289373e-05, + "loss": 1.137, + "step": 6713 + }, + { + "epoch": 1.0554933286171864, + "grad_norm": 0.13401064276695251, + "learning_rate": 2.285872211235475e-05, + "loss": 0.974, + "step": 6714 + }, + { + "epoch": 1.0556505335141189, + "grad_norm": 0.1448763906955719, + "learning_rate": 2.285256648259688e-05, + "loss": 1.0874, + "step": 6715 + }, + { + "epoch": 1.0558077384110516, + "grad_norm": 0.15706899762153625, + "learning_rate": 2.284641098399607e-05, + "loss": 1.0474, + "step": 6716 + }, + { + "epoch": 1.055964943307984, + "grad_norm": 0.14132793247699738, + "learning_rate": 2.2840255616928284e-05, + "loss": 1.091, + "step": 6717 + }, + { + "epoch": 1.0561221482049166, + "grad_norm": 0.12726064026355743, + "learning_rate": 2.2834100381769478e-05, + "loss": 1.024, + "step": 6718 + }, + { + "epoch": 1.056279353101849, + "grad_norm": 0.1518988013267517, + "learning_rate": 2.2827945278895574e-05, + "loss": 1.108, + "step": 6719 + }, + { + "epoch": 1.0564365579987816, + "grad_norm": 0.18123194575309753, + "learning_rate": 2.282179030868251e-05, + "loss": 1.0971, + "step": 6720 + }, + { + "epoch": 1.0564365579987816, + "eval_loss": 1.087044596672058, + "eval_runtime": 2334.8212, + "eval_samples_per_second": 3.965, + "eval_steps_per_second": 1.983, + "step": 6720 + }, + { + "epoch": 1.0565937628957143, + "grad_norm": 0.1630663126707077, + "learning_rate": 2.2815635471506204e-05, + "loss": 1.0739, + "step": 6721 + }, + { + "epoch": 1.0567509677926468, + "grad_norm": 0.13507089018821716, + "learning_rate": 2.2809480767742567e-05, + "loss": 1.1072, + "step": 6722 + }, + { + "epoch": 1.0569081726895793, + "grad_norm": 0.12876969575881958, + "learning_rate": 2.2803326197767523e-05, + "loss": 1.0147, + "step": 6723 + }, + { + "epoch": 1.0570653775865118, + "grad_norm": 0.12850850820541382, + "learning_rate": 2.2797171761956948e-05, + "loss": 1.0165, + "step": 6724 + }, + { + "epoch": 1.0572225824834445, + "grad_norm": 0.14576077461242676, + "learning_rate": 2.2791017460686748e-05, + "loss": 0.8959, + "step": 6725 + }, + { + "epoch": 1.057379787380377, + "grad_norm": 0.12488464266061783, + "learning_rate": 2.278486329433279e-05, + "loss": 0.9799, + "step": 6726 + }, + { + "epoch": 1.0575369922773095, + "grad_norm": 0.13620580732822418, + "learning_rate": 2.277870926327095e-05, + "loss": 1.0876, + "step": 6727 + }, + { + "epoch": 1.057694197174242, + "grad_norm": 0.13856784999370575, + "learning_rate": 2.277255536787711e-05, + "loss": 1.1778, + "step": 6728 + }, + { + "epoch": 1.0578514020711745, + "grad_norm": 0.13168473541736603, + "learning_rate": 2.2766401608527103e-05, + "loss": 1.1217, + "step": 6729 + }, + { + "epoch": 1.058008606968107, + "grad_norm": 0.15721744298934937, + "learning_rate": 2.2760247985596795e-05, + "loss": 1.115, + "step": 6730 + }, + { + "epoch": 1.0581658118650397, + "grad_norm": 0.15150834619998932, + "learning_rate": 2.275409449946201e-05, + "loss": 1.0603, + "step": 6731 + }, + { + "epoch": 1.0583230167619722, + "grad_norm": 0.14315399527549744, + "learning_rate": 2.2747941150498586e-05, + "loss": 1.0445, + "step": 6732 + }, + { + "epoch": 1.0584802216589047, + "grad_norm": 0.1379554122686386, + "learning_rate": 2.2741787939082363e-05, + "loss": 1.0584, + "step": 6733 + }, + { + "epoch": 1.0586374265558371, + "grad_norm": 0.14963991940021515, + "learning_rate": 2.2735634865589124e-05, + "loss": 1.1498, + "step": 6734 + }, + { + "epoch": 1.0587946314527699, + "grad_norm": 0.15091224014759064, + "learning_rate": 2.2729481930394706e-05, + "loss": 1.1318, + "step": 6735 + }, + { + "epoch": 1.0589518363497024, + "grad_norm": 0.13579067587852478, + "learning_rate": 2.2723329133874885e-05, + "loss": 1.1634, + "step": 6736 + }, + { + "epoch": 1.0591090412466349, + "grad_norm": 0.12674826383590698, + "learning_rate": 2.271717647640546e-05, + "loss": 1.0849, + "step": 6737 + }, + { + "epoch": 1.0592662461435673, + "grad_norm": 0.1494082361459732, + "learning_rate": 2.2711023958362214e-05, + "loss": 1.0274, + "step": 6738 + }, + { + "epoch": 1.0594234510404998, + "grad_norm": 0.1412169337272644, + "learning_rate": 2.27048715801209e-05, + "loss": 1.1924, + "step": 6739 + }, + { + "epoch": 1.0595806559374326, + "grad_norm": 0.19145633280277252, + "learning_rate": 2.2698719342057317e-05, + "loss": 1.053, + "step": 6740 + }, + { + "epoch": 1.059737860834365, + "grad_norm": 0.13808368146419525, + "learning_rate": 2.269256724454719e-05, + "loss": 1.0508, + "step": 6741 + }, + { + "epoch": 1.0598950657312975, + "grad_norm": 0.13900628685951233, + "learning_rate": 2.2686415287966277e-05, + "loss": 1.0552, + "step": 6742 + }, + { + "epoch": 1.06005227062823, + "grad_norm": 0.14872439205646515, + "learning_rate": 2.2680263472690323e-05, + "loss": 0.9311, + "step": 6743 + }, + { + "epoch": 1.0602094755251625, + "grad_norm": 0.1297401636838913, + "learning_rate": 2.2674111799095037e-05, + "loss": 1.0227, + "step": 6744 + }, + { + "epoch": 1.0603666804220953, + "grad_norm": 0.13636404275894165, + "learning_rate": 2.2667960267556174e-05, + "loss": 1.1171, + "step": 6745 + }, + { + "epoch": 1.0605238853190277, + "grad_norm": 0.14247004687786102, + "learning_rate": 2.26618088784494e-05, + "loss": 1.0096, + "step": 6746 + }, + { + "epoch": 1.0606810902159602, + "grad_norm": 0.12349604070186615, + "learning_rate": 2.265565763215046e-05, + "loss": 1.053, + "step": 6747 + }, + { + "epoch": 1.0608382951128927, + "grad_norm": 0.14035508036613464, + "learning_rate": 2.264950652903503e-05, + "loss": 0.95, + "step": 6748 + }, + { + "epoch": 1.0609955000098252, + "grad_norm": 0.15664082765579224, + "learning_rate": 2.2643355569478795e-05, + "loss": 1.0302, + "step": 6749 + }, + { + "epoch": 1.061152704906758, + "grad_norm": 0.141912579536438, + "learning_rate": 2.263720475385744e-05, + "loss": 1.077, + "step": 6750 + }, + { + "epoch": 1.0613099098036904, + "grad_norm": 0.16194646060466766, + "learning_rate": 2.263105408254662e-05, + "loss": 1.0582, + "step": 6751 + }, + { + "epoch": 1.061467114700623, + "grad_norm": 0.14592602849006653, + "learning_rate": 2.2624903555922007e-05, + "loss": 1.0609, + "step": 6752 + }, + { + "epoch": 1.0616243195975554, + "grad_norm": 0.13689959049224854, + "learning_rate": 2.2618753174359255e-05, + "loss": 1.1108, + "step": 6753 + }, + { + "epoch": 1.061781524494488, + "grad_norm": 0.13154283165931702, + "learning_rate": 2.261260293823399e-05, + "loss": 1.1297, + "step": 6754 + }, + { + "epoch": 1.0619387293914206, + "grad_norm": 0.1285742223262787, + "learning_rate": 2.260645284792186e-05, + "loss": 1.0514, + "step": 6755 + }, + { + "epoch": 1.0620959342883531, + "grad_norm": 0.1531996876001358, + "learning_rate": 2.2600302903798476e-05, + "loss": 1.1613, + "step": 6756 + }, + { + "epoch": 1.0622531391852856, + "grad_norm": 0.1419435739517212, + "learning_rate": 2.2594153106239463e-05, + "loss": 1.0379, + "step": 6757 + }, + { + "epoch": 1.0624103440822181, + "grad_norm": 0.1443438082933426, + "learning_rate": 2.258800345562043e-05, + "loss": 1.0889, + "step": 6758 + }, + { + "epoch": 1.0625675489791506, + "grad_norm": 0.14136548340320587, + "learning_rate": 2.2581853952316967e-05, + "loss": 1.1505, + "step": 6759 + }, + { + "epoch": 1.0627247538760833, + "grad_norm": 0.1268949955701828, + "learning_rate": 2.2575704596704668e-05, + "loss": 0.9982, + "step": 6760 + }, + { + "epoch": 1.0628819587730158, + "grad_norm": 0.14405110478401184, + "learning_rate": 2.25695553891591e-05, + "loss": 1.0295, + "step": 6761 + }, + { + "epoch": 1.0630391636699483, + "grad_norm": 0.1511630266904831, + "learning_rate": 2.256340633005584e-05, + "loss": 0.9983, + "step": 6762 + }, + { + "epoch": 1.0631963685668808, + "grad_norm": 0.14506085216999054, + "learning_rate": 2.2557257419770458e-05, + "loss": 1.055, + "step": 6763 + }, + { + "epoch": 1.0633535734638133, + "grad_norm": 0.1391686648130417, + "learning_rate": 2.2551108658678495e-05, + "loss": 1.121, + "step": 6764 + }, + { + "epoch": 1.063510778360746, + "grad_norm": 0.1339191496372223, + "learning_rate": 2.2544960047155502e-05, + "loss": 1.1775, + "step": 6765 + }, + { + "epoch": 1.0636679832576785, + "grad_norm": 0.14864668250083923, + "learning_rate": 2.253881158557701e-05, + "loss": 0.9386, + "step": 6766 + }, + { + "epoch": 1.063825188154611, + "grad_norm": 0.13405652344226837, + "learning_rate": 2.2532663274318526e-05, + "loss": 1.1496, + "step": 6767 + }, + { + "epoch": 1.0639823930515435, + "grad_norm": 0.14411379396915436, + "learning_rate": 2.2526515113755596e-05, + "loss": 1.0826, + "step": 6768 + }, + { + "epoch": 1.064139597948476, + "grad_norm": 0.15150001645088196, + "learning_rate": 2.2520367104263713e-05, + "loss": 1.1077, + "step": 6769 + }, + { + "epoch": 1.0642968028454087, + "grad_norm": 0.13799571990966797, + "learning_rate": 2.251421924621837e-05, + "loss": 1.0698, + "step": 6770 + }, + { + "epoch": 1.0644540077423412, + "grad_norm": 0.13309843838214874, + "learning_rate": 2.2508071539995057e-05, + "loss": 1.0503, + "step": 6771 + }, + { + "epoch": 1.0646112126392737, + "grad_norm": 0.14039920270442963, + "learning_rate": 2.2501923985969246e-05, + "loss": 1.0301, + "step": 6772 + }, + { + "epoch": 1.0647684175362062, + "grad_norm": 0.13240568339824677, + "learning_rate": 2.2495776584516427e-05, + "loss": 1.0827, + "step": 6773 + }, + { + "epoch": 1.0649256224331387, + "grad_norm": 0.13759469985961914, + "learning_rate": 2.2489629336012036e-05, + "loss": 1.1336, + "step": 6774 + }, + { + "epoch": 1.0650828273300714, + "grad_norm": 0.14600278437137604, + "learning_rate": 2.248348224083154e-05, + "loss": 1.0351, + "step": 6775 + }, + { + "epoch": 1.065240032227004, + "grad_norm": 0.13929995894432068, + "learning_rate": 2.2477335299350373e-05, + "loss": 1.0386, + "step": 6776 + }, + { + "epoch": 1.0653972371239364, + "grad_norm": 0.1409510374069214, + "learning_rate": 2.2471188511943962e-05, + "loss": 1.1176, + "step": 6777 + }, + { + "epoch": 1.065554442020869, + "grad_norm": 0.14629358053207397, + "learning_rate": 2.2465041878987744e-05, + "loss": 1.1161, + "step": 6778 + }, + { + "epoch": 1.0657116469178014, + "grad_norm": 0.14169493317604065, + "learning_rate": 2.245889540085711e-05, + "loss": 1.0896, + "step": 6779 + }, + { + "epoch": 1.0658688518147341, + "grad_norm": 0.12641575932502747, + "learning_rate": 2.24527490779275e-05, + "loss": 1.0494, + "step": 6780 + }, + { + "epoch": 1.0660260567116666, + "grad_norm": 0.1412162482738495, + "learning_rate": 2.244660291057426e-05, + "loss": 1.1802, + "step": 6781 + }, + { + "epoch": 1.066183261608599, + "grad_norm": 0.14101557433605194, + "learning_rate": 2.244045689917281e-05, + "loss": 0.9996, + "step": 6782 + }, + { + "epoch": 1.0663404665055316, + "grad_norm": 0.1462903767824173, + "learning_rate": 2.2434311044098517e-05, + "loss": 1.1684, + "step": 6783 + }, + { + "epoch": 1.066497671402464, + "grad_norm": 0.139430433511734, + "learning_rate": 2.2428165345726737e-05, + "loss": 1.1364, + "step": 6784 + }, + { + "epoch": 1.0666548762993968, + "grad_norm": 0.1315891444683075, + "learning_rate": 2.242201980443284e-05, + "loss": 1.0521, + "step": 6785 + }, + { + "epoch": 1.0668120811963293, + "grad_norm": 0.14229103922843933, + "learning_rate": 2.2415874420592152e-05, + "loss": 1.1944, + "step": 6786 + }, + { + "epoch": 1.0669692860932618, + "grad_norm": 0.12886013090610504, + "learning_rate": 2.240972919458003e-05, + "loss": 1.1011, + "step": 6787 + }, + { + "epoch": 1.0671264909901943, + "grad_norm": 0.14366596937179565, + "learning_rate": 2.24035841267718e-05, + "loss": 1.0005, + "step": 6788 + }, + { + "epoch": 1.0672836958871268, + "grad_norm": 0.14405860006809235, + "learning_rate": 2.2397439217542764e-05, + "loss": 1.0905, + "step": 6789 + }, + { + "epoch": 1.0674409007840595, + "grad_norm": 0.14416463673114777, + "learning_rate": 2.2391294467268247e-05, + "loss": 1.1238, + "step": 6790 + }, + { + "epoch": 1.067598105680992, + "grad_norm": 0.1444624662399292, + "learning_rate": 2.2385149876323524e-05, + "loss": 1.0767, + "step": 6791 + }, + { + "epoch": 1.0677553105779245, + "grad_norm": 0.13008983433246613, + "learning_rate": 2.2379005445083905e-05, + "loss": 1.0778, + "step": 6792 + }, + { + "epoch": 1.067912515474857, + "grad_norm": 0.13642984628677368, + "learning_rate": 2.2372861173924667e-05, + "loss": 1.0644, + "step": 6793 + }, + { + "epoch": 1.0680697203717895, + "grad_norm": 0.14667023718357086, + "learning_rate": 2.2366717063221066e-05, + "loss": 1.1531, + "step": 6794 + }, + { + "epoch": 1.0682269252687222, + "grad_norm": 0.14247217774391174, + "learning_rate": 2.2360573113348367e-05, + "loss": 1.0845, + "step": 6795 + }, + { + "epoch": 1.0683841301656547, + "grad_norm": 0.13757647573947906, + "learning_rate": 2.235442932468183e-05, + "loss": 1.1035, + "step": 6796 + }, + { + "epoch": 1.0685413350625872, + "grad_norm": 0.1382339894771576, + "learning_rate": 2.234828569759667e-05, + "loss": 1.0584, + "step": 6797 + }, + { + "epoch": 1.0686985399595197, + "grad_norm": 0.13505852222442627, + "learning_rate": 2.2342142232468144e-05, + "loss": 1.0028, + "step": 6798 + }, + { + "epoch": 1.0688557448564522, + "grad_norm": 0.1295074075460434, + "learning_rate": 2.233599892967145e-05, + "loss": 0.9671, + "step": 6799 + }, + { + "epoch": 1.069012949753385, + "grad_norm": 0.13525182008743286, + "learning_rate": 2.2329855789581804e-05, + "loss": 1.1011, + "step": 6800 + }, + { + "epoch": 1.0691701546503174, + "grad_norm": 0.14352592825889587, + "learning_rate": 2.2323712812574414e-05, + "loss": 1.0695, + "step": 6801 + }, + { + "epoch": 1.06932735954725, + "grad_norm": 0.15070697665214539, + "learning_rate": 2.2317569999024454e-05, + "loss": 1.048, + "step": 6802 + }, + { + "epoch": 1.0694845644441824, + "grad_norm": 0.1340702325105667, + "learning_rate": 2.231142734930712e-05, + "loss": 0.9993, + "step": 6803 + }, + { + "epoch": 1.0696417693411149, + "grad_norm": 0.13574174046516418, + "learning_rate": 2.230528486379757e-05, + "loss": 1.0236, + "step": 6804 + }, + { + "epoch": 1.0697989742380476, + "grad_norm": 0.1502939909696579, + "learning_rate": 2.229914254287097e-05, + "loss": 1.0536, + "step": 6805 + }, + { + "epoch": 1.06995617913498, + "grad_norm": 0.13239002227783203, + "learning_rate": 2.229300038690247e-05, + "loss": 1.0485, + "step": 6806 + }, + { + "epoch": 1.0701133840319126, + "grad_norm": 0.17655174434185028, + "learning_rate": 2.22868583962672e-05, + "loss": 1.0505, + "step": 6807 + }, + { + "epoch": 1.070270588928845, + "grad_norm": 0.13982750475406647, + "learning_rate": 2.2280716571340307e-05, + "loss": 1.0458, + "step": 6808 + }, + { + "epoch": 1.0704277938257776, + "grad_norm": 0.1651543527841568, + "learning_rate": 2.227457491249689e-05, + "loss": 0.9935, + "step": 6809 + }, + { + "epoch": 1.0705849987227103, + "grad_norm": 0.14850039780139923, + "learning_rate": 2.2268433420112067e-05, + "loss": 1.0423, + "step": 6810 + }, + { + "epoch": 1.0707422036196428, + "grad_norm": 0.13318435847759247, + "learning_rate": 2.2262292094560942e-05, + "loss": 1.0891, + "step": 6811 + }, + { + "epoch": 1.0708994085165753, + "grad_norm": 0.13821984827518463, + "learning_rate": 2.22561509362186e-05, + "loss": 1.1213, + "step": 6812 + }, + { + "epoch": 1.0710566134135078, + "grad_norm": 0.1341027319431305, + "learning_rate": 2.2250009945460124e-05, + "loss": 1.025, + "step": 6813 + }, + { + "epoch": 1.0712138183104403, + "grad_norm": 0.15405303239822388, + "learning_rate": 2.2243869122660563e-05, + "loss": 1.045, + "step": 6814 + }, + { + "epoch": 1.071371023207373, + "grad_norm": 0.13471081852912903, + "learning_rate": 2.2237728468194997e-05, + "loss": 1.0401, + "step": 6815 + }, + { + "epoch": 1.0715282281043055, + "grad_norm": 0.14097860455513, + "learning_rate": 2.2231587982438472e-05, + "loss": 1.0659, + "step": 6816 + }, + { + "epoch": 1.071685433001238, + "grad_norm": 0.13970810174942017, + "learning_rate": 2.2225447665766014e-05, + "loss": 0.9956, + "step": 6817 + }, + { + "epoch": 1.0718426378981705, + "grad_norm": 0.14492157101631165, + "learning_rate": 2.221930751855266e-05, + "loss": 1.1047, + "step": 6818 + }, + { + "epoch": 1.071999842795103, + "grad_norm": 0.14329084753990173, + "learning_rate": 2.2213167541173415e-05, + "loss": 1.0751, + "step": 6819 + }, + { + "epoch": 1.0721570476920357, + "grad_norm": 0.14667004346847534, + "learning_rate": 2.22070277340033e-05, + "loss": 1.0854, + "step": 6820 + }, + { + "epoch": 1.0723142525889682, + "grad_norm": 0.1488797664642334, + "learning_rate": 2.2200888097417307e-05, + "loss": 1.1847, + "step": 6821 + }, + { + "epoch": 1.0724714574859007, + "grad_norm": 0.1442245990037918, + "learning_rate": 2.2194748631790414e-05, + "loss": 1.2055, + "step": 6822 + }, + { + "epoch": 1.0726286623828332, + "grad_norm": 0.13449786603450775, + "learning_rate": 2.2188609337497606e-05, + "loss": 1.0678, + "step": 6823 + }, + { + "epoch": 1.0727858672797659, + "grad_norm": 0.1424616277217865, + "learning_rate": 2.218247021491384e-05, + "loss": 1.0412, + "step": 6824 + }, + { + "epoch": 1.0729430721766984, + "grad_norm": 0.13674286007881165, + "learning_rate": 2.2176331264414065e-05, + "loss": 1.1239, + "step": 6825 + }, + { + "epoch": 1.0731002770736309, + "grad_norm": 0.1447739601135254, + "learning_rate": 2.2170192486373245e-05, + "loss": 1.0112, + "step": 6826 + }, + { + "epoch": 1.0732574819705634, + "grad_norm": 0.13803522288799286, + "learning_rate": 2.2164053881166296e-05, + "loss": 1.1649, + "step": 6827 + }, + { + "epoch": 1.0734146868674959, + "grad_norm": 0.12798753380775452, + "learning_rate": 2.2157915449168153e-05, + "loss": 1.0262, + "step": 6828 + }, + { + "epoch": 1.0735718917644284, + "grad_norm": 0.14248555898666382, + "learning_rate": 2.215177719075371e-05, + "loss": 1.039, + "step": 6829 + }, + { + "epoch": 1.073729096661361, + "grad_norm": 0.15249311923980713, + "learning_rate": 2.214563910629788e-05, + "loss": 1.0889, + "step": 6830 + }, + { + "epoch": 1.0738863015582936, + "grad_norm": 0.15592628717422485, + "learning_rate": 2.213950119617556e-05, + "loss": 1.0341, + "step": 6831 + }, + { + "epoch": 1.074043506455226, + "grad_norm": 0.14249135553836823, + "learning_rate": 2.2133363460761623e-05, + "loss": 1.0556, + "step": 6832 + }, + { + "epoch": 1.0742007113521586, + "grad_norm": 0.1403677612543106, + "learning_rate": 2.212722590043094e-05, + "loss": 1.0743, + "step": 6833 + }, + { + "epoch": 1.0743579162490913, + "grad_norm": 0.1338101625442505, + "learning_rate": 2.2121088515558365e-05, + "loss": 1.0899, + "step": 6834 + }, + { + "epoch": 1.0745151211460238, + "grad_norm": 0.1278059035539627, + "learning_rate": 2.2114951306518744e-05, + "loss": 1.0199, + "step": 6835 + }, + { + "epoch": 1.0746723260429563, + "grad_norm": 0.15362823009490967, + "learning_rate": 2.2108814273686936e-05, + "loss": 1.1255, + "step": 6836 + }, + { + "epoch": 1.0748295309398888, + "grad_norm": 0.15459021925926208, + "learning_rate": 2.2102677417437736e-05, + "loss": 1.1482, + "step": 6837 + }, + { + "epoch": 1.0749867358368212, + "grad_norm": 0.14825507998466492, + "learning_rate": 2.2096540738145988e-05, + "loss": 1.0236, + "step": 6838 + }, + { + "epoch": 1.0751439407337537, + "grad_norm": 0.1396886259317398, + "learning_rate": 2.2090404236186476e-05, + "loss": 1.1493, + "step": 6839 + }, + { + "epoch": 1.0753011456306865, + "grad_norm": 0.13699349761009216, + "learning_rate": 2.2084267911934005e-05, + "loss": 0.973, + "step": 6840 + }, + { + "epoch": 1.075458350527619, + "grad_norm": 0.13489682972431183, + "learning_rate": 2.2078131765763364e-05, + "loss": 0.9826, + "step": 6841 + }, + { + "epoch": 1.0756155554245515, + "grad_norm": 0.13808147609233856, + "learning_rate": 2.2071995798049304e-05, + "loss": 1.0349, + "step": 6842 + }, + { + "epoch": 1.075772760321484, + "grad_norm": 0.14268989861011505, + "learning_rate": 2.2065860009166616e-05, + "loss": 1.1636, + "step": 6843 + }, + { + "epoch": 1.0759299652184167, + "grad_norm": 0.13867351412773132, + "learning_rate": 2.2059724399490022e-05, + "loss": 1.0266, + "step": 6844 + }, + { + "epoch": 1.0760871701153492, + "grad_norm": 0.15861843526363373, + "learning_rate": 2.205358896939428e-05, + "loss": 1.0394, + "step": 6845 + }, + { + "epoch": 1.0762443750122817, + "grad_norm": 0.14459887146949768, + "learning_rate": 2.2047453719254117e-05, + "loss": 1.1932, + "step": 6846 + }, + { + "epoch": 1.0764015799092141, + "grad_norm": 0.14023332297801971, + "learning_rate": 2.204131864944424e-05, + "loss": 1.1001, + "step": 6847 + }, + { + "epoch": 1.0765587848061466, + "grad_norm": 0.13127627968788147, + "learning_rate": 2.203518376033937e-05, + "loss": 1.0673, + "step": 6848 + }, + { + "epoch": 1.0767159897030791, + "grad_norm": 0.1361897885799408, + "learning_rate": 2.2029049052314184e-05, + "loss": 1.0656, + "step": 6849 + }, + { + "epoch": 1.0768731946000119, + "grad_norm": 0.14018553495407104, + "learning_rate": 2.2022914525743386e-05, + "loss": 1.165, + "step": 6850 + }, + { + "epoch": 1.0770303994969443, + "grad_norm": 0.14729741215705872, + "learning_rate": 2.2016780181001645e-05, + "loss": 1.0938, + "step": 6851 + }, + { + "epoch": 1.0771876043938768, + "grad_norm": 0.1401669681072235, + "learning_rate": 2.2010646018463616e-05, + "loss": 1.0663, + "step": 6852 + }, + { + "epoch": 1.0773448092908093, + "grad_norm": 0.12580382823944092, + "learning_rate": 2.2004512038503958e-05, + "loss": 1.125, + "step": 6853 + }, + { + "epoch": 1.077502014187742, + "grad_norm": 0.12055519968271255, + "learning_rate": 2.19983782414973e-05, + "loss": 0.9754, + "step": 6854 + }, + { + "epoch": 1.0776592190846745, + "grad_norm": 0.14853142201900482, + "learning_rate": 2.199224462781828e-05, + "loss": 1.0807, + "step": 6855 + }, + { + "epoch": 1.077816423981607, + "grad_norm": 0.14001984894275665, + "learning_rate": 2.1986111197841522e-05, + "loss": 1.1005, + "step": 6856 + }, + { + "epoch": 1.0779736288785395, + "grad_norm": 0.13452467322349548, + "learning_rate": 2.197997795194162e-05, + "loss": 1.0576, + "step": 6857 + }, + { + "epoch": 1.078130833775472, + "grad_norm": 0.13335581123828888, + "learning_rate": 2.1973844890493178e-05, + "loss": 1.0619, + "step": 6858 + }, + { + "epoch": 1.0782880386724047, + "grad_norm": 0.13634103536605835, + "learning_rate": 2.196771201387077e-05, + "loss": 1.1118, + "step": 6859 + }, + { + "epoch": 1.0784452435693372, + "grad_norm": 0.1420438140630722, + "learning_rate": 2.1961579322448972e-05, + "loss": 1.1147, + "step": 6860 + }, + { + "epoch": 1.0786024484662697, + "grad_norm": 0.12722203135490417, + "learning_rate": 2.195544681660236e-05, + "loss": 1.0638, + "step": 6861 + }, + { + "epoch": 1.0787596533632022, + "grad_norm": 0.14435343444347382, + "learning_rate": 2.1949314496705465e-05, + "loss": 1.0018, + "step": 6862 + }, + { + "epoch": 1.0789168582601347, + "grad_norm": 0.13326679170131683, + "learning_rate": 2.1943182363132838e-05, + "loss": 0.9709, + "step": 6863 + }, + { + "epoch": 1.0790740631570674, + "grad_norm": 0.13346648216247559, + "learning_rate": 2.1937050416259e-05, + "loss": 1.0575, + "step": 6864 + }, + { + "epoch": 1.079231268054, + "grad_norm": 0.1331537961959839, + "learning_rate": 2.193091865645846e-05, + "loss": 1.1521, + "step": 6865 + }, + { + "epoch": 1.0793884729509324, + "grad_norm": 0.17918452620506287, + "learning_rate": 2.1924787084105745e-05, + "loss": 1.0128, + "step": 6866 + }, + { + "epoch": 1.079545677847865, + "grad_norm": 0.13416318595409393, + "learning_rate": 2.1918655699575327e-05, + "loss": 1.1397, + "step": 6867 + }, + { + "epoch": 1.0797028827447974, + "grad_norm": 0.13478371500968933, + "learning_rate": 2.19125245032417e-05, + "loss": 1.092, + "step": 6868 + }, + { + "epoch": 1.0798600876417301, + "grad_norm": 0.12421022355556488, + "learning_rate": 2.1906393495479327e-05, + "loss": 0.9125, + "step": 6869 + }, + { + "epoch": 1.0800172925386626, + "grad_norm": 0.15842193365097046, + "learning_rate": 2.1900262676662665e-05, + "loss": 1.0202, + "step": 6870 + }, + { + "epoch": 1.0801744974355951, + "grad_norm": 0.13692085444927216, + "learning_rate": 2.1894132047166178e-05, + "loss": 1.1075, + "step": 6871 + }, + { + "epoch": 1.0803317023325276, + "grad_norm": 0.13673561811447144, + "learning_rate": 2.188800160736427e-05, + "loss": 1.0685, + "step": 6872 + }, + { + "epoch": 1.0804889072294601, + "grad_norm": 0.1461687833070755, + "learning_rate": 2.1881871357631393e-05, + "loss": 1.1709, + "step": 6873 + }, + { + "epoch": 1.0806461121263928, + "grad_norm": 0.13897156715393066, + "learning_rate": 2.187574129834195e-05, + "loss": 1.0552, + "step": 6874 + }, + { + "epoch": 1.0808033170233253, + "grad_norm": 0.1381140649318695, + "learning_rate": 2.186961142987034e-05, + "loss": 1.1009, + "step": 6875 + }, + { + "epoch": 1.0809605219202578, + "grad_norm": 0.13417258858680725, + "learning_rate": 2.186348175259096e-05, + "loss": 1.0632, + "step": 6876 + }, + { + "epoch": 1.0811177268171903, + "grad_norm": 0.1352785974740982, + "learning_rate": 2.1857352266878168e-05, + "loss": 1.0842, + "step": 6877 + }, + { + "epoch": 1.0812749317141228, + "grad_norm": 0.1412501037120819, + "learning_rate": 2.1851222973106353e-05, + "loss": 1.1078, + "step": 6878 + }, + { + "epoch": 1.0814321366110555, + "grad_norm": 0.1538170874118805, + "learning_rate": 2.1845093871649852e-05, + "loss": 1.1498, + "step": 6879 + }, + { + "epoch": 1.081589341507988, + "grad_norm": 0.14308083057403564, + "learning_rate": 2.183896496288302e-05, + "loss": 1.1414, + "step": 6880 + }, + { + "epoch": 1.081589341507988, + "eval_loss": 1.0860660076141357, + "eval_runtime": 2370.7265, + "eval_samples_per_second": 3.905, + "eval_steps_per_second": 1.953, + "step": 6880 + } + ], + "logging_steps": 1, + "max_steps": 12722, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 160, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.5242621126253216e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}