{ "best_metric": 0.3534523546695709, "best_model_checkpoint": "../results/with_lora_mistral_4bit_enbedding_trainable_unfiltered_dataset_inst_frame/checkpoint-2150", "epoch": 0.9982763778293756, "eval_steps": 10, "global_step": 2190, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 23.42947006225586, "learning_rate": 9.995440036479709e-05, "loss": 0.7291, "step": 1 }, { "epoch": 0.0, "grad_norm": 8.535998344421387, "learning_rate": 9.990880072959417e-05, "loss": 0.6489, "step": 2 }, { "epoch": 0.0, "grad_norm": 5.2448015213012695, "learning_rate": 9.986320109439125e-05, "loss": 0.5896, "step": 3 }, { "epoch": 0.0, "grad_norm": 6.437242031097412, "learning_rate": 9.981760145918833e-05, "loss": 0.5657, "step": 4 }, { "epoch": 0.0, "grad_norm": 5.361367225646973, "learning_rate": 9.977200182398541e-05, "loss": 0.5484, "step": 5 }, { "epoch": 0.0, "grad_norm": 5.016327857971191, "learning_rate": 9.97264021887825e-05, "loss": 0.5382, "step": 6 }, { "epoch": 0.0, "grad_norm": 26.851099014282227, "learning_rate": 9.968080255357958e-05, "loss": 0.5216, "step": 7 }, { "epoch": 0.0, "grad_norm": 6.51596736907959, "learning_rate": 9.963520291837666e-05, "loss": 0.51, "step": 8 }, { "epoch": 0.0, "grad_norm": 4.276856422424316, "learning_rate": 9.958960328317374e-05, "loss": 0.4838, "step": 9 }, { "epoch": 0.0, "grad_norm": 3.621899366378784, "learning_rate": 9.954400364797082e-05, "loss": 0.5401, "step": 10 }, { "epoch": 0.0, "eval_loss": 0.49539637565612793, "eval_runtime": 19.4376, "eval_samples_per_second": 1.441, "eval_steps_per_second": 0.36, "step": 10 }, { "epoch": 0.01, "grad_norm": 3.671776294708252, "learning_rate": 9.94984040127679e-05, "loss": 0.4841, "step": 11 }, { "epoch": 0.01, "grad_norm": 3.1968910694122314, "learning_rate": 9.945280437756497e-05, "loss": 0.454, "step": 12 }, { "epoch": 0.01, "grad_norm": 2.9277799129486084, "learning_rate": 9.940720474236207e-05, "loss": 0.481, "step": 13 }, { "epoch": 0.01, "grad_norm": 1.9978115558624268, "learning_rate": 9.936160510715915e-05, "loss": 0.4935, "step": 14 }, { "epoch": 0.01, "grad_norm": 1.7962915897369385, "learning_rate": 9.931600547195623e-05, "loss": 0.4444, "step": 15 }, { "epoch": 0.01, "grad_norm": 1.8223544359207153, "learning_rate": 9.927040583675331e-05, "loss": 0.4606, "step": 16 }, { "epoch": 0.01, "grad_norm": 2.258604049682617, "learning_rate": 9.92248062015504e-05, "loss": 0.4592, "step": 17 }, { "epoch": 0.01, "grad_norm": 1.8389122486114502, "learning_rate": 9.917920656634748e-05, "loss": 0.4398, "step": 18 }, { "epoch": 0.01, "grad_norm": 1.8433682918548584, "learning_rate": 9.913360693114456e-05, "loss": 0.4411, "step": 19 }, { "epoch": 0.01, "grad_norm": 1.7910923957824707, "learning_rate": 9.908800729594163e-05, "loss": 0.4324, "step": 20 }, { "epoch": 0.01, "eval_loss": 0.4595804512500763, "eval_runtime": 19.8191, "eval_samples_per_second": 1.413, "eval_steps_per_second": 0.353, "step": 20 }, { "epoch": 0.01, "grad_norm": 1.8877395391464233, "learning_rate": 9.904240766073872e-05, "loss": 0.4136, "step": 21 }, { "epoch": 0.01, "grad_norm": 11.494500160217285, "learning_rate": 9.89968080255358e-05, "loss": 0.4511, "step": 22 }, { "epoch": 0.01, "grad_norm": 2.1572749614715576, "learning_rate": 9.895120839033288e-05, "loss": 0.4759, "step": 23 }, { "epoch": 0.01, "grad_norm": 2.031266212463379, "learning_rate": 9.890560875512997e-05, "loss": 0.4447, "step": 24 }, { "epoch": 0.01, "grad_norm": 1.8705304861068726, "learning_rate": 9.886000911992705e-05, "loss": 0.4589, "step": 25 }, { "epoch": 0.01, "grad_norm": 1.6991623640060425, "learning_rate": 9.881440948472413e-05, "loss": 0.4208, "step": 26 }, { "epoch": 0.01, "grad_norm": 1.8854478597640991, "learning_rate": 9.87688098495212e-05, "loss": 0.43, "step": 27 }, { "epoch": 0.01, "grad_norm": 1.826858639717102, "learning_rate": 9.872321021431828e-05, "loss": 0.4558, "step": 28 }, { "epoch": 0.01, "grad_norm": 1.6870789527893066, "learning_rate": 9.867761057911538e-05, "loss": 0.4169, "step": 29 }, { "epoch": 0.01, "grad_norm": 1.4985276460647583, "learning_rate": 9.863201094391246e-05, "loss": 0.4187, "step": 30 }, { "epoch": 0.01, "eval_loss": 0.44730886816978455, "eval_runtime": 19.0823, "eval_samples_per_second": 1.467, "eval_steps_per_second": 0.367, "step": 30 }, { "epoch": 0.01, "grad_norm": 1.6747127771377563, "learning_rate": 9.858641130870954e-05, "loss": 0.4232, "step": 31 }, { "epoch": 0.01, "grad_norm": 1.5604517459869385, "learning_rate": 9.854081167350662e-05, "loss": 0.4272, "step": 32 }, { "epoch": 0.02, "grad_norm": 1.685736060142517, "learning_rate": 9.84952120383037e-05, "loss": 0.3962, "step": 33 }, { "epoch": 0.02, "grad_norm": 1.6975411176681519, "learning_rate": 9.844961240310078e-05, "loss": 0.4327, "step": 34 }, { "epoch": 0.02, "grad_norm": 1.6804540157318115, "learning_rate": 9.840401276789785e-05, "loss": 0.4347, "step": 35 }, { "epoch": 0.02, "grad_norm": 1.6468958854675293, "learning_rate": 9.835841313269493e-05, "loss": 0.4327, "step": 36 }, { "epoch": 0.02, "grad_norm": 1.5674083232879639, "learning_rate": 9.831281349749203e-05, "loss": 0.4195, "step": 37 }, { "epoch": 0.02, "grad_norm": 1.5349141359329224, "learning_rate": 9.826721386228911e-05, "loss": 0.4214, "step": 38 }, { "epoch": 0.02, "grad_norm": 2.2588911056518555, "learning_rate": 9.822161422708619e-05, "loss": 0.4422, "step": 39 }, { "epoch": 0.02, "grad_norm": 1.5731521844863892, "learning_rate": 9.817601459188327e-05, "loss": 0.4356, "step": 40 }, { "epoch": 0.02, "eval_loss": 0.43755975365638733, "eval_runtime": 18.8787, "eval_samples_per_second": 1.483, "eval_steps_per_second": 0.371, "step": 40 }, { "epoch": 0.02, "grad_norm": 1.470995306968689, "learning_rate": 9.813041495668036e-05, "loss": 0.4014, "step": 41 }, { "epoch": 0.02, "grad_norm": 1.5055944919586182, "learning_rate": 9.808481532147742e-05, "loss": 0.4307, "step": 42 }, { "epoch": 0.02, "grad_norm": 1.5349737405776978, "learning_rate": 9.80392156862745e-05, "loss": 0.4342, "step": 43 }, { "epoch": 0.02, "grad_norm": 1.4112433195114136, "learning_rate": 9.799361605107159e-05, "loss": 0.4055, "step": 44 }, { "epoch": 0.02, "grad_norm": 1.3752671480178833, "learning_rate": 9.794801641586868e-05, "loss": 0.3995, "step": 45 }, { "epoch": 0.02, "grad_norm": 1.56571626663208, "learning_rate": 9.790241678066576e-05, "loss": 0.4315, "step": 46 }, { "epoch": 0.02, "grad_norm": 1.5266872644424438, "learning_rate": 9.785681714546285e-05, "loss": 0.4232, "step": 47 }, { "epoch": 0.02, "grad_norm": 1.6925004720687866, "learning_rate": 9.781121751025993e-05, "loss": 0.4668, "step": 48 }, { "epoch": 0.02, "grad_norm": 1.5088814496994019, "learning_rate": 9.7765617875057e-05, "loss": 0.4033, "step": 49 }, { "epoch": 0.02, "grad_norm": 1.4756637811660767, "learning_rate": 9.772001823985408e-05, "loss": 0.4059, "step": 50 }, { "epoch": 0.02, "eval_loss": 0.432329922914505, "eval_runtime": 19.0193, "eval_samples_per_second": 1.472, "eval_steps_per_second": 0.368, "step": 50 }, { "epoch": 0.02, "grad_norm": 1.544489860534668, "learning_rate": 9.767441860465116e-05, "loss": 0.4314, "step": 51 }, { "epoch": 0.02, "grad_norm": 1.4627840518951416, "learning_rate": 9.762881896944824e-05, "loss": 0.3999, "step": 52 }, { "epoch": 0.02, "grad_norm": 1.546804666519165, "learning_rate": 9.758321933424534e-05, "loss": 0.413, "step": 53 }, { "epoch": 0.02, "grad_norm": 1.4067586660385132, "learning_rate": 9.753761969904242e-05, "loss": 0.4112, "step": 54 }, { "epoch": 0.03, "grad_norm": 1.5578185319900513, "learning_rate": 9.74920200638395e-05, "loss": 0.4222, "step": 55 }, { "epoch": 0.03, "grad_norm": 1.398961067199707, "learning_rate": 9.744642042863658e-05, "loss": 0.3935, "step": 56 }, { "epoch": 0.03, "grad_norm": 1.5989190340042114, "learning_rate": 9.740082079343365e-05, "loss": 0.4436, "step": 57 }, { "epoch": 0.03, "grad_norm": 1.493788242340088, "learning_rate": 9.735522115823073e-05, "loss": 0.4161, "step": 58 }, { "epoch": 0.03, "grad_norm": 1.4532933235168457, "learning_rate": 9.730962152302781e-05, "loss": 0.4021, "step": 59 }, { "epoch": 0.03, "grad_norm": 1.5490344762802124, "learning_rate": 9.72640218878249e-05, "loss": 0.4049, "step": 60 }, { "epoch": 0.03, "eval_loss": 0.4251527786254883, "eval_runtime": 19.1976, "eval_samples_per_second": 1.459, "eval_steps_per_second": 0.365, "step": 60 }, { "epoch": 0.03, "grad_norm": 1.670199990272522, "learning_rate": 9.721842225262199e-05, "loss": 0.4259, "step": 61 }, { "epoch": 0.03, "grad_norm": 1.3972994089126587, "learning_rate": 9.717282261741907e-05, "loss": 0.3767, "step": 62 }, { "epoch": 0.03, "grad_norm": 1.40248703956604, "learning_rate": 9.712722298221615e-05, "loss": 0.4021, "step": 63 }, { "epoch": 0.03, "grad_norm": 1.4790185689926147, "learning_rate": 9.708162334701322e-05, "loss": 0.4194, "step": 64 }, { "epoch": 0.03, "grad_norm": 1.4340730905532837, "learning_rate": 9.70360237118103e-05, "loss": 0.4256, "step": 65 }, { "epoch": 0.03, "grad_norm": 1.3620188236236572, "learning_rate": 9.699042407660739e-05, "loss": 0.4111, "step": 66 }, { "epoch": 0.03, "grad_norm": 1.3097904920578003, "learning_rate": 9.694482444140447e-05, "loss": 0.391, "step": 67 }, { "epoch": 0.03, "grad_norm": 1.4144586324691772, "learning_rate": 9.689922480620155e-05, "loss": 0.3683, "step": 68 }, { "epoch": 0.03, "grad_norm": 1.2774299383163452, "learning_rate": 9.685362517099864e-05, "loss": 0.386, "step": 69 }, { "epoch": 0.03, "grad_norm": 1.3778272867202759, "learning_rate": 9.680802553579573e-05, "loss": 0.3938, "step": 70 }, { "epoch": 0.03, "eval_loss": 0.4205349385738373, "eval_runtime": 18.779, "eval_samples_per_second": 1.491, "eval_steps_per_second": 0.373, "step": 70 }, { "epoch": 0.03, "grad_norm": 1.4748997688293457, "learning_rate": 9.676242590059281e-05, "loss": 0.3927, "step": 71 }, { "epoch": 0.03, "grad_norm": 1.3778220415115356, "learning_rate": 9.671682626538988e-05, "loss": 0.4209, "step": 72 }, { "epoch": 0.03, "grad_norm": 1.4680858850479126, "learning_rate": 9.667122663018696e-05, "loss": 0.4046, "step": 73 }, { "epoch": 0.03, "grad_norm": 1.4174176454544067, "learning_rate": 9.662562699498404e-05, "loss": 0.4025, "step": 74 }, { "epoch": 0.03, "grad_norm": 1.4409643411636353, "learning_rate": 9.658002735978112e-05, "loss": 0.4103, "step": 75 }, { "epoch": 0.03, "grad_norm": 1.3730573654174805, "learning_rate": 9.65344277245782e-05, "loss": 0.4223, "step": 76 }, { "epoch": 0.04, "grad_norm": 1.3434337377548218, "learning_rate": 9.64888280893753e-05, "loss": 0.3866, "step": 77 }, { "epoch": 0.04, "grad_norm": 1.3390862941741943, "learning_rate": 9.644322845417238e-05, "loss": 0.4057, "step": 78 }, { "epoch": 0.04, "grad_norm": 1.264007568359375, "learning_rate": 9.639762881896945e-05, "loss": 0.3819, "step": 79 }, { "epoch": 0.04, "grad_norm": 1.3711590766906738, "learning_rate": 9.635202918376653e-05, "loss": 0.4152, "step": 80 }, { "epoch": 0.04, "eval_loss": 0.41787847876548767, "eval_runtime": 18.9002, "eval_samples_per_second": 1.481, "eval_steps_per_second": 0.37, "step": 80 }, { "epoch": 0.04, "grad_norm": 1.4036868810653687, "learning_rate": 9.630642954856361e-05, "loss": 0.4079, "step": 81 }, { "epoch": 0.04, "grad_norm": 1.2858104705810547, "learning_rate": 9.62608299133607e-05, "loss": 0.3716, "step": 82 }, { "epoch": 0.04, "grad_norm": 1.4125823974609375, "learning_rate": 9.621523027815778e-05, "loss": 0.4203, "step": 83 }, { "epoch": 0.04, "grad_norm": 1.2835861444473267, "learning_rate": 9.616963064295486e-05, "loss": 0.3851, "step": 84 }, { "epoch": 0.04, "grad_norm": 1.3441009521484375, "learning_rate": 9.612403100775195e-05, "loss": 0.4046, "step": 85 }, { "epoch": 0.04, "grad_norm": 1.256738543510437, "learning_rate": 9.607843137254903e-05, "loss": 0.3782, "step": 86 }, { "epoch": 0.04, "grad_norm": 1.3569210767745972, "learning_rate": 9.60328317373461e-05, "loss": 0.4137, "step": 87 }, { "epoch": 0.04, "grad_norm": 1.302693247795105, "learning_rate": 9.598723210214318e-05, "loss": 0.3853, "step": 88 }, { "epoch": 0.04, "grad_norm": 1.3464365005493164, "learning_rate": 9.594163246694027e-05, "loss": 0.3692, "step": 89 }, { "epoch": 0.04, "grad_norm": 1.302085041999817, "learning_rate": 9.589603283173735e-05, "loss": 0.386, "step": 90 }, { "epoch": 0.04, "eval_loss": 0.41579389572143555, "eval_runtime": 19.4361, "eval_samples_per_second": 1.441, "eval_steps_per_second": 0.36, "step": 90 }, { "epoch": 0.04, "grad_norm": 1.8792297840118408, "learning_rate": 9.585043319653443e-05, "loss": 0.3841, "step": 91 }, { "epoch": 0.04, "grad_norm": 1.4461652040481567, "learning_rate": 9.580483356133151e-05, "loss": 0.3957, "step": 92 }, { "epoch": 0.04, "grad_norm": 1.4995123147964478, "learning_rate": 9.57592339261286e-05, "loss": 0.4188, "step": 93 }, { "epoch": 0.04, "grad_norm": 10.198469161987305, "learning_rate": 9.571363429092567e-05, "loss": 0.4228, "step": 94 }, { "epoch": 0.04, "grad_norm": 1.4933909177780151, "learning_rate": 9.566803465572276e-05, "loss": 0.3749, "step": 95 }, { "epoch": 0.04, "grad_norm": 1.293959379196167, "learning_rate": 9.562243502051984e-05, "loss": 0.3909, "step": 96 }, { "epoch": 0.04, "grad_norm": 1.3375085592269897, "learning_rate": 9.557683538531692e-05, "loss": 0.3833, "step": 97 }, { "epoch": 0.04, "grad_norm": 1.3052716255187988, "learning_rate": 9.5531235750114e-05, "loss": 0.386, "step": 98 }, { "epoch": 0.05, "grad_norm": 1.3119175434112549, "learning_rate": 9.548563611491108e-05, "loss": 0.3993, "step": 99 }, { "epoch": 0.05, "grad_norm": 1.4674808979034424, "learning_rate": 9.544003647970816e-05, "loss": 0.4081, "step": 100 }, { "epoch": 0.05, "eval_loss": 0.4140634536743164, "eval_runtime": 19.7343, "eval_samples_per_second": 1.419, "eval_steps_per_second": 0.355, "step": 100 }, { "epoch": 0.05, "grad_norm": 1.4924694299697876, "learning_rate": 9.539443684450525e-05, "loss": 0.3729, "step": 101 }, { "epoch": 0.05, "grad_norm": 1.3195910453796387, "learning_rate": 9.534883720930233e-05, "loss": 0.3832, "step": 102 }, { "epoch": 0.05, "grad_norm": 1.3233399391174316, "learning_rate": 9.530323757409941e-05, "loss": 0.4088, "step": 103 }, { "epoch": 0.05, "grad_norm": 1.3528556823730469, "learning_rate": 9.525763793889649e-05, "loss": 0.3832, "step": 104 }, { "epoch": 0.05, "grad_norm": 1.332626223564148, "learning_rate": 9.521203830369357e-05, "loss": 0.3973, "step": 105 }, { "epoch": 0.05, "grad_norm": 1.3452651500701904, "learning_rate": 9.516643866849065e-05, "loss": 0.3782, "step": 106 }, { "epoch": 0.05, "grad_norm": 1.2375059127807617, "learning_rate": 9.512083903328774e-05, "loss": 0.3967, "step": 107 }, { "epoch": 0.05, "grad_norm": 1.2086222171783447, "learning_rate": 9.507523939808482e-05, "loss": 0.395, "step": 108 }, { "epoch": 0.05, "grad_norm": 1.216988444328308, "learning_rate": 9.50296397628819e-05, "loss": 0.3826, "step": 109 }, { "epoch": 0.05, "grad_norm": 1.1735084056854248, "learning_rate": 9.498404012767898e-05, "loss": 0.3796, "step": 110 }, { "epoch": 0.05, "eval_loss": 0.41126012802124023, "eval_runtime": 19.0256, "eval_samples_per_second": 1.472, "eval_steps_per_second": 0.368, "step": 110 }, { "epoch": 0.05, "grad_norm": 1.322290062904358, "learning_rate": 9.493844049247606e-05, "loss": 0.3912, "step": 111 }, { "epoch": 0.05, "grad_norm": 1.2272303104400635, "learning_rate": 9.489284085727315e-05, "loss": 0.3841, "step": 112 }, { "epoch": 0.05, "grad_norm": 1.2432588338851929, "learning_rate": 9.484724122207023e-05, "loss": 0.3912, "step": 113 }, { "epoch": 0.05, "grad_norm": 1.2858808040618896, "learning_rate": 9.480164158686731e-05, "loss": 0.4006, "step": 114 }, { "epoch": 0.05, "grad_norm": 1.2801638841629028, "learning_rate": 9.475604195166439e-05, "loss": 0.4052, "step": 115 }, { "epoch": 0.05, "grad_norm": 1.1603589057922363, "learning_rate": 9.471044231646147e-05, "loss": 0.3861, "step": 116 }, { "epoch": 0.05, "grad_norm": 1.261997938156128, "learning_rate": 9.466484268125855e-05, "loss": 0.4054, "step": 117 }, { "epoch": 0.05, "grad_norm": 1.3250972032546997, "learning_rate": 9.461924304605564e-05, "loss": 0.3916, "step": 118 }, { "epoch": 0.05, "grad_norm": 1.2297006845474243, "learning_rate": 9.457364341085272e-05, "loss": 0.3852, "step": 119 }, { "epoch": 0.05, "grad_norm": 1.1711465120315552, "learning_rate": 9.45280437756498e-05, "loss": 0.3783, "step": 120 }, { "epoch": 0.05, "eval_loss": 0.4085245132446289, "eval_runtime": 18.864, "eval_samples_per_second": 1.484, "eval_steps_per_second": 0.371, "step": 120 }, { "epoch": 0.06, "grad_norm": 1.2444721460342407, "learning_rate": 9.448244414044688e-05, "loss": 0.4089, "step": 121 }, { "epoch": 0.06, "grad_norm": 1.3666821718215942, "learning_rate": 9.443684450524396e-05, "loss": 0.4065, "step": 122 }, { "epoch": 0.06, "grad_norm": 1.2235740423202515, "learning_rate": 9.439124487004104e-05, "loss": 0.3789, "step": 123 }, { "epoch": 0.06, "grad_norm": 1.2389872074127197, "learning_rate": 9.434564523483813e-05, "loss": 0.4049, "step": 124 }, { "epoch": 0.06, "grad_norm": 1.2753331661224365, "learning_rate": 9.430004559963521e-05, "loss": 0.3976, "step": 125 }, { "epoch": 0.06, "grad_norm": 1.1875836849212646, "learning_rate": 9.425444596443229e-05, "loss": 0.3831, "step": 126 }, { "epoch": 0.06, "grad_norm": 1.170867681503296, "learning_rate": 9.420884632922937e-05, "loss": 0.3842, "step": 127 }, { "epoch": 0.06, "grad_norm": 1.2708452939987183, "learning_rate": 9.416324669402645e-05, "loss": 0.3748, "step": 128 }, { "epoch": 0.06, "grad_norm": 1.1985563039779663, "learning_rate": 9.411764705882353e-05, "loss": 0.4001, "step": 129 }, { "epoch": 0.06, "grad_norm": 1.2549450397491455, "learning_rate": 9.407204742362062e-05, "loss": 0.3953, "step": 130 }, { "epoch": 0.06, "eval_loss": 0.40715333819389343, "eval_runtime": 18.7672, "eval_samples_per_second": 1.492, "eval_steps_per_second": 0.373, "step": 130 }, { "epoch": 0.06, "grad_norm": 1.2259796857833862, "learning_rate": 9.40264477884177e-05, "loss": 0.3888, "step": 131 }, { "epoch": 0.06, "grad_norm": 1.1683098077774048, "learning_rate": 9.398084815321478e-05, "loss": 0.3897, "step": 132 }, { "epoch": 0.06, "grad_norm": 1.1220805644989014, "learning_rate": 9.393524851801186e-05, "loss": 0.3676, "step": 133 }, { "epoch": 0.06, "grad_norm": 1.2481117248535156, "learning_rate": 9.388964888280894e-05, "loss": 0.4106, "step": 134 }, { "epoch": 0.06, "grad_norm": 1.314931869506836, "learning_rate": 9.384404924760603e-05, "loss": 0.4077, "step": 135 }, { "epoch": 0.06, "grad_norm": 1.1738797426223755, "learning_rate": 9.379844961240311e-05, "loss": 0.3988, "step": 136 }, { "epoch": 0.06, "grad_norm": 1.1521250009536743, "learning_rate": 9.375284997720019e-05, "loss": 0.3818, "step": 137 }, { "epoch": 0.06, "grad_norm": 1.29606294631958, "learning_rate": 9.370725034199727e-05, "loss": 0.4129, "step": 138 }, { "epoch": 0.06, "grad_norm": 1.3134340047836304, "learning_rate": 9.366165070679435e-05, "loss": 0.4004, "step": 139 }, { "epoch": 0.06, "grad_norm": 1.2621879577636719, "learning_rate": 9.361605107159142e-05, "loss": 0.4077, "step": 140 }, { "epoch": 0.06, "eval_loss": 0.4061693847179413, "eval_runtime": 19.4395, "eval_samples_per_second": 1.44, "eval_steps_per_second": 0.36, "step": 140 }, { "epoch": 0.06, "grad_norm": 1.2790113687515259, "learning_rate": 9.357045143638852e-05, "loss": 0.4147, "step": 141 }, { "epoch": 0.06, "grad_norm": 1.3079200983047485, "learning_rate": 9.35248518011856e-05, "loss": 0.4044, "step": 142 }, { "epoch": 0.07, "grad_norm": 1.2306909561157227, "learning_rate": 9.347925216598268e-05, "loss": 0.3801, "step": 143 }, { "epoch": 0.07, "grad_norm": 1.2091785669326782, "learning_rate": 9.343365253077976e-05, "loss": 0.3558, "step": 144 }, { "epoch": 0.07, "grad_norm": 1.235843300819397, "learning_rate": 9.338805289557684e-05, "loss": 0.3874, "step": 145 }, { "epoch": 0.07, "grad_norm": 1.20920991897583, "learning_rate": 9.334245326037392e-05, "loss": 0.3818, "step": 146 }, { "epoch": 0.07, "grad_norm": 1.2847074270248413, "learning_rate": 9.329685362517099e-05, "loss": 0.3772, "step": 147 }, { "epoch": 0.07, "grad_norm": 1.2185736894607544, "learning_rate": 9.325125398996807e-05, "loss": 0.3921, "step": 148 }, { "epoch": 0.07, "grad_norm": 1.2444103956222534, "learning_rate": 9.320565435476517e-05, "loss": 0.3914, "step": 149 }, { "epoch": 0.07, "grad_norm": 1.283208966255188, "learning_rate": 9.316005471956225e-05, "loss": 0.3768, "step": 150 }, { "epoch": 0.07, "eval_loss": 0.40466779470443726, "eval_runtime": 18.8032, "eval_samples_per_second": 1.489, "eval_steps_per_second": 0.372, "step": 150 }, { "epoch": 0.07, "grad_norm": 1.2506765127182007, "learning_rate": 9.311445508435933e-05, "loss": 0.3859, "step": 151 }, { "epoch": 0.07, "grad_norm": 1.2482283115386963, "learning_rate": 9.306885544915641e-05, "loss": 0.3991, "step": 152 }, { "epoch": 0.07, "grad_norm": 1.1240688562393188, "learning_rate": 9.30232558139535e-05, "loss": 0.3621, "step": 153 }, { "epoch": 0.07, "grad_norm": 1.1728860139846802, "learning_rate": 9.297765617875058e-05, "loss": 0.3806, "step": 154 }, { "epoch": 0.07, "grad_norm": 1.137986183166504, "learning_rate": 9.293205654354765e-05, "loss": 0.3889, "step": 155 }, { "epoch": 0.07, "grad_norm": 1.132051944732666, "learning_rate": 9.288645690834473e-05, "loss": 0.3763, "step": 156 }, { "epoch": 0.07, "grad_norm": 1.2157707214355469, "learning_rate": 9.284085727314182e-05, "loss": 0.373, "step": 157 }, { "epoch": 0.07, "grad_norm": 1.2097870111465454, "learning_rate": 9.27952576379389e-05, "loss": 0.4121, "step": 158 }, { "epoch": 0.07, "grad_norm": 1.1059770584106445, "learning_rate": 9.274965800273599e-05, "loss": 0.3716, "step": 159 }, { "epoch": 0.07, "grad_norm": 1.170626163482666, "learning_rate": 9.270405836753307e-05, "loss": 0.3649, "step": 160 }, { "epoch": 0.07, "eval_loss": 0.4046716094017029, "eval_runtime": 18.9699, "eval_samples_per_second": 1.476, "eval_steps_per_second": 0.369, "step": 160 }, { "epoch": 0.07, "grad_norm": 1.1942059993743896, "learning_rate": 9.265845873233015e-05, "loss": 0.3768, "step": 161 }, { "epoch": 0.07, "grad_norm": 1.2125589847564697, "learning_rate": 9.261285909712722e-05, "loss": 0.3719, "step": 162 }, { "epoch": 0.07, "grad_norm": 1.1660679578781128, "learning_rate": 9.25672594619243e-05, "loss": 0.3882, "step": 163 }, { "epoch": 0.07, "grad_norm": 1.0966999530792236, "learning_rate": 9.252165982672138e-05, "loss": 0.3588, "step": 164 }, { "epoch": 0.08, "grad_norm": 1.197269320487976, "learning_rate": 9.247606019151848e-05, "loss": 0.3769, "step": 165 }, { "epoch": 0.08, "grad_norm": 1.2217698097229004, "learning_rate": 9.243046055631556e-05, "loss": 0.3682, "step": 166 }, { "epoch": 0.08, "grad_norm": 1.2168915271759033, "learning_rate": 9.238486092111264e-05, "loss": 0.3894, "step": 167 }, { "epoch": 0.08, "grad_norm": 1.2309722900390625, "learning_rate": 9.233926128590972e-05, "loss": 0.3976, "step": 168 }, { "epoch": 0.08, "grad_norm": 1.2164597511291504, "learning_rate": 9.22936616507068e-05, "loss": 0.3682, "step": 169 }, { "epoch": 0.08, "grad_norm": 1.264969825744629, "learning_rate": 9.224806201550387e-05, "loss": 0.4121, "step": 170 }, { "epoch": 0.08, "eval_loss": 0.4015327990055084, "eval_runtime": 19.3928, "eval_samples_per_second": 1.444, "eval_steps_per_second": 0.361, "step": 170 }, { "epoch": 0.08, "grad_norm": 1.1746195554733276, "learning_rate": 9.220246238030095e-05, "loss": 0.3945, "step": 171 }, { "epoch": 0.08, "grad_norm": 1.1495100259780884, "learning_rate": 9.215686274509804e-05, "loss": 0.3887, "step": 172 }, { "epoch": 0.08, "grad_norm": 1.2439409494400024, "learning_rate": 9.211126310989513e-05, "loss": 0.3995, "step": 173 }, { "epoch": 0.08, "grad_norm": 1.1578717231750488, "learning_rate": 9.206566347469221e-05, "loss": 0.3954, "step": 174 }, { "epoch": 0.08, "grad_norm": 1.2770475149154663, "learning_rate": 9.20200638394893e-05, "loss": 0.3994, "step": 175 }, { "epoch": 0.08, "grad_norm": 1.2108159065246582, "learning_rate": 9.197446420428638e-05, "loss": 0.3764, "step": 176 }, { "epoch": 0.08, "grad_norm": 1.2297917604446411, "learning_rate": 9.192886456908344e-05, "loss": 0.3837, "step": 177 }, { "epoch": 0.08, "grad_norm": 1.1047996282577515, "learning_rate": 9.188326493388053e-05, "loss": 0.3781, "step": 178 }, { "epoch": 0.08, "grad_norm": 1.2975915670394897, "learning_rate": 9.183766529867761e-05, "loss": 0.385, "step": 179 }, { "epoch": 0.08, "grad_norm": 1.1674829721450806, "learning_rate": 9.179206566347469e-05, "loss": 0.4053, "step": 180 }, { "epoch": 0.08, "eval_loss": 0.4007401764392853, "eval_runtime": 19.4644, "eval_samples_per_second": 1.439, "eval_steps_per_second": 0.36, "step": 180 }, { "epoch": 0.08, "grad_norm": 1.2072957754135132, "learning_rate": 9.174646602827178e-05, "loss": 0.3883, "step": 181 }, { "epoch": 0.08, "grad_norm": 1.1251564025878906, "learning_rate": 9.170086639306887e-05, "loss": 0.3667, "step": 182 }, { "epoch": 0.08, "grad_norm": 1.0530849695205688, "learning_rate": 9.165526675786595e-05, "loss": 0.3647, "step": 183 }, { "epoch": 0.08, "grad_norm": 1.154868721961975, "learning_rate": 9.160966712266302e-05, "loss": 0.4002, "step": 184 }, { "epoch": 0.08, "grad_norm": 1.1559878587722778, "learning_rate": 9.15640674874601e-05, "loss": 0.3937, "step": 185 }, { "epoch": 0.08, "grad_norm": 1.164669156074524, "learning_rate": 9.151846785225718e-05, "loss": 0.4066, "step": 186 }, { "epoch": 0.09, "grad_norm": 1.1499162912368774, "learning_rate": 9.147286821705426e-05, "loss": 0.3792, "step": 187 }, { "epoch": 0.09, "grad_norm": 1.1299781799316406, "learning_rate": 9.142726858185134e-05, "loss": 0.3667, "step": 188 }, { "epoch": 0.09, "grad_norm": 1.328697919845581, "learning_rate": 9.138166894664844e-05, "loss": 0.3775, "step": 189 }, { "epoch": 0.09, "grad_norm": 1.1182619333267212, "learning_rate": 9.133606931144552e-05, "loss": 0.3755, "step": 190 }, { "epoch": 0.09, "eval_loss": 0.3991912305355072, "eval_runtime": 18.7544, "eval_samples_per_second": 1.493, "eval_steps_per_second": 0.373, "step": 190 }, { "epoch": 0.09, "grad_norm": 1.1765880584716797, "learning_rate": 9.12904696762426e-05, "loss": 0.3958, "step": 191 }, { "epoch": 0.09, "grad_norm": 1.1418038606643677, "learning_rate": 9.124487004103967e-05, "loss": 0.3893, "step": 192 }, { "epoch": 0.09, "grad_norm": 1.1207756996154785, "learning_rate": 9.119927040583675e-05, "loss": 0.4006, "step": 193 }, { "epoch": 0.09, "grad_norm": 1.1601992845535278, "learning_rate": 9.115367077063383e-05, "loss": 0.3779, "step": 194 }, { "epoch": 0.09, "grad_norm": 1.0996065139770508, "learning_rate": 9.110807113543092e-05, "loss": 0.362, "step": 195 }, { "epoch": 0.09, "grad_norm": 1.0413964986801147, "learning_rate": 9.1062471500228e-05, "loss": 0.3533, "step": 196 }, { "epoch": 0.09, "grad_norm": 1.1514556407928467, "learning_rate": 9.101687186502509e-05, "loss": 0.3753, "step": 197 }, { "epoch": 0.09, "grad_norm": 1.1734694242477417, "learning_rate": 9.097127222982217e-05, "loss": 0.3829, "step": 198 }, { "epoch": 0.09, "grad_norm": 1.1342203617095947, "learning_rate": 9.092567259461924e-05, "loss": 0.38, "step": 199 }, { "epoch": 0.09, "grad_norm": 1.170672059059143, "learning_rate": 9.088007295941632e-05, "loss": 0.3813, "step": 200 }, { "epoch": 0.09, "eval_loss": 0.39865168929100037, "eval_runtime": 18.8251, "eval_samples_per_second": 1.487, "eval_steps_per_second": 0.372, "step": 200 }, { "epoch": 0.09, "grad_norm": 1.1320968866348267, "learning_rate": 9.08344733242134e-05, "loss": 0.381, "step": 201 }, { "epoch": 0.09, "grad_norm": 1.1122519969940186, "learning_rate": 9.078887368901049e-05, "loss": 0.3912, "step": 202 }, { "epoch": 0.09, "grad_norm": 1.2247223854064941, "learning_rate": 9.074327405380757e-05, "loss": 0.4028, "step": 203 }, { "epoch": 0.09, "grad_norm": 1.1854742765426636, "learning_rate": 9.069767441860465e-05, "loss": 0.3753, "step": 204 }, { "epoch": 0.09, "grad_norm": 1.129613995552063, "learning_rate": 9.065207478340175e-05, "loss": 0.39, "step": 205 }, { "epoch": 0.09, "grad_norm": 1.180238127708435, "learning_rate": 9.060647514819883e-05, "loss": 0.4166, "step": 206 }, { "epoch": 0.09, "grad_norm": 0.9991777539253235, "learning_rate": 9.05608755129959e-05, "loss": 0.369, "step": 207 }, { "epoch": 0.09, "grad_norm": 1.0713785886764526, "learning_rate": 9.051527587779298e-05, "loss": 0.3608, "step": 208 }, { "epoch": 0.1, "grad_norm": 1.111507773399353, "learning_rate": 9.046967624259006e-05, "loss": 0.3802, "step": 209 }, { "epoch": 0.1, "grad_norm": 1.14186429977417, "learning_rate": 9.042407660738714e-05, "loss": 0.3667, "step": 210 }, { "epoch": 0.1, "eval_loss": 0.3970547318458557, "eval_runtime": 19.182, "eval_samples_per_second": 1.46, "eval_steps_per_second": 0.365, "step": 210 }, { "epoch": 0.1, "grad_norm": 1.1212234497070312, "learning_rate": 9.037847697218422e-05, "loss": 0.3737, "step": 211 }, { "epoch": 0.1, "grad_norm": 1.112081527709961, "learning_rate": 9.03328773369813e-05, "loss": 0.3718, "step": 212 }, { "epoch": 0.1, "grad_norm": 1.0366638898849487, "learning_rate": 9.02872777017784e-05, "loss": 0.3623, "step": 213 }, { "epoch": 0.1, "grad_norm": 1.0450507402420044, "learning_rate": 9.024167806657547e-05, "loss": 0.3719, "step": 214 }, { "epoch": 0.1, "grad_norm": 1.1613171100616455, "learning_rate": 9.019607843137255e-05, "loss": 0.3959, "step": 215 }, { "epoch": 0.1, "grad_norm": 1.1329877376556396, "learning_rate": 9.015047879616963e-05, "loss": 0.3851, "step": 216 }, { "epoch": 0.1, "grad_norm": 1.3032368421554565, "learning_rate": 9.010487916096671e-05, "loss": 0.3833, "step": 217 }, { "epoch": 0.1, "grad_norm": 1.1342045068740845, "learning_rate": 9.00592795257638e-05, "loss": 0.3653, "step": 218 }, { "epoch": 0.1, "grad_norm": 1.0300016403198242, "learning_rate": 9.001367989056088e-05, "loss": 0.371, "step": 219 }, { "epoch": 0.1, "grad_norm": 1.0597660541534424, "learning_rate": 8.996808025535796e-05, "loss": 0.371, "step": 220 }, { "epoch": 0.1, "eval_loss": 0.39607396721839905, "eval_runtime": 18.8558, "eval_samples_per_second": 1.485, "eval_steps_per_second": 0.371, "step": 220 }, { "epoch": 0.1, "grad_norm": 1.113075613975525, "learning_rate": 8.992248062015505e-05, "loss": 0.3695, "step": 221 }, { "epoch": 0.1, "grad_norm": 1.0930582284927368, "learning_rate": 8.987688098495212e-05, "loss": 0.373, "step": 222 }, { "epoch": 0.1, "grad_norm": 1.0523693561553955, "learning_rate": 8.98312813497492e-05, "loss": 0.3584, "step": 223 }, { "epoch": 0.1, "grad_norm": 1.1256333589553833, "learning_rate": 8.978568171454629e-05, "loss": 0.3839, "step": 224 }, { "epoch": 0.1, "grad_norm": 1.156557559967041, "learning_rate": 8.974008207934337e-05, "loss": 0.3876, "step": 225 }, { "epoch": 0.1, "grad_norm": 1.1051405668258667, "learning_rate": 8.969448244414045e-05, "loss": 0.4027, "step": 226 }, { "epoch": 0.1, "grad_norm": 1.0528311729431152, "learning_rate": 8.964888280893753e-05, "loss": 0.366, "step": 227 }, { "epoch": 0.1, "grad_norm": 1.0805798768997192, "learning_rate": 8.960328317373461e-05, "loss": 0.3661, "step": 228 }, { "epoch": 0.1, "grad_norm": 1.1037845611572266, "learning_rate": 8.95576835385317e-05, "loss": 0.3711, "step": 229 }, { "epoch": 0.1, "grad_norm": 1.0873870849609375, "learning_rate": 8.951208390332878e-05, "loss": 0.3886, "step": 230 }, { "epoch": 0.1, "eval_loss": 0.3947284519672394, "eval_runtime": 18.9717, "eval_samples_per_second": 1.476, "eval_steps_per_second": 0.369, "step": 230 }, { "epoch": 0.11, "grad_norm": 1.1278759241104126, "learning_rate": 8.946648426812586e-05, "loss": 0.3588, "step": 231 }, { "epoch": 0.11, "grad_norm": 1.205957293510437, "learning_rate": 8.942088463292294e-05, "loss": 0.4015, "step": 232 }, { "epoch": 0.11, "grad_norm": 1.081870436668396, "learning_rate": 8.937528499772002e-05, "loss": 0.3853, "step": 233 }, { "epoch": 0.11, "grad_norm": 1.0868654251098633, "learning_rate": 8.93296853625171e-05, "loss": 0.3882, "step": 234 }, { "epoch": 0.11, "grad_norm": 1.1404011249542236, "learning_rate": 8.928408572731418e-05, "loss": 0.3577, "step": 235 }, { "epoch": 0.11, "grad_norm": 1.016044020652771, "learning_rate": 8.923848609211127e-05, "loss": 0.3784, "step": 236 }, { "epoch": 0.11, "grad_norm": 1.0132030248641968, "learning_rate": 8.919288645690835e-05, "loss": 0.3562, "step": 237 }, { "epoch": 0.11, "grad_norm": 1.0909206867218018, "learning_rate": 8.914728682170543e-05, "loss": 0.3824, "step": 238 }, { "epoch": 0.11, "grad_norm": 1.0702511072158813, "learning_rate": 8.910168718650251e-05, "loss": 0.371, "step": 239 }, { "epoch": 0.11, "grad_norm": 1.102399468421936, "learning_rate": 8.90560875512996e-05, "loss": 0.3938, "step": 240 }, { "epoch": 0.11, "eval_loss": 0.39401116967201233, "eval_runtime": 18.9873, "eval_samples_per_second": 1.475, "eval_steps_per_second": 0.369, "step": 240 }, { "epoch": 0.11, "grad_norm": 1.0651742219924927, "learning_rate": 8.901048791609668e-05, "loss": 0.378, "step": 241 }, { "epoch": 0.11, "grad_norm": 1.0487183332443237, "learning_rate": 8.896488828089376e-05, "loss": 0.3832, "step": 242 }, { "epoch": 0.11, "grad_norm": 1.0751785039901733, "learning_rate": 8.891928864569084e-05, "loss": 0.3783, "step": 243 }, { "epoch": 0.11, "grad_norm": 1.0894086360931396, "learning_rate": 8.887368901048792e-05, "loss": 0.3673, "step": 244 }, { "epoch": 0.11, "grad_norm": 1.0687947273254395, "learning_rate": 8.8828089375285e-05, "loss": 0.3596, "step": 245 }, { "epoch": 0.11, "grad_norm": 1.075567603111267, "learning_rate": 8.878248974008208e-05, "loss": 0.39, "step": 246 }, { "epoch": 0.11, "grad_norm": 1.0721067190170288, "learning_rate": 8.873689010487917e-05, "loss": 0.3724, "step": 247 }, { "epoch": 0.11, "grad_norm": 1.0904428958892822, "learning_rate": 8.869129046967625e-05, "loss": 0.3687, "step": 248 }, { "epoch": 0.11, "grad_norm": 1.124393105506897, "learning_rate": 8.864569083447333e-05, "loss": 0.3614, "step": 249 }, { "epoch": 0.11, "grad_norm": 1.0123660564422607, "learning_rate": 8.860009119927041e-05, "loss": 0.3818, "step": 250 }, { "epoch": 0.11, "eval_loss": 0.3932059407234192, "eval_runtime": 19.8198, "eval_samples_per_second": 1.413, "eval_steps_per_second": 0.353, "step": 250 }, { "epoch": 0.11, "grad_norm": 1.1686595678329468, "learning_rate": 8.855449156406749e-05, "loss": 0.3657, "step": 251 }, { "epoch": 0.11, "grad_norm": 1.1555325984954834, "learning_rate": 8.850889192886457e-05, "loss": 0.3953, "step": 252 }, { "epoch": 0.12, "grad_norm": 1.0188342332839966, "learning_rate": 8.846329229366166e-05, "loss": 0.3731, "step": 253 }, { "epoch": 0.12, "grad_norm": 1.0264111757278442, "learning_rate": 8.841769265845874e-05, "loss": 0.353, "step": 254 }, { "epoch": 0.12, "grad_norm": 1.1515635251998901, "learning_rate": 8.837209302325582e-05, "loss": 0.3982, "step": 255 }, { "epoch": 0.12, "grad_norm": 1.0572092533111572, "learning_rate": 8.83264933880529e-05, "loss": 0.4006, "step": 256 }, { "epoch": 0.12, "grad_norm": 1.058567762374878, "learning_rate": 8.828089375284998e-05, "loss": 0.3816, "step": 257 }, { "epoch": 0.12, "grad_norm": 1.0707305669784546, "learning_rate": 8.823529411764706e-05, "loss": 0.3614, "step": 258 }, { "epoch": 0.12, "grad_norm": 1.0715553760528564, "learning_rate": 8.818969448244415e-05, "loss": 0.3882, "step": 259 }, { "epoch": 0.12, "grad_norm": 1.0845972299575806, "learning_rate": 8.814409484724121e-05, "loss": 0.3736, "step": 260 }, { "epoch": 0.12, "eval_loss": 0.39302879571914673, "eval_runtime": 19.4673, "eval_samples_per_second": 1.438, "eval_steps_per_second": 0.36, "step": 260 }, { "epoch": 0.12, "grad_norm": 1.091538906097412, "learning_rate": 8.809849521203831e-05, "loss": 0.3735, "step": 261 }, { "epoch": 0.12, "grad_norm": 1.012810468673706, "learning_rate": 8.805289557683539e-05, "loss": 0.3676, "step": 262 }, { "epoch": 0.12, "grad_norm": 1.1814091205596924, "learning_rate": 8.800729594163247e-05, "loss": 0.3964, "step": 263 }, { "epoch": 0.12, "grad_norm": 1.0467474460601807, "learning_rate": 8.796169630642956e-05, "loss": 0.3705, "step": 264 }, { "epoch": 0.12, "grad_norm": 1.0370585918426514, "learning_rate": 8.791609667122664e-05, "loss": 0.3557, "step": 265 }, { "epoch": 0.12, "grad_norm": 1.0208948850631714, "learning_rate": 8.787049703602372e-05, "loss": 0.3821, "step": 266 }, { "epoch": 0.12, "grad_norm": 0.9990605711936951, "learning_rate": 8.78248974008208e-05, "loss": 0.3762, "step": 267 }, { "epoch": 0.12, "grad_norm": 1.1138283014297485, "learning_rate": 8.777929776561787e-05, "loss": 0.3575, "step": 268 }, { "epoch": 0.12, "grad_norm": 1.1316741704940796, "learning_rate": 8.773369813041496e-05, "loss": 0.3896, "step": 269 }, { "epoch": 0.12, "grad_norm": 1.0367543697357178, "learning_rate": 8.768809849521205e-05, "loss": 0.3756, "step": 270 }, { "epoch": 0.12, "eval_loss": 0.39074134826660156, "eval_runtime": 19.5875, "eval_samples_per_second": 1.429, "eval_steps_per_second": 0.357, "step": 270 }, { "epoch": 0.12, "grad_norm": 0.9777958989143372, "learning_rate": 8.764249886000913e-05, "loss": 0.3613, "step": 271 }, { "epoch": 0.12, "grad_norm": 1.0536458492279053, "learning_rate": 8.759689922480621e-05, "loss": 0.3808, "step": 272 }, { "epoch": 0.12, "grad_norm": 1.064129114151001, "learning_rate": 8.755129958960329e-05, "loss": 0.3738, "step": 273 }, { "epoch": 0.12, "grad_norm": 0.9518744349479675, "learning_rate": 8.750569995440037e-05, "loss": 0.3498, "step": 274 }, { "epoch": 0.13, "grad_norm": 1.0231995582580566, "learning_rate": 8.746010031919744e-05, "loss": 0.378, "step": 275 }, { "epoch": 0.13, "grad_norm": 1.0383845567703247, "learning_rate": 8.741450068399452e-05, "loss": 0.3832, "step": 276 }, { "epoch": 0.13, "grad_norm": 1.1850907802581787, "learning_rate": 8.736890104879162e-05, "loss": 0.3782, "step": 277 }, { "epoch": 0.13, "grad_norm": 1.0028883218765259, "learning_rate": 8.73233014135887e-05, "loss": 0.3641, "step": 278 }, { "epoch": 0.13, "grad_norm": 0.9896926879882812, "learning_rate": 8.727770177838578e-05, "loss": 0.3549, "step": 279 }, { "epoch": 0.13, "grad_norm": 1.1111470460891724, "learning_rate": 8.723210214318286e-05, "loss": 0.3687, "step": 280 }, { "epoch": 0.13, "eval_loss": 0.3898495137691498, "eval_runtime": 19.1295, "eval_samples_per_second": 1.464, "eval_steps_per_second": 0.366, "step": 280 }, { "epoch": 0.13, "grad_norm": 1.0218709707260132, "learning_rate": 8.718650250797994e-05, "loss": 0.3719, "step": 281 }, { "epoch": 0.13, "grad_norm": 1.0067551136016846, "learning_rate": 8.714090287277701e-05, "loss": 0.3632, "step": 282 }, { "epoch": 0.13, "grad_norm": 1.0330657958984375, "learning_rate": 8.70953032375741e-05, "loss": 0.3534, "step": 283 }, { "epoch": 0.13, "grad_norm": 1.0615925788879395, "learning_rate": 8.704970360237118e-05, "loss": 0.3747, "step": 284 }, { "epoch": 0.13, "grad_norm": 1.0701462030410767, "learning_rate": 8.700410396716827e-05, "loss": 0.3588, "step": 285 }, { "epoch": 0.13, "grad_norm": 1.0732547044754028, "learning_rate": 8.695850433196535e-05, "loss": 0.3847, "step": 286 }, { "epoch": 0.13, "grad_norm": 1.0118660926818848, "learning_rate": 8.691290469676243e-05, "loss": 0.3491, "step": 287 }, { "epoch": 0.13, "grad_norm": 1.0180116891860962, "learning_rate": 8.686730506155952e-05, "loss": 0.3697, "step": 288 }, { "epoch": 0.13, "grad_norm": 1.028004765510559, "learning_rate": 8.68217054263566e-05, "loss": 0.3809, "step": 289 }, { "epoch": 0.13, "grad_norm": 1.1547340154647827, "learning_rate": 8.677610579115367e-05, "loss": 0.3813, "step": 290 }, { "epoch": 0.13, "eval_loss": 0.38859543204307556, "eval_runtime": 19.3351, "eval_samples_per_second": 1.448, "eval_steps_per_second": 0.362, "step": 290 }, { "epoch": 0.13, "grad_norm": 10.94005298614502, "learning_rate": 8.673050615595075e-05, "loss": 0.3674, "step": 291 }, { "epoch": 0.13, "grad_norm": 1.228397011756897, "learning_rate": 8.668490652074783e-05, "loss": 0.3836, "step": 292 }, { "epoch": 0.13, "grad_norm": 0.9820699691772461, "learning_rate": 8.663930688554493e-05, "loss": 0.3574, "step": 293 }, { "epoch": 0.13, "grad_norm": 1.0105698108673096, "learning_rate": 8.659370725034201e-05, "loss": 0.362, "step": 294 }, { "epoch": 0.13, "grad_norm": 0.9972749352455139, "learning_rate": 8.654810761513909e-05, "loss": 0.3559, "step": 295 }, { "epoch": 0.13, "grad_norm": 1.0240153074264526, "learning_rate": 8.650250797993617e-05, "loss": 0.373, "step": 296 }, { "epoch": 0.14, "grad_norm": 1.0577991008758545, "learning_rate": 8.645690834473324e-05, "loss": 0.387, "step": 297 }, { "epoch": 0.14, "grad_norm": 1.04681396484375, "learning_rate": 8.641130870953032e-05, "loss": 0.3481, "step": 298 }, { "epoch": 0.14, "grad_norm": 1.0815809965133667, "learning_rate": 8.63657090743274e-05, "loss": 0.3767, "step": 299 }, { "epoch": 0.14, "grad_norm": 1.0435343980789185, "learning_rate": 8.632010943912448e-05, "loss": 0.3668, "step": 300 }, { "epoch": 0.14, "eval_loss": 0.3885766565799713, "eval_runtime": 19.117, "eval_samples_per_second": 1.465, "eval_steps_per_second": 0.366, "step": 300 }, { "epoch": 0.14, "grad_norm": 1.044992208480835, "learning_rate": 8.627450980392158e-05, "loss": 0.3648, "step": 301 }, { "epoch": 0.14, "grad_norm": 0.9807784557342529, "learning_rate": 8.622891016871866e-05, "loss": 0.3553, "step": 302 }, { "epoch": 0.14, "grad_norm": 1.0685855150222778, "learning_rate": 8.618331053351574e-05, "loss": 0.3731, "step": 303 }, { "epoch": 0.14, "grad_norm": 1.0137559175491333, "learning_rate": 8.613771089831282e-05, "loss": 0.3841, "step": 304 }, { "epoch": 0.14, "grad_norm": 0.9751632213592529, "learning_rate": 8.609211126310989e-05, "loss": 0.3757, "step": 305 }, { "epoch": 0.14, "grad_norm": 1.0728063583374023, "learning_rate": 8.604651162790697e-05, "loss": 0.385, "step": 306 }, { "epoch": 0.14, "grad_norm": 1.0910289287567139, "learning_rate": 8.600091199270406e-05, "loss": 0.368, "step": 307 }, { "epoch": 0.14, "grad_norm": 1.0189052820205688, "learning_rate": 8.595531235750114e-05, "loss": 0.3743, "step": 308 }, { "epoch": 0.14, "grad_norm": 0.9784758687019348, "learning_rate": 8.590971272229823e-05, "loss": 0.3637, "step": 309 }, { "epoch": 0.14, "grad_norm": 1.1712713241577148, "learning_rate": 8.586411308709531e-05, "loss": 0.4026, "step": 310 }, { "epoch": 0.14, "eval_loss": 0.38862475752830505, "eval_runtime": 18.9756, "eval_samples_per_second": 1.476, "eval_steps_per_second": 0.369, "step": 310 }, { "epoch": 0.14, "grad_norm": 1.027456283569336, "learning_rate": 8.58185134518924e-05, "loss": 0.3521, "step": 311 }, { "epoch": 0.14, "grad_norm": 0.9766998291015625, "learning_rate": 8.577291381668946e-05, "loss": 0.3633, "step": 312 }, { "epoch": 0.14, "grad_norm": 0.9665610790252686, "learning_rate": 8.572731418148655e-05, "loss": 0.3779, "step": 313 }, { "epoch": 0.14, "grad_norm": 1.0428779125213623, "learning_rate": 8.568171454628363e-05, "loss": 0.3782, "step": 314 }, { "epoch": 0.14, "grad_norm": 0.9691622257232666, "learning_rate": 8.563611491108071e-05, "loss": 0.3478, "step": 315 }, { "epoch": 0.14, "grad_norm": 0.9197782874107361, "learning_rate": 8.559051527587779e-05, "loss": 0.3399, "step": 316 }, { "epoch": 0.14, "grad_norm": 1.1256742477416992, "learning_rate": 8.554491564067489e-05, "loss": 0.3791, "step": 317 }, { "epoch": 0.14, "grad_norm": 1.0777599811553955, "learning_rate": 8.549931600547197e-05, "loss": 0.3742, "step": 318 }, { "epoch": 0.15, "grad_norm": 1.09661865234375, "learning_rate": 8.545371637026904e-05, "loss": 0.3706, "step": 319 }, { "epoch": 0.15, "grad_norm": 1.0405218601226807, "learning_rate": 8.540811673506612e-05, "loss": 0.3704, "step": 320 }, { "epoch": 0.15, "eval_loss": 0.387399822473526, "eval_runtime": 18.6057, "eval_samples_per_second": 1.505, "eval_steps_per_second": 0.376, "step": 320 }, { "epoch": 0.15, "grad_norm": 0.988017737865448, "learning_rate": 8.53625170998632e-05, "loss": 0.3436, "step": 321 }, { "epoch": 0.15, "grad_norm": 1.0084255933761597, "learning_rate": 8.531691746466028e-05, "loss": 0.3846, "step": 322 }, { "epoch": 0.15, "grad_norm": 1.024824619293213, "learning_rate": 8.527131782945736e-05, "loss": 0.3915, "step": 323 }, { "epoch": 0.15, "grad_norm": 1.0249780416488647, "learning_rate": 8.522571819425445e-05, "loss": 0.3589, "step": 324 }, { "epoch": 0.15, "grad_norm": 1.0125277042388916, "learning_rate": 8.518011855905154e-05, "loss": 0.3761, "step": 325 }, { "epoch": 0.15, "grad_norm": 0.9836946725845337, "learning_rate": 8.513451892384862e-05, "loss": 0.3621, "step": 326 }, { "epoch": 0.15, "grad_norm": 1.0279687643051147, "learning_rate": 8.508891928864569e-05, "loss": 0.3823, "step": 327 }, { "epoch": 0.15, "grad_norm": 1.0053000450134277, "learning_rate": 8.504331965344277e-05, "loss": 0.3882, "step": 328 }, { "epoch": 0.15, "grad_norm": 0.9956850409507751, "learning_rate": 8.499772001823985e-05, "loss": 0.3537, "step": 329 }, { "epoch": 0.15, "grad_norm": 0.9967787265777588, "learning_rate": 8.495212038303694e-05, "loss": 0.3472, "step": 330 }, { "epoch": 0.15, "eval_loss": 0.3873635232448578, "eval_runtime": 20.0228, "eval_samples_per_second": 1.398, "eval_steps_per_second": 0.35, "step": 330 }, { "epoch": 0.15, "grad_norm": 1.0272730588912964, "learning_rate": 8.490652074783402e-05, "loss": 0.3758, "step": 331 }, { "epoch": 0.15, "grad_norm": 1.004315972328186, "learning_rate": 8.48609211126311e-05, "loss": 0.372, "step": 332 }, { "epoch": 0.15, "grad_norm": 0.9904900193214417, "learning_rate": 8.48153214774282e-05, "loss": 0.3648, "step": 333 }, { "epoch": 0.15, "grad_norm": 0.9831294417381287, "learning_rate": 8.476972184222526e-05, "loss": 0.3534, "step": 334 }, { "epoch": 0.15, "grad_norm": 0.9594573974609375, "learning_rate": 8.472412220702234e-05, "loss": 0.3547, "step": 335 }, { "epoch": 0.15, "grad_norm": 1.0190843343734741, "learning_rate": 8.467852257181943e-05, "loss": 0.3562, "step": 336 }, { "epoch": 0.15, "grad_norm": 1.0010571479797363, "learning_rate": 8.463292293661651e-05, "loss": 0.3777, "step": 337 }, { "epoch": 0.15, "grad_norm": 0.9827868342399597, "learning_rate": 8.458732330141359e-05, "loss": 0.3554, "step": 338 }, { "epoch": 0.15, "grad_norm": 0.943844199180603, "learning_rate": 8.454172366621067e-05, "loss": 0.3556, "step": 339 }, { "epoch": 0.15, "grad_norm": 1.0718072652816772, "learning_rate": 8.449612403100775e-05, "loss": 0.3625, "step": 340 }, { "epoch": 0.15, "eval_loss": 0.38646969199180603, "eval_runtime": 18.7738, "eval_samples_per_second": 1.491, "eval_steps_per_second": 0.373, "step": 340 }, { "epoch": 0.16, "grad_norm": 1.038783073425293, "learning_rate": 8.445052439580485e-05, "loss": 0.3892, "step": 341 }, { "epoch": 0.16, "grad_norm": 0.996299147605896, "learning_rate": 8.440492476060192e-05, "loss": 0.3731, "step": 342 }, { "epoch": 0.16, "grad_norm": 1.0760611295700073, "learning_rate": 8.4359325125399e-05, "loss": 0.3403, "step": 343 }, { "epoch": 0.16, "grad_norm": 1.0220232009887695, "learning_rate": 8.431372549019608e-05, "loss": 0.3702, "step": 344 }, { "epoch": 0.16, "grad_norm": 0.9995104670524597, "learning_rate": 8.426812585499316e-05, "loss": 0.3697, "step": 345 }, { "epoch": 0.16, "grad_norm": 0.9776452779769897, "learning_rate": 8.422252621979024e-05, "loss": 0.3518, "step": 346 }, { "epoch": 0.16, "grad_norm": 1.042927861213684, "learning_rate": 8.417692658458733e-05, "loss": 0.3703, "step": 347 }, { "epoch": 0.16, "grad_norm": 1.0032014846801758, "learning_rate": 8.413132694938441e-05, "loss": 0.3666, "step": 348 }, { "epoch": 0.16, "grad_norm": 0.9905934929847717, "learning_rate": 8.408572731418149e-05, "loss": 0.345, "step": 349 }, { "epoch": 0.16, "grad_norm": 0.9962011575698853, "learning_rate": 8.404012767897857e-05, "loss": 0.3679, "step": 350 }, { "epoch": 0.16, "eval_loss": 0.38614824414253235, "eval_runtime": 19.1496, "eval_samples_per_second": 1.462, "eval_steps_per_second": 0.366, "step": 350 }, { "epoch": 0.16, "grad_norm": 1.0067214965820312, "learning_rate": 8.399452804377565e-05, "loss": 0.368, "step": 351 }, { "epoch": 0.16, "grad_norm": 1.0402851104736328, "learning_rate": 8.394892840857273e-05, "loss": 0.3707, "step": 352 }, { "epoch": 0.16, "grad_norm": 0.924884021282196, "learning_rate": 8.390332877336982e-05, "loss": 0.3584, "step": 353 }, { "epoch": 0.16, "grad_norm": 0.9956393837928772, "learning_rate": 8.38577291381669e-05, "loss": 0.3608, "step": 354 }, { "epoch": 0.16, "grad_norm": 1.0277490615844727, "learning_rate": 8.381212950296398e-05, "loss": 0.379, "step": 355 }, { "epoch": 0.16, "grad_norm": 1.0546191930770874, "learning_rate": 8.376652986776106e-05, "loss": 0.3868, "step": 356 }, { "epoch": 0.16, "grad_norm": 0.9730943441390991, "learning_rate": 8.372093023255814e-05, "loss": 0.3681, "step": 357 }, { "epoch": 0.16, "grad_norm": 0.995608925819397, "learning_rate": 8.367533059735522e-05, "loss": 0.3728, "step": 358 }, { "epoch": 0.16, "grad_norm": 1.039383888244629, "learning_rate": 8.36297309621523e-05, "loss": 0.3815, "step": 359 }, { "epoch": 0.16, "grad_norm": 0.9661096930503845, "learning_rate": 8.358413132694939e-05, "loss": 0.3392, "step": 360 }, { "epoch": 0.16, "eval_loss": 0.3855496048927307, "eval_runtime": 19.2455, "eval_samples_per_second": 1.455, "eval_steps_per_second": 0.364, "step": 360 }, { "epoch": 0.16, "grad_norm": 0.9818366169929504, "learning_rate": 8.353853169174647e-05, "loss": 0.3898, "step": 361 }, { "epoch": 0.17, "grad_norm": 1.140006184577942, "learning_rate": 8.349293205654355e-05, "loss": 0.3799, "step": 362 }, { "epoch": 0.17, "grad_norm": 0.9910751581192017, "learning_rate": 8.344733242134063e-05, "loss": 0.3561, "step": 363 }, { "epoch": 0.17, "grad_norm": 0.9846599102020264, "learning_rate": 8.340173278613771e-05, "loss": 0.3751, "step": 364 }, { "epoch": 0.17, "grad_norm": 0.9890525341033936, "learning_rate": 8.33561331509348e-05, "loss": 0.3717, "step": 365 }, { "epoch": 0.17, "grad_norm": 1.0564824342727661, "learning_rate": 8.331053351573188e-05, "loss": 0.3361, "step": 366 }, { "epoch": 0.17, "grad_norm": 0.9904609322547913, "learning_rate": 8.326493388052896e-05, "loss": 0.382, "step": 367 }, { "epoch": 0.17, "grad_norm": 0.9419021606445312, "learning_rate": 8.321933424532604e-05, "loss": 0.3426, "step": 368 }, { "epoch": 0.17, "grad_norm": 0.976959764957428, "learning_rate": 8.317373461012312e-05, "loss": 0.3562, "step": 369 }, { "epoch": 0.17, "grad_norm": 0.9872338175773621, "learning_rate": 8.31281349749202e-05, "loss": 0.3417, "step": 370 }, { "epoch": 0.17, "eval_loss": 0.3853757679462433, "eval_runtime": 19.3296, "eval_samples_per_second": 1.449, "eval_steps_per_second": 0.362, "step": 370 }, { "epoch": 0.17, "grad_norm": 0.9730595350265503, "learning_rate": 8.308253533971729e-05, "loss": 0.3597, "step": 371 }, { "epoch": 0.17, "grad_norm": 0.9857218861579895, "learning_rate": 8.303693570451437e-05, "loss": 0.3632, "step": 372 }, { "epoch": 0.17, "grad_norm": 0.9228557348251343, "learning_rate": 8.299133606931145e-05, "loss": 0.3417, "step": 373 }, { "epoch": 0.17, "grad_norm": 1.03836989402771, "learning_rate": 8.294573643410853e-05, "loss": 0.3601, "step": 374 }, { "epoch": 0.17, "grad_norm": 0.973513662815094, "learning_rate": 8.290013679890561e-05, "loss": 0.3598, "step": 375 }, { "epoch": 0.17, "grad_norm": 1.0164240598678589, "learning_rate": 8.28545371637027e-05, "loss": 0.3645, "step": 376 }, { "epoch": 0.17, "grad_norm": 0.9688085317611694, "learning_rate": 8.280893752849978e-05, "loss": 0.3579, "step": 377 }, { "epoch": 0.17, "grad_norm": 1.1452713012695312, "learning_rate": 8.276333789329686e-05, "loss": 0.3879, "step": 378 }, { "epoch": 0.17, "grad_norm": 0.9936511516571045, "learning_rate": 8.271773825809394e-05, "loss": 0.3838, "step": 379 }, { "epoch": 0.17, "grad_norm": 0.9094933867454529, "learning_rate": 8.267213862289101e-05, "loss": 0.3332, "step": 380 }, { "epoch": 0.17, "eval_loss": 0.383484810590744, "eval_runtime": 19.0505, "eval_samples_per_second": 1.47, "eval_steps_per_second": 0.367, "step": 380 }, { "epoch": 0.17, "grad_norm": 0.9642332196235657, "learning_rate": 8.26265389876881e-05, "loss": 0.3659, "step": 381 }, { "epoch": 0.17, "grad_norm": 0.9800812005996704, "learning_rate": 8.258093935248519e-05, "loss": 0.3882, "step": 382 }, { "epoch": 0.17, "grad_norm": 0.9797675013542175, "learning_rate": 8.253533971728227e-05, "loss": 0.3792, "step": 383 }, { "epoch": 0.18, "grad_norm": 1.0148242712020874, "learning_rate": 8.248974008207935e-05, "loss": 0.3604, "step": 384 }, { "epoch": 0.18, "grad_norm": 0.9496625661849976, "learning_rate": 8.244414044687643e-05, "loss": 0.3636, "step": 385 }, { "epoch": 0.18, "grad_norm": 0.9917979836463928, "learning_rate": 8.239854081167351e-05, "loss": 0.351, "step": 386 }, { "epoch": 0.18, "grad_norm": 0.9986299276351929, "learning_rate": 8.23529411764706e-05, "loss": 0.386, "step": 387 }, { "epoch": 0.18, "grad_norm": 0.9700773358345032, "learning_rate": 8.230734154126766e-05, "loss": 0.3597, "step": 388 }, { "epoch": 0.18, "grad_norm": 1.0540348291397095, "learning_rate": 8.226174190606476e-05, "loss": 0.3744, "step": 389 }, { "epoch": 0.18, "grad_norm": 1.018379807472229, "learning_rate": 8.221614227086184e-05, "loss": 0.3872, "step": 390 }, { "epoch": 0.18, "eval_loss": 0.3828833997249603, "eval_runtime": 18.5979, "eval_samples_per_second": 1.506, "eval_steps_per_second": 0.376, "step": 390 }, { "epoch": 0.18, "grad_norm": 0.9792240262031555, "learning_rate": 8.217054263565892e-05, "loss": 0.3993, "step": 391 }, { "epoch": 0.18, "grad_norm": 0.9694705605506897, "learning_rate": 8.2124943000456e-05, "loss": 0.3567, "step": 392 }, { "epoch": 0.18, "grad_norm": 1.0545449256896973, "learning_rate": 8.207934336525308e-05, "loss": 0.375, "step": 393 }, { "epoch": 0.18, "grad_norm": 0.995250940322876, "learning_rate": 8.203374373005017e-05, "loss": 0.3596, "step": 394 }, { "epoch": 0.18, "grad_norm": 0.993552029132843, "learning_rate": 8.198814409484723e-05, "loss": 0.3555, "step": 395 }, { "epoch": 0.18, "grad_norm": 0.9589958786964417, "learning_rate": 8.194254445964432e-05, "loss": 0.3699, "step": 396 }, { "epoch": 0.18, "grad_norm": 0.9881184101104736, "learning_rate": 8.189694482444141e-05, "loss": 0.36, "step": 397 }, { "epoch": 0.18, "grad_norm": 0.9230943918228149, "learning_rate": 8.18513451892385e-05, "loss": 0.3635, "step": 398 }, { "epoch": 0.18, "grad_norm": 0.9407085180282593, "learning_rate": 8.180574555403558e-05, "loss": 0.3821, "step": 399 }, { "epoch": 0.18, "grad_norm": 0.9004390239715576, "learning_rate": 8.176014591883266e-05, "loss": 0.3671, "step": 400 }, { "epoch": 0.18, "eval_loss": 0.3836938738822937, "eval_runtime": 19.9469, "eval_samples_per_second": 1.404, "eval_steps_per_second": 0.351, "step": 400 }, { "epoch": 0.18, "grad_norm": 0.9622935056686401, "learning_rate": 8.171454628362974e-05, "loss": 0.3451, "step": 401 }, { "epoch": 0.18, "grad_norm": 0.9424606561660767, "learning_rate": 8.166894664842682e-05, "loss": 0.3635, "step": 402 }, { "epoch": 0.18, "grad_norm": 0.9327376484870911, "learning_rate": 8.162334701322389e-05, "loss": 0.3632, "step": 403 }, { "epoch": 0.18, "grad_norm": 1.0619184970855713, "learning_rate": 8.157774737802097e-05, "loss": 0.3886, "step": 404 }, { "epoch": 0.18, "grad_norm": 0.9540975689888, "learning_rate": 8.153214774281807e-05, "loss": 0.3503, "step": 405 }, { "epoch": 0.19, "grad_norm": 0.895407497882843, "learning_rate": 8.148654810761515e-05, "loss": 0.3503, "step": 406 }, { "epoch": 0.19, "grad_norm": 0.9608978033065796, "learning_rate": 8.144094847241223e-05, "loss": 0.3598, "step": 407 }, { "epoch": 0.19, "grad_norm": 0.9120776057243347, "learning_rate": 8.139534883720931e-05, "loss": 0.343, "step": 408 }, { "epoch": 0.19, "grad_norm": 1.028601884841919, "learning_rate": 8.134974920200639e-05, "loss": 0.3827, "step": 409 }, { "epoch": 0.19, "grad_norm": 0.9658374786376953, "learning_rate": 8.130414956680346e-05, "loss": 0.3595, "step": 410 }, { "epoch": 0.19, "eval_loss": 0.38228359818458557, "eval_runtime": 18.8049, "eval_samples_per_second": 1.489, "eval_steps_per_second": 0.372, "step": 410 }, { "epoch": 0.19, "grad_norm": 0.9547862410545349, "learning_rate": 8.125854993160054e-05, "loss": 0.378, "step": 411 }, { "epoch": 0.19, "grad_norm": 0.9772255420684814, "learning_rate": 8.121295029639762e-05, "loss": 0.3727, "step": 412 }, { "epoch": 0.19, "grad_norm": 0.9499548673629761, "learning_rate": 8.116735066119472e-05, "loss": 0.366, "step": 413 }, { "epoch": 0.19, "grad_norm": 0.9336138963699341, "learning_rate": 8.11217510259918e-05, "loss": 0.3429, "step": 414 }, { "epoch": 0.19, "grad_norm": 0.9108003377914429, "learning_rate": 8.107615139078888e-05, "loss": 0.3384, "step": 415 }, { "epoch": 0.19, "grad_norm": 0.9173622131347656, "learning_rate": 8.103055175558596e-05, "loss": 0.3577, "step": 416 }, { "epoch": 0.19, "grad_norm": 0.9635920524597168, "learning_rate": 8.098495212038303e-05, "loss": 0.3695, "step": 417 }, { "epoch": 0.19, "grad_norm": 0.9141045808792114, "learning_rate": 8.093935248518011e-05, "loss": 0.3538, "step": 418 }, { "epoch": 0.19, "grad_norm": 0.990811824798584, "learning_rate": 8.08937528499772e-05, "loss": 0.3604, "step": 419 }, { "epoch": 0.19, "grad_norm": 0.9809904098510742, "learning_rate": 8.084815321477428e-05, "loss": 0.345, "step": 420 }, { "epoch": 0.19, "eval_loss": 0.38126489520072937, "eval_runtime": 19.1235, "eval_samples_per_second": 1.464, "eval_steps_per_second": 0.366, "step": 420 }, { "epoch": 0.19, "grad_norm": 0.9208155870437622, "learning_rate": 8.080255357957137e-05, "loss": 0.3682, "step": 421 }, { "epoch": 0.19, "grad_norm": 1.1504791975021362, "learning_rate": 8.075695394436846e-05, "loss": 0.362, "step": 422 }, { "epoch": 0.19, "grad_norm": 0.9805225133895874, "learning_rate": 8.071135430916554e-05, "loss": 0.3646, "step": 423 }, { "epoch": 0.19, "grad_norm": 1.0297439098358154, "learning_rate": 8.066575467396262e-05, "loss": 0.383, "step": 424 }, { "epoch": 0.19, "grad_norm": 0.9397987127304077, "learning_rate": 8.062015503875969e-05, "loss": 0.3735, "step": 425 }, { "epoch": 0.19, "grad_norm": 0.9980968832969666, "learning_rate": 8.057455540355677e-05, "loss": 0.3658, "step": 426 }, { "epoch": 0.19, "grad_norm": 1.0988585948944092, "learning_rate": 8.052895576835385e-05, "loss": 0.3752, "step": 427 }, { "epoch": 0.2, "grad_norm": 0.9558773040771484, "learning_rate": 8.048335613315093e-05, "loss": 0.3613, "step": 428 }, { "epoch": 0.2, "grad_norm": 0.9506344199180603, "learning_rate": 8.043775649794803e-05, "loss": 0.3397, "step": 429 }, { "epoch": 0.2, "grad_norm": 0.9391772150993347, "learning_rate": 8.039215686274511e-05, "loss": 0.3618, "step": 430 }, { "epoch": 0.2, "eval_loss": 0.3807036578655243, "eval_runtime": 18.5019, "eval_samples_per_second": 1.513, "eval_steps_per_second": 0.378, "step": 430 }, { "epoch": 0.2, "grad_norm": 0.9951292872428894, "learning_rate": 8.034655722754219e-05, "loss": 0.3525, "step": 431 }, { "epoch": 0.2, "grad_norm": 1.022258996963501, "learning_rate": 8.030095759233926e-05, "loss": 0.3954, "step": 432 }, { "epoch": 0.2, "grad_norm": 0.9829654693603516, "learning_rate": 8.025535795713634e-05, "loss": 0.359, "step": 433 }, { "epoch": 0.2, "grad_norm": 0.9890940189361572, "learning_rate": 8.020975832193342e-05, "loss": 0.3812, "step": 434 }, { "epoch": 0.2, "grad_norm": 0.91367107629776, "learning_rate": 8.01641586867305e-05, "loss": 0.361, "step": 435 }, { "epoch": 0.2, "grad_norm": 0.9756280779838562, "learning_rate": 8.011855905152759e-05, "loss": 0.3588, "step": 436 }, { "epoch": 0.2, "grad_norm": 1.0047883987426758, "learning_rate": 8.007295941632468e-05, "loss": 0.3769, "step": 437 }, { "epoch": 0.2, "grad_norm": 0.9199777841567993, "learning_rate": 8.002735978112176e-05, "loss": 0.3645, "step": 438 }, { "epoch": 0.2, "grad_norm": 0.9628649950027466, "learning_rate": 7.998176014591884e-05, "loss": 0.3631, "step": 439 }, { "epoch": 0.2, "grad_norm": 0.888653039932251, "learning_rate": 7.993616051071591e-05, "loss": 0.3424, "step": 440 }, { "epoch": 0.2, "eval_loss": 0.3802674114704132, "eval_runtime": 20.1434, "eval_samples_per_second": 1.39, "eval_steps_per_second": 0.348, "step": 440 }, { "epoch": 0.2, "grad_norm": 0.9911215305328369, "learning_rate": 7.9890560875513e-05, "loss": 0.3562, "step": 441 }, { "epoch": 0.2, "grad_norm": 0.9457067251205444, "learning_rate": 7.984496124031008e-05, "loss": 0.3718, "step": 442 }, { "epoch": 0.2, "grad_norm": 0.9514966011047363, "learning_rate": 7.979936160510716e-05, "loss": 0.358, "step": 443 }, { "epoch": 0.2, "grad_norm": 0.949561357498169, "learning_rate": 7.975376196990424e-05, "loss": 0.3769, "step": 444 }, { "epoch": 0.2, "grad_norm": 0.9274330735206604, "learning_rate": 7.970816233470134e-05, "loss": 0.3638, "step": 445 }, { "epoch": 0.2, "grad_norm": 0.9469530582427979, "learning_rate": 7.966256269949842e-05, "loss": 0.3661, "step": 446 }, { "epoch": 0.2, "grad_norm": 0.9310869574546814, "learning_rate": 7.961696306429548e-05, "loss": 0.353, "step": 447 }, { "epoch": 0.2, "grad_norm": 0.961445152759552, "learning_rate": 7.957136342909257e-05, "loss": 0.3463, "step": 448 }, { "epoch": 0.2, "grad_norm": 0.9427098035812378, "learning_rate": 7.952576379388965e-05, "loss": 0.3545, "step": 449 }, { "epoch": 0.21, "grad_norm": 0.8861632943153381, "learning_rate": 7.948016415868673e-05, "loss": 0.3308, "step": 450 }, { "epoch": 0.21, "eval_loss": 0.37948015332221985, "eval_runtime": 19.2271, "eval_samples_per_second": 1.456, "eval_steps_per_second": 0.364, "step": 450 }, { "epoch": 0.21, "grad_norm": 0.9233769178390503, "learning_rate": 7.943456452348381e-05, "loss": 0.3498, "step": 451 }, { "epoch": 0.21, "grad_norm": 0.9102416634559631, "learning_rate": 7.93889648882809e-05, "loss": 0.3504, "step": 452 }, { "epoch": 0.21, "grad_norm": 0.9856906533241272, "learning_rate": 7.934336525307799e-05, "loss": 0.3707, "step": 453 }, { "epoch": 0.21, "grad_norm": 0.9343120455741882, "learning_rate": 7.929776561787506e-05, "loss": 0.341, "step": 454 }, { "epoch": 0.21, "grad_norm": 1.006124496459961, "learning_rate": 7.925216598267214e-05, "loss": 0.364, "step": 455 }, { "epoch": 0.21, "grad_norm": 0.8969525694847107, "learning_rate": 7.920656634746922e-05, "loss": 0.3473, "step": 456 }, { "epoch": 0.21, "grad_norm": 0.978102445602417, "learning_rate": 7.91609667122663e-05, "loss": 0.3927, "step": 457 }, { "epoch": 0.21, "grad_norm": 1.060112714767456, "learning_rate": 7.911536707706338e-05, "loss": 0.3989, "step": 458 }, { "epoch": 0.21, "grad_norm": 0.9657124280929565, "learning_rate": 7.906976744186047e-05, "loss": 0.3926, "step": 459 }, { "epoch": 0.21, "grad_norm": 0.9384371638298035, "learning_rate": 7.902416780665755e-05, "loss": 0.3768, "step": 460 }, { "epoch": 0.21, "eval_loss": 0.37937769293785095, "eval_runtime": 18.5987, "eval_samples_per_second": 1.505, "eval_steps_per_second": 0.376, "step": 460 }, { "epoch": 0.21, "grad_norm": 0.953213632106781, "learning_rate": 7.897856817145464e-05, "loss": 0.3659, "step": 461 }, { "epoch": 0.21, "grad_norm": 0.8762606382369995, "learning_rate": 7.893296853625171e-05, "loss": 0.3213, "step": 462 }, { "epoch": 0.21, "grad_norm": 0.9968709945678711, "learning_rate": 7.888736890104879e-05, "loss": 0.3897, "step": 463 }, { "epoch": 0.21, "grad_norm": 0.9149399399757385, "learning_rate": 7.884176926584587e-05, "loss": 0.3633, "step": 464 }, { "epoch": 0.21, "grad_norm": 0.9021903276443481, "learning_rate": 7.879616963064296e-05, "loss": 0.3586, "step": 465 }, { "epoch": 0.21, "grad_norm": 0.8712608814239502, "learning_rate": 7.875056999544004e-05, "loss": 0.3425, "step": 466 }, { "epoch": 0.21, "grad_norm": 0.9100139141082764, "learning_rate": 7.870497036023712e-05, "loss": 0.3693, "step": 467 }, { "epoch": 0.21, "grad_norm": 0.9456149339675903, "learning_rate": 7.86593707250342e-05, "loss": 0.3745, "step": 468 }, { "epoch": 0.21, "grad_norm": 0.8988533616065979, "learning_rate": 7.861377108983128e-05, "loss": 0.3434, "step": 469 }, { "epoch": 0.21, "grad_norm": 0.8404950499534607, "learning_rate": 7.856817145462836e-05, "loss": 0.343, "step": 470 }, { "epoch": 0.21, "eval_loss": 0.37977489829063416, "eval_runtime": 19.6011, "eval_samples_per_second": 1.428, "eval_steps_per_second": 0.357, "step": 470 }, { "epoch": 0.21, "grad_norm": 0.9720378518104553, "learning_rate": 7.852257181942545e-05, "loss": 0.3644, "step": 471 }, { "epoch": 0.22, "grad_norm": 1.0149002075195312, "learning_rate": 7.847697218422253e-05, "loss": 0.3705, "step": 472 }, { "epoch": 0.22, "grad_norm": 0.9858806133270264, "learning_rate": 7.843137254901961e-05, "loss": 0.3941, "step": 473 }, { "epoch": 0.22, "grad_norm": 0.9632289409637451, "learning_rate": 7.838577291381669e-05, "loss": 0.3725, "step": 474 }, { "epoch": 0.22, "grad_norm": 0.9068118333816528, "learning_rate": 7.834017327861377e-05, "loss": 0.3415, "step": 475 }, { "epoch": 0.22, "grad_norm": 0.9591249823570251, "learning_rate": 7.829457364341086e-05, "loss": 0.3561, "step": 476 }, { "epoch": 0.22, "grad_norm": 0.9359074234962463, "learning_rate": 7.824897400820794e-05, "loss": 0.3541, "step": 477 }, { "epoch": 0.22, "grad_norm": 0.9554955959320068, "learning_rate": 7.820337437300502e-05, "loss": 0.3755, "step": 478 }, { "epoch": 0.22, "grad_norm": 1.0127731561660767, "learning_rate": 7.81577747378021e-05, "loss": 0.3832, "step": 479 }, { "epoch": 0.22, "grad_norm": 0.9365389943122864, "learning_rate": 7.811217510259918e-05, "loss": 0.3706, "step": 480 }, { "epoch": 0.22, "eval_loss": 0.379183828830719, "eval_runtime": 18.8228, "eval_samples_per_second": 1.488, "eval_steps_per_second": 0.372, "step": 480 }, { "epoch": 0.22, "grad_norm": 0.9115477204322815, "learning_rate": 7.806657546739626e-05, "loss": 0.3645, "step": 481 }, { "epoch": 0.22, "grad_norm": 0.8891696929931641, "learning_rate": 7.802097583219335e-05, "loss": 0.3554, "step": 482 }, { "epoch": 0.22, "grad_norm": 0.8747963905334473, "learning_rate": 7.797537619699043e-05, "loss": 0.3399, "step": 483 }, { "epoch": 0.22, "grad_norm": 0.8941763043403625, "learning_rate": 7.792977656178751e-05, "loss": 0.3535, "step": 484 }, { "epoch": 0.22, "grad_norm": 0.9057623147964478, "learning_rate": 7.788417692658459e-05, "loss": 0.365, "step": 485 }, { "epoch": 0.22, "grad_norm": 0.9394456148147583, "learning_rate": 7.783857729138167e-05, "loss": 0.3496, "step": 486 }, { "epoch": 0.22, "grad_norm": 1.0419434309005737, "learning_rate": 7.779297765617875e-05, "loss": 0.3576, "step": 487 }, { "epoch": 0.22, "grad_norm": 0.9206737279891968, "learning_rate": 7.774737802097584e-05, "loss": 0.3609, "step": 488 }, { "epoch": 0.22, "grad_norm": 0.9505032300949097, "learning_rate": 7.770177838577292e-05, "loss": 0.3806, "step": 489 }, { "epoch": 0.22, "grad_norm": 0.9697739481925964, "learning_rate": 7.765617875057e-05, "loss": 0.369, "step": 490 }, { "epoch": 0.22, "eval_loss": 0.37793412804603577, "eval_runtime": 18.9214, "eval_samples_per_second": 1.48, "eval_steps_per_second": 0.37, "step": 490 }, { "epoch": 0.22, "grad_norm": 0.8664564490318298, "learning_rate": 7.761057911536708e-05, "loss": 0.3509, "step": 491 }, { "epoch": 0.22, "grad_norm": 0.8611375093460083, "learning_rate": 7.756497948016416e-05, "loss": 0.3294, "step": 492 }, { "epoch": 0.22, "grad_norm": 0.9530375003814697, "learning_rate": 7.751937984496124e-05, "loss": 0.3566, "step": 493 }, { "epoch": 0.23, "grad_norm": 0.9074346423149109, "learning_rate": 7.747378020975833e-05, "loss": 0.3635, "step": 494 }, { "epoch": 0.23, "grad_norm": 0.8907040953636169, "learning_rate": 7.742818057455541e-05, "loss": 0.364, "step": 495 }, { "epoch": 0.23, "grad_norm": 0.9278532266616821, "learning_rate": 7.738258093935249e-05, "loss": 0.3563, "step": 496 }, { "epoch": 0.23, "grad_norm": 0.956122875213623, "learning_rate": 7.733698130414957e-05, "loss": 0.3599, "step": 497 }, { "epoch": 0.23, "grad_norm": 0.9482458233833313, "learning_rate": 7.729138166894665e-05, "loss": 0.3626, "step": 498 }, { "epoch": 0.23, "grad_norm": 1.004625678062439, "learning_rate": 7.724578203374374e-05, "loss": 0.3884, "step": 499 }, { "epoch": 0.23, "grad_norm": 0.9188464283943176, "learning_rate": 7.720018239854082e-05, "loss": 0.3553, "step": 500 }, { "epoch": 0.23, "eval_loss": 0.37794917821884155, "eval_runtime": 18.631, "eval_samples_per_second": 1.503, "eval_steps_per_second": 0.376, "step": 500 }, { "epoch": 0.23, "grad_norm": 0.8780580759048462, "learning_rate": 7.71545827633379e-05, "loss": 0.3404, "step": 501 }, { "epoch": 0.23, "grad_norm": 0.8963892459869385, "learning_rate": 7.710898312813498e-05, "loss": 0.3406, "step": 502 }, { "epoch": 0.23, "grad_norm": 1.0074235200881958, "learning_rate": 7.706338349293206e-05, "loss": 0.3795, "step": 503 }, { "epoch": 0.23, "grad_norm": 0.9363308548927307, "learning_rate": 7.701778385772914e-05, "loss": 0.3524, "step": 504 }, { "epoch": 0.23, "grad_norm": 0.854862630367279, "learning_rate": 7.697218422252623e-05, "loss": 0.3351, "step": 505 }, { "epoch": 0.23, "grad_norm": 0.9411963224411011, "learning_rate": 7.692658458732331e-05, "loss": 0.365, "step": 506 }, { "epoch": 0.23, "grad_norm": 0.9519062042236328, "learning_rate": 7.688098495212039e-05, "loss": 0.3606, "step": 507 }, { "epoch": 0.23, "grad_norm": 0.936029314994812, "learning_rate": 7.683538531691746e-05, "loss": 0.3773, "step": 508 }, { "epoch": 0.23, "grad_norm": 0.8771556615829468, "learning_rate": 7.678978568171455e-05, "loss": 0.3492, "step": 509 }, { "epoch": 0.23, "grad_norm": 0.878050684928894, "learning_rate": 7.674418604651163e-05, "loss": 0.3503, "step": 510 }, { "epoch": 0.23, "eval_loss": 0.3781495690345764, "eval_runtime": 19.9333, "eval_samples_per_second": 1.405, "eval_steps_per_second": 0.351, "step": 510 }, { "epoch": 0.23, "grad_norm": 0.9384998083114624, "learning_rate": 7.669858641130872e-05, "loss": 0.3682, "step": 511 }, { "epoch": 0.23, "grad_norm": 0.923982560634613, "learning_rate": 7.66529867761058e-05, "loss": 0.3583, "step": 512 }, { "epoch": 0.23, "grad_norm": 0.8898745775222778, "learning_rate": 7.660738714090288e-05, "loss": 0.3767, "step": 513 }, { "epoch": 0.23, "grad_norm": 0.9237766861915588, "learning_rate": 7.656178750569996e-05, "loss": 0.369, "step": 514 }, { "epoch": 0.23, "grad_norm": 1.0031055212020874, "learning_rate": 7.651618787049703e-05, "loss": 0.3662, "step": 515 }, { "epoch": 0.24, "grad_norm": 0.9065768122673035, "learning_rate": 7.647058823529411e-05, "loss": 0.3557, "step": 516 }, { "epoch": 0.24, "grad_norm": 0.9564458131790161, "learning_rate": 7.64249886000912e-05, "loss": 0.3621, "step": 517 }, { "epoch": 0.24, "grad_norm": 0.9868760704994202, "learning_rate": 7.637938896488829e-05, "loss": 0.3492, "step": 518 }, { "epoch": 0.24, "grad_norm": 1.0082308053970337, "learning_rate": 7.633378932968537e-05, "loss": 0.3833, "step": 519 }, { "epoch": 0.24, "grad_norm": 0.903857409954071, "learning_rate": 7.628818969448245e-05, "loss": 0.3633, "step": 520 }, { "epoch": 0.24, "eval_loss": 0.3774409294128418, "eval_runtime": 19.0237, "eval_samples_per_second": 1.472, "eval_steps_per_second": 0.368, "step": 520 }, { "epoch": 0.24, "grad_norm": 0.8547798991203308, "learning_rate": 7.624259005927953e-05, "loss": 0.3611, "step": 521 }, { "epoch": 0.24, "grad_norm": 0.9237222075462341, "learning_rate": 7.619699042407661e-05, "loss": 0.3652, "step": 522 }, { "epoch": 0.24, "grad_norm": 0.8383327722549438, "learning_rate": 7.615139078887368e-05, "loss": 0.3571, "step": 523 }, { "epoch": 0.24, "grad_norm": 0.994269073009491, "learning_rate": 7.610579115367076e-05, "loss": 0.37, "step": 524 }, { "epoch": 0.24, "grad_norm": 0.872664749622345, "learning_rate": 7.606019151846786e-05, "loss": 0.362, "step": 525 }, { "epoch": 0.24, "grad_norm": 0.9679466485977173, "learning_rate": 7.601459188326494e-05, "loss": 0.3581, "step": 526 }, { "epoch": 0.24, "grad_norm": 0.9206968545913696, "learning_rate": 7.596899224806202e-05, "loss": 0.3525, "step": 527 }, { "epoch": 0.24, "grad_norm": 0.915579617023468, "learning_rate": 7.59233926128591e-05, "loss": 0.346, "step": 528 }, { "epoch": 0.24, "grad_norm": 0.942768394947052, "learning_rate": 7.587779297765619e-05, "loss": 0.3708, "step": 529 }, { "epoch": 0.24, "grad_norm": 0.948949933052063, "learning_rate": 7.583219334245326e-05, "loss": 0.342, "step": 530 }, { "epoch": 0.24, "eval_loss": 0.37704339623451233, "eval_runtime": 19.4572, "eval_samples_per_second": 1.439, "eval_steps_per_second": 0.36, "step": 530 }, { "epoch": 0.24, "grad_norm": 0.8952234983444214, "learning_rate": 7.578659370725034e-05, "loss": 0.3672, "step": 531 }, { "epoch": 0.24, "grad_norm": 0.9527508616447449, "learning_rate": 7.574099407204742e-05, "loss": 0.3652, "step": 532 }, { "epoch": 0.24, "grad_norm": 0.9585651755332947, "learning_rate": 7.569539443684451e-05, "loss": 0.3725, "step": 533 }, { "epoch": 0.24, "grad_norm": 0.8955938816070557, "learning_rate": 7.56497948016416e-05, "loss": 0.3429, "step": 534 }, { "epoch": 0.24, "grad_norm": 0.9243647456169128, "learning_rate": 7.560419516643868e-05, "loss": 0.3494, "step": 535 }, { "epoch": 0.24, "grad_norm": 0.914239227771759, "learning_rate": 7.555859553123576e-05, "loss": 0.3741, "step": 536 }, { "epoch": 0.24, "grad_norm": 0.8691146373748779, "learning_rate": 7.551299589603284e-05, "loss": 0.3526, "step": 537 }, { "epoch": 0.25, "grad_norm": 0.9309206604957581, "learning_rate": 7.546739626082991e-05, "loss": 0.3747, "step": 538 }, { "epoch": 0.25, "grad_norm": 0.8218508362770081, "learning_rate": 7.542179662562699e-05, "loss": 0.3519, "step": 539 }, { "epoch": 0.25, "grad_norm": 1.1142666339874268, "learning_rate": 7.537619699042407e-05, "loss": 0.3545, "step": 540 }, { "epoch": 0.25, "eval_loss": 0.3766672909259796, "eval_runtime": 18.6678, "eval_samples_per_second": 1.5, "eval_steps_per_second": 0.375, "step": 540 }, { "epoch": 0.25, "grad_norm": 0.9692361354827881, "learning_rate": 7.533059735522117e-05, "loss": 0.3895, "step": 541 }, { "epoch": 0.25, "grad_norm": 0.9041081070899963, "learning_rate": 7.528499772001825e-05, "loss": 0.3583, "step": 542 }, { "epoch": 0.25, "grad_norm": 0.9008587002754211, "learning_rate": 7.523939808481533e-05, "loss": 0.3498, "step": 543 }, { "epoch": 0.25, "grad_norm": 1.0266526937484741, "learning_rate": 7.519379844961241e-05, "loss": 0.3612, "step": 544 }, { "epoch": 0.25, "grad_norm": 1.0442652702331543, "learning_rate": 7.514819881440948e-05, "loss": 0.3806, "step": 545 }, { "epoch": 0.25, "grad_norm": 0.8736815452575684, "learning_rate": 7.510259917920656e-05, "loss": 0.3163, "step": 546 }, { "epoch": 0.25, "grad_norm": 0.8623315691947937, "learning_rate": 7.505699954400364e-05, "loss": 0.3701, "step": 547 }, { "epoch": 0.25, "grad_norm": 0.9528596997261047, "learning_rate": 7.501139990880073e-05, "loss": 0.3426, "step": 548 }, { "epoch": 0.25, "grad_norm": 0.8725919127464294, "learning_rate": 7.496580027359782e-05, "loss": 0.3552, "step": 549 }, { "epoch": 0.25, "grad_norm": 0.9121567010879517, "learning_rate": 7.49202006383949e-05, "loss": 0.3616, "step": 550 }, { "epoch": 0.25, "eval_loss": 0.37652093172073364, "eval_runtime": 19.7078, "eval_samples_per_second": 1.421, "eval_steps_per_second": 0.355, "step": 550 }, { "epoch": 0.25, "grad_norm": 0.916845440864563, "learning_rate": 7.487460100319199e-05, "loss": 0.3626, "step": 551 }, { "epoch": 0.25, "grad_norm": 0.8767221570014954, "learning_rate": 7.482900136798905e-05, "loss": 0.361, "step": 552 }, { "epoch": 0.25, "grad_norm": 0.8678212761878967, "learning_rate": 7.478340173278614e-05, "loss": 0.3508, "step": 553 }, { "epoch": 0.25, "grad_norm": 0.9074848294258118, "learning_rate": 7.473780209758322e-05, "loss": 0.3555, "step": 554 }, { "epoch": 0.25, "grad_norm": 0.9501338005065918, "learning_rate": 7.46922024623803e-05, "loss": 0.3662, "step": 555 }, { "epoch": 0.25, "grad_norm": 0.9243327379226685, "learning_rate": 7.464660282717738e-05, "loss": 0.3767, "step": 556 }, { "epoch": 0.25, "grad_norm": 0.9555984139442444, "learning_rate": 7.460100319197448e-05, "loss": 0.3614, "step": 557 }, { "epoch": 0.25, "grad_norm": 0.9904035329818726, "learning_rate": 7.455540355677156e-05, "loss": 0.343, "step": 558 }, { "epoch": 0.25, "grad_norm": 0.9084532260894775, "learning_rate": 7.450980392156864e-05, "loss": 0.3734, "step": 559 }, { "epoch": 0.26, "grad_norm": 0.9660534858703613, "learning_rate": 7.446420428636571e-05, "loss": 0.3912, "step": 560 }, { "epoch": 0.26, "eval_loss": 0.37544146180152893, "eval_runtime": 20.1661, "eval_samples_per_second": 1.388, "eval_steps_per_second": 0.347, "step": 560 }, { "epoch": 0.26, "grad_norm": 0.8810905814170837, "learning_rate": 7.441860465116279e-05, "loss": 0.3565, "step": 561 }, { "epoch": 0.26, "grad_norm": 0.9330177307128906, "learning_rate": 7.437300501595987e-05, "loss": 0.3797, "step": 562 }, { "epoch": 0.26, "grad_norm": 0.9678060412406921, "learning_rate": 7.432740538075695e-05, "loss": 0.3512, "step": 563 }, { "epoch": 0.26, "grad_norm": 0.919135332107544, "learning_rate": 7.428180574555403e-05, "loss": 0.3739, "step": 564 }, { "epoch": 0.26, "grad_norm": 0.9367424845695496, "learning_rate": 7.423620611035113e-05, "loss": 0.3912, "step": 565 }, { "epoch": 0.26, "grad_norm": 0.8687885999679565, "learning_rate": 7.419060647514821e-05, "loss": 0.3517, "step": 566 }, { "epoch": 0.26, "grad_norm": 0.9798654913902283, "learning_rate": 7.414500683994528e-05, "loss": 0.3538, "step": 567 }, { "epoch": 0.26, "grad_norm": 0.9142093658447266, "learning_rate": 7.409940720474236e-05, "loss": 0.3614, "step": 568 }, { "epoch": 0.26, "grad_norm": 0.9957043528556824, "learning_rate": 7.405380756953944e-05, "loss": 0.3635, "step": 569 }, { "epoch": 0.26, "grad_norm": 0.9421626329421997, "learning_rate": 7.400820793433652e-05, "loss": 0.374, "step": 570 }, { "epoch": 0.26, "eval_loss": 0.37424519658088684, "eval_runtime": 19.7385, "eval_samples_per_second": 1.419, "eval_steps_per_second": 0.355, "step": 570 }, { "epoch": 0.26, "grad_norm": 0.9157000780105591, "learning_rate": 7.39626082991336e-05, "loss": 0.3695, "step": 571 }, { "epoch": 0.26, "grad_norm": 0.9190824031829834, "learning_rate": 7.391700866393069e-05, "loss": 0.3661, "step": 572 }, { "epoch": 0.26, "grad_norm": 0.8940838575363159, "learning_rate": 7.387140902872778e-05, "loss": 0.3399, "step": 573 }, { "epoch": 0.26, "grad_norm": 0.9431096911430359, "learning_rate": 7.382580939352486e-05, "loss": 0.378, "step": 574 }, { "epoch": 0.26, "grad_norm": 0.9267783761024475, "learning_rate": 7.378020975832193e-05, "loss": 0.3752, "step": 575 }, { "epoch": 0.26, "grad_norm": 0.895793616771698, "learning_rate": 7.373461012311901e-05, "loss": 0.3676, "step": 576 }, { "epoch": 0.26, "grad_norm": 0.9508955478668213, "learning_rate": 7.36890104879161e-05, "loss": 0.3461, "step": 577 }, { "epoch": 0.26, "grad_norm": 0.9015037417411804, "learning_rate": 7.364341085271318e-05, "loss": 0.3689, "step": 578 }, { "epoch": 0.26, "grad_norm": 0.9231910109519958, "learning_rate": 7.359781121751026e-05, "loss": 0.3495, "step": 579 }, { "epoch": 0.26, "grad_norm": 0.902377188205719, "learning_rate": 7.355221158230734e-05, "loss": 0.3802, "step": 580 }, { "epoch": 0.26, "eval_loss": 0.37441423535346985, "eval_runtime": 22.0055, "eval_samples_per_second": 1.272, "eval_steps_per_second": 0.318, "step": 580 }, { "epoch": 0.26, "grad_norm": 0.859978973865509, "learning_rate": 7.350661194710444e-05, "loss": 0.3719, "step": 581 }, { "epoch": 0.27, "grad_norm": 0.9164068102836609, "learning_rate": 7.34610123119015e-05, "loss": 0.3728, "step": 582 }, { "epoch": 0.27, "grad_norm": 0.9238768219947815, "learning_rate": 7.341541267669859e-05, "loss": 0.3574, "step": 583 }, { "epoch": 0.27, "grad_norm": 0.8287404775619507, "learning_rate": 7.336981304149567e-05, "loss": 0.3387, "step": 584 }, { "epoch": 0.27, "grad_norm": 0.8957213163375854, "learning_rate": 7.332421340629275e-05, "loss": 0.3327, "step": 585 }, { "epoch": 0.27, "grad_norm": 0.9304167628288269, "learning_rate": 7.327861377108983e-05, "loss": 0.371, "step": 586 }, { "epoch": 0.27, "grad_norm": 0.8944963216781616, "learning_rate": 7.323301413588691e-05, "loss": 0.3484, "step": 587 }, { "epoch": 0.27, "grad_norm": 0.9239458441734314, "learning_rate": 7.3187414500684e-05, "loss": 0.4004, "step": 588 }, { "epoch": 0.27, "grad_norm": 0.8646803498268127, "learning_rate": 7.314181486548109e-05, "loss": 0.3641, "step": 589 }, { "epoch": 0.27, "grad_norm": 0.8503308296203613, "learning_rate": 7.309621523027816e-05, "loss": 0.3579, "step": 590 }, { "epoch": 0.27, "eval_loss": 0.37435466051101685, "eval_runtime": 20.8667, "eval_samples_per_second": 1.342, "eval_steps_per_second": 0.335, "step": 590 }, { "epoch": 0.27, "grad_norm": 1.0227876901626587, "learning_rate": 7.305061559507524e-05, "loss": 0.3305, "step": 591 }, { "epoch": 0.27, "grad_norm": 0.8426918387413025, "learning_rate": 7.300501595987232e-05, "loss": 0.3278, "step": 592 }, { "epoch": 0.27, "grad_norm": 0.9567776918411255, "learning_rate": 7.29594163246694e-05, "loss": 0.3607, "step": 593 }, { "epoch": 0.27, "grad_norm": 0.8379486203193665, "learning_rate": 7.291381668946649e-05, "loss": 0.3262, "step": 594 }, { "epoch": 0.27, "grad_norm": 1.170776605606079, "learning_rate": 7.286821705426357e-05, "loss": 0.3802, "step": 595 }, { "epoch": 0.27, "grad_norm": 0.9064086079597473, "learning_rate": 7.282261741906065e-05, "loss": 0.3625, "step": 596 }, { "epoch": 0.27, "grad_norm": 0.89022296667099, "learning_rate": 7.277701778385773e-05, "loss": 0.3563, "step": 597 }, { "epoch": 0.27, "grad_norm": 0.9683102369308472, "learning_rate": 7.273141814865481e-05, "loss": 0.3595, "step": 598 }, { "epoch": 0.27, "grad_norm": 0.9998959898948669, "learning_rate": 7.26858185134519e-05, "loss": 0.3735, "step": 599 }, { "epoch": 0.27, "grad_norm": 0.8802072405815125, "learning_rate": 7.264021887824898e-05, "loss": 0.3479, "step": 600 }, { "epoch": 0.27, "eval_loss": 0.37338584661483765, "eval_runtime": 21.634, "eval_samples_per_second": 1.294, "eval_steps_per_second": 0.324, "step": 600 }, { "epoch": 0.27, "grad_norm": 0.8717195987701416, "learning_rate": 7.259461924304606e-05, "loss": 0.3522, "step": 601 }, { "epoch": 0.27, "grad_norm": 0.9232763648033142, "learning_rate": 7.254901960784314e-05, "loss": 0.3492, "step": 602 }, { "epoch": 0.27, "grad_norm": 0.9206188321113586, "learning_rate": 7.250341997264022e-05, "loss": 0.3572, "step": 603 }, { "epoch": 0.28, "grad_norm": 0.9413841962814331, "learning_rate": 7.24578203374373e-05, "loss": 0.3645, "step": 604 }, { "epoch": 0.28, "grad_norm": 0.9230554699897766, "learning_rate": 7.241222070223439e-05, "loss": 0.3594, "step": 605 }, { "epoch": 0.28, "grad_norm": 0.9277369379997253, "learning_rate": 7.236662106703147e-05, "loss": 0.3661, "step": 606 }, { "epoch": 0.28, "grad_norm": 0.8822442889213562, "learning_rate": 7.232102143182855e-05, "loss": 0.3533, "step": 607 }, { "epoch": 0.28, "grad_norm": 0.8277364373207092, "learning_rate": 7.227542179662563e-05, "loss": 0.3329, "step": 608 }, { "epoch": 0.28, "grad_norm": 0.8693709969520569, "learning_rate": 7.222982216142271e-05, "loss": 0.3421, "step": 609 }, { "epoch": 0.28, "grad_norm": 0.9155862331390381, "learning_rate": 7.21842225262198e-05, "loss": 0.3749, "step": 610 }, { "epoch": 0.28, "eval_loss": 0.37279894948005676, "eval_runtime": 30.1926, "eval_samples_per_second": 0.927, "eval_steps_per_second": 0.232, "step": 610 }, { "epoch": 0.28, "grad_norm": 0.9259756803512573, "learning_rate": 7.213862289101688e-05, "loss": 0.3748, "step": 611 }, { "epoch": 0.28, "grad_norm": 0.9580633044242859, "learning_rate": 7.209302325581396e-05, "loss": 0.3528, "step": 612 }, { "epoch": 0.28, "grad_norm": 0.8764567971229553, "learning_rate": 7.204742362061104e-05, "loss": 0.3562, "step": 613 }, { "epoch": 0.28, "grad_norm": 0.8910514116287231, "learning_rate": 7.200182398540812e-05, "loss": 0.3507, "step": 614 }, { "epoch": 0.28, "grad_norm": 1.0229401588439941, "learning_rate": 7.19562243502052e-05, "loss": 0.3837, "step": 615 }, { "epoch": 0.28, "grad_norm": 0.8673608899116516, "learning_rate": 7.191062471500228e-05, "loss": 0.3548, "step": 616 }, { "epoch": 0.28, "grad_norm": 0.8642386794090271, "learning_rate": 7.186502507979937e-05, "loss": 0.3409, "step": 617 }, { "epoch": 0.28, "grad_norm": 0.9027450680732727, "learning_rate": 7.181942544459645e-05, "loss": 0.3605, "step": 618 }, { "epoch": 0.28, "grad_norm": 0.950926661491394, "learning_rate": 7.177382580939353e-05, "loss": 0.3833, "step": 619 }, { "epoch": 0.28, "grad_norm": 0.910386323928833, "learning_rate": 7.172822617419061e-05, "loss": 0.3639, "step": 620 }, { "epoch": 0.28, "eval_loss": 0.37247234582901, "eval_runtime": 29.593, "eval_samples_per_second": 0.946, "eval_steps_per_second": 0.237, "step": 620 }, { "epoch": 0.28, "grad_norm": 0.9070430397987366, "learning_rate": 7.168262653898769e-05, "loss": 0.3685, "step": 621 }, { "epoch": 0.28, "grad_norm": 0.9218928813934326, "learning_rate": 7.163702690378477e-05, "loss": 0.3657, "step": 622 }, { "epoch": 0.28, "grad_norm": 0.87645423412323, "learning_rate": 7.159142726858186e-05, "loss": 0.3557, "step": 623 }, { "epoch": 0.28, "grad_norm": 0.9534425139427185, "learning_rate": 7.154582763337894e-05, "loss": 0.3471, "step": 624 }, { "epoch": 0.28, "grad_norm": 0.8581913709640503, "learning_rate": 7.150022799817602e-05, "loss": 0.358, "step": 625 }, { "epoch": 0.29, "grad_norm": 0.8720976114273071, "learning_rate": 7.14546283629731e-05, "loss": 0.3455, "step": 626 }, { "epoch": 0.29, "grad_norm": 0.8933624625205994, "learning_rate": 7.140902872777018e-05, "loss": 0.3512, "step": 627 }, { "epoch": 0.29, "grad_norm": 0.8910596966743469, "learning_rate": 7.136342909256725e-05, "loss": 0.368, "step": 628 }, { "epoch": 0.29, "grad_norm": 0.865628719329834, "learning_rate": 7.131782945736435e-05, "loss": 0.3528, "step": 629 }, { "epoch": 0.29, "grad_norm": 0.905426025390625, "learning_rate": 7.127222982216143e-05, "loss": 0.374, "step": 630 }, { "epoch": 0.29, "eval_loss": 0.37291833758354187, "eval_runtime": 22.3333, "eval_samples_per_second": 1.254, "eval_steps_per_second": 0.313, "step": 630 }, { "epoch": 0.29, "grad_norm": 0.8680101633071899, "learning_rate": 7.122663018695851e-05, "loss": 0.3755, "step": 631 }, { "epoch": 0.29, "grad_norm": 0.866011917591095, "learning_rate": 7.118103055175559e-05, "loss": 0.3504, "step": 632 }, { "epoch": 0.29, "grad_norm": 0.8784955143928528, "learning_rate": 7.113543091655267e-05, "loss": 0.361, "step": 633 }, { "epoch": 0.29, "grad_norm": 0.8499343991279602, "learning_rate": 7.108983128134976e-05, "loss": 0.3534, "step": 634 }, { "epoch": 0.29, "grad_norm": 0.9045304656028748, "learning_rate": 7.104423164614684e-05, "loss": 0.3685, "step": 635 }, { "epoch": 0.29, "grad_norm": 1.0086599588394165, "learning_rate": 7.09986320109439e-05, "loss": 0.3772, "step": 636 }, { "epoch": 0.29, "grad_norm": 0.8259167075157166, "learning_rate": 7.0953032375741e-05, "loss": 0.325, "step": 637 }, { "epoch": 0.29, "grad_norm": 0.8794663548469543, "learning_rate": 7.090743274053808e-05, "loss": 0.3311, "step": 638 }, { "epoch": 0.29, "grad_norm": 0.8893126249313354, "learning_rate": 7.086183310533516e-05, "loss": 0.3505, "step": 639 }, { "epoch": 0.29, "grad_norm": 0.9002717137336731, "learning_rate": 7.081623347013225e-05, "loss": 0.3497, "step": 640 }, { "epoch": 0.29, "eval_loss": 0.3717420995235443, "eval_runtime": 18.0961, "eval_samples_per_second": 1.547, "eval_steps_per_second": 0.387, "step": 640 }, { "epoch": 0.29, "grad_norm": 0.8322933912277222, "learning_rate": 7.077063383492933e-05, "loss": 0.3451, "step": 641 }, { "epoch": 0.29, "grad_norm": 0.8885860443115234, "learning_rate": 7.072503419972641e-05, "loss": 0.3646, "step": 642 }, { "epoch": 0.29, "grad_norm": 0.8684788346290588, "learning_rate": 7.067943456452348e-05, "loss": 0.3568, "step": 643 }, { "epoch": 0.29, "grad_norm": 0.853153645992279, "learning_rate": 7.063383492932056e-05, "loss": 0.3571, "step": 644 }, { "epoch": 0.29, "grad_norm": 0.8872026205062866, "learning_rate": 7.058823529411765e-05, "loss": 0.3763, "step": 645 }, { "epoch": 0.29, "grad_norm": 0.8363185524940491, "learning_rate": 7.054263565891474e-05, "loss": 0.3589, "step": 646 }, { "epoch": 0.29, "grad_norm": 0.8505784869194031, "learning_rate": 7.049703602371182e-05, "loss": 0.3627, "step": 647 }, { "epoch": 0.3, "grad_norm": 0.8950116038322449, "learning_rate": 7.04514363885089e-05, "loss": 0.3634, "step": 648 }, { "epoch": 0.3, "grad_norm": 0.876524031162262, "learning_rate": 7.040583675330598e-05, "loss": 0.3601, "step": 649 }, { "epoch": 0.3, "grad_norm": 0.8355085849761963, "learning_rate": 7.036023711810305e-05, "loss": 0.3641, "step": 650 }, { "epoch": 0.3, "eval_loss": 0.3709128201007843, "eval_runtime": 21.856, "eval_samples_per_second": 1.281, "eval_steps_per_second": 0.32, "step": 650 }, { "epoch": 0.3, "grad_norm": 0.8563446998596191, "learning_rate": 7.031463748290013e-05, "loss": 0.3314, "step": 651 }, { "epoch": 0.3, "grad_norm": 0.893064022064209, "learning_rate": 7.026903784769721e-05, "loss": 0.3606, "step": 652 }, { "epoch": 0.3, "grad_norm": 0.8532779812812805, "learning_rate": 7.022343821249431e-05, "loss": 0.3432, "step": 653 }, { "epoch": 0.3, "grad_norm": 0.8648200035095215, "learning_rate": 7.017783857729139e-05, "loss": 0.342, "step": 654 }, { "epoch": 0.3, "grad_norm": 0.9161383509635925, "learning_rate": 7.013223894208847e-05, "loss": 0.3798, "step": 655 }, { "epoch": 0.3, "grad_norm": 0.9029960632324219, "learning_rate": 7.008663930688555e-05, "loss": 0.3594, "step": 656 }, { "epoch": 0.3, "grad_norm": 0.838525652885437, "learning_rate": 7.004103967168264e-05, "loss": 0.3565, "step": 657 }, { "epoch": 0.3, "grad_norm": 0.9098020792007446, "learning_rate": 6.99954400364797e-05, "loss": 0.3595, "step": 658 }, { "epoch": 0.3, "grad_norm": 0.915035605430603, "learning_rate": 6.994984040127679e-05, "loss": 0.3582, "step": 659 }, { "epoch": 0.3, "grad_norm": 0.9485026001930237, "learning_rate": 6.990424076607387e-05, "loss": 0.3709, "step": 660 }, { "epoch": 0.3, "eval_loss": 0.37105894088745117, "eval_runtime": 22.2753, "eval_samples_per_second": 1.257, "eval_steps_per_second": 0.314, "step": 660 }, { "epoch": 0.3, "grad_norm": 0.929305374622345, "learning_rate": 6.985864113087096e-05, "loss": 0.3812, "step": 661 }, { "epoch": 0.3, "grad_norm": 0.8890742659568787, "learning_rate": 6.981304149566804e-05, "loss": 0.3588, "step": 662 }, { "epoch": 0.3, "grad_norm": 0.9145094156265259, "learning_rate": 6.976744186046513e-05, "loss": 0.383, "step": 663 }, { "epoch": 0.3, "grad_norm": 0.8319873809814453, "learning_rate": 6.972184222526221e-05, "loss": 0.3429, "step": 664 }, { "epoch": 0.3, "grad_norm": 0.8460900187492371, "learning_rate": 6.967624259005928e-05, "loss": 0.3371, "step": 665 }, { "epoch": 0.3, "grad_norm": 0.8295953869819641, "learning_rate": 6.963064295485636e-05, "loss": 0.3497, "step": 666 }, { "epoch": 0.3, "grad_norm": 0.8634557723999023, "learning_rate": 6.958504331965344e-05, "loss": 0.377, "step": 667 }, { "epoch": 0.3, "grad_norm": 0.8128460049629211, "learning_rate": 6.953944368445052e-05, "loss": 0.3359, "step": 668 }, { "epoch": 0.3, "grad_norm": 0.913492739200592, "learning_rate": 6.949384404924762e-05, "loss": 0.3632, "step": 669 }, { "epoch": 0.31, "grad_norm": 0.8793885111808777, "learning_rate": 6.94482444140447e-05, "loss": 0.3407, "step": 670 }, { "epoch": 0.31, "eval_loss": 0.37183091044425964, "eval_runtime": 20.3451, "eval_samples_per_second": 1.376, "eval_steps_per_second": 0.344, "step": 670 }, { "epoch": 0.31, "grad_norm": 0.8706486225128174, "learning_rate": 6.940264477884178e-05, "loss": 0.3715, "step": 671 }, { "epoch": 0.31, "grad_norm": 0.910187304019928, "learning_rate": 6.935704514363886e-05, "loss": 0.3639, "step": 672 }, { "epoch": 0.31, "grad_norm": 0.8096619844436646, "learning_rate": 6.931144550843593e-05, "loss": 0.3388, "step": 673 }, { "epoch": 0.31, "grad_norm": 0.8766554594039917, "learning_rate": 6.926584587323301e-05, "loss": 0.346, "step": 674 }, { "epoch": 0.31, "grad_norm": 1.1496071815490723, "learning_rate": 6.922024623803009e-05, "loss": 0.3507, "step": 675 }, { "epoch": 0.31, "grad_norm": 0.9036510586738586, "learning_rate": 6.917464660282717e-05, "loss": 0.3485, "step": 676 }, { "epoch": 0.31, "grad_norm": 0.9343310594558716, "learning_rate": 6.912904696762427e-05, "loss": 0.3609, "step": 677 }, { "epoch": 0.31, "grad_norm": 0.8673393726348877, "learning_rate": 6.908344733242135e-05, "loss": 0.3626, "step": 678 }, { "epoch": 0.31, "grad_norm": 0.9198659062385559, "learning_rate": 6.903784769721843e-05, "loss": 0.3712, "step": 679 }, { "epoch": 0.31, "grad_norm": 0.9207894802093506, "learning_rate": 6.89922480620155e-05, "loss": 0.3507, "step": 680 }, { "epoch": 0.31, "eval_loss": 0.37167611718177795, "eval_runtime": 22.1884, "eval_samples_per_second": 1.262, "eval_steps_per_second": 0.315, "step": 680 }, { "epoch": 0.31, "grad_norm": 0.847029447555542, "learning_rate": 6.894664842681258e-05, "loss": 0.3429, "step": 681 }, { "epoch": 0.31, "grad_norm": 0.9102590680122375, "learning_rate": 6.890104879160966e-05, "loss": 0.3748, "step": 682 }, { "epoch": 0.31, "grad_norm": 0.9330032467842102, "learning_rate": 6.885544915640675e-05, "loss": 0.362, "step": 683 }, { "epoch": 0.31, "grad_norm": 0.8520646691322327, "learning_rate": 6.880984952120383e-05, "loss": 0.3574, "step": 684 }, { "epoch": 0.31, "grad_norm": 0.868168294429779, "learning_rate": 6.876424988600092e-05, "loss": 0.3597, "step": 685 }, { "epoch": 0.31, "grad_norm": 0.8473092913627625, "learning_rate": 6.8718650250798e-05, "loss": 0.3375, "step": 686 }, { "epoch": 0.31, "grad_norm": 0.9174085259437561, "learning_rate": 6.867305061559507e-05, "loss": 0.3822, "step": 687 }, { "epoch": 0.31, "grad_norm": 0.8579756617546082, "learning_rate": 6.862745098039216e-05, "loss": 0.3487, "step": 688 }, { "epoch": 0.31, "grad_norm": 0.8969879150390625, "learning_rate": 6.858185134518924e-05, "loss": 0.3635, "step": 689 }, { "epoch": 0.31, "grad_norm": 0.868897020816803, "learning_rate": 6.853625170998632e-05, "loss": 0.3437, "step": 690 }, { "epoch": 0.31, "eval_loss": 0.3700122535228729, "eval_runtime": 21.7146, "eval_samples_per_second": 1.289, "eval_steps_per_second": 0.322, "step": 690 }, { "epoch": 0.31, "grad_norm": 0.8899202942848206, "learning_rate": 6.84906520747834e-05, "loss": 0.3423, "step": 691 }, { "epoch": 0.32, "grad_norm": 0.772549033164978, "learning_rate": 6.844505243958048e-05, "loss": 0.3315, "step": 692 }, { "epoch": 0.32, "grad_norm": 0.8518693447113037, "learning_rate": 6.839945280437758e-05, "loss": 0.3371, "step": 693 }, { "epoch": 0.32, "grad_norm": 0.8638544678688049, "learning_rate": 6.835385316917466e-05, "loss": 0.3413, "step": 694 }, { "epoch": 0.32, "grad_norm": 0.8001556396484375, "learning_rate": 6.830825353397173e-05, "loss": 0.3266, "step": 695 }, { "epoch": 0.32, "grad_norm": 0.8450673222541809, "learning_rate": 6.826265389876881e-05, "loss": 0.3658, "step": 696 }, { "epoch": 0.32, "grad_norm": 0.8260478973388672, "learning_rate": 6.821705426356589e-05, "loss": 0.3426, "step": 697 }, { "epoch": 0.32, "grad_norm": 0.7822526097297668, "learning_rate": 6.817145462836297e-05, "loss": 0.3371, "step": 698 }, { "epoch": 0.32, "grad_norm": 0.901171088218689, "learning_rate": 6.812585499316005e-05, "loss": 0.3842, "step": 699 }, { "epoch": 0.32, "grad_norm": 0.9115702509880066, "learning_rate": 6.808025535795714e-05, "loss": 0.3551, "step": 700 }, { "epoch": 0.32, "eval_loss": 0.36962977051734924, "eval_runtime": 22.9971, "eval_samples_per_second": 1.218, "eval_steps_per_second": 0.304, "step": 700 }, { "epoch": 0.32, "grad_norm": 0.8465586304664612, "learning_rate": 6.803465572275423e-05, "loss": 0.3361, "step": 701 }, { "epoch": 0.32, "grad_norm": 0.8594992756843567, "learning_rate": 6.79890560875513e-05, "loss": 0.3409, "step": 702 }, { "epoch": 0.32, "grad_norm": 0.85016930103302, "learning_rate": 6.794345645234838e-05, "loss": 0.3411, "step": 703 }, { "epoch": 0.32, "grad_norm": 0.9131604433059692, "learning_rate": 6.789785681714546e-05, "loss": 0.3684, "step": 704 }, { "epoch": 0.32, "grad_norm": 0.9225114583969116, "learning_rate": 6.785225718194254e-05, "loss": 0.3436, "step": 705 }, { "epoch": 0.32, "grad_norm": 0.8579698801040649, "learning_rate": 6.780665754673963e-05, "loss": 0.3453, "step": 706 }, { "epoch": 0.32, "grad_norm": 0.8255881071090698, "learning_rate": 6.776105791153671e-05, "loss": 0.3213, "step": 707 }, { "epoch": 0.32, "grad_norm": 0.8486250042915344, "learning_rate": 6.771545827633379e-05, "loss": 0.3361, "step": 708 }, { "epoch": 0.32, "grad_norm": 0.867413341999054, "learning_rate": 6.766985864113089e-05, "loss": 0.3645, "step": 709 }, { "epoch": 0.32, "grad_norm": 0.8573243021965027, "learning_rate": 6.762425900592795e-05, "loss": 0.354, "step": 710 }, { "epoch": 0.32, "eval_loss": 0.36976662278175354, "eval_runtime": 22.6059, "eval_samples_per_second": 1.239, "eval_steps_per_second": 0.31, "step": 710 }, { "epoch": 0.32, "grad_norm": 0.8835360407829285, "learning_rate": 6.757865937072504e-05, "loss": 0.3484, "step": 711 }, { "epoch": 0.32, "grad_norm": 0.865594208240509, "learning_rate": 6.753305973552212e-05, "loss": 0.354, "step": 712 }, { "epoch": 0.33, "grad_norm": 0.8187012672424316, "learning_rate": 6.74874601003192e-05, "loss": 0.3331, "step": 713 }, { "epoch": 0.33, "grad_norm": 0.9948188662528992, "learning_rate": 6.744186046511628e-05, "loss": 0.3837, "step": 714 }, { "epoch": 0.33, "grad_norm": 0.8897073864936829, "learning_rate": 6.739626082991336e-05, "loss": 0.3541, "step": 715 }, { "epoch": 0.33, "grad_norm": 0.8769757747650146, "learning_rate": 6.735066119471044e-05, "loss": 0.3362, "step": 716 }, { "epoch": 0.33, "grad_norm": 0.8944910168647766, "learning_rate": 6.730506155950753e-05, "loss": 0.3493, "step": 717 }, { "epoch": 0.33, "grad_norm": 0.8475896120071411, "learning_rate": 6.725946192430461e-05, "loss": 0.3651, "step": 718 }, { "epoch": 0.33, "grad_norm": 0.8484331369400024, "learning_rate": 6.721386228910169e-05, "loss": 0.341, "step": 719 }, { "epoch": 0.33, "grad_norm": 0.8814172148704529, "learning_rate": 6.716826265389877e-05, "loss": 0.3624, "step": 720 }, { "epoch": 0.33, "eval_loss": 0.3701036870479584, "eval_runtime": 21.9379, "eval_samples_per_second": 1.276, "eval_steps_per_second": 0.319, "step": 720 }, { "epoch": 0.33, "grad_norm": 0.8305844664573669, "learning_rate": 6.712266301869585e-05, "loss": 0.3502, "step": 721 }, { "epoch": 0.33, "grad_norm": 0.884107232093811, "learning_rate": 6.707706338349293e-05, "loss": 0.3599, "step": 722 }, { "epoch": 0.33, "grad_norm": 0.8543264269828796, "learning_rate": 6.703146374829002e-05, "loss": 0.3569, "step": 723 }, { "epoch": 0.33, "grad_norm": 0.8755825161933899, "learning_rate": 6.69858641130871e-05, "loss": 0.3599, "step": 724 }, { "epoch": 0.33, "grad_norm": 0.8918110132217407, "learning_rate": 6.694026447788418e-05, "loss": 0.3488, "step": 725 }, { "epoch": 0.33, "grad_norm": 0.8799797296524048, "learning_rate": 6.689466484268126e-05, "loss": 0.3512, "step": 726 }, { "epoch": 0.33, "grad_norm": 0.8555727601051331, "learning_rate": 6.684906520747834e-05, "loss": 0.3485, "step": 727 }, { "epoch": 0.33, "grad_norm": 0.8324811458587646, "learning_rate": 6.680346557227542e-05, "loss": 0.3426, "step": 728 }, { "epoch": 0.33, "grad_norm": 0.8397484421730042, "learning_rate": 6.67578659370725e-05, "loss": 0.3452, "step": 729 }, { "epoch": 0.33, "grad_norm": 0.8771885633468628, "learning_rate": 6.671226630186959e-05, "loss": 0.3416, "step": 730 }, { "epoch": 0.33, "eval_loss": 0.3702766001224518, "eval_runtime": 21.4551, "eval_samples_per_second": 1.305, "eval_steps_per_second": 0.326, "step": 730 }, { "epoch": 0.33, "grad_norm": 0.8133786916732788, "learning_rate": 6.666666666666667e-05, "loss": 0.3377, "step": 731 }, { "epoch": 0.33, "grad_norm": 0.8744335174560547, "learning_rate": 6.662106703146375e-05, "loss": 0.3581, "step": 732 }, { "epoch": 0.33, "grad_norm": 0.8324525952339172, "learning_rate": 6.657546739626083e-05, "loss": 0.3432, "step": 733 }, { "epoch": 0.33, "grad_norm": 0.8790922164916992, "learning_rate": 6.652986776105792e-05, "loss": 0.3563, "step": 734 }, { "epoch": 0.34, "grad_norm": 0.855384886264801, "learning_rate": 6.6484268125855e-05, "loss": 0.3543, "step": 735 }, { "epoch": 0.34, "grad_norm": 0.8443394303321838, "learning_rate": 6.643866849065208e-05, "loss": 0.3453, "step": 736 }, { "epoch": 0.34, "grad_norm": 0.8796120882034302, "learning_rate": 6.639306885544916e-05, "loss": 0.3468, "step": 737 }, { "epoch": 0.34, "grad_norm": 0.8887386918067932, "learning_rate": 6.634746922024624e-05, "loss": 0.3559, "step": 738 }, { "epoch": 0.34, "grad_norm": 0.8751109838485718, "learning_rate": 6.630186958504332e-05, "loss": 0.3398, "step": 739 }, { "epoch": 0.34, "grad_norm": 0.8693111538887024, "learning_rate": 6.62562699498404e-05, "loss": 0.3519, "step": 740 }, { "epoch": 0.34, "eval_loss": 0.3703303039073944, "eval_runtime": 21.0686, "eval_samples_per_second": 1.329, "eval_steps_per_second": 0.332, "step": 740 }, { "epoch": 0.34, "grad_norm": 0.8620005249977112, "learning_rate": 6.621067031463749e-05, "loss": 0.359, "step": 741 }, { "epoch": 0.34, "grad_norm": 0.8814571499824524, "learning_rate": 6.616507067943457e-05, "loss": 0.3532, "step": 742 }, { "epoch": 0.34, "grad_norm": 0.8492552042007446, "learning_rate": 6.611947104423165e-05, "loss": 0.3379, "step": 743 }, { "epoch": 0.34, "grad_norm": 0.8217783570289612, "learning_rate": 6.607387140902873e-05, "loss": 0.343, "step": 744 }, { "epoch": 0.34, "grad_norm": 0.8890987634658813, "learning_rate": 6.602827177382581e-05, "loss": 0.369, "step": 745 }, { "epoch": 0.34, "grad_norm": 0.8515855073928833, "learning_rate": 6.59826721386229e-05, "loss": 0.3502, "step": 746 }, { "epoch": 0.34, "grad_norm": 0.8381286263465881, "learning_rate": 6.593707250341998e-05, "loss": 0.3419, "step": 747 }, { "epoch": 0.34, "grad_norm": 0.7878593802452087, "learning_rate": 6.589147286821705e-05, "loss": 0.3262, "step": 748 }, { "epoch": 0.34, "grad_norm": 0.8711159229278564, "learning_rate": 6.584587323301414e-05, "loss": 0.3616, "step": 749 }, { "epoch": 0.34, "grad_norm": 0.8434100151062012, "learning_rate": 6.580027359781122e-05, "loss": 0.3465, "step": 750 }, { "epoch": 0.34, "eval_loss": 0.3702242076396942, "eval_runtime": 21.1864, "eval_samples_per_second": 1.322, "eval_steps_per_second": 0.33, "step": 750 }, { "epoch": 0.34, "grad_norm": 0.8792661428451538, "learning_rate": 6.57546739626083e-05, "loss": 0.3704, "step": 751 }, { "epoch": 0.34, "grad_norm": 0.8422108292579651, "learning_rate": 6.570907432740539e-05, "loss": 0.3516, "step": 752 }, { "epoch": 0.34, "grad_norm": 0.9015281796455383, "learning_rate": 6.566347469220247e-05, "loss": 0.3602, "step": 753 }, { "epoch": 0.34, "grad_norm": 0.8497557640075684, "learning_rate": 6.561787505699955e-05, "loss": 0.3507, "step": 754 }, { "epoch": 0.34, "grad_norm": 0.8488445281982422, "learning_rate": 6.557227542179663e-05, "loss": 0.333, "step": 755 }, { "epoch": 0.34, "grad_norm": 0.8524776101112366, "learning_rate": 6.55266757865937e-05, "loss": 0.363, "step": 756 }, { "epoch": 0.35, "grad_norm": 0.879468560218811, "learning_rate": 6.54810761513908e-05, "loss": 0.3769, "step": 757 }, { "epoch": 0.35, "grad_norm": 0.844282865524292, "learning_rate": 6.543547651618788e-05, "loss": 0.3697, "step": 758 }, { "epoch": 0.35, "grad_norm": 0.8251934051513672, "learning_rate": 6.538987688098496e-05, "loss": 0.3377, "step": 759 }, { "epoch": 0.35, "grad_norm": 0.9126609563827515, "learning_rate": 6.534427724578204e-05, "loss": 0.3696, "step": 760 }, { "epoch": 0.35, "eval_loss": 0.3702351450920105, "eval_runtime": 21.0169, "eval_samples_per_second": 1.332, "eval_steps_per_second": 0.333, "step": 760 }, { "epoch": 0.35, "grad_norm": 0.823424756526947, "learning_rate": 6.529867761057912e-05, "loss": 0.3535, "step": 761 }, { "epoch": 0.35, "grad_norm": 0.8669154644012451, "learning_rate": 6.52530779753762e-05, "loss": 0.3451, "step": 762 }, { "epoch": 0.35, "grad_norm": 0.9502606391906738, "learning_rate": 6.520747834017327e-05, "loss": 0.3967, "step": 763 }, { "epoch": 0.35, "grad_norm": 0.8474224209785461, "learning_rate": 6.516187870497035e-05, "loss": 0.357, "step": 764 }, { "epoch": 0.35, "grad_norm": 0.8229814767837524, "learning_rate": 6.511627906976745e-05, "loss": 0.3513, "step": 765 }, { "epoch": 0.35, "grad_norm": 0.8437790274620056, "learning_rate": 6.507067943456453e-05, "loss": 0.3412, "step": 766 }, { "epoch": 0.35, "grad_norm": 0.8239248991012573, "learning_rate": 6.502507979936161e-05, "loss": 0.3358, "step": 767 }, { "epoch": 0.35, "grad_norm": 0.8583698868751526, "learning_rate": 6.49794801641587e-05, "loss": 0.3408, "step": 768 }, { "epoch": 0.35, "grad_norm": 0.7928481101989746, "learning_rate": 6.493388052895578e-05, "loss": 0.3325, "step": 769 }, { "epoch": 0.35, "grad_norm": 0.9882442951202393, "learning_rate": 6.488828089375286e-05, "loss": 0.3517, "step": 770 }, { "epoch": 0.35, "eval_loss": 0.3702905774116516, "eval_runtime": 21.183, "eval_samples_per_second": 1.322, "eval_steps_per_second": 0.33, "step": 770 }, { "epoch": 0.35, "grad_norm": 0.9171540141105652, "learning_rate": 6.484268125854993e-05, "loss": 0.3614, "step": 771 }, { "epoch": 0.35, "grad_norm": 0.8405658602714539, "learning_rate": 6.479708162334701e-05, "loss": 0.3538, "step": 772 }, { "epoch": 0.35, "grad_norm": 0.8744710683822632, "learning_rate": 6.47514819881441e-05, "loss": 0.3498, "step": 773 }, { "epoch": 0.35, "grad_norm": 0.8798863291740417, "learning_rate": 6.470588235294118e-05, "loss": 0.3943, "step": 774 }, { "epoch": 0.35, "grad_norm": 0.8916918039321899, "learning_rate": 6.466028271773827e-05, "loss": 0.3822, "step": 775 }, { "epoch": 0.35, "grad_norm": 0.8482193946838379, "learning_rate": 6.461468308253535e-05, "loss": 0.3465, "step": 776 }, { "epoch": 0.35, "grad_norm": 0.8396366238594055, "learning_rate": 6.456908344733243e-05, "loss": 0.3554, "step": 777 }, { "epoch": 0.35, "grad_norm": 0.8911833167076111, "learning_rate": 6.45234838121295e-05, "loss": 0.3487, "step": 778 }, { "epoch": 0.36, "grad_norm": 0.879220187664032, "learning_rate": 6.447788417692658e-05, "loss": 0.3585, "step": 779 }, { "epoch": 0.36, "grad_norm": 0.8869410157203674, "learning_rate": 6.443228454172366e-05, "loss": 0.3497, "step": 780 }, { "epoch": 0.36, "eval_loss": 0.369541198015213, "eval_runtime": 21.1307, "eval_samples_per_second": 1.325, "eval_steps_per_second": 0.331, "step": 780 }, { "epoch": 0.36, "grad_norm": 0.8716014623641968, "learning_rate": 6.438668490652076e-05, "loss": 0.3771, "step": 781 }, { "epoch": 0.36, "grad_norm": 0.891190767288208, "learning_rate": 6.434108527131784e-05, "loss": 0.3718, "step": 782 }, { "epoch": 0.36, "grad_norm": 0.8753977417945862, "learning_rate": 6.429548563611492e-05, "loss": 0.3421, "step": 783 }, { "epoch": 0.36, "grad_norm": 0.8322952389717102, "learning_rate": 6.4249886000912e-05, "loss": 0.3447, "step": 784 }, { "epoch": 0.36, "grad_norm": 0.8560271859169006, "learning_rate": 6.420428636570907e-05, "loss": 0.3584, "step": 785 }, { "epoch": 0.36, "grad_norm": 0.8641065359115601, "learning_rate": 6.415868673050615e-05, "loss": 0.3356, "step": 786 }, { "epoch": 0.36, "grad_norm": 0.8236862421035767, "learning_rate": 6.411308709530323e-05, "loss": 0.3394, "step": 787 }, { "epoch": 0.36, "grad_norm": 0.8534393906593323, "learning_rate": 6.406748746010032e-05, "loss": 0.3623, "step": 788 }, { "epoch": 0.36, "grad_norm": 0.7958694100379944, "learning_rate": 6.402188782489741e-05, "loss": 0.3541, "step": 789 }, { "epoch": 0.36, "grad_norm": 0.8262916803359985, "learning_rate": 6.397628818969449e-05, "loss": 0.3317, "step": 790 }, { "epoch": 0.36, "eval_loss": 0.36867284774780273, "eval_runtime": 21.0978, "eval_samples_per_second": 1.327, "eval_steps_per_second": 0.332, "step": 790 }, { "epoch": 0.36, "grad_norm": 0.8441379070281982, "learning_rate": 6.393068855449157e-05, "loss": 0.3496, "step": 791 }, { "epoch": 0.36, "grad_norm": 0.8390480279922485, "learning_rate": 6.388508891928866e-05, "loss": 0.3432, "step": 792 }, { "epoch": 0.36, "grad_norm": 0.9012981653213501, "learning_rate": 6.383948928408572e-05, "loss": 0.3722, "step": 793 }, { "epoch": 0.36, "grad_norm": 0.8114961981773376, "learning_rate": 6.37938896488828e-05, "loss": 0.3315, "step": 794 }, { "epoch": 0.36, "grad_norm": 0.8510379791259766, "learning_rate": 6.374829001367989e-05, "loss": 0.3415, "step": 795 }, { "epoch": 0.36, "grad_norm": 0.8974173069000244, "learning_rate": 6.370269037847697e-05, "loss": 0.3659, "step": 796 }, { "epoch": 0.36, "grad_norm": 0.8199520111083984, "learning_rate": 6.365709074327406e-05, "loss": 0.3491, "step": 797 }, { "epoch": 0.36, "grad_norm": 1.0816798210144043, "learning_rate": 6.361149110807115e-05, "loss": 0.3439, "step": 798 }, { "epoch": 0.36, "grad_norm": 0.8767193555831909, "learning_rate": 6.356589147286823e-05, "loss": 0.3471, "step": 799 }, { "epoch": 0.36, "grad_norm": 0.8754706978797913, "learning_rate": 6.35202918376653e-05, "loss": 0.3699, "step": 800 }, { "epoch": 0.36, "eval_loss": 0.36849379539489746, "eval_runtime": 21.2125, "eval_samples_per_second": 1.32, "eval_steps_per_second": 0.33, "step": 800 }, { "epoch": 0.37, "grad_norm": 0.8332600593566895, "learning_rate": 6.347469220246238e-05, "loss": 0.3322, "step": 801 }, { "epoch": 0.37, "grad_norm": 0.9050359129905701, "learning_rate": 6.342909256725946e-05, "loss": 0.3424, "step": 802 }, { "epoch": 0.37, "grad_norm": 0.9001706838607788, "learning_rate": 6.338349293205654e-05, "loss": 0.3548, "step": 803 }, { "epoch": 0.37, "grad_norm": 0.8502199053764343, "learning_rate": 6.333789329685362e-05, "loss": 0.3367, "step": 804 }, { "epoch": 0.37, "grad_norm": 0.8597239851951599, "learning_rate": 6.329229366165072e-05, "loss": 0.3588, "step": 805 }, { "epoch": 0.37, "grad_norm": 0.8567646741867065, "learning_rate": 6.32466940264478e-05, "loss": 0.3924, "step": 806 }, { "epoch": 0.37, "grad_norm": 0.8374874591827393, "learning_rate": 6.320109439124488e-05, "loss": 0.3451, "step": 807 }, { "epoch": 0.37, "grad_norm": 0.8712239861488342, "learning_rate": 6.315549475604195e-05, "loss": 0.3567, "step": 808 }, { "epoch": 0.37, "grad_norm": 0.8838446140289307, "learning_rate": 6.310989512083903e-05, "loss": 0.3714, "step": 809 }, { "epoch": 0.37, "grad_norm": 0.8462404012680054, "learning_rate": 6.306429548563611e-05, "loss": 0.3504, "step": 810 }, { "epoch": 0.37, "eval_loss": 0.369089812040329, "eval_runtime": 21.3098, "eval_samples_per_second": 1.314, "eval_steps_per_second": 0.328, "step": 810 }, { "epoch": 0.37, "grad_norm": 0.8680486679077148, "learning_rate": 6.30186958504332e-05, "loss": 0.353, "step": 811 }, { "epoch": 0.37, "grad_norm": 0.854362964630127, "learning_rate": 6.297309621523028e-05, "loss": 0.3382, "step": 812 }, { "epoch": 0.37, "grad_norm": 0.8963435888290405, "learning_rate": 6.292749658002737e-05, "loss": 0.3514, "step": 813 }, { "epoch": 0.37, "grad_norm": 0.8164211511611938, "learning_rate": 6.288189694482445e-05, "loss": 0.3281, "step": 814 }, { "epoch": 0.37, "grad_norm": 0.8216964602470398, "learning_rate": 6.283629730962152e-05, "loss": 0.36, "step": 815 }, { "epoch": 0.37, "grad_norm": 0.8878861665725708, "learning_rate": 6.27906976744186e-05, "loss": 0.3826, "step": 816 }, { "epoch": 0.37, "grad_norm": 0.795782208442688, "learning_rate": 6.274509803921569e-05, "loss": 0.3358, "step": 817 }, { "epoch": 0.37, "grad_norm": 0.9114422798156738, "learning_rate": 6.269949840401277e-05, "loss": 0.3521, "step": 818 }, { "epoch": 0.37, "grad_norm": 0.8703455924987793, "learning_rate": 6.265389876880985e-05, "loss": 0.3368, "step": 819 }, { "epoch": 0.37, "grad_norm": 0.8820777535438538, "learning_rate": 6.260829913360693e-05, "loss": 0.3485, "step": 820 }, { "epoch": 0.37, "eval_loss": 0.3687816262245178, "eval_runtime": 21.1501, "eval_samples_per_second": 1.324, "eval_steps_per_second": 0.331, "step": 820 }, { "epoch": 0.37, "grad_norm": 0.8445407748222351, "learning_rate": 6.256269949840403e-05, "loss": 0.3288, "step": 821 }, { "epoch": 0.37, "grad_norm": 0.8472913503646851, "learning_rate": 6.25170998632011e-05, "loss": 0.3427, "step": 822 }, { "epoch": 0.38, "grad_norm": 0.911773681640625, "learning_rate": 6.247150022799818e-05, "loss": 0.3571, "step": 823 }, { "epoch": 0.38, "grad_norm": 0.889894962310791, "learning_rate": 6.242590059279526e-05, "loss": 0.3629, "step": 824 }, { "epoch": 0.38, "grad_norm": 0.8566384315490723, "learning_rate": 6.238030095759234e-05, "loss": 0.3582, "step": 825 }, { "epoch": 0.38, "grad_norm": 0.8500192165374756, "learning_rate": 6.233470132238942e-05, "loss": 0.3375, "step": 826 }, { "epoch": 0.38, "grad_norm": 0.8549200892448425, "learning_rate": 6.22891016871865e-05, "loss": 0.3487, "step": 827 }, { "epoch": 0.38, "grad_norm": 0.8865132927894592, "learning_rate": 6.224350205198358e-05, "loss": 0.3625, "step": 828 }, { "epoch": 0.38, "grad_norm": 0.898220956325531, "learning_rate": 6.219790241678068e-05, "loss": 0.3657, "step": 829 }, { "epoch": 0.38, "grad_norm": 0.8199746012687683, "learning_rate": 6.215230278157775e-05, "loss": 0.3239, "step": 830 }, { "epoch": 0.38, "eval_loss": 0.36817699670791626, "eval_runtime": 21.3091, "eval_samples_per_second": 1.314, "eval_steps_per_second": 0.328, "step": 830 }, { "epoch": 0.38, "grad_norm": 0.8350242376327515, "learning_rate": 6.210670314637483e-05, "loss": 0.3443, "step": 831 }, { "epoch": 0.38, "grad_norm": 0.8211762309074402, "learning_rate": 6.206110351117191e-05, "loss": 0.3478, "step": 832 }, { "epoch": 0.38, "grad_norm": 0.8550103306770325, "learning_rate": 6.201550387596899e-05, "loss": 0.362, "step": 833 }, { "epoch": 0.38, "grad_norm": 0.840576708316803, "learning_rate": 6.196990424076607e-05, "loss": 0.3629, "step": 834 }, { "epoch": 0.38, "grad_norm": 0.8360455632209778, "learning_rate": 6.192430460556316e-05, "loss": 0.3462, "step": 835 }, { "epoch": 0.38, "grad_norm": 0.8622804284095764, "learning_rate": 6.187870497036024e-05, "loss": 0.3582, "step": 836 }, { "epoch": 0.38, "grad_norm": 0.8452496528625488, "learning_rate": 6.183310533515732e-05, "loss": 0.351, "step": 837 }, { "epoch": 0.38, "grad_norm": 0.7803338766098022, "learning_rate": 6.17875056999544e-05, "loss": 0.3237, "step": 838 }, { "epoch": 0.38, "grad_norm": 0.879328727722168, "learning_rate": 6.174190606475148e-05, "loss": 0.3666, "step": 839 }, { "epoch": 0.38, "grad_norm": 0.8226670622825623, "learning_rate": 6.169630642954857e-05, "loss": 0.3342, "step": 840 }, { "epoch": 0.38, "eval_loss": 0.36737722158432007, "eval_runtime": 21.1249, "eval_samples_per_second": 1.325, "eval_steps_per_second": 0.331, "step": 840 }, { "epoch": 0.38, "grad_norm": 0.912907600402832, "learning_rate": 6.165070679434565e-05, "loss": 0.3701, "step": 841 }, { "epoch": 0.38, "grad_norm": 0.8975276947021484, "learning_rate": 6.160510715914273e-05, "loss": 0.3395, "step": 842 }, { "epoch": 0.38, "grad_norm": 0.8709104657173157, "learning_rate": 6.155950752393981e-05, "loss": 0.3352, "step": 843 }, { "epoch": 0.38, "grad_norm": 0.8861703872680664, "learning_rate": 6.151390788873689e-05, "loss": 0.3757, "step": 844 }, { "epoch": 0.39, "grad_norm": 0.8633649349212646, "learning_rate": 6.146830825353397e-05, "loss": 0.3439, "step": 845 }, { "epoch": 0.39, "grad_norm": 0.8183202743530273, "learning_rate": 6.142270861833106e-05, "loss": 0.3525, "step": 846 }, { "epoch": 0.39, "grad_norm": 0.9185418486595154, "learning_rate": 6.137710898312814e-05, "loss": 0.3833, "step": 847 }, { "epoch": 0.39, "grad_norm": 0.8235242962837219, "learning_rate": 6.133150934792522e-05, "loss": 0.3469, "step": 848 }, { "epoch": 0.39, "grad_norm": 0.8366368412971497, "learning_rate": 6.12859097127223e-05, "loss": 0.3419, "step": 849 }, { "epoch": 0.39, "grad_norm": 0.834038496017456, "learning_rate": 6.124031007751938e-05, "loss": 0.3404, "step": 850 }, { "epoch": 0.39, "eval_loss": 0.3677498698234558, "eval_runtime": 21.1975, "eval_samples_per_second": 1.321, "eval_steps_per_second": 0.33, "step": 850 }, { "epoch": 0.39, "grad_norm": 0.8480759859085083, "learning_rate": 6.119471044231646e-05, "loss": 0.3562, "step": 851 }, { "epoch": 0.39, "grad_norm": 0.8846428990364075, "learning_rate": 6.114911080711355e-05, "loss": 0.3646, "step": 852 }, { "epoch": 0.39, "grad_norm": 0.8484827876091003, "learning_rate": 6.110351117191063e-05, "loss": 0.334, "step": 853 }, { "epoch": 0.39, "grad_norm": 0.8655832409858704, "learning_rate": 6.105791153670771e-05, "loss": 0.3497, "step": 854 }, { "epoch": 0.39, "grad_norm": 0.9084945321083069, "learning_rate": 6.101231190150479e-05, "loss": 0.3698, "step": 855 }, { "epoch": 0.39, "grad_norm": 0.8178445100784302, "learning_rate": 6.096671226630187e-05, "loss": 0.333, "step": 856 }, { "epoch": 0.39, "grad_norm": 0.8397005796432495, "learning_rate": 6.0921112631098955e-05, "loss": 0.3407, "step": 857 }, { "epoch": 0.39, "grad_norm": 0.8644444942474365, "learning_rate": 6.0875512995896036e-05, "loss": 0.3502, "step": 858 }, { "epoch": 0.39, "grad_norm": 0.8999423384666443, "learning_rate": 6.082991336069311e-05, "loss": 0.3529, "step": 859 }, { "epoch": 0.39, "grad_norm": 0.8924650549888611, "learning_rate": 6.078431372549019e-05, "loss": 0.3616, "step": 860 }, { "epoch": 0.39, "eval_loss": 0.3670370280742645, "eval_runtime": 21.1232, "eval_samples_per_second": 1.326, "eval_steps_per_second": 0.331, "step": 860 }, { "epoch": 0.39, "grad_norm": 0.8417632579803467, "learning_rate": 6.073871409028729e-05, "loss": 0.3394, "step": 861 }, { "epoch": 0.39, "grad_norm": 0.8309604525566101, "learning_rate": 6.069311445508436e-05, "loss": 0.3387, "step": 862 }, { "epoch": 0.39, "grad_norm": 0.8310514688491821, "learning_rate": 6.0647514819881445e-05, "loss": 0.3473, "step": 863 }, { "epoch": 0.39, "grad_norm": 0.8718897104263306, "learning_rate": 6.060191518467853e-05, "loss": 0.3725, "step": 864 }, { "epoch": 0.39, "grad_norm": 0.9132295250892639, "learning_rate": 6.055631554947561e-05, "loss": 0.3639, "step": 865 }, { "epoch": 0.39, "grad_norm": 0.8623649477958679, "learning_rate": 6.0510715914272683e-05, "loss": 0.3434, "step": 866 }, { "epoch": 0.4, "grad_norm": 0.8534384369850159, "learning_rate": 6.0465116279069765e-05, "loss": 0.3499, "step": 867 }, { "epoch": 0.4, "grad_norm": 0.8440150022506714, "learning_rate": 6.041951664386685e-05, "loss": 0.3534, "step": 868 }, { "epoch": 0.4, "grad_norm": 0.8526747822761536, "learning_rate": 6.0373917008663935e-05, "loss": 0.3524, "step": 869 }, { "epoch": 0.4, "grad_norm": 0.8256170153617859, "learning_rate": 6.032831737346102e-05, "loss": 0.3438, "step": 870 }, { "epoch": 0.4, "eval_loss": 0.3662932515144348, "eval_runtime": 21.3199, "eval_samples_per_second": 1.313, "eval_steps_per_second": 0.328, "step": 870 }, { "epoch": 0.4, "grad_norm": 0.8334060311317444, "learning_rate": 6.02827177382581e-05, "loss": 0.357, "step": 871 }, { "epoch": 0.4, "grad_norm": 0.8603335618972778, "learning_rate": 6.023711810305518e-05, "loss": 0.3469, "step": 872 }, { "epoch": 0.4, "grad_norm": 0.7767968773841858, "learning_rate": 6.019151846785226e-05, "loss": 0.3091, "step": 873 }, { "epoch": 0.4, "grad_norm": 0.8094191551208496, "learning_rate": 6.014591883264934e-05, "loss": 0.3479, "step": 874 }, { "epoch": 0.4, "grad_norm": 0.819817066192627, "learning_rate": 6.010031919744642e-05, "loss": 0.3508, "step": 875 }, { "epoch": 0.4, "grad_norm": 0.841957151889801, "learning_rate": 6.00547195622435e-05, "loss": 0.3398, "step": 876 }, { "epoch": 0.4, "grad_norm": 0.86558997631073, "learning_rate": 6.000911992704059e-05, "loss": 0.3334, "step": 877 }, { "epoch": 0.4, "grad_norm": 0.8331573605537415, "learning_rate": 5.996352029183767e-05, "loss": 0.3667, "step": 878 }, { "epoch": 0.4, "grad_norm": 0.8304200768470764, "learning_rate": 5.991792065663475e-05, "loss": 0.3468, "step": 879 }, { "epoch": 0.4, "grad_norm": 0.8669402599334717, "learning_rate": 5.9872321021431834e-05, "loss": 0.3498, "step": 880 }, { "epoch": 0.4, "eval_loss": 0.36670106649398804, "eval_runtime": 21.1982, "eval_samples_per_second": 1.321, "eval_steps_per_second": 0.33, "step": 880 }, { "epoch": 0.4, "grad_norm": 0.8461664915084839, "learning_rate": 5.982672138622891e-05, "loss": 0.3358, "step": 881 }, { "epoch": 0.4, "grad_norm": 0.8536818623542786, "learning_rate": 5.978112175102599e-05, "loss": 0.3671, "step": 882 }, { "epoch": 0.4, "grad_norm": 0.8542598485946655, "learning_rate": 5.973552211582307e-05, "loss": 0.351, "step": 883 }, { "epoch": 0.4, "grad_norm": 0.84647136926651, "learning_rate": 5.9689922480620155e-05, "loss": 0.3653, "step": 884 }, { "epoch": 0.4, "grad_norm": 0.8366497159004211, "learning_rate": 5.964432284541724e-05, "loss": 0.3389, "step": 885 }, { "epoch": 0.4, "grad_norm": 0.8828349709510803, "learning_rate": 5.9598723210214325e-05, "loss": 0.3747, "step": 886 }, { "epoch": 0.4, "grad_norm": 0.8899936079978943, "learning_rate": 5.9553123575011407e-05, "loss": 0.361, "step": 887 }, { "epoch": 0.4, "grad_norm": 0.8368967771530151, "learning_rate": 5.950752393980849e-05, "loss": 0.3381, "step": 888 }, { "epoch": 0.41, "grad_norm": 0.8967677354812622, "learning_rate": 5.946192430460556e-05, "loss": 0.3591, "step": 889 }, { "epoch": 0.41, "grad_norm": 0.7917211055755615, "learning_rate": 5.9416324669402645e-05, "loss": 0.3389, "step": 890 }, { "epoch": 0.41, "eval_loss": 0.3663075268268585, "eval_runtime": 21.5147, "eval_samples_per_second": 1.301, "eval_steps_per_second": 0.325, "step": 890 }, { "epoch": 0.41, "grad_norm": 0.8735987544059753, "learning_rate": 5.937072503419973e-05, "loss": 0.3529, "step": 891 }, { "epoch": 0.41, "grad_norm": 0.8549131155014038, "learning_rate": 5.932512539899681e-05, "loss": 0.3788, "step": 892 }, { "epoch": 0.41, "grad_norm": 0.836391270160675, "learning_rate": 5.92795257637939e-05, "loss": 0.341, "step": 893 }, { "epoch": 0.41, "grad_norm": 0.8679954409599304, "learning_rate": 5.923392612859098e-05, "loss": 0.3677, "step": 894 }, { "epoch": 0.41, "grad_norm": 0.8982836604118347, "learning_rate": 5.918832649338806e-05, "loss": 0.3697, "step": 895 }, { "epoch": 0.41, "grad_norm": 0.8966107964515686, "learning_rate": 5.9142726858185135e-05, "loss": 0.3539, "step": 896 }, { "epoch": 0.41, "grad_norm": 0.8517934083938599, "learning_rate": 5.909712722298222e-05, "loss": 0.3364, "step": 897 }, { "epoch": 0.41, "grad_norm": 0.8622394800186157, "learning_rate": 5.90515275877793e-05, "loss": 0.3373, "step": 898 }, { "epoch": 0.41, "grad_norm": 0.8497902750968933, "learning_rate": 5.900592795257638e-05, "loss": 0.3402, "step": 899 }, { "epoch": 0.41, "grad_norm": 0.8614276051521301, "learning_rate": 5.8960328317373456e-05, "loss": 0.3474, "step": 900 }, { "epoch": 0.41, "eval_loss": 0.36591216921806335, "eval_runtime": 21.185, "eval_samples_per_second": 1.322, "eval_steps_per_second": 0.33, "step": 900 }, { "epoch": 0.41, "grad_norm": 0.8814240097999573, "learning_rate": 5.891472868217055e-05, "loss": 0.3623, "step": 901 }, { "epoch": 0.41, "grad_norm": 0.8524793982505798, "learning_rate": 5.886912904696763e-05, "loss": 0.3439, "step": 902 }, { "epoch": 0.41, "grad_norm": 0.8247084021568298, "learning_rate": 5.882352941176471e-05, "loss": 0.3596, "step": 903 }, { "epoch": 0.41, "grad_norm": 0.812872052192688, "learning_rate": 5.877792977656179e-05, "loss": 0.3365, "step": 904 }, { "epoch": 0.41, "grad_norm": 0.8451302647590637, "learning_rate": 5.873233014135887e-05, "loss": 0.3435, "step": 905 }, { "epoch": 0.41, "grad_norm": 0.8993464708328247, "learning_rate": 5.868673050615595e-05, "loss": 0.3688, "step": 906 }, { "epoch": 0.41, "grad_norm": 0.8517639636993408, "learning_rate": 5.8641130870953034e-05, "loss": 0.3709, "step": 907 }, { "epoch": 0.41, "grad_norm": 0.8076426386833191, "learning_rate": 5.859553123575011e-05, "loss": 0.3513, "step": 908 }, { "epoch": 0.41, "grad_norm": 0.8321362137794495, "learning_rate": 5.8549931600547205e-05, "loss": 0.3362, "step": 909 }, { "epoch": 0.41, "grad_norm": 0.841407299041748, "learning_rate": 5.8504331965344286e-05, "loss": 0.3402, "step": 910 }, { "epoch": 0.41, "eval_loss": 0.36606305837631226, "eval_runtime": 21.2447, "eval_samples_per_second": 1.318, "eval_steps_per_second": 0.329, "step": 910 }, { "epoch": 0.42, "grad_norm": 0.873389720916748, "learning_rate": 5.845873233014136e-05, "loss": 0.3674, "step": 911 }, { "epoch": 0.42, "grad_norm": 0.8563045859336853, "learning_rate": 5.841313269493844e-05, "loss": 0.3587, "step": 912 }, { "epoch": 0.42, "grad_norm": 0.8238916993141174, "learning_rate": 5.8367533059735525e-05, "loss": 0.3521, "step": 913 }, { "epoch": 0.42, "grad_norm": 0.8294292688369751, "learning_rate": 5.8321933424532607e-05, "loss": 0.3614, "step": 914 }, { "epoch": 0.42, "grad_norm": 0.8287543058395386, "learning_rate": 5.827633378932968e-05, "loss": 0.3596, "step": 915 }, { "epoch": 0.42, "grad_norm": 0.8150818347930908, "learning_rate": 5.823073415412676e-05, "loss": 0.3515, "step": 916 }, { "epoch": 0.42, "grad_norm": 0.8092970252037048, "learning_rate": 5.818513451892386e-05, "loss": 0.329, "step": 917 }, { "epoch": 0.42, "grad_norm": 0.8499705195426941, "learning_rate": 5.8139534883720933e-05, "loss": 0.3446, "step": 918 }, { "epoch": 0.42, "grad_norm": 0.8703744411468506, "learning_rate": 5.8093935248518015e-05, "loss": 0.3618, "step": 919 }, { "epoch": 0.42, "grad_norm": 0.8624708652496338, "learning_rate": 5.80483356133151e-05, "loss": 0.3596, "step": 920 }, { "epoch": 0.42, "eval_loss": 0.3661345839500427, "eval_runtime": 21.1403, "eval_samples_per_second": 1.324, "eval_steps_per_second": 0.331, "step": 920 }, { "epoch": 0.42, "grad_norm": 0.811722993850708, "learning_rate": 5.800273597811218e-05, "loss": 0.3702, "step": 921 }, { "epoch": 0.42, "grad_norm": 0.8618665337562561, "learning_rate": 5.795713634290926e-05, "loss": 0.361, "step": 922 }, { "epoch": 0.42, "grad_norm": 0.8876213431358337, "learning_rate": 5.7911536707706335e-05, "loss": 0.3639, "step": 923 }, { "epoch": 0.42, "grad_norm": 0.842745840549469, "learning_rate": 5.786593707250342e-05, "loss": 0.3543, "step": 924 }, { "epoch": 0.42, "grad_norm": 0.8399864435195923, "learning_rate": 5.782033743730051e-05, "loss": 0.3282, "step": 925 }, { "epoch": 0.42, "grad_norm": 0.8489199280738831, "learning_rate": 5.777473780209759e-05, "loss": 0.3351, "step": 926 }, { "epoch": 0.42, "grad_norm": 0.8790804743766785, "learning_rate": 5.772913816689467e-05, "loss": 0.3722, "step": 927 }, { "epoch": 0.42, "grad_norm": 0.8245604634284973, "learning_rate": 5.768353853169175e-05, "loss": 0.3429, "step": 928 }, { "epoch": 0.42, "grad_norm": 0.7950709462165833, "learning_rate": 5.763793889648883e-05, "loss": 0.3395, "step": 929 }, { "epoch": 0.42, "grad_norm": 0.810104250907898, "learning_rate": 5.759233926128591e-05, "loss": 0.3478, "step": 930 }, { "epoch": 0.42, "eval_loss": 0.3656521439552307, "eval_runtime": 21.2315, "eval_samples_per_second": 1.319, "eval_steps_per_second": 0.33, "step": 930 }, { "epoch": 0.42, "grad_norm": 0.8308596611022949, "learning_rate": 5.754673962608299e-05, "loss": 0.3392, "step": 931 }, { "epoch": 0.42, "grad_norm": 0.8465328812599182, "learning_rate": 5.750113999088007e-05, "loss": 0.3587, "step": 932 }, { "epoch": 0.43, "grad_norm": 0.8508004546165466, "learning_rate": 5.745554035567716e-05, "loss": 0.3449, "step": 933 }, { "epoch": 0.43, "grad_norm": 0.9344462156295776, "learning_rate": 5.740994072047424e-05, "loss": 0.3814, "step": 934 }, { "epoch": 0.43, "grad_norm": 0.8226639032363892, "learning_rate": 5.736434108527132e-05, "loss": 0.3393, "step": 935 }, { "epoch": 0.43, "grad_norm": 0.903837263584137, "learning_rate": 5.7318741450068405e-05, "loss": 0.3545, "step": 936 }, { "epoch": 0.43, "grad_norm": 0.927081823348999, "learning_rate": 5.7273141814865486e-05, "loss": 0.3647, "step": 937 }, { "epoch": 0.43, "grad_norm": 0.8148213028907776, "learning_rate": 5.722754217966256e-05, "loss": 0.3397, "step": 938 }, { "epoch": 0.43, "grad_norm": 0.8732878565788269, "learning_rate": 5.718194254445964e-05, "loss": 0.3533, "step": 939 }, { "epoch": 0.43, "grad_norm": 0.8293365836143494, "learning_rate": 5.7136342909256725e-05, "loss": 0.3572, "step": 940 }, { "epoch": 0.43, "eval_loss": 0.3642878532409668, "eval_runtime": 18.6197, "eval_samples_per_second": 1.504, "eval_steps_per_second": 0.376, "step": 940 }, { "epoch": 0.43, "grad_norm": 0.8569697141647339, "learning_rate": 5.709074327405381e-05, "loss": 0.3527, "step": 941 }, { "epoch": 0.43, "grad_norm": 0.8310784101486206, "learning_rate": 5.7045143638850895e-05, "loss": 0.3418, "step": 942 }, { "epoch": 0.43, "grad_norm": 0.803471028804779, "learning_rate": 5.699954400364798e-05, "loss": 0.3377, "step": 943 }, { "epoch": 0.43, "grad_norm": 0.8265700936317444, "learning_rate": 5.695394436844506e-05, "loss": 0.3421, "step": 944 }, { "epoch": 0.43, "grad_norm": 0.8034089803695679, "learning_rate": 5.6908344733242133e-05, "loss": 0.3437, "step": 945 }, { "epoch": 0.43, "grad_norm": 0.8499231934547424, "learning_rate": 5.6862745098039215e-05, "loss": 0.3532, "step": 946 }, { "epoch": 0.43, "grad_norm": 0.7843233942985535, "learning_rate": 5.68171454628363e-05, "loss": 0.3402, "step": 947 }, { "epoch": 0.43, "grad_norm": 0.7977408170700073, "learning_rate": 5.677154582763338e-05, "loss": 0.3481, "step": 948 }, { "epoch": 0.43, "grad_norm": 0.8667237758636475, "learning_rate": 5.672594619243047e-05, "loss": 0.3843, "step": 949 }, { "epoch": 0.43, "grad_norm": 0.8898354172706604, "learning_rate": 5.668034655722755e-05, "loss": 0.3631, "step": 950 }, { "epoch": 0.43, "eval_loss": 0.36420196294784546, "eval_runtime": 19.1437, "eval_samples_per_second": 1.463, "eval_steps_per_second": 0.366, "step": 950 }, { "epoch": 0.43, "grad_norm": 0.8387503027915955, "learning_rate": 5.663474692202463e-05, "loss": 0.3391, "step": 951 }, { "epoch": 0.43, "grad_norm": 0.8411070108413696, "learning_rate": 5.6589147286821706e-05, "loss": 0.3699, "step": 952 }, { "epoch": 0.43, "grad_norm": 0.8362002968788147, "learning_rate": 5.654354765161879e-05, "loss": 0.338, "step": 953 }, { "epoch": 0.43, "grad_norm": 0.8469303846359253, "learning_rate": 5.649794801641587e-05, "loss": 0.363, "step": 954 }, { "epoch": 0.44, "grad_norm": 0.8548648953437805, "learning_rate": 5.645234838121295e-05, "loss": 0.3344, "step": 955 }, { "epoch": 0.44, "grad_norm": 0.8648215532302856, "learning_rate": 5.640674874601003e-05, "loss": 0.3636, "step": 956 }, { "epoch": 0.44, "grad_norm": 0.8931896686553955, "learning_rate": 5.636114911080712e-05, "loss": 0.3479, "step": 957 }, { "epoch": 0.44, "grad_norm": 0.8487157821655273, "learning_rate": 5.63155494756042e-05, "loss": 0.3603, "step": 958 }, { "epoch": 0.44, "grad_norm": 0.852799117565155, "learning_rate": 5.6269949840401285e-05, "loss": 0.3711, "step": 959 }, { "epoch": 0.44, "grad_norm": 0.8263891339302063, "learning_rate": 5.622435020519836e-05, "loss": 0.3574, "step": 960 }, { "epoch": 0.44, "eval_loss": 0.363812118768692, "eval_runtime": 18.6019, "eval_samples_per_second": 1.505, "eval_steps_per_second": 0.376, "step": 960 }, { "epoch": 0.44, "grad_norm": 0.8231980800628662, "learning_rate": 5.617875056999544e-05, "loss": 0.3317, "step": 961 }, { "epoch": 0.44, "grad_norm": 0.8345970511436462, "learning_rate": 5.613315093479252e-05, "loss": 0.346, "step": 962 }, { "epoch": 0.44, "grad_norm": 0.8778778910636902, "learning_rate": 5.6087551299589605e-05, "loss": 0.341, "step": 963 }, { "epoch": 0.44, "grad_norm": 0.7894421219825745, "learning_rate": 5.604195166438668e-05, "loss": 0.3379, "step": 964 }, { "epoch": 0.44, "grad_norm": 0.895371675491333, "learning_rate": 5.5996352029183775e-05, "loss": 0.3797, "step": 965 }, { "epoch": 0.44, "grad_norm": 0.8161599636077881, "learning_rate": 5.595075239398086e-05, "loss": 0.3472, "step": 966 }, { "epoch": 0.44, "grad_norm": 0.8689946532249451, "learning_rate": 5.590515275877793e-05, "loss": 0.3463, "step": 967 }, { "epoch": 0.44, "grad_norm": 0.7838358879089355, "learning_rate": 5.585955312357501e-05, "loss": 0.32, "step": 968 }, { "epoch": 0.44, "grad_norm": 0.8021233677864075, "learning_rate": 5.5813953488372095e-05, "loss": 0.3386, "step": 969 }, { "epoch": 0.44, "grad_norm": 0.8686328530311584, "learning_rate": 5.576835385316918e-05, "loss": 0.3388, "step": 970 }, { "epoch": 0.44, "eval_loss": 0.36350157856941223, "eval_runtime": 18.792, "eval_samples_per_second": 1.49, "eval_steps_per_second": 0.372, "step": 970 }, { "epoch": 0.44, "grad_norm": 0.8454912304878235, "learning_rate": 5.572275421796626e-05, "loss": 0.3627, "step": 971 }, { "epoch": 0.44, "grad_norm": 0.8108564019203186, "learning_rate": 5.5677154582763333e-05, "loss": 0.347, "step": 972 }, { "epoch": 0.44, "grad_norm": 0.8135057091712952, "learning_rate": 5.563155494756043e-05, "loss": 0.3557, "step": 973 }, { "epoch": 0.44, "grad_norm": 0.8536540865898132, "learning_rate": 5.558595531235751e-05, "loss": 0.3771, "step": 974 }, { "epoch": 0.44, "grad_norm": 0.8181318640708923, "learning_rate": 5.5540355677154585e-05, "loss": 0.3505, "step": 975 }, { "epoch": 0.44, "grad_norm": 0.7815612554550171, "learning_rate": 5.549475604195167e-05, "loss": 0.3259, "step": 976 }, { "epoch": 0.45, "grad_norm": 0.8374220728874207, "learning_rate": 5.544915640674875e-05, "loss": 0.3544, "step": 977 }, { "epoch": 0.45, "grad_norm": 0.8432349562644958, "learning_rate": 5.540355677154583e-05, "loss": 0.3551, "step": 978 }, { "epoch": 0.45, "grad_norm": 0.824934720993042, "learning_rate": 5.5357957136342906e-05, "loss": 0.3477, "step": 979 }, { "epoch": 0.45, "grad_norm": 0.8221052289009094, "learning_rate": 5.531235750113999e-05, "loss": 0.3248, "step": 980 }, { "epoch": 0.45, "eval_loss": 0.3641625940799713, "eval_runtime": 18.8993, "eval_samples_per_second": 1.482, "eval_steps_per_second": 0.37, "step": 980 }, { "epoch": 0.45, "grad_norm": 0.7826463580131531, "learning_rate": 5.526675786593708e-05, "loss": 0.3287, "step": 981 }, { "epoch": 0.45, "grad_norm": 0.7870301008224487, "learning_rate": 5.522115823073416e-05, "loss": 0.3407, "step": 982 }, { "epoch": 0.45, "grad_norm": 0.8751205205917358, "learning_rate": 5.517555859553124e-05, "loss": 0.3495, "step": 983 }, { "epoch": 0.45, "grad_norm": 0.8503699898719788, "learning_rate": 5.512995896032832e-05, "loss": 0.3596, "step": 984 }, { "epoch": 0.45, "grad_norm": 0.8469080328941345, "learning_rate": 5.50843593251254e-05, "loss": 0.3334, "step": 985 }, { "epoch": 0.45, "grad_norm": 0.8288311958312988, "learning_rate": 5.503875968992248e-05, "loss": 0.3737, "step": 986 }, { "epoch": 0.45, "grad_norm": 0.8115249276161194, "learning_rate": 5.499316005471956e-05, "loss": 0.3358, "step": 987 }, { "epoch": 0.45, "grad_norm": 0.8277326226234436, "learning_rate": 5.494756041951664e-05, "loss": 0.3303, "step": 988 }, { "epoch": 0.45, "grad_norm": 0.8234649896621704, "learning_rate": 5.490196078431373e-05, "loss": 0.3494, "step": 989 }, { "epoch": 0.45, "grad_norm": 0.8039526343345642, "learning_rate": 5.485636114911081e-05, "loss": 0.3379, "step": 990 }, { "epoch": 0.45, "eval_loss": 0.36366531252861023, "eval_runtime": 19.7343, "eval_samples_per_second": 1.419, "eval_steps_per_second": 0.355, "step": 990 }, { "epoch": 0.45, "grad_norm": 0.8476756811141968, "learning_rate": 5.481076151390789e-05, "loss": 0.3373, "step": 991 }, { "epoch": 0.45, "grad_norm": 0.8155856132507324, "learning_rate": 5.4765161878704975e-05, "loss": 0.3464, "step": 992 }, { "epoch": 0.45, "grad_norm": 0.8660078644752502, "learning_rate": 5.471956224350206e-05, "loss": 0.3739, "step": 993 }, { "epoch": 0.45, "grad_norm": 0.8255999684333801, "learning_rate": 5.467396260829913e-05, "loss": 0.3531, "step": 994 }, { "epoch": 0.45, "grad_norm": 0.7718782424926758, "learning_rate": 5.462836297309621e-05, "loss": 0.3346, "step": 995 }, { "epoch": 0.45, "grad_norm": 0.8417358994483948, "learning_rate": 5.4582763337893295e-05, "loss": 0.3499, "step": 996 }, { "epoch": 0.45, "grad_norm": 0.8342063426971436, "learning_rate": 5.4537163702690384e-05, "loss": 0.3481, "step": 997 }, { "epoch": 0.45, "grad_norm": 0.8939498066902161, "learning_rate": 5.4491564067487465e-05, "loss": 0.345, "step": 998 }, { "epoch": 0.46, "grad_norm": 0.8737196922302246, "learning_rate": 5.444596443228455e-05, "loss": 0.356, "step": 999 }, { "epoch": 0.46, "grad_norm": 0.8749281764030457, "learning_rate": 5.440036479708163e-05, "loss": 0.3428, "step": 1000 }, { "epoch": 0.46, "eval_loss": 0.3630655109882355, "eval_runtime": 21.6489, "eval_samples_per_second": 1.293, "eval_steps_per_second": 0.323, "step": 1000 }, { "epoch": 0.46, "grad_norm": 0.8389559388160706, "learning_rate": 5.4354765161878704e-05, "loss": 0.3527, "step": 1001 }, { "epoch": 0.46, "grad_norm": 0.7941823601722717, "learning_rate": 5.4309165526675785e-05, "loss": 0.3312, "step": 1002 }, { "epoch": 0.46, "grad_norm": 0.8154469132423401, "learning_rate": 5.426356589147287e-05, "loss": 0.3512, "step": 1003 }, { "epoch": 0.46, "grad_norm": 0.8760425448417664, "learning_rate": 5.421796625626995e-05, "loss": 0.3367, "step": 1004 }, { "epoch": 0.46, "grad_norm": 0.821310818195343, "learning_rate": 5.417236662106704e-05, "loss": 0.353, "step": 1005 }, { "epoch": 0.46, "grad_norm": 0.8528419137001038, "learning_rate": 5.412676698586412e-05, "loss": 0.354, "step": 1006 }, { "epoch": 0.46, "grad_norm": 0.8211894035339355, "learning_rate": 5.40811673506612e-05, "loss": 0.3332, "step": 1007 }, { "epoch": 0.46, "grad_norm": 0.8660132884979248, "learning_rate": 5.403556771545828e-05, "loss": 0.3386, "step": 1008 }, { "epoch": 0.46, "grad_norm": 0.8995015621185303, "learning_rate": 5.398996808025536e-05, "loss": 0.3623, "step": 1009 }, { "epoch": 0.46, "grad_norm": 0.8141841888427734, "learning_rate": 5.394436844505244e-05, "loss": 0.3643, "step": 1010 }, { "epoch": 0.46, "eval_loss": 0.3631010949611664, "eval_runtime": 20.9081, "eval_samples_per_second": 1.339, "eval_steps_per_second": 0.335, "step": 1010 }, { "epoch": 0.46, "grad_norm": 0.828433632850647, "learning_rate": 5.389876880984952e-05, "loss": 0.3394, "step": 1011 }, { "epoch": 0.46, "grad_norm": 0.8452778458595276, "learning_rate": 5.38531691746466e-05, "loss": 0.35, "step": 1012 }, { "epoch": 0.46, "grad_norm": 0.8223903775215149, "learning_rate": 5.380756953944369e-05, "loss": 0.3459, "step": 1013 }, { "epoch": 0.46, "grad_norm": 0.8182603716850281, "learning_rate": 5.376196990424077e-05, "loss": 0.3388, "step": 1014 }, { "epoch": 0.46, "grad_norm": 0.8223605155944824, "learning_rate": 5.3716370269037855e-05, "loss": 0.3477, "step": 1015 }, { "epoch": 0.46, "grad_norm": 0.8775902390480042, "learning_rate": 5.367077063383493e-05, "loss": 0.348, "step": 1016 }, { "epoch": 0.46, "grad_norm": 0.8247773051261902, "learning_rate": 5.362517099863201e-05, "loss": 0.343, "step": 1017 }, { "epoch": 0.46, "grad_norm": 0.8566569089889526, "learning_rate": 5.357957136342909e-05, "loss": 0.3296, "step": 1018 }, { "epoch": 0.46, "grad_norm": 0.9025068879127502, "learning_rate": 5.3533971728226175e-05, "loss": 0.3685, "step": 1019 }, { "epoch": 0.46, "grad_norm": 0.8382214307785034, "learning_rate": 5.348837209302326e-05, "loss": 0.3598, "step": 1020 }, { "epoch": 0.46, "eval_loss": 0.3636544644832611, "eval_runtime": 20.2937, "eval_samples_per_second": 1.38, "eval_steps_per_second": 0.345, "step": 1020 }, { "epoch": 0.47, "grad_norm": 0.8766863942146301, "learning_rate": 5.3442772457820345e-05, "loss": 0.3596, "step": 1021 }, { "epoch": 0.47, "grad_norm": 0.8422759771347046, "learning_rate": 5.339717282261743e-05, "loss": 0.3441, "step": 1022 }, { "epoch": 0.47, "grad_norm": 0.8558328151702881, "learning_rate": 5.335157318741451e-05, "loss": 0.3508, "step": 1023 }, { "epoch": 0.47, "grad_norm": 0.8047720789909363, "learning_rate": 5.3305973552211584e-05, "loss": 0.3365, "step": 1024 }, { "epoch": 0.47, "grad_norm": 0.8321728110313416, "learning_rate": 5.3260373917008665e-05, "loss": 0.3579, "step": 1025 }, { "epoch": 0.47, "grad_norm": 0.8026608824729919, "learning_rate": 5.321477428180575e-05, "loss": 0.3358, "step": 1026 }, { "epoch": 0.47, "grad_norm": 0.8313252329826355, "learning_rate": 5.316917464660283e-05, "loss": 0.3308, "step": 1027 }, { "epoch": 0.47, "grad_norm": 0.858326256275177, "learning_rate": 5.3123575011399904e-05, "loss": 0.3489, "step": 1028 }, { "epoch": 0.47, "grad_norm": 0.8693019151687622, "learning_rate": 5.3077975376197e-05, "loss": 0.3528, "step": 1029 }, { "epoch": 0.47, "grad_norm": 0.8624293804168701, "learning_rate": 5.303237574099408e-05, "loss": 0.3656, "step": 1030 }, { "epoch": 0.47, "eval_loss": 0.36372342705726624, "eval_runtime": 20.3094, "eval_samples_per_second": 1.379, "eval_steps_per_second": 0.345, "step": 1030 }, { "epoch": 0.47, "grad_norm": 0.8259044289588928, "learning_rate": 5.2986776105791156e-05, "loss": 0.3261, "step": 1031 }, { "epoch": 0.47, "grad_norm": 0.8822616338729858, "learning_rate": 5.294117647058824e-05, "loss": 0.3518, "step": 1032 }, { "epoch": 0.47, "grad_norm": 0.8377732634544373, "learning_rate": 5.289557683538532e-05, "loss": 0.3557, "step": 1033 }, { "epoch": 0.47, "grad_norm": 0.8176842331886292, "learning_rate": 5.28499772001824e-05, "loss": 0.3542, "step": 1034 }, { "epoch": 0.47, "grad_norm": 0.8251992464065552, "learning_rate": 5.2804377564979476e-05, "loss": 0.3672, "step": 1035 }, { "epoch": 0.47, "grad_norm": 0.8290631175041199, "learning_rate": 5.275877792977656e-05, "loss": 0.3456, "step": 1036 }, { "epoch": 0.47, "grad_norm": 0.8259645700454712, "learning_rate": 5.271317829457365e-05, "loss": 0.3536, "step": 1037 }, { "epoch": 0.47, "grad_norm": 0.8718307018280029, "learning_rate": 5.266757865937073e-05, "loss": 0.3706, "step": 1038 }, { "epoch": 0.47, "grad_norm": 0.8760273456573486, "learning_rate": 5.262197902416781e-05, "loss": 0.3639, "step": 1039 }, { "epoch": 0.47, "grad_norm": 0.8457003235816956, "learning_rate": 5.257637938896489e-05, "loss": 0.3671, "step": 1040 }, { "epoch": 0.47, "eval_loss": 0.3634140193462372, "eval_runtime": 20.5539, "eval_samples_per_second": 1.362, "eval_steps_per_second": 0.341, "step": 1040 }, { "epoch": 0.47, "grad_norm": 0.8008033037185669, "learning_rate": 5.253077975376197e-05, "loss": 0.3534, "step": 1041 }, { "epoch": 0.47, "grad_norm": 0.8359259366989136, "learning_rate": 5.2485180118559055e-05, "loss": 0.342, "step": 1042 }, { "epoch": 0.48, "grad_norm": 0.8333999514579773, "learning_rate": 5.243958048335613e-05, "loss": 0.3505, "step": 1043 }, { "epoch": 0.48, "grad_norm": 0.8984323143959045, "learning_rate": 5.239398084815321e-05, "loss": 0.3435, "step": 1044 }, { "epoch": 0.48, "grad_norm": 0.7493898272514343, "learning_rate": 5.234838121295031e-05, "loss": 0.3247, "step": 1045 }, { "epoch": 0.48, "grad_norm": 0.8557037115097046, "learning_rate": 5.230278157774738e-05, "loss": 0.3829, "step": 1046 }, { "epoch": 0.48, "grad_norm": 0.8355690836906433, "learning_rate": 5.2257181942544463e-05, "loss": 0.346, "step": 1047 }, { "epoch": 0.48, "grad_norm": 0.8401297330856323, "learning_rate": 5.2211582307341545e-05, "loss": 0.3598, "step": 1048 }, { "epoch": 0.48, "grad_norm": 0.8262518644332886, "learning_rate": 5.216598267213863e-05, "loss": 0.3418, "step": 1049 }, { "epoch": 0.48, "grad_norm": 0.7955124974250793, "learning_rate": 5.21203830369357e-05, "loss": 0.334, "step": 1050 }, { "epoch": 0.48, "eval_loss": 0.36403360962867737, "eval_runtime": 20.7565, "eval_samples_per_second": 1.349, "eval_steps_per_second": 0.337, "step": 1050 }, { "epoch": 0.48, "grad_norm": 0.8252997994422913, "learning_rate": 5.2074783401732784e-05, "loss": 0.3431, "step": 1051 }, { "epoch": 0.48, "grad_norm": 0.8318145275115967, "learning_rate": 5.2029183766529865e-05, "loss": 0.3486, "step": 1052 }, { "epoch": 0.48, "grad_norm": 0.8297901153564453, "learning_rate": 5.1983584131326954e-05, "loss": 0.3472, "step": 1053 }, { "epoch": 0.48, "grad_norm": 0.7829481959342957, "learning_rate": 5.1937984496124036e-05, "loss": 0.3517, "step": 1054 }, { "epoch": 0.48, "grad_norm": 0.8852716684341431, "learning_rate": 5.189238486092112e-05, "loss": 0.3607, "step": 1055 }, { "epoch": 0.48, "grad_norm": 0.8906071186065674, "learning_rate": 5.18467852257182e-05, "loss": 0.3742, "step": 1056 }, { "epoch": 0.48, "grad_norm": 0.8188351392745972, "learning_rate": 5.180118559051528e-05, "loss": 0.3479, "step": 1057 }, { "epoch": 0.48, "grad_norm": 0.8125540018081665, "learning_rate": 5.1755585955312356e-05, "loss": 0.3599, "step": 1058 }, { "epoch": 0.48, "grad_norm": 0.8210964202880859, "learning_rate": 5.170998632010944e-05, "loss": 0.349, "step": 1059 }, { "epoch": 0.48, "grad_norm": 0.856060802936554, "learning_rate": 5.166438668490652e-05, "loss": 0.3565, "step": 1060 }, { "epoch": 0.48, "eval_loss": 0.3636583387851715, "eval_runtime": 21.8196, "eval_samples_per_second": 1.283, "eval_steps_per_second": 0.321, "step": 1060 }, { "epoch": 0.48, "grad_norm": 0.8330450654029846, "learning_rate": 5.161878704970361e-05, "loss": 0.3296, "step": 1061 }, { "epoch": 0.48, "grad_norm": 0.8015806078910828, "learning_rate": 5.157318741450069e-05, "loss": 0.3334, "step": 1062 }, { "epoch": 0.48, "grad_norm": 0.8128681182861328, "learning_rate": 5.152758777929777e-05, "loss": 0.3382, "step": 1063 }, { "epoch": 0.49, "grad_norm": 0.8020748496055603, "learning_rate": 5.148198814409485e-05, "loss": 0.3409, "step": 1064 }, { "epoch": 0.49, "grad_norm": 0.843084454536438, "learning_rate": 5.143638850889193e-05, "loss": 0.3456, "step": 1065 }, { "epoch": 0.49, "grad_norm": 0.7976772785186768, "learning_rate": 5.139078887368901e-05, "loss": 0.343, "step": 1066 }, { "epoch": 0.49, "grad_norm": 0.8992964029312134, "learning_rate": 5.134518923848609e-05, "loss": 0.3494, "step": 1067 }, { "epoch": 0.49, "grad_norm": 0.9563857913017273, "learning_rate": 5.129958960328317e-05, "loss": 0.3546, "step": 1068 }, { "epoch": 0.49, "grad_norm": 0.8422328233718872, "learning_rate": 5.125398996808026e-05, "loss": 0.3622, "step": 1069 }, { "epoch": 0.49, "grad_norm": 0.8861860036849976, "learning_rate": 5.120839033287734e-05, "loss": 0.3698, "step": 1070 }, { "epoch": 0.49, "eval_loss": 0.3621842563152313, "eval_runtime": 21.5719, "eval_samples_per_second": 1.298, "eval_steps_per_second": 0.324, "step": 1070 }, { "epoch": 0.49, "grad_norm": 0.8619980216026306, "learning_rate": 5.1162790697674425e-05, "loss": 0.3595, "step": 1071 }, { "epoch": 0.49, "grad_norm": 0.8314919471740723, "learning_rate": 5.111719106247151e-05, "loss": 0.3578, "step": 1072 }, { "epoch": 0.49, "grad_norm": 0.8684543371200562, "learning_rate": 5.107159142726858e-05, "loss": 0.3452, "step": 1073 }, { "epoch": 0.49, "grad_norm": 0.8409459590911865, "learning_rate": 5.1025991792065663e-05, "loss": 0.3541, "step": 1074 }, { "epoch": 0.49, "grad_norm": 0.814139187335968, "learning_rate": 5.0980392156862745e-05, "loss": 0.352, "step": 1075 }, { "epoch": 0.49, "grad_norm": 0.774848461151123, "learning_rate": 5.093479252165983e-05, "loss": 0.3324, "step": 1076 }, { "epoch": 0.49, "grad_norm": 0.8436389565467834, "learning_rate": 5.0889192886456915e-05, "loss": 0.3546, "step": 1077 }, { "epoch": 0.49, "grad_norm": 0.8201998472213745, "learning_rate": 5.0843593251254e-05, "loss": 0.3437, "step": 1078 }, { "epoch": 0.49, "grad_norm": 0.834858775138855, "learning_rate": 5.079799361605108e-05, "loss": 0.3442, "step": 1079 }, { "epoch": 0.49, "grad_norm": 0.8360298275947571, "learning_rate": 5.0752393980848154e-05, "loss": 0.3461, "step": 1080 }, { "epoch": 0.49, "eval_loss": 0.3615194857120514, "eval_runtime": 20.521, "eval_samples_per_second": 1.364, "eval_steps_per_second": 0.341, "step": 1080 }, { "epoch": 0.49, "grad_norm": 0.8249042630195618, "learning_rate": 5.0706794345645236e-05, "loss": 0.3574, "step": 1081 }, { "epoch": 0.49, "grad_norm": 0.8017203211784363, "learning_rate": 5.066119471044232e-05, "loss": 0.3428, "step": 1082 }, { "epoch": 0.49, "grad_norm": 0.8337559700012207, "learning_rate": 5.06155950752394e-05, "loss": 0.3539, "step": 1083 }, { "epoch": 0.49, "grad_norm": 0.8060609698295593, "learning_rate": 5.0569995440036474e-05, "loss": 0.3339, "step": 1084 }, { "epoch": 0.49, "grad_norm": 0.8409240245819092, "learning_rate": 5.052439580483357e-05, "loss": 0.3564, "step": 1085 }, { "epoch": 0.5, "grad_norm": 0.8593933582305908, "learning_rate": 5.047879616963065e-05, "loss": 0.3243, "step": 1086 }, { "epoch": 0.5, "grad_norm": 0.853076696395874, "learning_rate": 5.0433196534427726e-05, "loss": 0.3351, "step": 1087 }, { "epoch": 0.5, "grad_norm": 0.8284323811531067, "learning_rate": 5.038759689922481e-05, "loss": 0.3655, "step": 1088 }, { "epoch": 0.5, "grad_norm": 0.8908005356788635, "learning_rate": 5.034199726402189e-05, "loss": 0.3819, "step": 1089 }, { "epoch": 0.5, "grad_norm": 0.7974300980567932, "learning_rate": 5.029639762881897e-05, "loss": 0.3114, "step": 1090 }, { "epoch": 0.5, "eval_loss": 0.3615086078643799, "eval_runtime": 20.4658, "eval_samples_per_second": 1.368, "eval_steps_per_second": 0.342, "step": 1090 }, { "epoch": 0.5, "grad_norm": 0.8122878074645996, "learning_rate": 5.025079799361605e-05, "loss": 0.3515, "step": 1091 }, { "epoch": 0.5, "grad_norm": 0.8021461367607117, "learning_rate": 5.020519835841313e-05, "loss": 0.3382, "step": 1092 }, { "epoch": 0.5, "grad_norm": 0.8411031365394592, "learning_rate": 5.015959872321022e-05, "loss": 0.3486, "step": 1093 }, { "epoch": 0.5, "grad_norm": 0.8350054025650024, "learning_rate": 5.0113999088007305e-05, "loss": 0.3644, "step": 1094 }, { "epoch": 0.5, "grad_norm": 0.8409880995750427, "learning_rate": 5.006839945280438e-05, "loss": 0.3475, "step": 1095 }, { "epoch": 0.5, "grad_norm": 0.8329557776451111, "learning_rate": 5.002279981760146e-05, "loss": 0.3403, "step": 1096 }, { "epoch": 0.5, "grad_norm": 0.7804337739944458, "learning_rate": 4.997720018239854e-05, "loss": 0.3353, "step": 1097 }, { "epoch": 0.5, "grad_norm": 0.8142528533935547, "learning_rate": 4.9931600547195625e-05, "loss": 0.3405, "step": 1098 }, { "epoch": 0.5, "grad_norm": 0.8784443736076355, "learning_rate": 4.988600091199271e-05, "loss": 0.3688, "step": 1099 }, { "epoch": 0.5, "grad_norm": 0.8305411338806152, "learning_rate": 4.984040127678979e-05, "loss": 0.3334, "step": 1100 }, { "epoch": 0.5, "eval_loss": 0.36101606488227844, "eval_runtime": 20.3924, "eval_samples_per_second": 1.373, "eval_steps_per_second": 0.343, "step": 1100 }, { "epoch": 0.5, "grad_norm": 0.7925887703895569, "learning_rate": 4.979480164158687e-05, "loss": 0.3379, "step": 1101 }, { "epoch": 0.5, "grad_norm": 0.8849925398826599, "learning_rate": 4.974920200638395e-05, "loss": 0.3611, "step": 1102 }, { "epoch": 0.5, "grad_norm": 0.8023688197135925, "learning_rate": 4.9703602371181034e-05, "loss": 0.3452, "step": 1103 }, { "epoch": 0.5, "grad_norm": 0.813382625579834, "learning_rate": 4.9658002735978115e-05, "loss": 0.331, "step": 1104 }, { "epoch": 0.5, "grad_norm": 0.8702760934829712, "learning_rate": 4.96124031007752e-05, "loss": 0.3497, "step": 1105 }, { "epoch": 0.5, "grad_norm": 0.8400478363037109, "learning_rate": 4.956680346557228e-05, "loss": 0.3465, "step": 1106 }, { "epoch": 0.5, "grad_norm": 0.8609248399734497, "learning_rate": 4.952120383036936e-05, "loss": 0.3288, "step": 1107 }, { "epoch": 0.51, "grad_norm": 0.867631733417511, "learning_rate": 4.947560419516644e-05, "loss": 0.3582, "step": 1108 }, { "epoch": 0.51, "grad_norm": 0.804740846157074, "learning_rate": 4.9430004559963524e-05, "loss": 0.3348, "step": 1109 }, { "epoch": 0.51, "grad_norm": 0.8101495504379272, "learning_rate": 4.93844049247606e-05, "loss": 0.3542, "step": 1110 }, { "epoch": 0.51, "eval_loss": 0.3616204261779785, "eval_runtime": 20.7406, "eval_samples_per_second": 1.35, "eval_steps_per_second": 0.338, "step": 1110 }, { "epoch": 0.51, "grad_norm": 0.8318590521812439, "learning_rate": 4.933880528955769e-05, "loss": 0.3397, "step": 1111 }, { "epoch": 0.51, "grad_norm": 0.8036967515945435, "learning_rate": 4.929320565435477e-05, "loss": 0.349, "step": 1112 }, { "epoch": 0.51, "grad_norm": 0.8411818146705627, "learning_rate": 4.924760601915185e-05, "loss": 0.3442, "step": 1113 }, { "epoch": 0.51, "grad_norm": 0.8122812509536743, "learning_rate": 4.9202006383948926e-05, "loss": 0.3503, "step": 1114 }, { "epoch": 0.51, "grad_norm": 0.8358497023582458, "learning_rate": 4.9156406748746014e-05, "loss": 0.3533, "step": 1115 }, { "epoch": 0.51, "grad_norm": 0.83430415391922, "learning_rate": 4.9110807113543096e-05, "loss": 0.3624, "step": 1116 }, { "epoch": 0.51, "grad_norm": 0.8298221230506897, "learning_rate": 4.906520747834018e-05, "loss": 0.3448, "step": 1117 }, { "epoch": 0.51, "grad_norm": 0.827634871006012, "learning_rate": 4.901960784313725e-05, "loss": 0.3478, "step": 1118 }, { "epoch": 0.51, "grad_norm": 0.8373301029205322, "learning_rate": 4.897400820793434e-05, "loss": 0.3384, "step": 1119 }, { "epoch": 0.51, "grad_norm": 0.8340453505516052, "learning_rate": 4.892840857273142e-05, "loss": 0.3394, "step": 1120 }, { "epoch": 0.51, "eval_loss": 0.36215755343437195, "eval_runtime": 19.4441, "eval_samples_per_second": 1.44, "eval_steps_per_second": 0.36, "step": 1120 }, { "epoch": 0.51, "grad_norm": 0.8383544683456421, "learning_rate": 4.88828089375285e-05, "loss": 0.3594, "step": 1121 }, { "epoch": 0.51, "grad_norm": 0.7880938053131104, "learning_rate": 4.883720930232558e-05, "loss": 0.3327, "step": 1122 }, { "epoch": 0.51, "grad_norm": 0.7821892499923706, "learning_rate": 4.879160966712267e-05, "loss": 0.352, "step": 1123 }, { "epoch": 0.51, "grad_norm": 0.8121359348297119, "learning_rate": 4.874601003191975e-05, "loss": 0.3515, "step": 1124 }, { "epoch": 0.51, "grad_norm": 0.8127467036247253, "learning_rate": 4.8700410396716825e-05, "loss": 0.3505, "step": 1125 }, { "epoch": 0.51, "grad_norm": 0.8172454833984375, "learning_rate": 4.865481076151391e-05, "loss": 0.3329, "step": 1126 }, { "epoch": 0.51, "grad_norm": 0.8340194225311279, "learning_rate": 4.8609211126310995e-05, "loss": 0.3442, "step": 1127 }, { "epoch": 0.51, "grad_norm": 0.7899029850959778, "learning_rate": 4.856361149110808e-05, "loss": 0.3338, "step": 1128 }, { "epoch": 0.51, "grad_norm": 0.8151934742927551, "learning_rate": 4.851801185590515e-05, "loss": 0.3408, "step": 1129 }, { "epoch": 0.52, "grad_norm": 0.8011878728866577, "learning_rate": 4.8472412220702234e-05, "loss": 0.3335, "step": 1130 }, { "epoch": 0.52, "eval_loss": 0.3620659410953522, "eval_runtime": 21.8046, "eval_samples_per_second": 1.284, "eval_steps_per_second": 0.321, "step": 1130 }, { "epoch": 0.52, "grad_norm": 0.8504952788352966, "learning_rate": 4.842681258549932e-05, "loss": 0.3377, "step": 1131 }, { "epoch": 0.52, "grad_norm": 0.7817379236221313, "learning_rate": 4.8381212950296404e-05, "loss": 0.355, "step": 1132 }, { "epoch": 0.52, "grad_norm": 0.8368287086486816, "learning_rate": 4.833561331509348e-05, "loss": 0.3565, "step": 1133 }, { "epoch": 0.52, "grad_norm": 0.8199528455734253, "learning_rate": 4.829001367989056e-05, "loss": 0.3296, "step": 1134 }, { "epoch": 0.52, "grad_norm": 0.828828752040863, "learning_rate": 4.824441404468765e-05, "loss": 0.3358, "step": 1135 }, { "epoch": 0.52, "grad_norm": 0.7847192287445068, "learning_rate": 4.8198814409484724e-05, "loss": 0.3432, "step": 1136 }, { "epoch": 0.52, "grad_norm": 0.8164374828338623, "learning_rate": 4.8153214774281806e-05, "loss": 0.3334, "step": 1137 }, { "epoch": 0.52, "grad_norm": 0.8527355790138245, "learning_rate": 4.810761513907889e-05, "loss": 0.3697, "step": 1138 }, { "epoch": 0.52, "grad_norm": 0.7457334399223328, "learning_rate": 4.8062015503875976e-05, "loss": 0.3177, "step": 1139 }, { "epoch": 0.52, "grad_norm": 0.8588700294494629, "learning_rate": 4.801641586867305e-05, "loss": 0.3574, "step": 1140 }, { "epoch": 0.52, "eval_loss": 0.36230775713920593, "eval_runtime": 21.7776, "eval_samples_per_second": 1.286, "eval_steps_per_second": 0.321, "step": 1140 }, { "epoch": 0.52, "grad_norm": 0.8982169032096863, "learning_rate": 4.797081623347013e-05, "loss": 0.3492, "step": 1141 }, { "epoch": 0.52, "grad_norm": 0.8421137928962708, "learning_rate": 4.7925216598267214e-05, "loss": 0.3535, "step": 1142 }, { "epoch": 0.52, "grad_norm": 0.7602046132087708, "learning_rate": 4.78796169630643e-05, "loss": 0.307, "step": 1143 }, { "epoch": 0.52, "grad_norm": 0.8325467109680176, "learning_rate": 4.783401732786138e-05, "loss": 0.3388, "step": 1144 }, { "epoch": 0.52, "grad_norm": 0.8151599168777466, "learning_rate": 4.778841769265846e-05, "loss": 0.3447, "step": 1145 }, { "epoch": 0.52, "grad_norm": 0.8560123443603516, "learning_rate": 4.774281805745554e-05, "loss": 0.3515, "step": 1146 }, { "epoch": 0.52, "grad_norm": 0.8704785108566284, "learning_rate": 4.769721842225262e-05, "loss": 0.3558, "step": 1147 }, { "epoch": 0.52, "grad_norm": 0.8487253189086914, "learning_rate": 4.7651618787049705e-05, "loss": 0.3404, "step": 1148 }, { "epoch": 0.52, "grad_norm": 0.795734703540802, "learning_rate": 4.7606019151846787e-05, "loss": 0.3337, "step": 1149 }, { "epoch": 0.52, "grad_norm": 0.7966299653053284, "learning_rate": 4.756041951664387e-05, "loss": 0.3257, "step": 1150 }, { "epoch": 0.52, "eval_loss": 0.3623342514038086, "eval_runtime": 21.8906, "eval_samples_per_second": 1.279, "eval_steps_per_second": 0.32, "step": 1150 }, { "epoch": 0.52, "grad_norm": 0.8604366183280945, "learning_rate": 4.751481988144095e-05, "loss": 0.3352, "step": 1151 }, { "epoch": 0.53, "grad_norm": 0.8704828023910522, "learning_rate": 4.746922024623803e-05, "loss": 0.3767, "step": 1152 }, { "epoch": 0.53, "grad_norm": 0.8722444176673889, "learning_rate": 4.7423620611035114e-05, "loss": 0.3706, "step": 1153 }, { "epoch": 0.53, "grad_norm": 0.8082621097564697, "learning_rate": 4.7378020975832195e-05, "loss": 0.3292, "step": 1154 }, { "epoch": 0.53, "grad_norm": 0.8296574354171753, "learning_rate": 4.733242134062928e-05, "loss": 0.3493, "step": 1155 }, { "epoch": 0.53, "grad_norm": 0.8503522872924805, "learning_rate": 4.728682170542636e-05, "loss": 0.3528, "step": 1156 }, { "epoch": 0.53, "grad_norm": 0.8259211182594299, "learning_rate": 4.724122207022344e-05, "loss": 0.368, "step": 1157 }, { "epoch": 0.53, "grad_norm": 0.83866947889328, "learning_rate": 4.719562243502052e-05, "loss": 0.35, "step": 1158 }, { "epoch": 0.53, "grad_norm": 0.8358547687530518, "learning_rate": 4.7150022799817604e-05, "loss": 0.3318, "step": 1159 }, { "epoch": 0.53, "grad_norm": 0.8184478878974915, "learning_rate": 4.7104423164614686e-05, "loss": 0.3471, "step": 1160 }, { "epoch": 0.53, "eval_loss": 0.3614867329597473, "eval_runtime": 21.8624, "eval_samples_per_second": 1.281, "eval_steps_per_second": 0.32, "step": 1160 }, { "epoch": 0.53, "grad_norm": 0.820179283618927, "learning_rate": 4.705882352941177e-05, "loss": 0.3258, "step": 1161 }, { "epoch": 0.53, "grad_norm": 0.8562389612197876, "learning_rate": 4.701322389420885e-05, "loss": 0.3589, "step": 1162 }, { "epoch": 0.53, "grad_norm": 0.809478759765625, "learning_rate": 4.696762425900593e-05, "loss": 0.3172, "step": 1163 }, { "epoch": 0.53, "grad_norm": 0.8187942504882812, "learning_rate": 4.692202462380301e-05, "loss": 0.3636, "step": 1164 }, { "epoch": 0.53, "grad_norm": 0.8812757134437561, "learning_rate": 4.6876424988600094e-05, "loss": 0.3622, "step": 1165 }, { "epoch": 0.53, "grad_norm": 0.93978351354599, "learning_rate": 4.6830825353397176e-05, "loss": 0.3601, "step": 1166 }, { "epoch": 0.53, "grad_norm": 0.8277170062065125, "learning_rate": 4.678522571819426e-05, "loss": 0.3563, "step": 1167 }, { "epoch": 0.53, "grad_norm": 0.8791542053222656, "learning_rate": 4.673962608299134e-05, "loss": 0.344, "step": 1168 }, { "epoch": 0.53, "grad_norm": 0.848914384841919, "learning_rate": 4.669402644778842e-05, "loss": 0.352, "step": 1169 }, { "epoch": 0.53, "grad_norm": 0.8371862769126892, "learning_rate": 4.6648426812585496e-05, "loss": 0.343, "step": 1170 }, { "epoch": 0.53, "eval_loss": 0.3610856831073761, "eval_runtime": 21.8836, "eval_samples_per_second": 1.279, "eval_steps_per_second": 0.32, "step": 1170 }, { "epoch": 0.53, "grad_norm": 0.8674805164337158, "learning_rate": 4.6602827177382585e-05, "loss": 0.3488, "step": 1171 }, { "epoch": 0.53, "grad_norm": 0.8609682321548462, "learning_rate": 4.6557227542179666e-05, "loss": 0.3599, "step": 1172 }, { "epoch": 0.53, "grad_norm": 0.8340581655502319, "learning_rate": 4.651162790697675e-05, "loss": 0.3602, "step": 1173 }, { "epoch": 0.54, "grad_norm": 0.8438746333122253, "learning_rate": 4.646602827177382e-05, "loss": 0.3399, "step": 1174 }, { "epoch": 0.54, "grad_norm": 0.8461584448814392, "learning_rate": 4.642042863657091e-05, "loss": 0.3349, "step": 1175 }, { "epoch": 0.54, "grad_norm": 0.7988734245300293, "learning_rate": 4.637482900136799e-05, "loss": 0.3482, "step": 1176 }, { "epoch": 0.54, "grad_norm": 0.8099560737609863, "learning_rate": 4.6329229366165075e-05, "loss": 0.3378, "step": 1177 }, { "epoch": 0.54, "grad_norm": 0.854646623134613, "learning_rate": 4.628362973096215e-05, "loss": 0.3622, "step": 1178 }, { "epoch": 0.54, "grad_norm": 0.8347275853157043, "learning_rate": 4.623803009575924e-05, "loss": 0.3366, "step": 1179 }, { "epoch": 0.54, "grad_norm": 0.7979652881622314, "learning_rate": 4.619243046055632e-05, "loss": 0.3439, "step": 1180 }, { "epoch": 0.54, "eval_loss": 0.36132702231407166, "eval_runtime": 21.8492, "eval_samples_per_second": 1.282, "eval_steps_per_second": 0.32, "step": 1180 }, { "epoch": 0.54, "grad_norm": 0.822918176651001, "learning_rate": 4.61468308253534e-05, "loss": 0.3481, "step": 1181 }, { "epoch": 0.54, "grad_norm": 0.8427508473396301, "learning_rate": 4.610123119015048e-05, "loss": 0.3455, "step": 1182 }, { "epoch": 0.54, "grad_norm": 0.8091826438903809, "learning_rate": 4.6055631554947565e-05, "loss": 0.3353, "step": 1183 }, { "epoch": 0.54, "grad_norm": 0.8591803908348083, "learning_rate": 4.601003191974465e-05, "loss": 0.351, "step": 1184 }, { "epoch": 0.54, "grad_norm": 0.8574907779693604, "learning_rate": 4.596443228454172e-05, "loss": 0.3606, "step": 1185 }, { "epoch": 0.54, "grad_norm": 0.8384338021278381, "learning_rate": 4.5918832649338804e-05, "loss": 0.3347, "step": 1186 }, { "epoch": 0.54, "grad_norm": 0.8053162097930908, "learning_rate": 4.587323301413589e-05, "loss": 0.3493, "step": 1187 }, { "epoch": 0.54, "grad_norm": 0.85943603515625, "learning_rate": 4.5827633378932974e-05, "loss": 0.3411, "step": 1188 }, { "epoch": 0.54, "grad_norm": 0.8188199996948242, "learning_rate": 4.578203374373005e-05, "loss": 0.3289, "step": 1189 }, { "epoch": 0.54, "grad_norm": 0.8296828269958496, "learning_rate": 4.573643410852713e-05, "loss": 0.3434, "step": 1190 }, { "epoch": 0.54, "eval_loss": 0.3615628778934479, "eval_runtime": 21.7695, "eval_samples_per_second": 1.286, "eval_steps_per_second": 0.322, "step": 1190 }, { "epoch": 0.54, "grad_norm": 0.8322606682777405, "learning_rate": 4.569083447332422e-05, "loss": 0.338, "step": 1191 }, { "epoch": 0.54, "grad_norm": 0.8713050484657288, "learning_rate": 4.56452348381213e-05, "loss": 0.3368, "step": 1192 }, { "epoch": 0.54, "grad_norm": 0.7981169819831848, "learning_rate": 4.5599635202918376e-05, "loss": 0.3362, "step": 1193 }, { "epoch": 0.54, "grad_norm": 0.791098415851593, "learning_rate": 4.555403556771546e-05, "loss": 0.3333, "step": 1194 }, { "epoch": 0.54, "grad_norm": 0.8596270680427551, "learning_rate": 4.5508435932512546e-05, "loss": 0.3261, "step": 1195 }, { "epoch": 0.55, "grad_norm": 0.8467071652412415, "learning_rate": 4.546283629730962e-05, "loss": 0.3459, "step": 1196 }, { "epoch": 0.55, "grad_norm": 0.7838037014007568, "learning_rate": 4.54172366621067e-05, "loss": 0.3406, "step": 1197 }, { "epoch": 0.55, "grad_norm": 0.7928112745285034, "learning_rate": 4.5371637026903785e-05, "loss": 0.3379, "step": 1198 }, { "epoch": 0.55, "grad_norm": 0.8437274098396301, "learning_rate": 4.532603739170087e-05, "loss": 0.3406, "step": 1199 }, { "epoch": 0.55, "grad_norm": 0.8299893140792847, "learning_rate": 4.528043775649795e-05, "loss": 0.3472, "step": 1200 }, { "epoch": 0.55, "eval_loss": 0.36214447021484375, "eval_runtime": 21.8369, "eval_samples_per_second": 1.282, "eval_steps_per_second": 0.321, "step": 1200 }, { "epoch": 0.55, "grad_norm": 0.7983232736587524, "learning_rate": 4.523483812129503e-05, "loss": 0.3362, "step": 1201 }, { "epoch": 0.55, "grad_norm": 0.8538187146186829, "learning_rate": 4.518923848609211e-05, "loss": 0.3582, "step": 1202 }, { "epoch": 0.55, "grad_norm": 0.8160207271575928, "learning_rate": 4.51436388508892e-05, "loss": 0.3423, "step": 1203 }, { "epoch": 0.55, "grad_norm": 0.8224906921386719, "learning_rate": 4.5098039215686275e-05, "loss": 0.3291, "step": 1204 }, { "epoch": 0.55, "grad_norm": 0.7791831493377686, "learning_rate": 4.505243958048336e-05, "loss": 0.3451, "step": 1205 }, { "epoch": 0.55, "grad_norm": 0.7967703342437744, "learning_rate": 4.500683994528044e-05, "loss": 0.3322, "step": 1206 }, { "epoch": 0.55, "grad_norm": 0.7647725939750671, "learning_rate": 4.496124031007753e-05, "loss": 0.3225, "step": 1207 }, { "epoch": 0.55, "grad_norm": 0.8055102229118347, "learning_rate": 4.49156406748746e-05, "loss": 0.3294, "step": 1208 }, { "epoch": 0.55, "grad_norm": 0.7932788729667664, "learning_rate": 4.4870041039671684e-05, "loss": 0.3441, "step": 1209 }, { "epoch": 0.55, "grad_norm": 0.8258801698684692, "learning_rate": 4.4824441404468765e-05, "loss": 0.3605, "step": 1210 }, { "epoch": 0.55, "eval_loss": 0.361802339553833, "eval_runtime": 21.6768, "eval_samples_per_second": 1.292, "eval_steps_per_second": 0.323, "step": 1210 }, { "epoch": 0.55, "grad_norm": 0.7840272784233093, "learning_rate": 4.477884176926585e-05, "loss": 0.3479, "step": 1211 }, { "epoch": 0.55, "grad_norm": 0.8730911612510681, "learning_rate": 4.473324213406293e-05, "loss": 0.373, "step": 1212 }, { "epoch": 0.55, "grad_norm": 0.8281298875808716, "learning_rate": 4.468764249886001e-05, "loss": 0.3379, "step": 1213 }, { "epoch": 0.55, "grad_norm": 0.8440173864364624, "learning_rate": 4.464204286365709e-05, "loss": 0.3592, "step": 1214 }, { "epoch": 0.55, "grad_norm": 0.8705652952194214, "learning_rate": 4.4596443228454174e-05, "loss": 0.3622, "step": 1215 }, { "epoch": 0.55, "grad_norm": 0.7783262729644775, "learning_rate": 4.4550843593251256e-05, "loss": 0.3262, "step": 1216 }, { "epoch": 0.55, "grad_norm": 0.9026988744735718, "learning_rate": 4.450524395804834e-05, "loss": 0.3698, "step": 1217 }, { "epoch": 0.56, "grad_norm": 0.8218929767608643, "learning_rate": 4.445964432284542e-05, "loss": 0.3459, "step": 1218 }, { "epoch": 0.56, "grad_norm": 0.8687737584114075, "learning_rate": 4.44140446876425e-05, "loss": 0.3489, "step": 1219 }, { "epoch": 0.56, "grad_norm": 0.8133276700973511, "learning_rate": 4.436844505243958e-05, "loss": 0.3474, "step": 1220 }, { "epoch": 0.56, "eval_loss": 0.36127376556396484, "eval_runtime": 21.8837, "eval_samples_per_second": 1.279, "eval_steps_per_second": 0.32, "step": 1220 }, { "epoch": 0.56, "grad_norm": 0.820976972579956, "learning_rate": 4.4322845417236665e-05, "loss": 0.3535, "step": 1221 }, { "epoch": 0.56, "grad_norm": 0.7587445378303528, "learning_rate": 4.4277245782033746e-05, "loss": 0.345, "step": 1222 }, { "epoch": 0.56, "grad_norm": 0.7987104058265686, "learning_rate": 4.423164614683083e-05, "loss": 0.3429, "step": 1223 }, { "epoch": 0.56, "grad_norm": 0.8208971619606018, "learning_rate": 4.418604651162791e-05, "loss": 0.3439, "step": 1224 }, { "epoch": 0.56, "grad_norm": 0.8062512874603271, "learning_rate": 4.414044687642499e-05, "loss": 0.3532, "step": 1225 }, { "epoch": 0.56, "grad_norm": 0.8156399130821228, "learning_rate": 4.409484724122207e-05, "loss": 0.365, "step": 1226 }, { "epoch": 0.56, "grad_norm": 0.8011518716812134, "learning_rate": 4.4049247606019155e-05, "loss": 0.3288, "step": 1227 }, { "epoch": 0.56, "grad_norm": 0.7845913767814636, "learning_rate": 4.400364797081624e-05, "loss": 0.3299, "step": 1228 }, { "epoch": 0.56, "grad_norm": 0.8292993903160095, "learning_rate": 4.395804833561332e-05, "loss": 0.3666, "step": 1229 }, { "epoch": 0.56, "grad_norm": 0.8220998644828796, "learning_rate": 4.39124487004104e-05, "loss": 0.3381, "step": 1230 }, { "epoch": 0.56, "eval_loss": 0.3609188199043274, "eval_runtime": 21.8935, "eval_samples_per_second": 1.279, "eval_steps_per_second": 0.32, "step": 1230 }, { "epoch": 0.56, "grad_norm": 0.786451518535614, "learning_rate": 4.386684906520748e-05, "loss": 0.3329, "step": 1231 }, { "epoch": 0.56, "grad_norm": 0.83308345079422, "learning_rate": 4.3821249430004564e-05, "loss": 0.3515, "step": 1232 }, { "epoch": 0.56, "grad_norm": 0.8226833343505859, "learning_rate": 4.3775649794801645e-05, "loss": 0.3458, "step": 1233 }, { "epoch": 0.56, "grad_norm": 0.8319200873374939, "learning_rate": 4.373005015959872e-05, "loss": 0.3491, "step": 1234 }, { "epoch": 0.56, "grad_norm": 0.8553533554077148, "learning_rate": 4.368445052439581e-05, "loss": 0.3524, "step": 1235 }, { "epoch": 0.56, "grad_norm": 0.864627480506897, "learning_rate": 4.363885088919289e-05, "loss": 0.3572, "step": 1236 }, { "epoch": 0.56, "grad_norm": 0.8432021737098694, "learning_rate": 4.359325125398997e-05, "loss": 0.3543, "step": 1237 }, { "epoch": 0.56, "grad_norm": 0.7859142422676086, "learning_rate": 4.354765161878705e-05, "loss": 0.3284, "step": 1238 }, { "epoch": 0.56, "grad_norm": 0.8182294368743896, "learning_rate": 4.3502051983584136e-05, "loss": 0.3282, "step": 1239 }, { "epoch": 0.57, "grad_norm": 0.9091182947158813, "learning_rate": 4.345645234838122e-05, "loss": 0.3749, "step": 1240 }, { "epoch": 0.57, "eval_loss": 0.360344260931015, "eval_runtime": 21.9012, "eval_samples_per_second": 1.278, "eval_steps_per_second": 0.32, "step": 1240 }, { "epoch": 0.57, "grad_norm": 0.7812795042991638, "learning_rate": 4.34108527131783e-05, "loss": 0.3315, "step": 1241 }, { "epoch": 0.57, "grad_norm": 0.816305935382843, "learning_rate": 4.3365253077975374e-05, "loss": 0.3502, "step": 1242 }, { "epoch": 0.57, "grad_norm": 0.8034347891807556, "learning_rate": 4.331965344277246e-05, "loss": 0.3337, "step": 1243 }, { "epoch": 0.57, "grad_norm": 0.8315020203590393, "learning_rate": 4.3274053807569544e-05, "loss": 0.354, "step": 1244 }, { "epoch": 0.57, "grad_norm": 0.7251798510551453, "learning_rate": 4.322845417236662e-05, "loss": 0.3249, "step": 1245 }, { "epoch": 0.57, "grad_norm": 0.7859770059585571, "learning_rate": 4.31828545371637e-05, "loss": 0.3275, "step": 1246 }, { "epoch": 0.57, "grad_norm": 0.8476674556732178, "learning_rate": 4.313725490196079e-05, "loss": 0.3574, "step": 1247 }, { "epoch": 0.57, "grad_norm": 0.8440091609954834, "learning_rate": 4.309165526675787e-05, "loss": 0.3525, "step": 1248 }, { "epoch": 0.57, "grad_norm": 0.8422577977180481, "learning_rate": 4.3046055631554946e-05, "loss": 0.3585, "step": 1249 }, { "epoch": 0.57, "grad_norm": 0.8375608325004578, "learning_rate": 4.300045599635203e-05, "loss": 0.3625, "step": 1250 }, { "epoch": 0.57, "eval_loss": 0.360787957906723, "eval_runtime": 21.7575, "eval_samples_per_second": 1.287, "eval_steps_per_second": 0.322, "step": 1250 }, { "epoch": 0.57, "grad_norm": 0.8197726607322693, "learning_rate": 4.2954856361149116e-05, "loss": 0.3537, "step": 1251 }, { "epoch": 0.57, "grad_norm": 0.8626959919929504, "learning_rate": 4.29092567259462e-05, "loss": 0.3442, "step": 1252 }, { "epoch": 0.57, "grad_norm": 0.8891031742095947, "learning_rate": 4.286365709074327e-05, "loss": 0.3681, "step": 1253 }, { "epoch": 0.57, "grad_norm": 0.8085613250732422, "learning_rate": 4.2818057455540355e-05, "loss": 0.3586, "step": 1254 }, { "epoch": 0.57, "grad_norm": 0.8536873459815979, "learning_rate": 4.2772457820337443e-05, "loss": 0.3324, "step": 1255 }, { "epoch": 0.57, "grad_norm": 0.8423956632614136, "learning_rate": 4.272685818513452e-05, "loss": 0.3529, "step": 1256 }, { "epoch": 0.57, "grad_norm": 0.8388254642486572, "learning_rate": 4.26812585499316e-05, "loss": 0.3612, "step": 1257 }, { "epoch": 0.57, "grad_norm": 0.7667680978775024, "learning_rate": 4.263565891472868e-05, "loss": 0.3391, "step": 1258 }, { "epoch": 0.57, "grad_norm": 0.8259601593017578, "learning_rate": 4.259005927952577e-05, "loss": 0.3482, "step": 1259 }, { "epoch": 0.57, "grad_norm": 0.8969464898109436, "learning_rate": 4.2544459644322845e-05, "loss": 0.351, "step": 1260 }, { "epoch": 0.57, "eval_loss": 0.36102089285850525, "eval_runtime": 21.6213, "eval_samples_per_second": 1.295, "eval_steps_per_second": 0.324, "step": 1260 }, { "epoch": 0.57, "grad_norm": 0.8078624606132507, "learning_rate": 4.249886000911993e-05, "loss": 0.3494, "step": 1261 }, { "epoch": 0.58, "grad_norm": 0.8384256362915039, "learning_rate": 4.245326037391701e-05, "loss": 0.3574, "step": 1262 }, { "epoch": 0.58, "grad_norm": 0.8481121063232422, "learning_rate": 4.24076607387141e-05, "loss": 0.3517, "step": 1263 }, { "epoch": 0.58, "grad_norm": 0.8446504473686218, "learning_rate": 4.236206110351117e-05, "loss": 0.3495, "step": 1264 }, { "epoch": 0.58, "grad_norm": 0.906520664691925, "learning_rate": 4.2316461468308254e-05, "loss": 0.3628, "step": 1265 }, { "epoch": 0.58, "grad_norm": 0.8286429047584534, "learning_rate": 4.2270861833105336e-05, "loss": 0.3521, "step": 1266 }, { "epoch": 0.58, "grad_norm": 0.8214951753616333, "learning_rate": 4.2225262197902424e-05, "loss": 0.3601, "step": 1267 }, { "epoch": 0.58, "grad_norm": 0.8151431679725647, "learning_rate": 4.21796625626995e-05, "loss": 0.3565, "step": 1268 }, { "epoch": 0.58, "grad_norm": 0.8063027262687683, "learning_rate": 4.213406292749658e-05, "loss": 0.3363, "step": 1269 }, { "epoch": 0.58, "grad_norm": 0.8764293789863586, "learning_rate": 4.208846329229366e-05, "loss": 0.3683, "step": 1270 }, { "epoch": 0.58, "eval_loss": 0.36087459325790405, "eval_runtime": 21.8157, "eval_samples_per_second": 1.283, "eval_steps_per_second": 0.321, "step": 1270 }, { "epoch": 0.58, "grad_norm": 0.7735445499420166, "learning_rate": 4.2042863657090744e-05, "loss": 0.3313, "step": 1271 }, { "epoch": 0.58, "grad_norm": 0.8024297952651978, "learning_rate": 4.1997264021887826e-05, "loss": 0.337, "step": 1272 }, { "epoch": 0.58, "grad_norm": 0.78364098072052, "learning_rate": 4.195166438668491e-05, "loss": 0.3353, "step": 1273 }, { "epoch": 0.58, "grad_norm": 0.8018545508384705, "learning_rate": 4.190606475148199e-05, "loss": 0.333, "step": 1274 }, { "epoch": 0.58, "grad_norm": 0.8167896270751953, "learning_rate": 4.186046511627907e-05, "loss": 0.3374, "step": 1275 }, { "epoch": 0.58, "grad_norm": 0.8448525667190552, "learning_rate": 4.181486548107615e-05, "loss": 0.3631, "step": 1276 }, { "epoch": 0.58, "grad_norm": 0.8643104434013367, "learning_rate": 4.1769265845873235e-05, "loss": 0.3679, "step": 1277 }, { "epoch": 0.58, "grad_norm": 0.8329184651374817, "learning_rate": 4.1723666210670316e-05, "loss": 0.3334, "step": 1278 }, { "epoch": 0.58, "grad_norm": 0.8378819227218628, "learning_rate": 4.16780665754674e-05, "loss": 0.3605, "step": 1279 }, { "epoch": 0.58, "grad_norm": 0.8405564427375793, "learning_rate": 4.163246694026448e-05, "loss": 0.3465, "step": 1280 }, { "epoch": 0.58, "eval_loss": 0.3606376647949219, "eval_runtime": 25.4779, "eval_samples_per_second": 1.099, "eval_steps_per_second": 0.275, "step": 1280 }, { "epoch": 0.58, "grad_norm": 0.8125925064086914, "learning_rate": 4.158686730506156e-05, "loss": 0.3392, "step": 1281 }, { "epoch": 0.58, "grad_norm": 0.804631769657135, "learning_rate": 4.1541267669858643e-05, "loss": 0.3284, "step": 1282 }, { "epoch": 0.58, "grad_norm": 0.8230606913566589, "learning_rate": 4.1495668034655725e-05, "loss": 0.3483, "step": 1283 }, { "epoch": 0.59, "grad_norm": 0.8066771030426025, "learning_rate": 4.145006839945281e-05, "loss": 0.3195, "step": 1284 }, { "epoch": 0.59, "grad_norm": 0.8422403335571289, "learning_rate": 4.140446876424989e-05, "loss": 0.3749, "step": 1285 }, { "epoch": 0.59, "grad_norm": 0.8488630652427673, "learning_rate": 4.135886912904697e-05, "loss": 0.3597, "step": 1286 }, { "epoch": 0.59, "grad_norm": 0.8020509481430054, "learning_rate": 4.131326949384405e-05, "loss": 0.3333, "step": 1287 }, { "epoch": 0.59, "grad_norm": 0.7981416583061218, "learning_rate": 4.1267669858641134e-05, "loss": 0.335, "step": 1288 }, { "epoch": 0.59, "grad_norm": 0.8085934519767761, "learning_rate": 4.1222070223438216e-05, "loss": 0.3396, "step": 1289 }, { "epoch": 0.59, "grad_norm": 0.8348598480224609, "learning_rate": 4.11764705882353e-05, "loss": 0.3397, "step": 1290 }, { "epoch": 0.59, "eval_loss": 0.3603203594684601, "eval_runtime": 25.6317, "eval_samples_per_second": 1.092, "eval_steps_per_second": 0.273, "step": 1290 }, { "epoch": 0.59, "grad_norm": 0.7678222060203552, "learning_rate": 4.113087095303238e-05, "loss": 0.3341, "step": 1291 }, { "epoch": 0.59, "grad_norm": 0.823225200176239, "learning_rate": 4.108527131782946e-05, "loss": 0.3495, "step": 1292 }, { "epoch": 0.59, "grad_norm": 0.8174163699150085, "learning_rate": 4.103967168262654e-05, "loss": 0.3498, "step": 1293 }, { "epoch": 0.59, "grad_norm": 0.8279191851615906, "learning_rate": 4.099407204742362e-05, "loss": 0.3528, "step": 1294 }, { "epoch": 0.59, "grad_norm": 0.7891851663589478, "learning_rate": 4.0948472412220706e-05, "loss": 0.3408, "step": 1295 }, { "epoch": 0.59, "grad_norm": 0.8267097473144531, "learning_rate": 4.090287277701779e-05, "loss": 0.3595, "step": 1296 }, { "epoch": 0.59, "grad_norm": 0.819893479347229, "learning_rate": 4.085727314181487e-05, "loss": 0.3373, "step": 1297 }, { "epoch": 0.59, "grad_norm": 0.8141855001449585, "learning_rate": 4.0811673506611944e-05, "loss": 0.3517, "step": 1298 }, { "epoch": 0.59, "grad_norm": 0.787208080291748, "learning_rate": 4.076607387140903e-05, "loss": 0.3206, "step": 1299 }, { "epoch": 0.59, "grad_norm": 0.8328278064727783, "learning_rate": 4.0720474236206115e-05, "loss": 0.3601, "step": 1300 }, { "epoch": 0.59, "eval_loss": 0.3600243628025055, "eval_runtime": 34.1653, "eval_samples_per_second": 0.82, "eval_steps_per_second": 0.205, "step": 1300 }, { "epoch": 0.59, "grad_norm": 0.8156459331512451, "learning_rate": 4.0674874601003196e-05, "loss": 0.3377, "step": 1301 }, { "epoch": 0.59, "grad_norm": 0.8492475152015686, "learning_rate": 4.062927496580027e-05, "loss": 0.3469, "step": 1302 }, { "epoch": 0.59, "grad_norm": 0.8182046413421631, "learning_rate": 4.058367533059736e-05, "loss": 0.35, "step": 1303 }, { "epoch": 0.59, "grad_norm": 0.803213357925415, "learning_rate": 4.053807569539444e-05, "loss": 0.3506, "step": 1304 }, { "epoch": 0.59, "grad_norm": 0.7835839986801147, "learning_rate": 4.0492476060191516e-05, "loss": 0.3433, "step": 1305 }, { "epoch": 0.6, "grad_norm": 0.8448796272277832, "learning_rate": 4.04468764249886e-05, "loss": 0.3546, "step": 1306 }, { "epoch": 0.6, "grad_norm": 0.7899730205535889, "learning_rate": 4.040127678978569e-05, "loss": 0.3359, "step": 1307 }, { "epoch": 0.6, "grad_norm": 0.8046201467514038, "learning_rate": 4.035567715458277e-05, "loss": 0.3385, "step": 1308 }, { "epoch": 0.6, "grad_norm": 0.8358333706855774, "learning_rate": 4.0310077519379843e-05, "loss": 0.3716, "step": 1309 }, { "epoch": 0.6, "grad_norm": 0.8300046920776367, "learning_rate": 4.0264477884176925e-05, "loss": 0.3588, "step": 1310 }, { "epoch": 0.6, "eval_loss": 0.35997697710990906, "eval_runtime": 34.4834, "eval_samples_per_second": 0.812, "eval_steps_per_second": 0.203, "step": 1310 }, { "epoch": 0.6, "grad_norm": 0.8253521919250488, "learning_rate": 4.0218878248974014e-05, "loss": 0.3362, "step": 1311 }, { "epoch": 0.6, "grad_norm": 0.8703365325927734, "learning_rate": 4.0173278613771095e-05, "loss": 0.3591, "step": 1312 }, { "epoch": 0.6, "grad_norm": 0.8559762239456177, "learning_rate": 4.012767897856817e-05, "loss": 0.3345, "step": 1313 }, { "epoch": 0.6, "grad_norm": 0.8356379270553589, "learning_rate": 4.008207934336525e-05, "loss": 0.3718, "step": 1314 }, { "epoch": 0.6, "grad_norm": 0.7918616533279419, "learning_rate": 4.003647970816234e-05, "loss": 0.3308, "step": 1315 }, { "epoch": 0.6, "grad_norm": 0.898504912853241, "learning_rate": 3.999088007295942e-05, "loss": 0.3613, "step": 1316 }, { "epoch": 0.6, "grad_norm": 0.7918505072593689, "learning_rate": 3.99452804377565e-05, "loss": 0.3224, "step": 1317 }, { "epoch": 0.6, "grad_norm": 0.8192049264907837, "learning_rate": 3.989968080255358e-05, "loss": 0.329, "step": 1318 }, { "epoch": 0.6, "grad_norm": 0.8752269744873047, "learning_rate": 3.985408116735067e-05, "loss": 0.3533, "step": 1319 }, { "epoch": 0.6, "grad_norm": 0.7703496217727661, "learning_rate": 3.980848153214774e-05, "loss": 0.3143, "step": 1320 }, { "epoch": 0.6, "eval_loss": 0.3601928651332855, "eval_runtime": 25.5117, "eval_samples_per_second": 1.098, "eval_steps_per_second": 0.274, "step": 1320 }, { "epoch": 0.6, "grad_norm": 0.8335027694702148, "learning_rate": 3.9762881896944824e-05, "loss": 0.3324, "step": 1321 }, { "epoch": 0.6, "grad_norm": 0.8822119235992432, "learning_rate": 3.9717282261741906e-05, "loss": 0.3579, "step": 1322 }, { "epoch": 0.6, "grad_norm": 0.8622987866401672, "learning_rate": 3.9671682626538994e-05, "loss": 0.3675, "step": 1323 }, { "epoch": 0.6, "grad_norm": 0.8221805691719055, "learning_rate": 3.962608299133607e-05, "loss": 0.359, "step": 1324 }, { "epoch": 0.6, "grad_norm": 0.8118740320205688, "learning_rate": 3.958048335613315e-05, "loss": 0.3496, "step": 1325 }, { "epoch": 0.6, "grad_norm": 0.7761417031288147, "learning_rate": 3.953488372093023e-05, "loss": 0.3326, "step": 1326 }, { "epoch": 0.6, "grad_norm": 0.812454879283905, "learning_rate": 3.948928408572732e-05, "loss": 0.3555, "step": 1327 }, { "epoch": 0.61, "grad_norm": 0.8731850981712341, "learning_rate": 3.9443684450524396e-05, "loss": 0.3771, "step": 1328 }, { "epoch": 0.61, "grad_norm": 0.85962975025177, "learning_rate": 3.939808481532148e-05, "loss": 0.3599, "step": 1329 }, { "epoch": 0.61, "grad_norm": 0.8229592442512512, "learning_rate": 3.935248518011856e-05, "loss": 0.3607, "step": 1330 }, { "epoch": 0.61, "eval_loss": 0.35948753356933594, "eval_runtime": 25.3157, "eval_samples_per_second": 1.106, "eval_steps_per_second": 0.277, "step": 1330 }, { "epoch": 0.61, "grad_norm": 0.7954073548316956, "learning_rate": 3.930688554491564e-05, "loss": 0.3359, "step": 1331 }, { "epoch": 0.61, "grad_norm": 0.8094289302825928, "learning_rate": 3.926128590971272e-05, "loss": 0.3235, "step": 1332 }, { "epoch": 0.61, "grad_norm": 0.826915442943573, "learning_rate": 3.9215686274509805e-05, "loss": 0.3452, "step": 1333 }, { "epoch": 0.61, "grad_norm": 0.7665673494338989, "learning_rate": 3.917008663930689e-05, "loss": 0.3256, "step": 1334 }, { "epoch": 0.61, "grad_norm": 0.8052255511283875, "learning_rate": 3.912448700410397e-05, "loss": 0.3446, "step": 1335 }, { "epoch": 0.61, "grad_norm": 0.8459590077400208, "learning_rate": 3.907888736890105e-05, "loss": 0.3737, "step": 1336 }, { "epoch": 0.61, "grad_norm": 0.8796717524528503, "learning_rate": 3.903328773369813e-05, "loss": 0.3619, "step": 1337 }, { "epoch": 0.61, "grad_norm": 0.8221214413642883, "learning_rate": 3.8987688098495214e-05, "loss": 0.3447, "step": 1338 }, { "epoch": 0.61, "grad_norm": 0.8288754224777222, "learning_rate": 3.8942088463292295e-05, "loss": 0.3469, "step": 1339 }, { "epoch": 0.61, "grad_norm": 0.772335946559906, "learning_rate": 3.889648882808938e-05, "loss": 0.3336, "step": 1340 }, { "epoch": 0.61, "eval_loss": 0.35878506302833557, "eval_runtime": 19.9011, "eval_samples_per_second": 1.407, "eval_steps_per_second": 0.352, "step": 1340 }, { "epoch": 0.61, "grad_norm": 0.8370662927627563, "learning_rate": 3.885088919288646e-05, "loss": 0.3364, "step": 1341 }, { "epoch": 0.61, "grad_norm": 0.7943137288093567, "learning_rate": 3.880528955768354e-05, "loss": 0.3442, "step": 1342 }, { "epoch": 0.61, "grad_norm": 0.8786354660987854, "learning_rate": 3.875968992248062e-05, "loss": 0.347, "step": 1343 }, { "epoch": 0.61, "grad_norm": 0.8405556082725525, "learning_rate": 3.8714090287277704e-05, "loss": 0.3463, "step": 1344 }, { "epoch": 0.61, "grad_norm": 0.8621813058853149, "learning_rate": 3.8668490652074786e-05, "loss": 0.3443, "step": 1345 }, { "epoch": 0.61, "grad_norm": 0.8162899613380432, "learning_rate": 3.862289101687187e-05, "loss": 0.3503, "step": 1346 }, { "epoch": 0.61, "grad_norm": 0.7732406258583069, "learning_rate": 3.857729138166895e-05, "loss": 0.333, "step": 1347 }, { "epoch": 0.61, "grad_norm": 0.8653243780136108, "learning_rate": 3.853169174646603e-05, "loss": 0.3579, "step": 1348 }, { "epoch": 0.61, "grad_norm": 0.8177424073219299, "learning_rate": 3.848609211126311e-05, "loss": 0.3445, "step": 1349 }, { "epoch": 0.62, "grad_norm": 0.8780079483985901, "learning_rate": 3.8440492476060194e-05, "loss": 0.3396, "step": 1350 }, { "epoch": 0.62, "eval_loss": 0.35845258831977844, "eval_runtime": 20.8282, "eval_samples_per_second": 1.344, "eval_steps_per_second": 0.336, "step": 1350 }, { "epoch": 0.62, "grad_norm": 0.7994604110717773, "learning_rate": 3.8394892840857276e-05, "loss": 0.348, "step": 1351 }, { "epoch": 0.62, "grad_norm": 0.7904070615768433, "learning_rate": 3.834929320565436e-05, "loss": 0.3436, "step": 1352 }, { "epoch": 0.62, "grad_norm": 0.8547983765602112, "learning_rate": 3.830369357045144e-05, "loss": 0.3378, "step": 1353 }, { "epoch": 0.62, "grad_norm": 0.8214447498321533, "learning_rate": 3.8258093935248515e-05, "loss": 0.3486, "step": 1354 }, { "epoch": 0.62, "grad_norm": 0.8538534641265869, "learning_rate": 3.82124943000456e-05, "loss": 0.3672, "step": 1355 }, { "epoch": 0.62, "grad_norm": 0.821524441242218, "learning_rate": 3.8166894664842685e-05, "loss": 0.3508, "step": 1356 }, { "epoch": 0.62, "grad_norm": 0.8136518597602844, "learning_rate": 3.8121295029639767e-05, "loss": 0.3226, "step": 1357 }, { "epoch": 0.62, "grad_norm": 0.7952825427055359, "learning_rate": 3.807569539443684e-05, "loss": 0.3312, "step": 1358 }, { "epoch": 0.62, "grad_norm": 0.819299042224884, "learning_rate": 3.803009575923393e-05, "loss": 0.3374, "step": 1359 }, { "epoch": 0.62, "grad_norm": 0.8893455266952515, "learning_rate": 3.798449612403101e-05, "loss": 0.3271, "step": 1360 }, { "epoch": 0.62, "eval_loss": 0.3591363728046417, "eval_runtime": 18.335, "eval_samples_per_second": 1.527, "eval_steps_per_second": 0.382, "step": 1360 }, { "epoch": 0.62, "grad_norm": 0.839749813079834, "learning_rate": 3.7938896488828094e-05, "loss": 0.3406, "step": 1361 }, { "epoch": 0.62, "grad_norm": 0.7815828919410706, "learning_rate": 3.789329685362517e-05, "loss": 0.3298, "step": 1362 }, { "epoch": 0.62, "grad_norm": 0.8089067339897156, "learning_rate": 3.784769721842226e-05, "loss": 0.3381, "step": 1363 }, { "epoch": 0.62, "grad_norm": 0.8392083048820496, "learning_rate": 3.780209758321934e-05, "loss": 0.3536, "step": 1364 }, { "epoch": 0.62, "grad_norm": 0.8178284168243408, "learning_rate": 3.775649794801642e-05, "loss": 0.3363, "step": 1365 }, { "epoch": 0.62, "grad_norm": 0.8227528929710388, "learning_rate": 3.7710898312813495e-05, "loss": 0.3428, "step": 1366 }, { "epoch": 0.62, "grad_norm": 0.8406091928482056, "learning_rate": 3.7665298677610584e-05, "loss": 0.348, "step": 1367 }, { "epoch": 0.62, "grad_norm": 0.8545897603034973, "learning_rate": 3.7619699042407666e-05, "loss": 0.3517, "step": 1368 }, { "epoch": 0.62, "grad_norm": 0.8717309236526489, "learning_rate": 3.757409940720474e-05, "loss": 0.3684, "step": 1369 }, { "epoch": 0.62, "grad_norm": 0.86220383644104, "learning_rate": 3.752849977200182e-05, "loss": 0.3731, "step": 1370 }, { "epoch": 0.62, "eval_loss": 0.3583603501319885, "eval_runtime": 18.6501, "eval_samples_per_second": 1.501, "eval_steps_per_second": 0.375, "step": 1370 }, { "epoch": 0.62, "grad_norm": 0.829359233379364, "learning_rate": 3.748290013679891e-05, "loss": 0.3265, "step": 1371 }, { "epoch": 0.63, "grad_norm": 0.7688923478126526, "learning_rate": 3.743730050159599e-05, "loss": 0.327, "step": 1372 }, { "epoch": 0.63, "grad_norm": 0.7676702737808228, "learning_rate": 3.739170086639307e-05, "loss": 0.331, "step": 1373 }, { "epoch": 0.63, "grad_norm": 0.8761011958122253, "learning_rate": 3.734610123119015e-05, "loss": 0.3445, "step": 1374 }, { "epoch": 0.63, "grad_norm": 0.8697832226753235, "learning_rate": 3.730050159598724e-05, "loss": 0.3834, "step": 1375 }, { "epoch": 0.63, "grad_norm": 0.832823634147644, "learning_rate": 3.725490196078432e-05, "loss": 0.356, "step": 1376 }, { "epoch": 0.63, "grad_norm": 0.8047780394554138, "learning_rate": 3.7209302325581394e-05, "loss": 0.3407, "step": 1377 }, { "epoch": 0.63, "grad_norm": 0.8695047497749329, "learning_rate": 3.7163702690378476e-05, "loss": 0.3436, "step": 1378 }, { "epoch": 0.63, "grad_norm": 0.762008011341095, "learning_rate": 3.7118103055175565e-05, "loss": 0.3155, "step": 1379 }, { "epoch": 0.63, "grad_norm": 0.8362067937850952, "learning_rate": 3.707250341997264e-05, "loss": 0.3469, "step": 1380 }, { "epoch": 0.63, "eval_loss": 0.3580807149410248, "eval_runtime": 18.2133, "eval_samples_per_second": 1.537, "eval_steps_per_second": 0.384, "step": 1380 }, { "epoch": 0.63, "grad_norm": 0.7919859290122986, "learning_rate": 3.702690378476972e-05, "loss": 0.342, "step": 1381 }, { "epoch": 0.63, "grad_norm": 0.8330515027046204, "learning_rate": 3.69813041495668e-05, "loss": 0.3471, "step": 1382 }, { "epoch": 0.63, "grad_norm": 0.8035807609558105, "learning_rate": 3.693570451436389e-05, "loss": 0.3649, "step": 1383 }, { "epoch": 0.63, "grad_norm": 0.858091413974762, "learning_rate": 3.6890104879160967e-05, "loss": 0.3571, "step": 1384 }, { "epoch": 0.63, "grad_norm": 0.7906508445739746, "learning_rate": 3.684450524395805e-05, "loss": 0.3329, "step": 1385 }, { "epoch": 0.63, "grad_norm": 0.8505438566207886, "learning_rate": 3.679890560875513e-05, "loss": 0.3521, "step": 1386 }, { "epoch": 0.63, "grad_norm": 0.840091347694397, "learning_rate": 3.675330597355222e-05, "loss": 0.3495, "step": 1387 }, { "epoch": 0.63, "grad_norm": 0.886907696723938, "learning_rate": 3.6707706338349294e-05, "loss": 0.357, "step": 1388 }, { "epoch": 0.63, "grad_norm": 0.804918646812439, "learning_rate": 3.6662106703146375e-05, "loss": 0.3358, "step": 1389 }, { "epoch": 0.63, "grad_norm": 0.8357195854187012, "learning_rate": 3.661650706794346e-05, "loss": 0.3375, "step": 1390 }, { "epoch": 0.63, "eval_loss": 0.358403742313385, "eval_runtime": 18.0533, "eval_samples_per_second": 1.551, "eval_steps_per_second": 0.388, "step": 1390 }, { "epoch": 0.63, "grad_norm": 0.8863046765327454, "learning_rate": 3.6570907432740545e-05, "loss": 0.3587, "step": 1391 }, { "epoch": 0.63, "grad_norm": 0.8171054124832153, "learning_rate": 3.652530779753762e-05, "loss": 0.3584, "step": 1392 }, { "epoch": 0.63, "grad_norm": 0.7896978855133057, "learning_rate": 3.64797081623347e-05, "loss": 0.3112, "step": 1393 }, { "epoch": 0.64, "grad_norm": 0.8571012020111084, "learning_rate": 3.6434108527131784e-05, "loss": 0.3332, "step": 1394 }, { "epoch": 0.64, "grad_norm": 0.8435710668563843, "learning_rate": 3.6388508891928866e-05, "loss": 0.3551, "step": 1395 }, { "epoch": 0.64, "grad_norm": 0.8161331415176392, "learning_rate": 3.634290925672595e-05, "loss": 0.3492, "step": 1396 }, { "epoch": 0.64, "grad_norm": 0.840015172958374, "learning_rate": 3.629730962152303e-05, "loss": 0.3524, "step": 1397 }, { "epoch": 0.64, "grad_norm": 0.8551552891731262, "learning_rate": 3.625170998632011e-05, "loss": 0.3414, "step": 1398 }, { "epoch": 0.64, "grad_norm": 0.8473491668701172, "learning_rate": 3.620611035111719e-05, "loss": 0.3514, "step": 1399 }, { "epoch": 0.64, "grad_norm": 0.8031140565872192, "learning_rate": 3.6160510715914274e-05, "loss": 0.3202, "step": 1400 }, { "epoch": 0.64, "eval_loss": 0.35901978611946106, "eval_runtime": 18.7272, "eval_samples_per_second": 1.495, "eval_steps_per_second": 0.374, "step": 1400 }, { "epoch": 0.64, "grad_norm": 0.8324646353721619, "learning_rate": 3.6114911080711356e-05, "loss": 0.3578, "step": 1401 }, { "epoch": 0.64, "grad_norm": 0.7989433407783508, "learning_rate": 3.606931144550844e-05, "loss": 0.314, "step": 1402 }, { "epoch": 0.64, "grad_norm": 0.8086175322532654, "learning_rate": 3.602371181030552e-05, "loss": 0.3395, "step": 1403 }, { "epoch": 0.64, "grad_norm": 0.7840234041213989, "learning_rate": 3.59781121751026e-05, "loss": 0.3465, "step": 1404 }, { "epoch": 0.64, "grad_norm": 0.8186520934104919, "learning_rate": 3.593251253989968e-05, "loss": 0.3418, "step": 1405 }, { "epoch": 0.64, "grad_norm": 0.858751118183136, "learning_rate": 3.5886912904696765e-05, "loss": 0.3389, "step": 1406 }, { "epoch": 0.64, "grad_norm": 0.7823198437690735, "learning_rate": 3.5841313269493846e-05, "loss": 0.3357, "step": 1407 }, { "epoch": 0.64, "grad_norm": 0.7624536156654358, "learning_rate": 3.579571363429093e-05, "loss": 0.3165, "step": 1408 }, { "epoch": 0.64, "grad_norm": 0.7955945134162903, "learning_rate": 3.575011399908801e-05, "loss": 0.3462, "step": 1409 }, { "epoch": 0.64, "grad_norm": 0.8175010085105896, "learning_rate": 3.570451436388509e-05, "loss": 0.3358, "step": 1410 }, { "epoch": 0.64, "eval_loss": 0.35880425572395325, "eval_runtime": 18.8766, "eval_samples_per_second": 1.483, "eval_steps_per_second": 0.371, "step": 1410 }, { "epoch": 0.64, "grad_norm": 0.8709760904312134, "learning_rate": 3.565891472868217e-05, "loss": 0.3564, "step": 1411 }, { "epoch": 0.64, "grad_norm": 0.8882624506950378, "learning_rate": 3.5613315093479255e-05, "loss": 0.3559, "step": 1412 }, { "epoch": 0.64, "grad_norm": 0.7864699959754944, "learning_rate": 3.556771545827634e-05, "loss": 0.3234, "step": 1413 }, { "epoch": 0.64, "grad_norm": 0.8428051471710205, "learning_rate": 3.552211582307342e-05, "loss": 0.3641, "step": 1414 }, { "epoch": 0.65, "grad_norm": 0.8524724245071411, "learning_rate": 3.54765161878705e-05, "loss": 0.3408, "step": 1415 }, { "epoch": 0.65, "grad_norm": 0.8270473480224609, "learning_rate": 3.543091655266758e-05, "loss": 0.342, "step": 1416 }, { "epoch": 0.65, "grad_norm": 0.8461461663246155, "learning_rate": 3.5385316917464664e-05, "loss": 0.3481, "step": 1417 }, { "epoch": 0.65, "grad_norm": 0.8053926229476929, "learning_rate": 3.533971728226174e-05, "loss": 0.3428, "step": 1418 }, { "epoch": 0.65, "grad_norm": 0.8227868676185608, "learning_rate": 3.529411764705883e-05, "loss": 0.3371, "step": 1419 }, { "epoch": 0.65, "grad_norm": 0.8701817989349365, "learning_rate": 3.524851801185591e-05, "loss": 0.3509, "step": 1420 }, { "epoch": 0.65, "eval_loss": 0.3587776720523834, "eval_runtime": 18.3142, "eval_samples_per_second": 1.529, "eval_steps_per_second": 0.382, "step": 1420 }, { "epoch": 0.65, "grad_norm": 0.8183357119560242, "learning_rate": 3.520291837665299e-05, "loss": 0.3408, "step": 1421 }, { "epoch": 0.65, "grad_norm": 0.8657394051551819, "learning_rate": 3.5157318741450066e-05, "loss": 0.3462, "step": 1422 }, { "epoch": 0.65, "grad_norm": 0.8495250940322876, "learning_rate": 3.5111719106247154e-05, "loss": 0.3359, "step": 1423 }, { "epoch": 0.65, "grad_norm": 0.8178961873054504, "learning_rate": 3.5066119471044236e-05, "loss": 0.3522, "step": 1424 }, { "epoch": 0.65, "grad_norm": 0.8224758505821228, "learning_rate": 3.502051983584132e-05, "loss": 0.334, "step": 1425 }, { "epoch": 0.65, "grad_norm": 0.8101351261138916, "learning_rate": 3.497492020063839e-05, "loss": 0.3262, "step": 1426 }, { "epoch": 0.65, "grad_norm": 0.8682355880737305, "learning_rate": 3.492932056543548e-05, "loss": 0.3647, "step": 1427 }, { "epoch": 0.65, "grad_norm": 0.7884172797203064, "learning_rate": 3.488372093023256e-05, "loss": 0.3283, "step": 1428 }, { "epoch": 0.65, "grad_norm": 0.8620269894599915, "learning_rate": 3.483812129502964e-05, "loss": 0.3392, "step": 1429 }, { "epoch": 0.65, "grad_norm": 0.7871687412261963, "learning_rate": 3.479252165982672e-05, "loss": 0.3171, "step": 1430 }, { "epoch": 0.65, "eval_loss": 0.3581898510456085, "eval_runtime": 18.941, "eval_samples_per_second": 1.478, "eval_steps_per_second": 0.37, "step": 1430 }, { "epoch": 0.65, "grad_norm": 0.8699492812156677, "learning_rate": 3.474692202462381e-05, "loss": 0.3539, "step": 1431 }, { "epoch": 0.65, "grad_norm": 0.8600180149078369, "learning_rate": 3.470132238942089e-05, "loss": 0.3392, "step": 1432 }, { "epoch": 0.65, "grad_norm": 0.8551745414733887, "learning_rate": 3.4655722754217965e-05, "loss": 0.3701, "step": 1433 }, { "epoch": 0.65, "grad_norm": 0.8190971612930298, "learning_rate": 3.4610123119015046e-05, "loss": 0.3482, "step": 1434 }, { "epoch": 0.65, "grad_norm": 0.8227312564849854, "learning_rate": 3.4564523483812135e-05, "loss": 0.3611, "step": 1435 }, { "epoch": 0.65, "grad_norm": 0.8590283989906311, "learning_rate": 3.451892384860922e-05, "loss": 0.3432, "step": 1436 }, { "epoch": 0.66, "grad_norm": 0.8469401001930237, "learning_rate": 3.447332421340629e-05, "loss": 0.334, "step": 1437 }, { "epoch": 0.66, "grad_norm": 0.8209288120269775, "learning_rate": 3.442772457820337e-05, "loss": 0.34, "step": 1438 }, { "epoch": 0.66, "grad_norm": 0.8007102608680725, "learning_rate": 3.438212494300046e-05, "loss": 0.3601, "step": 1439 }, { "epoch": 0.66, "grad_norm": 0.8150454163551331, "learning_rate": 3.433652530779754e-05, "loss": 0.3407, "step": 1440 }, { "epoch": 0.66, "eval_loss": 0.3582390248775482, "eval_runtime": 18.6454, "eval_samples_per_second": 1.502, "eval_steps_per_second": 0.375, "step": 1440 }, { "epoch": 0.66, "grad_norm": 0.8899417519569397, "learning_rate": 3.429092567259462e-05, "loss": 0.367, "step": 1441 }, { "epoch": 0.66, "grad_norm": 0.8598084449768066, "learning_rate": 3.42453260373917e-05, "loss": 0.3476, "step": 1442 }, { "epoch": 0.66, "grad_norm": 0.8577296137809753, "learning_rate": 3.419972640218879e-05, "loss": 0.3553, "step": 1443 }, { "epoch": 0.66, "grad_norm": 0.805564820766449, "learning_rate": 3.4154126766985864e-05, "loss": 0.3295, "step": 1444 }, { "epoch": 0.66, "grad_norm": 0.8372154235839844, "learning_rate": 3.4108527131782945e-05, "loss": 0.3533, "step": 1445 }, { "epoch": 0.66, "grad_norm": 0.8153108358383179, "learning_rate": 3.406292749658003e-05, "loss": 0.3433, "step": 1446 }, { "epoch": 0.66, "grad_norm": 0.82663893699646, "learning_rate": 3.4017327861377116e-05, "loss": 0.352, "step": 1447 }, { "epoch": 0.66, "grad_norm": 0.8443247079849243, "learning_rate": 3.397172822617419e-05, "loss": 0.3464, "step": 1448 }, { "epoch": 0.66, "grad_norm": 0.8249718546867371, "learning_rate": 3.392612859097127e-05, "loss": 0.3408, "step": 1449 }, { "epoch": 0.66, "grad_norm": 0.873860239982605, "learning_rate": 3.3880528955768354e-05, "loss": 0.3575, "step": 1450 }, { "epoch": 0.66, "eval_loss": 0.35808515548706055, "eval_runtime": 18.3021, "eval_samples_per_second": 1.53, "eval_steps_per_second": 0.382, "step": 1450 }, { "epoch": 0.66, "grad_norm": 0.797887921333313, "learning_rate": 3.383492932056544e-05, "loss": 0.3355, "step": 1451 }, { "epoch": 0.66, "grad_norm": 0.8364313244819641, "learning_rate": 3.378932968536252e-05, "loss": 0.3445, "step": 1452 }, { "epoch": 0.66, "grad_norm": 0.7586292028427124, "learning_rate": 3.37437300501596e-05, "loss": 0.3252, "step": 1453 }, { "epoch": 0.66, "grad_norm": 0.8402112126350403, "learning_rate": 3.369813041495668e-05, "loss": 0.3474, "step": 1454 }, { "epoch": 0.66, "grad_norm": 0.7776329517364502, "learning_rate": 3.365253077975376e-05, "loss": 0.3259, "step": 1455 }, { "epoch": 0.66, "grad_norm": 0.8207614421844482, "learning_rate": 3.3606931144550845e-05, "loss": 0.3412, "step": 1456 }, { "epoch": 0.66, "grad_norm": 0.7944803833961487, "learning_rate": 3.3561331509347926e-05, "loss": 0.3524, "step": 1457 }, { "epoch": 0.66, "grad_norm": 0.8181450963020325, "learning_rate": 3.351573187414501e-05, "loss": 0.329, "step": 1458 }, { "epoch": 0.67, "grad_norm": 0.8156586289405823, "learning_rate": 3.347013223894209e-05, "loss": 0.3303, "step": 1459 }, { "epoch": 0.67, "grad_norm": 0.8118116855621338, "learning_rate": 3.342453260373917e-05, "loss": 0.3571, "step": 1460 }, { "epoch": 0.67, "eval_loss": 0.3578401207923889, "eval_runtime": 18.3051, "eval_samples_per_second": 1.53, "eval_steps_per_second": 0.382, "step": 1460 }, { "epoch": 0.67, "grad_norm": 0.8032804727554321, "learning_rate": 3.337893296853625e-05, "loss": 0.337, "step": 1461 }, { "epoch": 0.67, "grad_norm": 0.8342922329902649, "learning_rate": 3.3333333333333335e-05, "loss": 0.3402, "step": 1462 }, { "epoch": 0.67, "grad_norm": 0.7907533049583435, "learning_rate": 3.328773369813042e-05, "loss": 0.3387, "step": 1463 }, { "epoch": 0.67, "grad_norm": 0.830132782459259, "learning_rate": 3.32421340629275e-05, "loss": 0.3501, "step": 1464 }, { "epoch": 0.67, "grad_norm": 0.8640668392181396, "learning_rate": 3.319653442772458e-05, "loss": 0.3561, "step": 1465 }, { "epoch": 0.67, "grad_norm": 0.8216010928153992, "learning_rate": 3.315093479252166e-05, "loss": 0.3442, "step": 1466 }, { "epoch": 0.67, "grad_norm": 0.8106178641319275, "learning_rate": 3.3105335157318744e-05, "loss": 0.3355, "step": 1467 }, { "epoch": 0.67, "grad_norm": 0.7535154819488525, "learning_rate": 3.3059735522115825e-05, "loss": 0.3161, "step": 1468 }, { "epoch": 0.67, "grad_norm": 0.8195207715034485, "learning_rate": 3.301413588691291e-05, "loss": 0.3475, "step": 1469 }, { "epoch": 0.67, "grad_norm": 0.8379241228103638, "learning_rate": 3.296853625170999e-05, "loss": 0.3575, "step": 1470 }, { "epoch": 0.67, "eval_loss": 0.3576245903968811, "eval_runtime": 18.3426, "eval_samples_per_second": 1.527, "eval_steps_per_second": 0.382, "step": 1470 }, { "epoch": 0.67, "grad_norm": 0.8420901894569397, "learning_rate": 3.292293661650707e-05, "loss": 0.3288, "step": 1471 }, { "epoch": 0.67, "grad_norm": 0.8348906636238098, "learning_rate": 3.287733698130415e-05, "loss": 0.3527, "step": 1472 }, { "epoch": 0.67, "grad_norm": 0.8279021382331848, "learning_rate": 3.2831737346101234e-05, "loss": 0.3458, "step": 1473 }, { "epoch": 0.67, "grad_norm": 0.8192549347877502, "learning_rate": 3.2786137710898316e-05, "loss": 0.3648, "step": 1474 }, { "epoch": 0.67, "grad_norm": 0.8519864678382874, "learning_rate": 3.27405380756954e-05, "loss": 0.344, "step": 1475 }, { "epoch": 0.67, "grad_norm": 0.8401514887809753, "learning_rate": 3.269493844049248e-05, "loss": 0.3529, "step": 1476 }, { "epoch": 0.67, "grad_norm": 0.8426647186279297, "learning_rate": 3.264933880528956e-05, "loss": 0.3453, "step": 1477 }, { "epoch": 0.67, "grad_norm": 0.7950509786605835, "learning_rate": 3.2603739170086636e-05, "loss": 0.3472, "step": 1478 }, { "epoch": 0.67, "grad_norm": 0.8634794354438782, "learning_rate": 3.2558139534883724e-05, "loss": 0.3521, "step": 1479 }, { "epoch": 0.67, "grad_norm": 0.8343892693519592, "learning_rate": 3.2512539899680806e-05, "loss": 0.3439, "step": 1480 }, { "epoch": 0.67, "eval_loss": 0.35794949531555176, "eval_runtime": 19.3365, "eval_samples_per_second": 1.448, "eval_steps_per_second": 0.362, "step": 1480 }, { "epoch": 0.68, "grad_norm": 0.7715625762939453, "learning_rate": 3.246694026447789e-05, "loss": 0.3181, "step": 1481 }, { "epoch": 0.68, "grad_norm": 0.8029360771179199, "learning_rate": 3.242134062927496e-05, "loss": 0.3261, "step": 1482 }, { "epoch": 0.68, "grad_norm": 0.8179964423179626, "learning_rate": 3.237574099407205e-05, "loss": 0.3252, "step": 1483 }, { "epoch": 0.68, "grad_norm": 0.8506777882575989, "learning_rate": 3.233014135886913e-05, "loss": 0.3393, "step": 1484 }, { "epoch": 0.68, "grad_norm": 0.8550512790679932, "learning_rate": 3.2284541723666215e-05, "loss": 0.3387, "step": 1485 }, { "epoch": 0.68, "grad_norm": 0.8071249723434448, "learning_rate": 3.223894208846329e-05, "loss": 0.3279, "step": 1486 }, { "epoch": 0.68, "grad_norm": 0.809755265712738, "learning_rate": 3.219334245326038e-05, "loss": 0.3551, "step": 1487 }, { "epoch": 0.68, "grad_norm": 0.8375426530838013, "learning_rate": 3.214774281805746e-05, "loss": 0.3615, "step": 1488 }, { "epoch": 0.68, "grad_norm": 0.8345934152603149, "learning_rate": 3.2102143182854535e-05, "loss": 0.3507, "step": 1489 }, { "epoch": 0.68, "grad_norm": 0.8337054252624512, "learning_rate": 3.205654354765162e-05, "loss": 0.3466, "step": 1490 }, { "epoch": 0.68, "eval_loss": 0.3576383888721466, "eval_runtime": 19.4342, "eval_samples_per_second": 1.441, "eval_steps_per_second": 0.36, "step": 1490 }, { "epoch": 0.68, "grad_norm": 0.8991925716400146, "learning_rate": 3.2010943912448705e-05, "loss": 0.3451, "step": 1491 }, { "epoch": 0.68, "grad_norm": 0.8050253987312317, "learning_rate": 3.196534427724579e-05, "loss": 0.3535, "step": 1492 }, { "epoch": 0.68, "grad_norm": 0.8160567283630371, "learning_rate": 3.191974464204286e-05, "loss": 0.3394, "step": 1493 }, { "epoch": 0.68, "grad_norm": 0.8046148419380188, "learning_rate": 3.1874145006839944e-05, "loss": 0.3445, "step": 1494 }, { "epoch": 0.68, "grad_norm": 0.7577115297317505, "learning_rate": 3.182854537163703e-05, "loss": 0.3421, "step": 1495 }, { "epoch": 0.68, "grad_norm": 0.8331876993179321, "learning_rate": 3.1782945736434114e-05, "loss": 0.3487, "step": 1496 }, { "epoch": 0.68, "grad_norm": 0.7791025042533875, "learning_rate": 3.173734610123119e-05, "loss": 0.3306, "step": 1497 }, { "epoch": 0.68, "grad_norm": 0.8634000420570374, "learning_rate": 3.169174646602827e-05, "loss": 0.3486, "step": 1498 }, { "epoch": 0.68, "grad_norm": 0.8452799916267395, "learning_rate": 3.164614683082536e-05, "loss": 0.353, "step": 1499 }, { "epoch": 0.68, "grad_norm": 0.8176222443580627, "learning_rate": 3.160054719562244e-05, "loss": 0.342, "step": 1500 }, { "epoch": 0.68, "eval_loss": 0.35751423239707947, "eval_runtime": 18.9707, "eval_samples_per_second": 1.476, "eval_steps_per_second": 0.369, "step": 1500 }, { "epoch": 0.68, "grad_norm": 0.846010684967041, "learning_rate": 3.1554947560419516e-05, "loss": 0.3651, "step": 1501 }, { "epoch": 0.68, "grad_norm": 0.8323935270309448, "learning_rate": 3.15093479252166e-05, "loss": 0.3459, "step": 1502 }, { "epoch": 0.69, "grad_norm": 0.9228621125221252, "learning_rate": 3.1463748290013686e-05, "loss": 0.3485, "step": 1503 }, { "epoch": 0.69, "grad_norm": 0.8141658902168274, "learning_rate": 3.141814865481076e-05, "loss": 0.3349, "step": 1504 }, { "epoch": 0.69, "grad_norm": 0.8556486964225769, "learning_rate": 3.137254901960784e-05, "loss": 0.3566, "step": 1505 }, { "epoch": 0.69, "grad_norm": 0.858826220035553, "learning_rate": 3.1326949384404924e-05, "loss": 0.3347, "step": 1506 }, { "epoch": 0.69, "grad_norm": 0.8861152529716492, "learning_rate": 3.128134974920201e-05, "loss": 0.3613, "step": 1507 }, { "epoch": 0.69, "grad_norm": 0.8302914500236511, "learning_rate": 3.123575011399909e-05, "loss": 0.3429, "step": 1508 }, { "epoch": 0.69, "grad_norm": 0.8661092519760132, "learning_rate": 3.119015047879617e-05, "loss": 0.3577, "step": 1509 }, { "epoch": 0.69, "grad_norm": 0.813862681388855, "learning_rate": 3.114455084359325e-05, "loss": 0.3321, "step": 1510 }, { "epoch": 0.69, "eval_loss": 0.35740190744400024, "eval_runtime": 19.4999, "eval_samples_per_second": 1.436, "eval_steps_per_second": 0.359, "step": 1510 }, { "epoch": 0.69, "grad_norm": 0.7683899998664856, "learning_rate": 3.109895120839034e-05, "loss": 0.3428, "step": 1511 }, { "epoch": 0.69, "grad_norm": 0.8115766048431396, "learning_rate": 3.1053351573187415e-05, "loss": 0.3279, "step": 1512 }, { "epoch": 0.69, "grad_norm": 0.8759005665779114, "learning_rate": 3.1007751937984497e-05, "loss": 0.3459, "step": 1513 }, { "epoch": 0.69, "grad_norm": 0.8469499945640564, "learning_rate": 3.096215230278158e-05, "loss": 0.3339, "step": 1514 }, { "epoch": 0.69, "grad_norm": 0.8490378856658936, "learning_rate": 3.091655266757866e-05, "loss": 0.3347, "step": 1515 }, { "epoch": 0.69, "grad_norm": 0.814757764339447, "learning_rate": 3.087095303237574e-05, "loss": 0.3518, "step": 1516 }, { "epoch": 0.69, "grad_norm": 0.8592610955238342, "learning_rate": 3.0825353397172823e-05, "loss": 0.3453, "step": 1517 }, { "epoch": 0.69, "grad_norm": 0.7975971102714539, "learning_rate": 3.0779753761969905e-05, "loss": 0.3437, "step": 1518 }, { "epoch": 0.69, "grad_norm": 0.8102256655693054, "learning_rate": 3.073415412676699e-05, "loss": 0.3234, "step": 1519 }, { "epoch": 0.69, "grad_norm": 0.807258129119873, "learning_rate": 3.068855449156407e-05, "loss": 0.3484, "step": 1520 }, { "epoch": 0.69, "eval_loss": 0.3571014106273651, "eval_runtime": 19.2314, "eval_samples_per_second": 1.456, "eval_steps_per_second": 0.364, "step": 1520 }, { "epoch": 0.69, "grad_norm": 0.7736379504203796, "learning_rate": 3.064295485636115e-05, "loss": 0.3372, "step": 1521 }, { "epoch": 0.69, "grad_norm": 0.8541015982627869, "learning_rate": 3.059735522115823e-05, "loss": 0.3479, "step": 1522 }, { "epoch": 0.69, "grad_norm": 0.8304355144500732, "learning_rate": 3.0551755585955314e-05, "loss": 0.3629, "step": 1523 }, { "epoch": 0.69, "grad_norm": 0.8115606904029846, "learning_rate": 3.0506155950752396e-05, "loss": 0.3272, "step": 1524 }, { "epoch": 0.7, "grad_norm": 0.8201666474342346, "learning_rate": 3.0460556315549477e-05, "loss": 0.3396, "step": 1525 }, { "epoch": 0.7, "grad_norm": 0.8593149185180664, "learning_rate": 3.0414956680346556e-05, "loss": 0.3488, "step": 1526 }, { "epoch": 0.7, "grad_norm": 0.798021137714386, "learning_rate": 3.0369357045143644e-05, "loss": 0.3414, "step": 1527 }, { "epoch": 0.7, "grad_norm": 0.8162714838981628, "learning_rate": 3.0323757409940722e-05, "loss": 0.3575, "step": 1528 }, { "epoch": 0.7, "grad_norm": 0.8344952464103699, "learning_rate": 3.0278157774737804e-05, "loss": 0.351, "step": 1529 }, { "epoch": 0.7, "grad_norm": 0.8304796814918518, "learning_rate": 3.0232558139534883e-05, "loss": 0.3339, "step": 1530 }, { "epoch": 0.7, "eval_loss": 0.3572339713573456, "eval_runtime": 19.1453, "eval_samples_per_second": 1.463, "eval_steps_per_second": 0.366, "step": 1530 }, { "epoch": 0.7, "grad_norm": 0.8597332239151001, "learning_rate": 3.0186958504331968e-05, "loss": 0.355, "step": 1531 }, { "epoch": 0.7, "grad_norm": 0.7662696242332458, "learning_rate": 3.014135886912905e-05, "loss": 0.3448, "step": 1532 }, { "epoch": 0.7, "grad_norm": 0.784625768661499, "learning_rate": 3.009575923392613e-05, "loss": 0.3426, "step": 1533 }, { "epoch": 0.7, "grad_norm": 0.8529240489006042, "learning_rate": 3.005015959872321e-05, "loss": 0.3435, "step": 1534 }, { "epoch": 0.7, "grad_norm": 0.8391616940498352, "learning_rate": 3.0004559963520295e-05, "loss": 0.3536, "step": 1535 }, { "epoch": 0.7, "grad_norm": 0.7962397336959839, "learning_rate": 2.9958960328317376e-05, "loss": 0.3222, "step": 1536 }, { "epoch": 0.7, "grad_norm": 0.8026924729347229, "learning_rate": 2.9913360693114455e-05, "loss": 0.3562, "step": 1537 }, { "epoch": 0.7, "grad_norm": 0.8800778388977051, "learning_rate": 2.9867761057911536e-05, "loss": 0.3515, "step": 1538 }, { "epoch": 0.7, "grad_norm": 0.8260602951049805, "learning_rate": 2.982216142270862e-05, "loss": 0.3402, "step": 1539 }, { "epoch": 0.7, "grad_norm": 0.850034773349762, "learning_rate": 2.9776561787505703e-05, "loss": 0.3627, "step": 1540 }, { "epoch": 0.7, "eval_loss": 0.35721954703330994, "eval_runtime": 18.9282, "eval_samples_per_second": 1.479, "eval_steps_per_second": 0.37, "step": 1540 }, { "epoch": 0.7, "grad_norm": 0.8191645741462708, "learning_rate": 2.973096215230278e-05, "loss": 0.3532, "step": 1541 }, { "epoch": 0.7, "grad_norm": 0.8150588274002075, "learning_rate": 2.9685362517099863e-05, "loss": 0.3476, "step": 1542 }, { "epoch": 0.7, "grad_norm": 0.7702752947807312, "learning_rate": 2.963976288189695e-05, "loss": 0.3301, "step": 1543 }, { "epoch": 0.7, "grad_norm": 0.8159235715866089, "learning_rate": 2.959416324669403e-05, "loss": 0.3163, "step": 1544 }, { "epoch": 0.7, "grad_norm": 0.8318021893501282, "learning_rate": 2.954856361149111e-05, "loss": 0.3352, "step": 1545 }, { "epoch": 0.7, "grad_norm": 0.826701283454895, "learning_rate": 2.950296397628819e-05, "loss": 0.3357, "step": 1546 }, { "epoch": 0.71, "grad_norm": 0.8172712922096252, "learning_rate": 2.9457364341085275e-05, "loss": 0.3364, "step": 1547 }, { "epoch": 0.71, "grad_norm": 0.8200088739395142, "learning_rate": 2.9411764705882354e-05, "loss": 0.3312, "step": 1548 }, { "epoch": 0.71, "grad_norm": 0.8334987759590149, "learning_rate": 2.9366165070679435e-05, "loss": 0.348, "step": 1549 }, { "epoch": 0.71, "grad_norm": 0.768750011920929, "learning_rate": 2.9320565435476517e-05, "loss": 0.3383, "step": 1550 }, { "epoch": 0.71, "eval_loss": 0.35663700103759766, "eval_runtime": 19.07, "eval_samples_per_second": 1.468, "eval_steps_per_second": 0.367, "step": 1550 }, { "epoch": 0.71, "grad_norm": 0.8347567915916443, "learning_rate": 2.9274965800273602e-05, "loss": 0.3559, "step": 1551 }, { "epoch": 0.71, "grad_norm": 0.8524714708328247, "learning_rate": 2.922936616507068e-05, "loss": 0.3625, "step": 1552 }, { "epoch": 0.71, "grad_norm": 0.7744024991989136, "learning_rate": 2.9183766529867762e-05, "loss": 0.3524, "step": 1553 }, { "epoch": 0.71, "grad_norm": 0.8230989575386047, "learning_rate": 2.913816689466484e-05, "loss": 0.3545, "step": 1554 }, { "epoch": 0.71, "grad_norm": 0.7706303596496582, "learning_rate": 2.909256725946193e-05, "loss": 0.3356, "step": 1555 }, { "epoch": 0.71, "grad_norm": 0.8915783762931824, "learning_rate": 2.9046967624259008e-05, "loss": 0.3761, "step": 1556 }, { "epoch": 0.71, "grad_norm": 0.8355619311332703, "learning_rate": 2.900136798905609e-05, "loss": 0.3756, "step": 1557 }, { "epoch": 0.71, "grad_norm": 0.8191080093383789, "learning_rate": 2.8955768353853168e-05, "loss": 0.3315, "step": 1558 }, { "epoch": 0.71, "grad_norm": 0.8697982430458069, "learning_rate": 2.8910168718650256e-05, "loss": 0.3592, "step": 1559 }, { "epoch": 0.71, "grad_norm": 0.8937227725982666, "learning_rate": 2.8864569083447335e-05, "loss": 0.3317, "step": 1560 }, { "epoch": 0.71, "eval_loss": 0.3572239875793457, "eval_runtime": 19.0073, "eval_samples_per_second": 1.473, "eval_steps_per_second": 0.368, "step": 1560 }, { "epoch": 0.71, "grad_norm": 0.833889365196228, "learning_rate": 2.8818969448244416e-05, "loss": 0.3533, "step": 1561 }, { "epoch": 0.71, "grad_norm": 0.8254353404045105, "learning_rate": 2.8773369813041495e-05, "loss": 0.3368, "step": 1562 }, { "epoch": 0.71, "grad_norm": 0.8069315552711487, "learning_rate": 2.872777017783858e-05, "loss": 0.338, "step": 1563 }, { "epoch": 0.71, "grad_norm": 0.7715954184532166, "learning_rate": 2.868217054263566e-05, "loss": 0.3254, "step": 1564 }, { "epoch": 0.71, "grad_norm": 0.8389305472373962, "learning_rate": 2.8636570907432743e-05, "loss": 0.3439, "step": 1565 }, { "epoch": 0.71, "grad_norm": 0.8371710777282715, "learning_rate": 2.859097127222982e-05, "loss": 0.3387, "step": 1566 }, { "epoch": 0.71, "grad_norm": 0.7831338047981262, "learning_rate": 2.8545371637026907e-05, "loss": 0.3127, "step": 1567 }, { "epoch": 0.71, "grad_norm": 0.8159829378128052, "learning_rate": 2.849977200182399e-05, "loss": 0.3264, "step": 1568 }, { "epoch": 0.72, "grad_norm": 0.8167608380317688, "learning_rate": 2.8454172366621067e-05, "loss": 0.3364, "step": 1569 }, { "epoch": 0.72, "grad_norm": 0.8175981640815735, "learning_rate": 2.840857273141815e-05, "loss": 0.3491, "step": 1570 }, { "epoch": 0.72, "eval_loss": 0.3574839234352112, "eval_runtime": 19.2429, "eval_samples_per_second": 1.455, "eval_steps_per_second": 0.364, "step": 1570 }, { "epoch": 0.72, "grad_norm": 0.8386175036430359, "learning_rate": 2.8362973096215234e-05, "loss": 0.3485, "step": 1571 }, { "epoch": 0.72, "grad_norm": 0.816241443157196, "learning_rate": 2.8317373461012315e-05, "loss": 0.3353, "step": 1572 }, { "epoch": 0.72, "grad_norm": 0.8434162139892578, "learning_rate": 2.8271773825809394e-05, "loss": 0.3846, "step": 1573 }, { "epoch": 0.72, "grad_norm": 0.791397213935852, "learning_rate": 2.8226174190606475e-05, "loss": 0.3335, "step": 1574 }, { "epoch": 0.72, "grad_norm": 0.8274080157279968, "learning_rate": 2.818057455540356e-05, "loss": 0.35, "step": 1575 }, { "epoch": 0.72, "grad_norm": 0.7708328366279602, "learning_rate": 2.8134974920200642e-05, "loss": 0.3089, "step": 1576 }, { "epoch": 0.72, "grad_norm": 0.846105694770813, "learning_rate": 2.808937528499772e-05, "loss": 0.3332, "step": 1577 }, { "epoch": 0.72, "grad_norm": 0.7850958108901978, "learning_rate": 2.8043775649794802e-05, "loss": 0.3181, "step": 1578 }, { "epoch": 0.72, "grad_norm": 0.8704512119293213, "learning_rate": 2.7998176014591887e-05, "loss": 0.3559, "step": 1579 }, { "epoch": 0.72, "grad_norm": 0.8448445796966553, "learning_rate": 2.7952576379388966e-05, "loss": 0.3612, "step": 1580 }, { "epoch": 0.72, "eval_loss": 0.35740068554878235, "eval_runtime": 19.5965, "eval_samples_per_second": 1.429, "eval_steps_per_second": 0.357, "step": 1580 }, { "epoch": 0.72, "grad_norm": 0.8353627920150757, "learning_rate": 2.7906976744186048e-05, "loss": 0.3496, "step": 1581 }, { "epoch": 0.72, "grad_norm": 0.8289371132850647, "learning_rate": 2.786137710898313e-05, "loss": 0.3447, "step": 1582 }, { "epoch": 0.72, "grad_norm": 0.8660280704498291, "learning_rate": 2.7815777473780214e-05, "loss": 0.341, "step": 1583 }, { "epoch": 0.72, "grad_norm": 0.8957594633102417, "learning_rate": 2.7770177838577293e-05, "loss": 0.3216, "step": 1584 }, { "epoch": 0.72, "grad_norm": 0.81903475522995, "learning_rate": 2.7724578203374374e-05, "loss": 0.3418, "step": 1585 }, { "epoch": 0.72, "grad_norm": 0.8132171630859375, "learning_rate": 2.7678978568171453e-05, "loss": 0.3466, "step": 1586 }, { "epoch": 0.72, "grad_norm": 0.8006194829940796, "learning_rate": 2.763337893296854e-05, "loss": 0.3377, "step": 1587 }, { "epoch": 0.72, "grad_norm": 0.8752909302711487, "learning_rate": 2.758777929776562e-05, "loss": 0.369, "step": 1588 }, { "epoch": 0.72, "grad_norm": 0.8540804386138916, "learning_rate": 2.75421796625627e-05, "loss": 0.3354, "step": 1589 }, { "epoch": 0.72, "grad_norm": 0.8105359077453613, "learning_rate": 2.749658002735978e-05, "loss": 0.3293, "step": 1590 }, { "epoch": 0.72, "eval_loss": 0.3572061359882355, "eval_runtime": 19.0182, "eval_samples_per_second": 1.472, "eval_steps_per_second": 0.368, "step": 1590 }, { "epoch": 0.73, "grad_norm": 0.8374319672584534, "learning_rate": 2.7450980392156865e-05, "loss": 0.3393, "step": 1591 }, { "epoch": 0.73, "grad_norm": 0.8734089732170105, "learning_rate": 2.7405380756953947e-05, "loss": 0.3525, "step": 1592 }, { "epoch": 0.73, "grad_norm": 0.8762404918670654, "learning_rate": 2.735978112175103e-05, "loss": 0.3683, "step": 1593 }, { "epoch": 0.73, "grad_norm": 0.8304470777511597, "learning_rate": 2.7314181486548107e-05, "loss": 0.3502, "step": 1594 }, { "epoch": 0.73, "grad_norm": 0.8588269948959351, "learning_rate": 2.7268581851345192e-05, "loss": 0.3464, "step": 1595 }, { "epoch": 0.73, "grad_norm": 0.7775711417198181, "learning_rate": 2.7222982216142274e-05, "loss": 0.3206, "step": 1596 }, { "epoch": 0.73, "grad_norm": 0.8479073643684387, "learning_rate": 2.7177382580939352e-05, "loss": 0.3568, "step": 1597 }, { "epoch": 0.73, "grad_norm": 0.8442942500114441, "learning_rate": 2.7131782945736434e-05, "loss": 0.3389, "step": 1598 }, { "epoch": 0.73, "grad_norm": 0.8447399735450745, "learning_rate": 2.708618331053352e-05, "loss": 0.3375, "step": 1599 }, { "epoch": 0.73, "grad_norm": 0.8228472471237183, "learning_rate": 2.70405836753306e-05, "loss": 0.3229, "step": 1600 }, { "epoch": 0.73, "eval_loss": 0.35655805468559265, "eval_runtime": 19.4148, "eval_samples_per_second": 1.442, "eval_steps_per_second": 0.361, "step": 1600 }, { "epoch": 0.73, "grad_norm": 0.8076717853546143, "learning_rate": 2.699498404012768e-05, "loss": 0.3336, "step": 1601 }, { "epoch": 0.73, "grad_norm": 0.8026668429374695, "learning_rate": 2.694938440492476e-05, "loss": 0.3298, "step": 1602 }, { "epoch": 0.73, "grad_norm": 0.801601767539978, "learning_rate": 2.6903784769721846e-05, "loss": 0.3445, "step": 1603 }, { "epoch": 0.73, "grad_norm": 0.8218444585800171, "learning_rate": 2.6858185134518927e-05, "loss": 0.3617, "step": 1604 }, { "epoch": 0.73, "grad_norm": 0.8179977536201477, "learning_rate": 2.6812585499316006e-05, "loss": 0.3248, "step": 1605 }, { "epoch": 0.73, "grad_norm": 0.7940379977226257, "learning_rate": 2.6766985864113087e-05, "loss": 0.3401, "step": 1606 }, { "epoch": 0.73, "grad_norm": 0.8855268359184265, "learning_rate": 2.6721386228910173e-05, "loss": 0.3692, "step": 1607 }, { "epoch": 0.73, "grad_norm": 0.816184937953949, "learning_rate": 2.6675786593707254e-05, "loss": 0.3462, "step": 1608 }, { "epoch": 0.73, "grad_norm": 0.8183460831642151, "learning_rate": 2.6630186958504333e-05, "loss": 0.3547, "step": 1609 }, { "epoch": 0.73, "grad_norm": 0.8423659801483154, "learning_rate": 2.6584587323301414e-05, "loss": 0.3534, "step": 1610 }, { "epoch": 0.73, "eval_loss": 0.35616493225097656, "eval_runtime": 19.1242, "eval_samples_per_second": 1.464, "eval_steps_per_second": 0.366, "step": 1610 }, { "epoch": 0.73, "grad_norm": 0.8094700574874878, "learning_rate": 2.65389876880985e-05, "loss": 0.3299, "step": 1611 }, { "epoch": 0.73, "grad_norm": 0.793865442276001, "learning_rate": 2.6493388052895578e-05, "loss": 0.3344, "step": 1612 }, { "epoch": 0.74, "grad_norm": 0.8043534159660339, "learning_rate": 2.644778841769266e-05, "loss": 0.3337, "step": 1613 }, { "epoch": 0.74, "grad_norm": 0.8780481219291687, "learning_rate": 2.6402188782489738e-05, "loss": 0.3823, "step": 1614 }, { "epoch": 0.74, "grad_norm": 0.8131493330001831, "learning_rate": 2.6356589147286826e-05, "loss": 0.3338, "step": 1615 }, { "epoch": 0.74, "grad_norm": 0.7759178280830383, "learning_rate": 2.6310989512083905e-05, "loss": 0.3128, "step": 1616 }, { "epoch": 0.74, "grad_norm": 0.7824622392654419, "learning_rate": 2.6265389876880987e-05, "loss": 0.3189, "step": 1617 }, { "epoch": 0.74, "grad_norm": 0.8490877151489258, "learning_rate": 2.6219790241678065e-05, "loss": 0.358, "step": 1618 }, { "epoch": 0.74, "grad_norm": 0.8330671191215515, "learning_rate": 2.6174190606475153e-05, "loss": 0.3603, "step": 1619 }, { "epoch": 0.74, "grad_norm": 0.8165590167045593, "learning_rate": 2.6128590971272232e-05, "loss": 0.3551, "step": 1620 }, { "epoch": 0.74, "eval_loss": 0.35663822293281555, "eval_runtime": 19.6772, "eval_samples_per_second": 1.423, "eval_steps_per_second": 0.356, "step": 1620 }, { "epoch": 0.74, "grad_norm": 0.8029173612594604, "learning_rate": 2.6082991336069313e-05, "loss": 0.3549, "step": 1621 }, { "epoch": 0.74, "grad_norm": 0.8408702611923218, "learning_rate": 2.6037391700866392e-05, "loss": 0.3584, "step": 1622 }, { "epoch": 0.74, "grad_norm": 0.7862275242805481, "learning_rate": 2.5991792065663477e-05, "loss": 0.3143, "step": 1623 }, { "epoch": 0.74, "grad_norm": 0.8487365245819092, "learning_rate": 2.594619243046056e-05, "loss": 0.3433, "step": 1624 }, { "epoch": 0.74, "grad_norm": 0.8069978356361389, "learning_rate": 2.590059279525764e-05, "loss": 0.3384, "step": 1625 }, { "epoch": 0.74, "grad_norm": 0.81746906042099, "learning_rate": 2.585499316005472e-05, "loss": 0.3288, "step": 1626 }, { "epoch": 0.74, "grad_norm": 0.8378018140792847, "learning_rate": 2.5809393524851804e-05, "loss": 0.3524, "step": 1627 }, { "epoch": 0.74, "grad_norm": 0.7984631061553955, "learning_rate": 2.5763793889648886e-05, "loss": 0.3194, "step": 1628 }, { "epoch": 0.74, "grad_norm": 0.8371167182922363, "learning_rate": 2.5718194254445964e-05, "loss": 0.3591, "step": 1629 }, { "epoch": 0.74, "grad_norm": 0.8061546087265015, "learning_rate": 2.5672594619243046e-05, "loss": 0.3361, "step": 1630 }, { "epoch": 0.74, "eval_loss": 0.3569858968257904, "eval_runtime": 19.0111, "eval_samples_per_second": 1.473, "eval_steps_per_second": 0.368, "step": 1630 }, { "epoch": 0.74, "grad_norm": 0.8967863321304321, "learning_rate": 2.562699498404013e-05, "loss": 0.3553, "step": 1631 }, { "epoch": 0.74, "grad_norm": 0.8157939314842224, "learning_rate": 2.5581395348837212e-05, "loss": 0.3482, "step": 1632 }, { "epoch": 0.74, "grad_norm": 0.835759699344635, "learning_rate": 2.553579571363429e-05, "loss": 0.3522, "step": 1633 }, { "epoch": 0.74, "grad_norm": 0.8867244124412537, "learning_rate": 2.5490196078431373e-05, "loss": 0.366, "step": 1634 }, { "epoch": 0.75, "grad_norm": 0.8107739090919495, "learning_rate": 2.5444596443228458e-05, "loss": 0.3436, "step": 1635 }, { "epoch": 0.75, "grad_norm": 0.8030595183372498, "learning_rate": 2.539899680802554e-05, "loss": 0.3246, "step": 1636 }, { "epoch": 0.75, "grad_norm": 0.812812328338623, "learning_rate": 2.5353397172822618e-05, "loss": 0.3518, "step": 1637 }, { "epoch": 0.75, "grad_norm": 0.8139852285385132, "learning_rate": 2.53077975376197e-05, "loss": 0.3356, "step": 1638 }, { "epoch": 0.75, "grad_norm": 0.795472264289856, "learning_rate": 2.5262197902416785e-05, "loss": 0.3509, "step": 1639 }, { "epoch": 0.75, "grad_norm": 0.8404377698898315, "learning_rate": 2.5216598267213863e-05, "loss": 0.3487, "step": 1640 }, { "epoch": 0.75, "eval_loss": 0.3569706976413727, "eval_runtime": 19.0009, "eval_samples_per_second": 1.474, "eval_steps_per_second": 0.368, "step": 1640 }, { "epoch": 0.75, "grad_norm": 0.7934169173240662, "learning_rate": 2.5170998632010945e-05, "loss": 0.3427, "step": 1641 }, { "epoch": 0.75, "grad_norm": 0.790808379650116, "learning_rate": 2.5125398996808026e-05, "loss": 0.3334, "step": 1642 }, { "epoch": 0.75, "grad_norm": 0.828813910484314, "learning_rate": 2.507979936160511e-05, "loss": 0.3553, "step": 1643 }, { "epoch": 0.75, "grad_norm": 0.8527178764343262, "learning_rate": 2.503419972640219e-05, "loss": 0.3434, "step": 1644 }, { "epoch": 0.75, "grad_norm": 0.8209643959999084, "learning_rate": 2.498860009119927e-05, "loss": 0.3558, "step": 1645 }, { "epoch": 0.75, "grad_norm": 0.7981570363044739, "learning_rate": 2.4943000455996353e-05, "loss": 0.3192, "step": 1646 }, { "epoch": 0.75, "grad_norm": 0.8820638060569763, "learning_rate": 2.4897400820793435e-05, "loss": 0.3494, "step": 1647 }, { "epoch": 0.75, "grad_norm": 0.7953222393989563, "learning_rate": 2.4851801185590517e-05, "loss": 0.3244, "step": 1648 }, { "epoch": 0.75, "grad_norm": 0.8022490739822388, "learning_rate": 2.48062015503876e-05, "loss": 0.346, "step": 1649 }, { "epoch": 0.75, "grad_norm": 0.8578386306762695, "learning_rate": 2.476060191518468e-05, "loss": 0.3375, "step": 1650 }, { "epoch": 0.75, "eval_loss": 0.3572315573692322, "eval_runtime": 19.0785, "eval_samples_per_second": 1.468, "eval_steps_per_second": 0.367, "step": 1650 }, { "epoch": 0.75, "grad_norm": 0.8319166302680969, "learning_rate": 2.4715002279981762e-05, "loss": 0.351, "step": 1651 }, { "epoch": 0.75, "grad_norm": 0.840707004070282, "learning_rate": 2.4669402644778844e-05, "loss": 0.34, "step": 1652 }, { "epoch": 0.75, "grad_norm": 0.7389063835144043, "learning_rate": 2.4623803009575925e-05, "loss": 0.3147, "step": 1653 }, { "epoch": 0.75, "grad_norm": 0.7972502708435059, "learning_rate": 2.4578203374373007e-05, "loss": 0.3216, "step": 1654 }, { "epoch": 0.75, "grad_norm": 0.8653942942619324, "learning_rate": 2.453260373917009e-05, "loss": 0.3818, "step": 1655 }, { "epoch": 0.75, "grad_norm": 0.8010607361793518, "learning_rate": 2.448700410396717e-05, "loss": 0.3402, "step": 1656 }, { "epoch": 0.76, "grad_norm": 0.7780823111534119, "learning_rate": 2.444140446876425e-05, "loss": 0.3322, "step": 1657 }, { "epoch": 0.76, "grad_norm": 0.809596061706543, "learning_rate": 2.4395804833561334e-05, "loss": 0.346, "step": 1658 }, { "epoch": 0.76, "grad_norm": 0.8238269090652466, "learning_rate": 2.4350205198358412e-05, "loss": 0.3567, "step": 1659 }, { "epoch": 0.76, "grad_norm": 0.8283640742301941, "learning_rate": 2.4304605563155498e-05, "loss": 0.3527, "step": 1660 }, { "epoch": 0.76, "eval_loss": 0.3569982647895813, "eval_runtime": 19.2893, "eval_samples_per_second": 1.452, "eval_steps_per_second": 0.363, "step": 1660 }, { "epoch": 0.76, "grad_norm": 0.8536205887794495, "learning_rate": 2.4259005927952576e-05, "loss": 0.3364, "step": 1661 }, { "epoch": 0.76, "grad_norm": 0.8689197301864624, "learning_rate": 2.421340629274966e-05, "loss": 0.3603, "step": 1662 }, { "epoch": 0.76, "grad_norm": 0.861287534236908, "learning_rate": 2.416780665754674e-05, "loss": 0.3639, "step": 1663 }, { "epoch": 0.76, "grad_norm": 0.826146125793457, "learning_rate": 2.4122207022343825e-05, "loss": 0.3246, "step": 1664 }, { "epoch": 0.76, "grad_norm": 0.8066340684890747, "learning_rate": 2.4076607387140903e-05, "loss": 0.3522, "step": 1665 }, { "epoch": 0.76, "grad_norm": 0.8642566800117493, "learning_rate": 2.4031007751937988e-05, "loss": 0.3517, "step": 1666 }, { "epoch": 0.76, "grad_norm": 0.8636859059333801, "learning_rate": 2.3985408116735066e-05, "loss": 0.3675, "step": 1667 }, { "epoch": 0.76, "grad_norm": 0.7902210354804993, "learning_rate": 2.393980848153215e-05, "loss": 0.3311, "step": 1668 }, { "epoch": 0.76, "grad_norm": 0.8061851263046265, "learning_rate": 2.389420884632923e-05, "loss": 0.3454, "step": 1669 }, { "epoch": 0.76, "grad_norm": 0.7799895405769348, "learning_rate": 2.384860921112631e-05, "loss": 0.3306, "step": 1670 }, { "epoch": 0.76, "eval_loss": 0.3564049303531647, "eval_runtime": 18.9703, "eval_samples_per_second": 1.476, "eval_steps_per_second": 0.369, "step": 1670 }, { "epoch": 0.76, "grad_norm": 0.8299709558486938, "learning_rate": 2.3803009575923393e-05, "loss": 0.3483, "step": 1671 }, { "epoch": 0.76, "grad_norm": 0.8506625890731812, "learning_rate": 2.3757409940720475e-05, "loss": 0.3296, "step": 1672 }, { "epoch": 0.76, "grad_norm": 0.8220917582511902, "learning_rate": 2.3711810305517557e-05, "loss": 0.332, "step": 1673 }, { "epoch": 0.76, "grad_norm": 0.7895132303237915, "learning_rate": 2.366621067031464e-05, "loss": 0.3298, "step": 1674 }, { "epoch": 0.76, "grad_norm": 0.8111960291862488, "learning_rate": 2.362061103511172e-05, "loss": 0.3112, "step": 1675 }, { "epoch": 0.76, "grad_norm": 0.8449161648750305, "learning_rate": 2.3575011399908802e-05, "loss": 0.3542, "step": 1676 }, { "epoch": 0.76, "grad_norm": 0.8357505798339844, "learning_rate": 2.3529411764705884e-05, "loss": 0.3445, "step": 1677 }, { "epoch": 0.76, "grad_norm": 0.8511107563972473, "learning_rate": 2.3483812129502965e-05, "loss": 0.3456, "step": 1678 }, { "epoch": 0.77, "grad_norm": 0.8281843662261963, "learning_rate": 2.3438212494300047e-05, "loss": 0.347, "step": 1679 }, { "epoch": 0.77, "grad_norm": 0.8068907856941223, "learning_rate": 2.339261285909713e-05, "loss": 0.3376, "step": 1680 }, { "epoch": 0.77, "eval_loss": 0.35658687353134155, "eval_runtime": 19.0153, "eval_samples_per_second": 1.472, "eval_steps_per_second": 0.368, "step": 1680 }, { "epoch": 0.77, "grad_norm": 0.8254083395004272, "learning_rate": 2.334701322389421e-05, "loss": 0.3374, "step": 1681 }, { "epoch": 0.77, "grad_norm": 0.8004122376441956, "learning_rate": 2.3301413588691292e-05, "loss": 0.335, "step": 1682 }, { "epoch": 0.77, "grad_norm": 0.8271961212158203, "learning_rate": 2.3255813953488374e-05, "loss": 0.364, "step": 1683 }, { "epoch": 0.77, "grad_norm": 0.8114727735519409, "learning_rate": 2.3210214318285456e-05, "loss": 0.3342, "step": 1684 }, { "epoch": 0.77, "grad_norm": 0.8433076739311218, "learning_rate": 2.3164614683082538e-05, "loss": 0.3531, "step": 1685 }, { "epoch": 0.77, "grad_norm": 0.8108780980110168, "learning_rate": 2.311901504787962e-05, "loss": 0.3306, "step": 1686 }, { "epoch": 0.77, "grad_norm": 0.8376524448394775, "learning_rate": 2.30734154126767e-05, "loss": 0.3405, "step": 1687 }, { "epoch": 0.77, "grad_norm": 0.8186554908752441, "learning_rate": 2.3027815777473783e-05, "loss": 0.3366, "step": 1688 }, { "epoch": 0.77, "grad_norm": 0.7797662615776062, "learning_rate": 2.298221614227086e-05, "loss": 0.3315, "step": 1689 }, { "epoch": 0.77, "grad_norm": 0.7970744371414185, "learning_rate": 2.2936616507067946e-05, "loss": 0.3343, "step": 1690 }, { "epoch": 0.77, "eval_loss": 0.3567172586917877, "eval_runtime": 19.375, "eval_samples_per_second": 1.445, "eval_steps_per_second": 0.361, "step": 1690 }, { "epoch": 0.77, "grad_norm": 0.7889989614486694, "learning_rate": 2.2891016871865025e-05, "loss": 0.3281, "step": 1691 }, { "epoch": 0.77, "grad_norm": 0.8547121286392212, "learning_rate": 2.284541723666211e-05, "loss": 0.3495, "step": 1692 }, { "epoch": 0.77, "grad_norm": 0.8533220887184143, "learning_rate": 2.2799817601459188e-05, "loss": 0.3511, "step": 1693 }, { "epoch": 0.77, "grad_norm": 0.9014710783958435, "learning_rate": 2.2754217966256273e-05, "loss": 0.3481, "step": 1694 }, { "epoch": 0.77, "grad_norm": 0.8289618492126465, "learning_rate": 2.270861833105335e-05, "loss": 0.339, "step": 1695 }, { "epoch": 0.77, "grad_norm": 0.8249356150627136, "learning_rate": 2.2663018695850437e-05, "loss": 0.3538, "step": 1696 }, { "epoch": 0.77, "grad_norm": 0.8586651682853699, "learning_rate": 2.2617419060647515e-05, "loss": 0.359, "step": 1697 }, { "epoch": 0.77, "grad_norm": 0.8021293878555298, "learning_rate": 2.25718194254446e-05, "loss": 0.3428, "step": 1698 }, { "epoch": 0.77, "grad_norm": 0.8210233449935913, "learning_rate": 2.252621979024168e-05, "loss": 0.3393, "step": 1699 }, { "epoch": 0.77, "grad_norm": 0.8224427103996277, "learning_rate": 2.2480620155038764e-05, "loss": 0.3206, "step": 1700 }, { "epoch": 0.77, "eval_loss": 0.3561743199825287, "eval_runtime": 19.147, "eval_samples_per_second": 1.462, "eval_steps_per_second": 0.366, "step": 1700 }, { "epoch": 0.78, "grad_norm": 0.8186107873916626, "learning_rate": 2.2435020519835842e-05, "loss": 0.342, "step": 1701 }, { "epoch": 0.78, "grad_norm": 0.8090589046478271, "learning_rate": 2.2389420884632924e-05, "loss": 0.3182, "step": 1702 }, { "epoch": 0.78, "grad_norm": 0.8366579413414001, "learning_rate": 2.2343821249430005e-05, "loss": 0.3406, "step": 1703 }, { "epoch": 0.78, "grad_norm": 0.8310838341712952, "learning_rate": 2.2298221614227087e-05, "loss": 0.3156, "step": 1704 }, { "epoch": 0.78, "grad_norm": 0.829026997089386, "learning_rate": 2.225262197902417e-05, "loss": 0.3421, "step": 1705 }, { "epoch": 0.78, "grad_norm": 0.9288223385810852, "learning_rate": 2.220702234382125e-05, "loss": 0.3538, "step": 1706 }, { "epoch": 0.78, "grad_norm": 0.8505128026008606, "learning_rate": 2.2161422708618332e-05, "loss": 0.3605, "step": 1707 }, { "epoch": 0.78, "grad_norm": 0.8131338953971863, "learning_rate": 2.2115823073415414e-05, "loss": 0.3345, "step": 1708 }, { "epoch": 0.78, "grad_norm": 0.798833429813385, "learning_rate": 2.2070223438212496e-05, "loss": 0.3413, "step": 1709 }, { "epoch": 0.78, "grad_norm": 0.8337969183921814, "learning_rate": 2.2024623803009577e-05, "loss": 0.3494, "step": 1710 }, { "epoch": 0.78, "eval_loss": 0.3558688759803772, "eval_runtime": 19.1518, "eval_samples_per_second": 1.462, "eval_steps_per_second": 0.366, "step": 1710 }, { "epoch": 0.78, "grad_norm": 0.8072974681854248, "learning_rate": 2.197902416780666e-05, "loss": 0.3234, "step": 1711 }, { "epoch": 0.78, "grad_norm": 0.8112078309059143, "learning_rate": 2.193342453260374e-05, "loss": 0.3363, "step": 1712 }, { "epoch": 0.78, "grad_norm": 0.8293721079826355, "learning_rate": 2.1887824897400823e-05, "loss": 0.3503, "step": 1713 }, { "epoch": 0.78, "grad_norm": 0.8819985389709473, "learning_rate": 2.1842225262197904e-05, "loss": 0.3516, "step": 1714 }, { "epoch": 0.78, "grad_norm": 0.7671990990638733, "learning_rate": 2.1796625626994986e-05, "loss": 0.3333, "step": 1715 }, { "epoch": 0.78, "grad_norm": 0.845353364944458, "learning_rate": 2.1751025991792068e-05, "loss": 0.3412, "step": 1716 }, { "epoch": 0.78, "grad_norm": 0.8249948620796204, "learning_rate": 2.170542635658915e-05, "loss": 0.3608, "step": 1717 }, { "epoch": 0.78, "grad_norm": 0.7952206134796143, "learning_rate": 2.165982672138623e-05, "loss": 0.3408, "step": 1718 }, { "epoch": 0.78, "grad_norm": 0.8013413548469543, "learning_rate": 2.161422708618331e-05, "loss": 0.328, "step": 1719 }, { "epoch": 0.78, "grad_norm": 0.8645780682563782, "learning_rate": 2.1568627450980395e-05, "loss": 0.3608, "step": 1720 }, { "epoch": 0.78, "eval_loss": 0.3558249771595001, "eval_runtime": 18.9857, "eval_samples_per_second": 1.475, "eval_steps_per_second": 0.369, "step": 1720 }, { "epoch": 0.78, "grad_norm": 0.8145090937614441, "learning_rate": 2.1523027815777473e-05, "loss": 0.3374, "step": 1721 }, { "epoch": 0.78, "grad_norm": 0.8994247317314148, "learning_rate": 2.1477428180574558e-05, "loss": 0.3626, "step": 1722 }, { "epoch": 0.79, "grad_norm": 0.8226855397224426, "learning_rate": 2.1431828545371637e-05, "loss": 0.3389, "step": 1723 }, { "epoch": 0.79, "grad_norm": 0.8447384834289551, "learning_rate": 2.1386228910168722e-05, "loss": 0.3676, "step": 1724 }, { "epoch": 0.79, "grad_norm": 0.8145758509635925, "learning_rate": 2.13406292749658e-05, "loss": 0.3438, "step": 1725 }, { "epoch": 0.79, "grad_norm": 0.8028122782707214, "learning_rate": 2.1295029639762885e-05, "loss": 0.3405, "step": 1726 }, { "epoch": 0.79, "grad_norm": 0.8540904521942139, "learning_rate": 2.1249430004559964e-05, "loss": 0.3452, "step": 1727 }, { "epoch": 0.79, "grad_norm": 0.8273831605911255, "learning_rate": 2.120383036935705e-05, "loss": 0.3388, "step": 1728 }, { "epoch": 0.79, "grad_norm": 0.7830473780632019, "learning_rate": 2.1158230734154127e-05, "loss": 0.334, "step": 1729 }, { "epoch": 0.79, "grad_norm": 0.8468246459960938, "learning_rate": 2.1112631098951212e-05, "loss": 0.3433, "step": 1730 }, { "epoch": 0.79, "eval_loss": 0.3553706705570221, "eval_runtime": 18.8238, "eval_samples_per_second": 1.487, "eval_steps_per_second": 0.372, "step": 1730 }, { "epoch": 0.79, "grad_norm": 0.815864622592926, "learning_rate": 2.106703146374829e-05, "loss": 0.3376, "step": 1731 }, { "epoch": 0.79, "grad_norm": 0.804809033870697, "learning_rate": 2.1021431828545372e-05, "loss": 0.3272, "step": 1732 }, { "epoch": 0.79, "grad_norm": 0.7881250977516174, "learning_rate": 2.0975832193342454e-05, "loss": 0.3344, "step": 1733 }, { "epoch": 0.79, "grad_norm": 0.8245829343795776, "learning_rate": 2.0930232558139536e-05, "loss": 0.3478, "step": 1734 }, { "epoch": 0.79, "grad_norm": 0.8029740452766418, "learning_rate": 2.0884632922936617e-05, "loss": 0.3429, "step": 1735 }, { "epoch": 0.79, "grad_norm": 0.8557107448577881, "learning_rate": 2.08390332877337e-05, "loss": 0.3401, "step": 1736 }, { "epoch": 0.79, "grad_norm": 0.8481475114822388, "learning_rate": 2.079343365253078e-05, "loss": 0.334, "step": 1737 }, { "epoch": 0.79, "grad_norm": 0.8665690422058105, "learning_rate": 2.0747834017327863e-05, "loss": 0.3649, "step": 1738 }, { "epoch": 0.79, "grad_norm": 0.7907549738883972, "learning_rate": 2.0702234382124944e-05, "loss": 0.3306, "step": 1739 }, { "epoch": 0.79, "grad_norm": 0.811202347278595, "learning_rate": 2.0656634746922026e-05, "loss": 0.3336, "step": 1740 }, { "epoch": 0.79, "eval_loss": 0.3552098870277405, "eval_runtime": 19.0048, "eval_samples_per_second": 1.473, "eval_steps_per_second": 0.368, "step": 1740 }, { "epoch": 0.79, "grad_norm": 0.7975050210952759, "learning_rate": 2.0611035111719108e-05, "loss": 0.3339, "step": 1741 }, { "epoch": 0.79, "grad_norm": 0.8387274742126465, "learning_rate": 2.056543547651619e-05, "loss": 0.3443, "step": 1742 }, { "epoch": 0.79, "grad_norm": 0.8204060196876526, "learning_rate": 2.051983584131327e-05, "loss": 0.3371, "step": 1743 }, { "epoch": 0.79, "grad_norm": 0.8510503768920898, "learning_rate": 2.0474236206110353e-05, "loss": 0.3401, "step": 1744 }, { "epoch": 0.8, "grad_norm": 0.8482796549797058, "learning_rate": 2.0428636570907435e-05, "loss": 0.337, "step": 1745 }, { "epoch": 0.8, "grad_norm": 0.8208746910095215, "learning_rate": 2.0383036935704516e-05, "loss": 0.3312, "step": 1746 }, { "epoch": 0.8, "grad_norm": 0.8174223899841309, "learning_rate": 2.0337437300501598e-05, "loss": 0.341, "step": 1747 }, { "epoch": 0.8, "grad_norm": 0.805649995803833, "learning_rate": 2.029183766529868e-05, "loss": 0.3485, "step": 1748 }, { "epoch": 0.8, "grad_norm": 0.8207290768623352, "learning_rate": 2.0246238030095758e-05, "loss": 0.336, "step": 1749 }, { "epoch": 0.8, "grad_norm": 0.8540436029434204, "learning_rate": 2.0200638394892843e-05, "loss": 0.3536, "step": 1750 }, { "epoch": 0.8, "eval_loss": 0.3553099036216736, "eval_runtime": 18.9613, "eval_samples_per_second": 1.477, "eval_steps_per_second": 0.369, "step": 1750 }, { "epoch": 0.8, "grad_norm": 0.8083583116531372, "learning_rate": 2.0155038759689922e-05, "loss": 0.3209, "step": 1751 }, { "epoch": 0.8, "grad_norm": 0.826860249042511, "learning_rate": 2.0109439124487007e-05, "loss": 0.3353, "step": 1752 }, { "epoch": 0.8, "grad_norm": 0.8210761547088623, "learning_rate": 2.0063839489284085e-05, "loss": 0.3362, "step": 1753 }, { "epoch": 0.8, "grad_norm": 0.8487640619277954, "learning_rate": 2.001823985408117e-05, "loss": 0.3571, "step": 1754 }, { "epoch": 0.8, "grad_norm": 0.8051680326461792, "learning_rate": 1.997264021887825e-05, "loss": 0.3394, "step": 1755 }, { "epoch": 0.8, "grad_norm": 0.8569719791412354, "learning_rate": 1.9927040583675334e-05, "loss": 0.3325, "step": 1756 }, { "epoch": 0.8, "grad_norm": 0.8311398029327393, "learning_rate": 1.9881440948472412e-05, "loss": 0.3482, "step": 1757 }, { "epoch": 0.8, "grad_norm": 0.8272088766098022, "learning_rate": 1.9835841313269497e-05, "loss": 0.3486, "step": 1758 }, { "epoch": 0.8, "grad_norm": 1.4248172044754028, "learning_rate": 1.9790241678066576e-05, "loss": 0.3219, "step": 1759 }, { "epoch": 0.8, "grad_norm": 0.840705394744873, "learning_rate": 1.974464204286366e-05, "loss": 0.339, "step": 1760 }, { "epoch": 0.8, "eval_loss": 0.3546459376811981, "eval_runtime": 19.0023, "eval_samples_per_second": 1.474, "eval_steps_per_second": 0.368, "step": 1760 }, { "epoch": 0.8, "grad_norm": 0.8269831538200378, "learning_rate": 1.969904240766074e-05, "loss": 0.3262, "step": 1761 }, { "epoch": 0.8, "grad_norm": 0.8176106810569763, "learning_rate": 1.965344277245782e-05, "loss": 0.3215, "step": 1762 }, { "epoch": 0.8, "grad_norm": 0.8428852558135986, "learning_rate": 1.9607843137254903e-05, "loss": 0.3358, "step": 1763 }, { "epoch": 0.8, "grad_norm": 0.8273266553878784, "learning_rate": 1.9562243502051984e-05, "loss": 0.3521, "step": 1764 }, { "epoch": 0.8, "grad_norm": 0.8344984650611877, "learning_rate": 1.9516643866849066e-05, "loss": 0.3458, "step": 1765 }, { "epoch": 0.81, "grad_norm": 0.8323333859443665, "learning_rate": 1.9471044231646148e-05, "loss": 0.3543, "step": 1766 }, { "epoch": 0.81, "grad_norm": 0.8865499496459961, "learning_rate": 1.942544459644323e-05, "loss": 0.3572, "step": 1767 }, { "epoch": 0.81, "grad_norm": 0.8460800647735596, "learning_rate": 1.937984496124031e-05, "loss": 0.3425, "step": 1768 }, { "epoch": 0.81, "grad_norm": 0.830491304397583, "learning_rate": 1.9334245326037393e-05, "loss": 0.3302, "step": 1769 }, { "epoch": 0.81, "grad_norm": 0.831790030002594, "learning_rate": 1.9288645690834475e-05, "loss": 0.3305, "step": 1770 }, { "epoch": 0.81, "eval_loss": 0.35451439023017883, "eval_runtime": 18.9415, "eval_samples_per_second": 1.478, "eval_steps_per_second": 0.37, "step": 1770 }, { "epoch": 0.81, "grad_norm": 0.8443323373794556, "learning_rate": 1.9243046055631556e-05, "loss": 0.3514, "step": 1771 }, { "epoch": 0.81, "grad_norm": 0.8010796904563904, "learning_rate": 1.9197446420428638e-05, "loss": 0.3408, "step": 1772 }, { "epoch": 0.81, "grad_norm": 0.8368244767189026, "learning_rate": 1.915184678522572e-05, "loss": 0.349, "step": 1773 }, { "epoch": 0.81, "grad_norm": 0.8440476655960083, "learning_rate": 1.91062471500228e-05, "loss": 0.3515, "step": 1774 }, { "epoch": 0.81, "grad_norm": 0.8782945871353149, "learning_rate": 1.9060647514819883e-05, "loss": 0.3518, "step": 1775 }, { "epoch": 0.81, "grad_norm": 0.8282709121704102, "learning_rate": 1.9015047879616965e-05, "loss": 0.3378, "step": 1776 }, { "epoch": 0.81, "grad_norm": 0.8318287134170532, "learning_rate": 1.8969448244414047e-05, "loss": 0.3496, "step": 1777 }, { "epoch": 0.81, "grad_norm": 0.7970842123031616, "learning_rate": 1.892384860921113e-05, "loss": 0.3184, "step": 1778 }, { "epoch": 0.81, "grad_norm": 0.808746337890625, "learning_rate": 1.887824897400821e-05, "loss": 0.3422, "step": 1779 }, { "epoch": 0.81, "grad_norm": 0.844722330570221, "learning_rate": 1.8832649338805292e-05, "loss": 0.3535, "step": 1780 }, { "epoch": 0.81, "eval_loss": 0.35482144355773926, "eval_runtime": 20.0596, "eval_samples_per_second": 1.396, "eval_steps_per_second": 0.349, "step": 1780 }, { "epoch": 0.81, "grad_norm": 0.8527645468711853, "learning_rate": 1.878704970360237e-05, "loss": 0.3572, "step": 1781 }, { "epoch": 0.81, "grad_norm": 0.83580482006073, "learning_rate": 1.8741450068399455e-05, "loss": 0.3502, "step": 1782 }, { "epoch": 0.81, "grad_norm": 0.8025197386741638, "learning_rate": 1.8695850433196534e-05, "loss": 0.3241, "step": 1783 }, { "epoch": 0.81, "grad_norm": 0.7724499702453613, "learning_rate": 1.865025079799362e-05, "loss": 0.3245, "step": 1784 }, { "epoch": 0.81, "grad_norm": 0.8183644413948059, "learning_rate": 1.8604651162790697e-05, "loss": 0.3284, "step": 1785 }, { "epoch": 0.81, "grad_norm": 0.843235194683075, "learning_rate": 1.8559051527587782e-05, "loss": 0.3237, "step": 1786 }, { "epoch": 0.81, "grad_norm": 0.8600271940231323, "learning_rate": 1.851345189238486e-05, "loss": 0.3379, "step": 1787 }, { "epoch": 0.82, "grad_norm": 0.8103592395782471, "learning_rate": 1.8467852257181946e-05, "loss": 0.3451, "step": 1788 }, { "epoch": 0.82, "grad_norm": 0.9331439137458801, "learning_rate": 1.8422252621979024e-05, "loss": 0.3422, "step": 1789 }, { "epoch": 0.82, "grad_norm": 0.7992767095565796, "learning_rate": 1.837665298677611e-05, "loss": 0.3161, "step": 1790 }, { "epoch": 0.82, "eval_loss": 0.3550729751586914, "eval_runtime": 19.0724, "eval_samples_per_second": 1.468, "eval_steps_per_second": 0.367, "step": 1790 }, { "epoch": 0.82, "grad_norm": 0.8258519172668457, "learning_rate": 1.8331053351573188e-05, "loss": 0.3439, "step": 1791 }, { "epoch": 0.82, "grad_norm": 0.7907400727272034, "learning_rate": 1.8285453716370273e-05, "loss": 0.3261, "step": 1792 }, { "epoch": 0.82, "grad_norm": 0.8231396675109863, "learning_rate": 1.823985408116735e-05, "loss": 0.3452, "step": 1793 }, { "epoch": 0.82, "grad_norm": 0.8543897271156311, "learning_rate": 1.8194254445964433e-05, "loss": 0.3701, "step": 1794 }, { "epoch": 0.82, "grad_norm": 0.8704990744590759, "learning_rate": 1.8148654810761515e-05, "loss": 0.3536, "step": 1795 }, { "epoch": 0.82, "grad_norm": 0.846364438533783, "learning_rate": 1.8103055175558596e-05, "loss": 0.3428, "step": 1796 }, { "epoch": 0.82, "grad_norm": 0.871387779712677, "learning_rate": 1.8057455540355678e-05, "loss": 0.3574, "step": 1797 }, { "epoch": 0.82, "grad_norm": 0.8834625482559204, "learning_rate": 1.801185590515276e-05, "loss": 0.3618, "step": 1798 }, { "epoch": 0.82, "grad_norm": 0.8243701457977295, "learning_rate": 1.796625626994984e-05, "loss": 0.3322, "step": 1799 }, { "epoch": 0.82, "grad_norm": 0.8247841000556946, "learning_rate": 1.7920656634746923e-05, "loss": 0.3445, "step": 1800 }, { "epoch": 0.82, "eval_loss": 0.3546763062477112, "eval_runtime": 19.7315, "eval_samples_per_second": 1.419, "eval_steps_per_second": 0.355, "step": 1800 }, { "epoch": 0.82, "grad_norm": 0.8630883097648621, "learning_rate": 1.7875056999544005e-05, "loss": 0.3364, "step": 1801 }, { "epoch": 0.82, "grad_norm": 0.7862895131111145, "learning_rate": 1.7829457364341087e-05, "loss": 0.3234, "step": 1802 }, { "epoch": 0.82, "grad_norm": 0.8019478917121887, "learning_rate": 1.778385772913817e-05, "loss": 0.3326, "step": 1803 }, { "epoch": 0.82, "grad_norm": 0.8013519644737244, "learning_rate": 1.773825809393525e-05, "loss": 0.3209, "step": 1804 }, { "epoch": 0.82, "grad_norm": 0.8409405946731567, "learning_rate": 1.7692658458732332e-05, "loss": 0.337, "step": 1805 }, { "epoch": 0.82, "grad_norm": 0.8028327822685242, "learning_rate": 1.7647058823529414e-05, "loss": 0.3136, "step": 1806 }, { "epoch": 0.82, "grad_norm": 0.8128679990768433, "learning_rate": 1.7601459188326495e-05, "loss": 0.3217, "step": 1807 }, { "epoch": 0.82, "grad_norm": 0.8762690424919128, "learning_rate": 1.7555859553123577e-05, "loss": 0.3554, "step": 1808 }, { "epoch": 0.82, "grad_norm": 0.82847660779953, "learning_rate": 1.751025991792066e-05, "loss": 0.343, "step": 1809 }, { "epoch": 0.83, "grad_norm": 0.7954825162887573, "learning_rate": 1.746466028271774e-05, "loss": 0.3364, "step": 1810 }, { "epoch": 0.83, "eval_loss": 0.3547009825706482, "eval_runtime": 19.8414, "eval_samples_per_second": 1.411, "eval_steps_per_second": 0.353, "step": 1810 }, { "epoch": 0.83, "grad_norm": 0.8007568120956421, "learning_rate": 1.741906064751482e-05, "loss": 0.337, "step": 1811 }, { "epoch": 0.83, "grad_norm": 0.8554584980010986, "learning_rate": 1.7373461012311904e-05, "loss": 0.3576, "step": 1812 }, { "epoch": 0.83, "grad_norm": 0.8528220653533936, "learning_rate": 1.7327861377108982e-05, "loss": 0.3395, "step": 1813 }, { "epoch": 0.83, "grad_norm": 0.8310472965240479, "learning_rate": 1.7282261741906067e-05, "loss": 0.3271, "step": 1814 }, { "epoch": 0.83, "grad_norm": 0.7908942103385925, "learning_rate": 1.7236662106703146e-05, "loss": 0.3316, "step": 1815 }, { "epoch": 0.83, "grad_norm": 0.8840450644493103, "learning_rate": 1.719106247150023e-05, "loss": 0.3356, "step": 1816 }, { "epoch": 0.83, "grad_norm": 0.7694292068481445, "learning_rate": 1.714546283629731e-05, "loss": 0.3157, "step": 1817 }, { "epoch": 0.83, "grad_norm": 0.8402760028839111, "learning_rate": 1.7099863201094394e-05, "loss": 0.3444, "step": 1818 }, { "epoch": 0.83, "grad_norm": 0.8158255815505981, "learning_rate": 1.7054263565891473e-05, "loss": 0.3282, "step": 1819 }, { "epoch": 0.83, "grad_norm": 0.8101038336753845, "learning_rate": 1.7008663930688558e-05, "loss": 0.3357, "step": 1820 }, { "epoch": 0.83, "eval_loss": 0.35470905900001526, "eval_runtime": 22.0714, "eval_samples_per_second": 1.269, "eval_steps_per_second": 0.317, "step": 1820 }, { "epoch": 0.83, "grad_norm": 0.8301025032997131, "learning_rate": 1.6963064295485636e-05, "loss": 0.3401, "step": 1821 }, { "epoch": 0.83, "grad_norm": 0.8290889263153076, "learning_rate": 1.691746466028272e-05, "loss": 0.3482, "step": 1822 }, { "epoch": 0.83, "grad_norm": 0.8209647536277771, "learning_rate": 1.68718650250798e-05, "loss": 0.3564, "step": 1823 }, { "epoch": 0.83, "grad_norm": 0.8127288222312927, "learning_rate": 1.682626538987688e-05, "loss": 0.3335, "step": 1824 }, { "epoch": 0.83, "grad_norm": 0.7942671179771423, "learning_rate": 1.6780665754673963e-05, "loss": 0.3134, "step": 1825 }, { "epoch": 0.83, "grad_norm": 0.8012814521789551, "learning_rate": 1.6735066119471045e-05, "loss": 0.3361, "step": 1826 }, { "epoch": 0.83, "grad_norm": 0.8532559275627136, "learning_rate": 1.6689466484268127e-05, "loss": 0.3332, "step": 1827 }, { "epoch": 0.83, "grad_norm": 0.7754400372505188, "learning_rate": 1.664386684906521e-05, "loss": 0.318, "step": 1828 }, { "epoch": 0.83, "grad_norm": 0.8331125378608704, "learning_rate": 1.659826721386229e-05, "loss": 0.3358, "step": 1829 }, { "epoch": 0.83, "grad_norm": 0.817933201789856, "learning_rate": 1.6552667578659372e-05, "loss": 0.3332, "step": 1830 }, { "epoch": 0.83, "eval_loss": 0.354523628950119, "eval_runtime": 20.8147, "eval_samples_per_second": 1.345, "eval_steps_per_second": 0.336, "step": 1830 }, { "epoch": 0.83, "grad_norm": 0.7978525757789612, "learning_rate": 1.6507067943456454e-05, "loss": 0.3356, "step": 1831 }, { "epoch": 0.84, "grad_norm": 0.8010070323944092, "learning_rate": 1.6461468308253535e-05, "loss": 0.3361, "step": 1832 }, { "epoch": 0.84, "grad_norm": 0.8485461473464966, "learning_rate": 1.6415868673050617e-05, "loss": 0.3412, "step": 1833 }, { "epoch": 0.84, "grad_norm": 0.8424686789512634, "learning_rate": 1.63702690378477e-05, "loss": 0.3366, "step": 1834 }, { "epoch": 0.84, "grad_norm": 0.8157220482826233, "learning_rate": 1.632466940264478e-05, "loss": 0.3368, "step": 1835 }, { "epoch": 0.84, "grad_norm": 0.8582452535629272, "learning_rate": 1.6279069767441862e-05, "loss": 0.3612, "step": 1836 }, { "epoch": 0.84, "grad_norm": 0.8213275074958801, "learning_rate": 1.6233470132238944e-05, "loss": 0.341, "step": 1837 }, { "epoch": 0.84, "grad_norm": 0.8489278554916382, "learning_rate": 1.6187870497036026e-05, "loss": 0.3444, "step": 1838 }, { "epoch": 0.84, "grad_norm": 0.8359931707382202, "learning_rate": 1.6142270861833107e-05, "loss": 0.3361, "step": 1839 }, { "epoch": 0.84, "grad_norm": 0.7801775336265564, "learning_rate": 1.609667122663019e-05, "loss": 0.318, "step": 1840 }, { "epoch": 0.84, "eval_loss": 0.35444164276123047, "eval_runtime": 18.8656, "eval_samples_per_second": 1.484, "eval_steps_per_second": 0.371, "step": 1840 }, { "epoch": 0.84, "grad_norm": 0.8361044526100159, "learning_rate": 1.6051071591427267e-05, "loss": 0.3407, "step": 1841 }, { "epoch": 0.84, "grad_norm": 0.8520697951316833, "learning_rate": 1.6005471956224353e-05, "loss": 0.3422, "step": 1842 }, { "epoch": 0.84, "grad_norm": 0.7962945103645325, "learning_rate": 1.595987232102143e-05, "loss": 0.3387, "step": 1843 }, { "epoch": 0.84, "grad_norm": 0.8080679774284363, "learning_rate": 1.5914272685818516e-05, "loss": 0.3299, "step": 1844 }, { "epoch": 0.84, "grad_norm": 0.8117623925209045, "learning_rate": 1.5868673050615594e-05, "loss": 0.3327, "step": 1845 }, { "epoch": 0.84, "grad_norm": 0.852195143699646, "learning_rate": 1.582307341541268e-05, "loss": 0.3404, "step": 1846 }, { "epoch": 0.84, "grad_norm": 0.807907223701477, "learning_rate": 1.5777473780209758e-05, "loss": 0.3306, "step": 1847 }, { "epoch": 0.84, "grad_norm": 0.8273969888687134, "learning_rate": 1.5731874145006843e-05, "loss": 0.3441, "step": 1848 }, { "epoch": 0.84, "grad_norm": 0.8860800266265869, "learning_rate": 1.568627450980392e-05, "loss": 0.3282, "step": 1849 }, { "epoch": 0.84, "grad_norm": 0.8311830163002014, "learning_rate": 1.5640674874601006e-05, "loss": 0.3373, "step": 1850 }, { "epoch": 0.84, "eval_loss": 0.354708194732666, "eval_runtime": 19.9719, "eval_samples_per_second": 1.402, "eval_steps_per_second": 0.35, "step": 1850 }, { "epoch": 0.84, "grad_norm": 0.8048996329307556, "learning_rate": 1.5595075239398085e-05, "loss": 0.3388, "step": 1851 }, { "epoch": 0.84, "grad_norm": 0.7911148071289062, "learning_rate": 1.554947560419517e-05, "loss": 0.3256, "step": 1852 }, { "epoch": 0.84, "grad_norm": 0.8320807218551636, "learning_rate": 1.5503875968992248e-05, "loss": 0.3459, "step": 1853 }, { "epoch": 0.85, "grad_norm": 0.8004910945892334, "learning_rate": 1.545827633378933e-05, "loss": 0.3222, "step": 1854 }, { "epoch": 0.85, "grad_norm": 0.8975878357887268, "learning_rate": 1.5412676698586412e-05, "loss": 0.331, "step": 1855 }, { "epoch": 0.85, "grad_norm": 0.7683451771736145, "learning_rate": 1.5367077063383493e-05, "loss": 0.3269, "step": 1856 }, { "epoch": 0.85, "grad_norm": 0.8622913956642151, "learning_rate": 1.5321477428180575e-05, "loss": 0.3455, "step": 1857 }, { "epoch": 0.85, "grad_norm": 0.8696328997612, "learning_rate": 1.5275877792977657e-05, "loss": 0.3574, "step": 1858 }, { "epoch": 0.85, "grad_norm": 0.836586058139801, "learning_rate": 1.5230278157774739e-05, "loss": 0.3446, "step": 1859 }, { "epoch": 0.85, "grad_norm": 0.8396971225738525, "learning_rate": 1.5184678522571822e-05, "loss": 0.3375, "step": 1860 }, { "epoch": 0.85, "eval_loss": 0.35480833053588867, "eval_runtime": 20.1797, "eval_samples_per_second": 1.388, "eval_steps_per_second": 0.347, "step": 1860 }, { "epoch": 0.85, "grad_norm": 0.8306203484535217, "learning_rate": 1.5139078887368902e-05, "loss": 0.3373, "step": 1861 }, { "epoch": 0.85, "grad_norm": 0.803442656993866, "learning_rate": 1.5093479252165984e-05, "loss": 0.3243, "step": 1862 }, { "epoch": 0.85, "grad_norm": 0.8001576066017151, "learning_rate": 1.5047879616963066e-05, "loss": 0.3408, "step": 1863 }, { "epoch": 0.85, "grad_norm": 0.8718475103378296, "learning_rate": 1.5002279981760147e-05, "loss": 0.3316, "step": 1864 }, { "epoch": 0.85, "grad_norm": 0.8238996863365173, "learning_rate": 1.4956680346557227e-05, "loss": 0.3394, "step": 1865 }, { "epoch": 0.85, "grad_norm": 0.8301441669464111, "learning_rate": 1.491108071135431e-05, "loss": 0.326, "step": 1866 }, { "epoch": 0.85, "grad_norm": 0.8007735013961792, "learning_rate": 1.486548107615139e-05, "loss": 0.3308, "step": 1867 }, { "epoch": 0.85, "grad_norm": 0.8509636521339417, "learning_rate": 1.4819881440948474e-05, "loss": 0.3528, "step": 1868 }, { "epoch": 0.85, "grad_norm": 0.7968654632568359, "learning_rate": 1.4774281805745554e-05, "loss": 0.3355, "step": 1869 }, { "epoch": 0.85, "grad_norm": 0.7697134017944336, "learning_rate": 1.4728682170542638e-05, "loss": 0.3088, "step": 1870 }, { "epoch": 0.85, "eval_loss": 0.35483449697494507, "eval_runtime": 20.0293, "eval_samples_per_second": 1.398, "eval_steps_per_second": 0.349, "step": 1870 }, { "epoch": 0.85, "grad_norm": 0.8213467001914978, "learning_rate": 1.4683082535339718e-05, "loss": 0.3368, "step": 1871 }, { "epoch": 0.85, "grad_norm": 0.8055750131607056, "learning_rate": 1.4637482900136801e-05, "loss": 0.3236, "step": 1872 }, { "epoch": 0.85, "grad_norm": 0.8239523768424988, "learning_rate": 1.4591883264933881e-05, "loss": 0.336, "step": 1873 }, { "epoch": 0.85, "grad_norm": 0.8376775979995728, "learning_rate": 1.4546283629730965e-05, "loss": 0.3332, "step": 1874 }, { "epoch": 0.85, "grad_norm": 0.8294931650161743, "learning_rate": 1.4500683994528045e-05, "loss": 0.3466, "step": 1875 }, { "epoch": 0.86, "grad_norm": 0.8678231239318848, "learning_rate": 1.4455084359325128e-05, "loss": 0.3482, "step": 1876 }, { "epoch": 0.86, "grad_norm": 0.8058868050575256, "learning_rate": 1.4409484724122208e-05, "loss": 0.3438, "step": 1877 }, { "epoch": 0.86, "grad_norm": 0.8198367357254028, "learning_rate": 1.436388508891929e-05, "loss": 0.3411, "step": 1878 }, { "epoch": 0.86, "grad_norm": 0.8211104273796082, "learning_rate": 1.4318285453716372e-05, "loss": 0.3431, "step": 1879 }, { "epoch": 0.86, "grad_norm": 0.8408799171447754, "learning_rate": 1.4272685818513453e-05, "loss": 0.3262, "step": 1880 }, { "epoch": 0.86, "eval_loss": 0.3547046184539795, "eval_runtime": 19.7685, "eval_samples_per_second": 1.416, "eval_steps_per_second": 0.354, "step": 1880 }, { "epoch": 0.86, "grad_norm": 0.9130158424377441, "learning_rate": 1.4227086183310533e-05, "loss": 0.3624, "step": 1881 }, { "epoch": 0.86, "grad_norm": 0.9254191517829895, "learning_rate": 1.4181486548107617e-05, "loss": 0.3494, "step": 1882 }, { "epoch": 0.86, "grad_norm": 0.809674859046936, "learning_rate": 1.4135886912904697e-05, "loss": 0.3323, "step": 1883 }, { "epoch": 0.86, "grad_norm": 0.8233156204223633, "learning_rate": 1.409028727770178e-05, "loss": 0.3472, "step": 1884 }, { "epoch": 0.86, "grad_norm": 0.8512375950813293, "learning_rate": 1.404468764249886e-05, "loss": 0.3388, "step": 1885 }, { "epoch": 0.86, "grad_norm": 0.8502083420753479, "learning_rate": 1.3999088007295944e-05, "loss": 0.346, "step": 1886 }, { "epoch": 0.86, "grad_norm": 0.8712393641471863, "learning_rate": 1.3953488372093024e-05, "loss": 0.358, "step": 1887 }, { "epoch": 0.86, "grad_norm": 0.7779493927955627, "learning_rate": 1.3907888736890107e-05, "loss": 0.3361, "step": 1888 }, { "epoch": 0.86, "grad_norm": 0.8024623990058899, "learning_rate": 1.3862289101687187e-05, "loss": 0.339, "step": 1889 }, { "epoch": 0.86, "grad_norm": 0.8336456418037415, "learning_rate": 1.381668946648427e-05, "loss": 0.3388, "step": 1890 }, { "epoch": 0.86, "eval_loss": 0.3545686900615692, "eval_runtime": 20.1169, "eval_samples_per_second": 1.392, "eval_steps_per_second": 0.348, "step": 1890 }, { "epoch": 0.86, "grad_norm": 0.8277023434638977, "learning_rate": 1.377108983128135e-05, "loss": 0.3403, "step": 1891 }, { "epoch": 0.86, "grad_norm": 0.8159246444702148, "learning_rate": 1.3725490196078432e-05, "loss": 0.3469, "step": 1892 }, { "epoch": 0.86, "grad_norm": 0.8369227051734924, "learning_rate": 1.3679890560875514e-05, "loss": 0.3568, "step": 1893 }, { "epoch": 0.86, "grad_norm": 0.8269848227500916, "learning_rate": 1.3634290925672596e-05, "loss": 0.3436, "step": 1894 }, { "epoch": 0.86, "grad_norm": 0.9031785726547241, "learning_rate": 1.3588691290469676e-05, "loss": 0.359, "step": 1895 }, { "epoch": 0.86, "grad_norm": 0.8029106259346008, "learning_rate": 1.354309165526676e-05, "loss": 0.3232, "step": 1896 }, { "epoch": 0.86, "grad_norm": 0.8643222451210022, "learning_rate": 1.349749202006384e-05, "loss": 0.3411, "step": 1897 }, { "epoch": 0.87, "grad_norm": 0.8164620399475098, "learning_rate": 1.3451892384860923e-05, "loss": 0.3333, "step": 1898 }, { "epoch": 0.87, "grad_norm": 0.8040909171104431, "learning_rate": 1.3406292749658003e-05, "loss": 0.3334, "step": 1899 }, { "epoch": 0.87, "grad_norm": 0.8196666240692139, "learning_rate": 1.3360693114455086e-05, "loss": 0.3512, "step": 1900 }, { "epoch": 0.87, "eval_loss": 0.35445669293403625, "eval_runtime": 20.2351, "eval_samples_per_second": 1.384, "eval_steps_per_second": 0.346, "step": 1900 }, { "epoch": 0.87, "grad_norm": 0.8267871141433716, "learning_rate": 1.3315093479252166e-05, "loss": 0.3121, "step": 1901 }, { "epoch": 0.87, "grad_norm": 0.8019816875457764, "learning_rate": 1.326949384404925e-05, "loss": 0.3437, "step": 1902 }, { "epoch": 0.87, "grad_norm": 0.8552934527397156, "learning_rate": 1.322389420884633e-05, "loss": 0.3614, "step": 1903 }, { "epoch": 0.87, "grad_norm": 0.8025230169296265, "learning_rate": 1.3178294573643413e-05, "loss": 0.3104, "step": 1904 }, { "epoch": 0.87, "grad_norm": 0.8386091589927673, "learning_rate": 1.3132694938440493e-05, "loss": 0.3371, "step": 1905 }, { "epoch": 0.87, "grad_norm": 0.8556039929389954, "learning_rate": 1.3087095303237577e-05, "loss": 0.331, "step": 1906 }, { "epoch": 0.87, "grad_norm": 0.8279280066490173, "learning_rate": 1.3041495668034657e-05, "loss": 0.3388, "step": 1907 }, { "epoch": 0.87, "grad_norm": 0.8014733791351318, "learning_rate": 1.2995896032831738e-05, "loss": 0.3343, "step": 1908 }, { "epoch": 0.87, "grad_norm": 0.9208333492279053, "learning_rate": 1.295029639762882e-05, "loss": 0.3638, "step": 1909 }, { "epoch": 0.87, "grad_norm": 0.818374514579773, "learning_rate": 1.2904696762425902e-05, "loss": 0.3266, "step": 1910 }, { "epoch": 0.87, "eval_loss": 0.3544815480709076, "eval_runtime": 20.5453, "eval_samples_per_second": 1.363, "eval_steps_per_second": 0.341, "step": 1910 }, { "epoch": 0.87, "grad_norm": 0.8163886666297913, "learning_rate": 1.2859097127222982e-05, "loss": 0.3529, "step": 1911 }, { "epoch": 0.87, "grad_norm": 0.8497297167778015, "learning_rate": 1.2813497492020065e-05, "loss": 0.3364, "step": 1912 }, { "epoch": 0.87, "grad_norm": 0.814104437828064, "learning_rate": 1.2767897856817145e-05, "loss": 0.3446, "step": 1913 }, { "epoch": 0.87, "grad_norm": 0.856905996799469, "learning_rate": 1.2722298221614229e-05, "loss": 0.3456, "step": 1914 }, { "epoch": 0.87, "grad_norm": 0.816177248954773, "learning_rate": 1.2676698586411309e-05, "loss": 0.3304, "step": 1915 }, { "epoch": 0.87, "grad_norm": 0.7743247151374817, "learning_rate": 1.2631098951208392e-05, "loss": 0.3247, "step": 1916 }, { "epoch": 0.87, "grad_norm": 0.8495445251464844, "learning_rate": 1.2585499316005472e-05, "loss": 0.332, "step": 1917 }, { "epoch": 0.87, "grad_norm": 0.8720409274101257, "learning_rate": 1.2539899680802556e-05, "loss": 0.3723, "step": 1918 }, { "epoch": 0.87, "grad_norm": 0.8387491106987, "learning_rate": 1.2494300045599636e-05, "loss": 0.3447, "step": 1919 }, { "epoch": 0.88, "grad_norm": 0.8160862922668457, "learning_rate": 1.2448700410396718e-05, "loss": 0.3588, "step": 1920 }, { "epoch": 0.88, "eval_loss": 0.35426434874534607, "eval_runtime": 20.709, "eval_samples_per_second": 1.352, "eval_steps_per_second": 0.338, "step": 1920 }, { "epoch": 0.88, "grad_norm": 0.8311136364936829, "learning_rate": 1.24031007751938e-05, "loss": 0.3394, "step": 1921 }, { "epoch": 0.88, "grad_norm": 0.8703175187110901, "learning_rate": 1.2357501139990881e-05, "loss": 0.3543, "step": 1922 }, { "epoch": 0.88, "grad_norm": 0.812587320804596, "learning_rate": 1.2311901504787963e-05, "loss": 0.34, "step": 1923 }, { "epoch": 0.88, "grad_norm": 0.8485066294670105, "learning_rate": 1.2266301869585044e-05, "loss": 0.3396, "step": 1924 }, { "epoch": 0.88, "grad_norm": 0.8266170620918274, "learning_rate": 1.2220702234382125e-05, "loss": 0.3429, "step": 1925 }, { "epoch": 0.88, "grad_norm": 0.8497158288955688, "learning_rate": 1.2175102599179206e-05, "loss": 0.3378, "step": 1926 }, { "epoch": 0.88, "grad_norm": 0.8195666074752808, "learning_rate": 1.2129502963976288e-05, "loss": 0.3425, "step": 1927 }, { "epoch": 0.88, "grad_norm": 0.8512323498725891, "learning_rate": 1.208390332877337e-05, "loss": 0.3348, "step": 1928 }, { "epoch": 0.88, "grad_norm": 0.8279510140419006, "learning_rate": 1.2038303693570451e-05, "loss": 0.3457, "step": 1929 }, { "epoch": 0.88, "grad_norm": 0.8365693092346191, "learning_rate": 1.1992704058367533e-05, "loss": 0.3457, "step": 1930 }, { "epoch": 0.88, "eval_loss": 0.3541213572025299, "eval_runtime": 20.5411, "eval_samples_per_second": 1.363, "eval_steps_per_second": 0.341, "step": 1930 }, { "epoch": 0.88, "grad_norm": 0.8179658055305481, "learning_rate": 1.1947104423164615e-05, "loss": 0.3281, "step": 1931 }, { "epoch": 0.88, "grad_norm": 0.8157138228416443, "learning_rate": 1.1901504787961697e-05, "loss": 0.3489, "step": 1932 }, { "epoch": 0.88, "grad_norm": 0.8499870300292969, "learning_rate": 1.1855905152758778e-05, "loss": 0.3206, "step": 1933 }, { "epoch": 0.88, "grad_norm": 0.8814250230789185, "learning_rate": 1.181030551755586e-05, "loss": 0.352, "step": 1934 }, { "epoch": 0.88, "grad_norm": 0.8282210230827332, "learning_rate": 1.1764705882352942e-05, "loss": 0.3508, "step": 1935 }, { "epoch": 0.88, "grad_norm": 0.8317981958389282, "learning_rate": 1.1719106247150024e-05, "loss": 0.3455, "step": 1936 }, { "epoch": 0.88, "grad_norm": 0.8515220880508423, "learning_rate": 1.1673506611947105e-05, "loss": 0.3395, "step": 1937 }, { "epoch": 0.88, "grad_norm": 0.8439879417419434, "learning_rate": 1.1627906976744187e-05, "loss": 0.3325, "step": 1938 }, { "epoch": 0.88, "grad_norm": 0.8174811601638794, "learning_rate": 1.1582307341541269e-05, "loss": 0.3359, "step": 1939 }, { "epoch": 0.88, "grad_norm": 0.8104060888290405, "learning_rate": 1.153670770633835e-05, "loss": 0.3319, "step": 1940 }, { "epoch": 0.88, "eval_loss": 0.35404470562934875, "eval_runtime": 19.9655, "eval_samples_per_second": 1.402, "eval_steps_per_second": 0.351, "step": 1940 }, { "epoch": 0.88, "grad_norm": 0.7619262933731079, "learning_rate": 1.149110807113543e-05, "loss": 0.3273, "step": 1941 }, { "epoch": 0.89, "grad_norm": 0.7890914082527161, "learning_rate": 1.1445508435932512e-05, "loss": 0.327, "step": 1942 }, { "epoch": 0.89, "grad_norm": 0.8490578532218933, "learning_rate": 1.1399908800729594e-05, "loss": 0.3246, "step": 1943 }, { "epoch": 0.89, "grad_norm": 0.8122377991676331, "learning_rate": 1.1354309165526676e-05, "loss": 0.345, "step": 1944 }, { "epoch": 0.89, "grad_norm": 0.8249675631523132, "learning_rate": 1.1308709530323757e-05, "loss": 0.3319, "step": 1945 }, { "epoch": 0.89, "grad_norm": 0.8207200169563293, "learning_rate": 1.126310989512084e-05, "loss": 0.3414, "step": 1946 }, { "epoch": 0.89, "grad_norm": 0.8593723773956299, "learning_rate": 1.1217510259917921e-05, "loss": 0.3495, "step": 1947 }, { "epoch": 0.89, "grad_norm": 0.799826979637146, "learning_rate": 1.1171910624715003e-05, "loss": 0.3163, "step": 1948 }, { "epoch": 0.89, "grad_norm": 0.8258078694343567, "learning_rate": 1.1126310989512084e-05, "loss": 0.3364, "step": 1949 }, { "epoch": 0.89, "grad_norm": 0.8616490364074707, "learning_rate": 1.1080711354309166e-05, "loss": 0.345, "step": 1950 }, { "epoch": 0.89, "eval_loss": 0.35401859879493713, "eval_runtime": 20.6827, "eval_samples_per_second": 1.354, "eval_steps_per_second": 0.338, "step": 1950 }, { "epoch": 0.89, "grad_norm": 0.8457894325256348, "learning_rate": 1.1035111719106248e-05, "loss": 0.3563, "step": 1951 }, { "epoch": 0.89, "grad_norm": 0.8008802533149719, "learning_rate": 1.098951208390333e-05, "loss": 0.3286, "step": 1952 }, { "epoch": 0.89, "grad_norm": 0.8404101729393005, "learning_rate": 1.0943912448700411e-05, "loss": 0.3471, "step": 1953 }, { "epoch": 0.89, "grad_norm": 0.8518589735031128, "learning_rate": 1.0898312813497493e-05, "loss": 0.3488, "step": 1954 }, { "epoch": 0.89, "grad_norm": 0.8108507990837097, "learning_rate": 1.0852713178294575e-05, "loss": 0.3382, "step": 1955 }, { "epoch": 0.89, "grad_norm": 0.771907389163971, "learning_rate": 1.0807113543091655e-05, "loss": 0.3183, "step": 1956 }, { "epoch": 0.89, "grad_norm": 0.7840840816497803, "learning_rate": 1.0761513907888737e-05, "loss": 0.317, "step": 1957 }, { "epoch": 0.89, "grad_norm": 0.796921968460083, "learning_rate": 1.0715914272685818e-05, "loss": 0.3257, "step": 1958 }, { "epoch": 0.89, "grad_norm": 0.8107333183288574, "learning_rate": 1.06703146374829e-05, "loss": 0.3522, "step": 1959 }, { "epoch": 0.89, "grad_norm": 0.8181228041648865, "learning_rate": 1.0624715002279982e-05, "loss": 0.3404, "step": 1960 }, { "epoch": 0.89, "eval_loss": 0.3538888990879059, "eval_runtime": 20.3575, "eval_samples_per_second": 1.375, "eval_steps_per_second": 0.344, "step": 1960 }, { "epoch": 0.89, "grad_norm": 0.8123586773872375, "learning_rate": 1.0579115367077063e-05, "loss": 0.3273, "step": 1961 }, { "epoch": 0.89, "grad_norm": 0.8805697560310364, "learning_rate": 1.0533515731874145e-05, "loss": 0.3428, "step": 1962 }, { "epoch": 0.89, "grad_norm": 0.779415488243103, "learning_rate": 1.0487916096671227e-05, "loss": 0.3337, "step": 1963 }, { "epoch": 0.9, "grad_norm": 0.8597623705863953, "learning_rate": 1.0442316461468309e-05, "loss": 0.3286, "step": 1964 }, { "epoch": 0.9, "grad_norm": 0.7904038429260254, "learning_rate": 1.039671682626539e-05, "loss": 0.3436, "step": 1965 }, { "epoch": 0.9, "grad_norm": 0.8617693185806274, "learning_rate": 1.0351117191062472e-05, "loss": 0.3548, "step": 1966 }, { "epoch": 0.9, "grad_norm": 0.8419015407562256, "learning_rate": 1.0305517555859554e-05, "loss": 0.3329, "step": 1967 }, { "epoch": 0.9, "grad_norm": 0.8732928037643433, "learning_rate": 1.0259917920656636e-05, "loss": 0.3533, "step": 1968 }, { "epoch": 0.9, "grad_norm": 0.8149675726890564, "learning_rate": 1.0214318285453717e-05, "loss": 0.3471, "step": 1969 }, { "epoch": 0.9, "grad_norm": 0.8281083703041077, "learning_rate": 1.0168718650250799e-05, "loss": 0.3555, "step": 1970 }, { "epoch": 0.9, "eval_loss": 0.35382261872291565, "eval_runtime": 20.9018, "eval_samples_per_second": 1.34, "eval_steps_per_second": 0.335, "step": 1970 }, { "epoch": 0.9, "grad_norm": 0.8474620580673218, "learning_rate": 1.0123119015047879e-05, "loss": 0.3436, "step": 1971 }, { "epoch": 0.9, "grad_norm": 0.7994499206542969, "learning_rate": 1.0077519379844961e-05, "loss": 0.3294, "step": 1972 }, { "epoch": 0.9, "grad_norm": 0.8625378012657166, "learning_rate": 1.0031919744642043e-05, "loss": 0.326, "step": 1973 }, { "epoch": 0.9, "grad_norm": 0.9084400534629822, "learning_rate": 9.986320109439124e-06, "loss": 0.3488, "step": 1974 }, { "epoch": 0.9, "grad_norm": 0.8352153301239014, "learning_rate": 9.940720474236206e-06, "loss": 0.3397, "step": 1975 }, { "epoch": 0.9, "grad_norm": 0.8399555087089539, "learning_rate": 9.895120839033288e-06, "loss": 0.3468, "step": 1976 }, { "epoch": 0.9, "grad_norm": 0.8331840634346008, "learning_rate": 9.84952120383037e-06, "loss": 0.3369, "step": 1977 }, { "epoch": 0.9, "grad_norm": 0.9115501046180725, "learning_rate": 9.803921568627451e-06, "loss": 0.365, "step": 1978 }, { "epoch": 0.9, "grad_norm": 0.8095980882644653, "learning_rate": 9.758321933424533e-06, "loss": 0.3397, "step": 1979 }, { "epoch": 0.9, "grad_norm": 0.7896193861961365, "learning_rate": 9.712722298221615e-06, "loss": 0.3353, "step": 1980 }, { "epoch": 0.9, "eval_loss": 0.35391122102737427, "eval_runtime": 20.1611, "eval_samples_per_second": 1.389, "eval_steps_per_second": 0.347, "step": 1980 }, { "epoch": 0.9, "grad_norm": 0.7885720133781433, "learning_rate": 9.667122663018696e-06, "loss": 0.3188, "step": 1981 }, { "epoch": 0.9, "grad_norm": 0.7856804728507996, "learning_rate": 9.621523027815778e-06, "loss": 0.3222, "step": 1982 }, { "epoch": 0.9, "grad_norm": 0.868549644947052, "learning_rate": 9.57592339261286e-06, "loss": 0.3639, "step": 1983 }, { "epoch": 0.9, "grad_norm": 0.882917046546936, "learning_rate": 9.530323757409942e-06, "loss": 0.3644, "step": 1984 }, { "epoch": 0.9, "grad_norm": 0.8365908265113831, "learning_rate": 9.484724122207023e-06, "loss": 0.3341, "step": 1985 }, { "epoch": 0.91, "grad_norm": 0.8351644277572632, "learning_rate": 9.439124487004105e-06, "loss": 0.3459, "step": 1986 }, { "epoch": 0.91, "grad_norm": 0.85927414894104, "learning_rate": 9.393524851801185e-06, "loss": 0.36, "step": 1987 }, { "epoch": 0.91, "grad_norm": 0.7935860753059387, "learning_rate": 9.347925216598267e-06, "loss": 0.319, "step": 1988 }, { "epoch": 0.91, "grad_norm": 0.855145275592804, "learning_rate": 9.302325581395349e-06, "loss": 0.3292, "step": 1989 }, { "epoch": 0.91, "grad_norm": 0.8166558146476746, "learning_rate": 9.25672594619243e-06, "loss": 0.3407, "step": 1990 }, { "epoch": 0.91, "eval_loss": 0.3539559543132782, "eval_runtime": 20.7137, "eval_samples_per_second": 1.352, "eval_steps_per_second": 0.338, "step": 1990 }, { "epoch": 0.91, "grad_norm": 0.794497013092041, "learning_rate": 9.211126310989512e-06, "loss": 0.3297, "step": 1991 }, { "epoch": 0.91, "grad_norm": 0.9098047614097595, "learning_rate": 9.165526675786594e-06, "loss": 0.3416, "step": 1992 }, { "epoch": 0.91, "grad_norm": 0.7829924821853638, "learning_rate": 9.119927040583676e-06, "loss": 0.3399, "step": 1993 }, { "epoch": 0.91, "grad_norm": 0.8300178050994873, "learning_rate": 9.074327405380757e-06, "loss": 0.339, "step": 1994 }, { "epoch": 0.91, "grad_norm": 0.8618704676628113, "learning_rate": 9.028727770177839e-06, "loss": 0.3351, "step": 1995 }, { "epoch": 0.91, "grad_norm": 0.887930154800415, "learning_rate": 8.98312813497492e-06, "loss": 0.3402, "step": 1996 }, { "epoch": 0.91, "grad_norm": 0.809892475605011, "learning_rate": 8.937528499772002e-06, "loss": 0.3438, "step": 1997 }, { "epoch": 0.91, "grad_norm": 0.7907106280326843, "learning_rate": 8.891928864569084e-06, "loss": 0.3333, "step": 1998 }, { "epoch": 0.91, "grad_norm": 0.7651233077049255, "learning_rate": 8.846329229366166e-06, "loss": 0.3213, "step": 1999 }, { "epoch": 0.91, "grad_norm": 0.8054981231689453, "learning_rate": 8.800729594163248e-06, "loss": 0.328, "step": 2000 }, { "epoch": 0.91, "eval_loss": 0.35396915674209595, "eval_runtime": 19.9787, "eval_samples_per_second": 1.401, "eval_steps_per_second": 0.35, "step": 2000 }, { "epoch": 0.91, "grad_norm": 0.8074979782104492, "learning_rate": 8.75512995896033e-06, "loss": 0.336, "step": 2001 }, { "epoch": 0.91, "grad_norm": 0.8073883056640625, "learning_rate": 8.70953032375741e-06, "loss": 0.3306, "step": 2002 }, { "epoch": 0.91, "grad_norm": 0.8059557676315308, "learning_rate": 8.663930688554491e-06, "loss": 0.3364, "step": 2003 }, { "epoch": 0.91, "grad_norm": 0.8500943779945374, "learning_rate": 8.618331053351573e-06, "loss": 0.3388, "step": 2004 }, { "epoch": 0.91, "grad_norm": 0.8143544793128967, "learning_rate": 8.572731418148655e-06, "loss": 0.3297, "step": 2005 }, { "epoch": 0.91, "grad_norm": 0.8129506707191467, "learning_rate": 8.527131782945736e-06, "loss": 0.3421, "step": 2006 }, { "epoch": 0.91, "grad_norm": 0.8499677181243896, "learning_rate": 8.481532147742818e-06, "loss": 0.3311, "step": 2007 }, { "epoch": 0.92, "grad_norm": 0.7764211297035217, "learning_rate": 8.4359325125399e-06, "loss": 0.3198, "step": 2008 }, { "epoch": 0.92, "grad_norm": 0.8650667667388916, "learning_rate": 8.390332877336982e-06, "loss": 0.3479, "step": 2009 }, { "epoch": 0.92, "grad_norm": 0.8370277285575867, "learning_rate": 8.344733242134063e-06, "loss": 0.3457, "step": 2010 }, { "epoch": 0.92, "eval_loss": 0.3540601432323456, "eval_runtime": 20.9003, "eval_samples_per_second": 1.34, "eval_steps_per_second": 0.335, "step": 2010 }, { "epoch": 0.92, "grad_norm": 0.8421793580055237, "learning_rate": 8.299133606931145e-06, "loss": 0.3529, "step": 2011 }, { "epoch": 0.92, "grad_norm": 0.8460060358047485, "learning_rate": 8.253533971728227e-06, "loss": 0.3422, "step": 2012 }, { "epoch": 0.92, "grad_norm": 0.8409944176673889, "learning_rate": 8.207934336525308e-06, "loss": 0.334, "step": 2013 }, { "epoch": 0.92, "grad_norm": 0.8694727420806885, "learning_rate": 8.16233470132239e-06, "loss": 0.3621, "step": 2014 }, { "epoch": 0.92, "grad_norm": 0.813942015171051, "learning_rate": 8.116735066119472e-06, "loss": 0.3467, "step": 2015 }, { "epoch": 0.92, "grad_norm": 0.8313078880310059, "learning_rate": 8.071135430916554e-06, "loss": 0.3257, "step": 2016 }, { "epoch": 0.92, "grad_norm": 0.8491564989089966, "learning_rate": 8.025535795713634e-06, "loss": 0.3594, "step": 2017 }, { "epoch": 0.92, "grad_norm": 0.9133617877960205, "learning_rate": 7.979936160510715e-06, "loss": 0.3492, "step": 2018 }, { "epoch": 0.92, "grad_norm": 0.8548010587692261, "learning_rate": 7.934336525307797e-06, "loss": 0.3297, "step": 2019 }, { "epoch": 0.92, "grad_norm": 0.8131832480430603, "learning_rate": 7.888736890104879e-06, "loss": 0.3388, "step": 2020 }, { "epoch": 0.92, "eval_loss": 0.3540656864643097, "eval_runtime": 20.3087, "eval_samples_per_second": 1.379, "eval_steps_per_second": 0.345, "step": 2020 }, { "epoch": 0.92, "grad_norm": 0.8627143502235413, "learning_rate": 7.84313725490196e-06, "loss": 0.3451, "step": 2021 }, { "epoch": 0.92, "grad_norm": 0.8710396885871887, "learning_rate": 7.797537619699042e-06, "loss": 0.3491, "step": 2022 }, { "epoch": 0.92, "grad_norm": 0.8423619866371155, "learning_rate": 7.751937984496124e-06, "loss": 0.3351, "step": 2023 }, { "epoch": 0.92, "grad_norm": 0.7903690934181213, "learning_rate": 7.706338349293206e-06, "loss": 0.3269, "step": 2024 }, { "epoch": 0.92, "grad_norm": 0.8393657803535461, "learning_rate": 7.660738714090288e-06, "loss": 0.3591, "step": 2025 }, { "epoch": 0.92, "grad_norm": 0.818337619304657, "learning_rate": 7.615139078887369e-06, "loss": 0.3277, "step": 2026 }, { "epoch": 0.92, "grad_norm": 0.8536666035652161, "learning_rate": 7.569539443684451e-06, "loss": 0.3385, "step": 2027 }, { "epoch": 0.92, "grad_norm": 0.7855940461158752, "learning_rate": 7.523939808481533e-06, "loss": 0.3235, "step": 2028 }, { "epoch": 0.92, "grad_norm": 0.8503609895706177, "learning_rate": 7.478340173278614e-06, "loss": 0.3269, "step": 2029 }, { "epoch": 0.93, "grad_norm": 0.8175629377365112, "learning_rate": 7.432740538075695e-06, "loss": 0.3251, "step": 2030 }, { "epoch": 0.93, "eval_loss": 0.35404643416404724, "eval_runtime": 19.8105, "eval_samples_per_second": 1.413, "eval_steps_per_second": 0.353, "step": 2030 }, { "epoch": 0.93, "grad_norm": 0.8303265571594238, "learning_rate": 7.387140902872777e-06, "loss": 0.3384, "step": 2031 }, { "epoch": 0.93, "grad_norm": 0.8031489849090576, "learning_rate": 7.341541267669859e-06, "loss": 0.3457, "step": 2032 }, { "epoch": 0.93, "grad_norm": 0.8192217350006104, "learning_rate": 7.295941632466941e-06, "loss": 0.3284, "step": 2033 }, { "epoch": 0.93, "grad_norm": 0.8637712597846985, "learning_rate": 7.250341997264022e-06, "loss": 0.3571, "step": 2034 }, { "epoch": 0.93, "grad_norm": 0.8116298317909241, "learning_rate": 7.204742362061104e-06, "loss": 0.3367, "step": 2035 }, { "epoch": 0.93, "grad_norm": 0.8289496898651123, "learning_rate": 7.159142726858186e-06, "loss": 0.3421, "step": 2036 }, { "epoch": 0.93, "grad_norm": 0.7838993072509766, "learning_rate": 7.113543091655267e-06, "loss": 0.3025, "step": 2037 }, { "epoch": 0.93, "grad_norm": 0.7901455163955688, "learning_rate": 7.067943456452348e-06, "loss": 0.3371, "step": 2038 }, { "epoch": 0.93, "grad_norm": 0.8628932237625122, "learning_rate": 7.02234382124943e-06, "loss": 0.3383, "step": 2039 }, { "epoch": 0.93, "grad_norm": 0.8549111485481262, "learning_rate": 6.976744186046512e-06, "loss": 0.357, "step": 2040 }, { "epoch": 0.93, "eval_loss": 0.35389232635498047, "eval_runtime": 19.6965, "eval_samples_per_second": 1.422, "eval_steps_per_second": 0.355, "step": 2040 }, { "epoch": 0.93, "grad_norm": 0.8260614275932312, "learning_rate": 6.931144550843594e-06, "loss": 0.3206, "step": 2041 }, { "epoch": 0.93, "grad_norm": 0.841353714466095, "learning_rate": 6.885544915640675e-06, "loss": 0.3291, "step": 2042 }, { "epoch": 0.93, "grad_norm": 0.8368021845817566, "learning_rate": 6.839945280437757e-06, "loss": 0.3492, "step": 2043 }, { "epoch": 0.93, "grad_norm": 0.8355389833450317, "learning_rate": 6.794345645234838e-06, "loss": 0.3458, "step": 2044 }, { "epoch": 0.93, "grad_norm": 0.8132802248001099, "learning_rate": 6.74874601003192e-06, "loss": 0.3162, "step": 2045 }, { "epoch": 0.93, "grad_norm": 0.8105396032333374, "learning_rate": 6.7031463748290014e-06, "loss": 0.3063, "step": 2046 }, { "epoch": 0.93, "grad_norm": 0.780116617679596, "learning_rate": 6.657546739626083e-06, "loss": 0.3285, "step": 2047 }, { "epoch": 0.93, "grad_norm": 0.8291633725166321, "learning_rate": 6.611947104423165e-06, "loss": 0.3335, "step": 2048 }, { "epoch": 0.93, "grad_norm": 0.8367629051208496, "learning_rate": 6.566347469220247e-06, "loss": 0.3419, "step": 2049 }, { "epoch": 0.93, "grad_norm": 0.8254304528236389, "learning_rate": 6.520747834017328e-06, "loss": 0.3436, "step": 2050 }, { "epoch": 0.93, "eval_loss": 0.3537757098674774, "eval_runtime": 19.7608, "eval_samples_per_second": 1.417, "eval_steps_per_second": 0.354, "step": 2050 }, { "epoch": 0.93, "grad_norm": 0.7851077318191528, "learning_rate": 6.47514819881441e-06, "loss": 0.3306, "step": 2051 }, { "epoch": 0.94, "grad_norm": 0.8641871809959412, "learning_rate": 6.429548563611491e-06, "loss": 0.34, "step": 2052 }, { "epoch": 0.94, "grad_norm": 0.875881552696228, "learning_rate": 6.383948928408573e-06, "loss": 0.3491, "step": 2053 }, { "epoch": 0.94, "grad_norm": 0.8497762084007263, "learning_rate": 6.3383492932056544e-06, "loss": 0.3513, "step": 2054 }, { "epoch": 0.94, "grad_norm": 0.8041360974311829, "learning_rate": 6.292749658002736e-06, "loss": 0.3361, "step": 2055 }, { "epoch": 0.94, "grad_norm": 0.8663139343261719, "learning_rate": 6.247150022799818e-06, "loss": 0.3366, "step": 2056 }, { "epoch": 0.94, "grad_norm": 0.8280770182609558, "learning_rate": 6.2015503875969e-06, "loss": 0.3526, "step": 2057 }, { "epoch": 0.94, "grad_norm": 0.842253565788269, "learning_rate": 6.155950752393981e-06, "loss": 0.3274, "step": 2058 }, { "epoch": 0.94, "grad_norm": 0.8146688938140869, "learning_rate": 6.110351117191062e-06, "loss": 0.3423, "step": 2059 }, { "epoch": 0.94, "grad_norm": 0.8018572330474854, "learning_rate": 6.064751481988144e-06, "loss": 0.3169, "step": 2060 }, { "epoch": 0.94, "eval_loss": 0.35372114181518555, "eval_runtime": 19.7858, "eval_samples_per_second": 1.415, "eval_steps_per_second": 0.354, "step": 2060 }, { "epoch": 0.94, "grad_norm": 0.8574947118759155, "learning_rate": 6.019151846785226e-06, "loss": 0.3683, "step": 2061 }, { "epoch": 0.94, "grad_norm": 0.8088916540145874, "learning_rate": 5.9735522115823075e-06, "loss": 0.3295, "step": 2062 }, { "epoch": 0.94, "grad_norm": 0.7937730550765991, "learning_rate": 5.927952576379389e-06, "loss": 0.3345, "step": 2063 }, { "epoch": 0.94, "grad_norm": 0.8222320675849915, "learning_rate": 5.882352941176471e-06, "loss": 0.3271, "step": 2064 }, { "epoch": 0.94, "grad_norm": 0.8784458041191101, "learning_rate": 5.836753305973553e-06, "loss": 0.3435, "step": 2065 }, { "epoch": 0.94, "grad_norm": 0.857282817363739, "learning_rate": 5.791153670770634e-06, "loss": 0.3362, "step": 2066 }, { "epoch": 0.94, "grad_norm": 0.7424783110618591, "learning_rate": 5.745554035567715e-06, "loss": 0.3035, "step": 2067 }, { "epoch": 0.94, "grad_norm": 0.8212710618972778, "learning_rate": 5.699954400364797e-06, "loss": 0.3311, "step": 2068 }, { "epoch": 0.94, "grad_norm": 0.8231950998306274, "learning_rate": 5.654354765161879e-06, "loss": 0.3436, "step": 2069 }, { "epoch": 0.94, "grad_norm": 0.8013624548912048, "learning_rate": 5.6087551299589605e-06, "loss": 0.3244, "step": 2070 }, { "epoch": 0.94, "eval_loss": 0.3536633551120758, "eval_runtime": 19.7898, "eval_samples_per_second": 1.415, "eval_steps_per_second": 0.354, "step": 2070 }, { "epoch": 0.94, "grad_norm": 0.8232308030128479, "learning_rate": 5.563155494756042e-06, "loss": 0.3322, "step": 2071 }, { "epoch": 0.94, "grad_norm": 0.8284724354743958, "learning_rate": 5.517555859553124e-06, "loss": 0.3474, "step": 2072 }, { "epoch": 0.94, "grad_norm": 0.7981323599815369, "learning_rate": 5.471956224350206e-06, "loss": 0.3314, "step": 2073 }, { "epoch": 0.95, "grad_norm": 0.7892775535583496, "learning_rate": 5.426356589147287e-06, "loss": 0.3456, "step": 2074 }, { "epoch": 0.95, "grad_norm": 0.8134260177612305, "learning_rate": 5.380756953944368e-06, "loss": 0.3439, "step": 2075 }, { "epoch": 0.95, "grad_norm": 0.8565177917480469, "learning_rate": 5.33515731874145e-06, "loss": 0.3372, "step": 2076 }, { "epoch": 0.95, "grad_norm": 0.8024502396583557, "learning_rate": 5.289557683538532e-06, "loss": 0.3226, "step": 2077 }, { "epoch": 0.95, "grad_norm": 0.8735952973365784, "learning_rate": 5.2439580483356135e-06, "loss": 0.3674, "step": 2078 }, { "epoch": 0.95, "grad_norm": 0.8029111623764038, "learning_rate": 5.198358413132695e-06, "loss": 0.3293, "step": 2079 }, { "epoch": 0.95, "grad_norm": 0.7668672800064087, "learning_rate": 5.152758777929777e-06, "loss": 0.3062, "step": 2080 }, { "epoch": 0.95, "eval_loss": 0.35355642437934875, "eval_runtime": 19.8253, "eval_samples_per_second": 1.412, "eval_steps_per_second": 0.353, "step": 2080 }, { "epoch": 0.95, "grad_norm": 0.8596630692481995, "learning_rate": 5.107159142726859e-06, "loss": 0.3573, "step": 2081 }, { "epoch": 0.95, "grad_norm": 0.8058382868766785, "learning_rate": 5.0615595075239396e-06, "loss": 0.3208, "step": 2082 }, { "epoch": 0.95, "grad_norm": 0.8100380301475525, "learning_rate": 5.015959872321021e-06, "loss": 0.3228, "step": 2083 }, { "epoch": 0.95, "grad_norm": 0.8688908815383911, "learning_rate": 4.970360237118103e-06, "loss": 0.3434, "step": 2084 }, { "epoch": 0.95, "grad_norm": 0.8467679619789124, "learning_rate": 4.924760601915185e-06, "loss": 0.3249, "step": 2085 }, { "epoch": 0.95, "grad_norm": 0.8613777160644531, "learning_rate": 4.8791609667122665e-06, "loss": 0.341, "step": 2086 }, { "epoch": 0.95, "grad_norm": 0.8214209079742432, "learning_rate": 4.833561331509348e-06, "loss": 0.3267, "step": 2087 }, { "epoch": 0.95, "grad_norm": 0.8228405714035034, "learning_rate": 4.78796169630643e-06, "loss": 0.3432, "step": 2088 }, { "epoch": 0.95, "grad_norm": 0.8442954421043396, "learning_rate": 4.742362061103512e-06, "loss": 0.3347, "step": 2089 }, { "epoch": 0.95, "grad_norm": 0.8544167876243591, "learning_rate": 4.6967624259005926e-06, "loss": 0.344, "step": 2090 }, { "epoch": 0.95, "eval_loss": 0.35357967019081116, "eval_runtime": 19.8199, "eval_samples_per_second": 1.413, "eval_steps_per_second": 0.353, "step": 2090 }, { "epoch": 0.95, "grad_norm": 0.8619562983512878, "learning_rate": 4.651162790697674e-06, "loss": 0.3449, "step": 2091 }, { "epoch": 0.95, "grad_norm": 0.8240465521812439, "learning_rate": 4.605563155494756e-06, "loss": 0.3442, "step": 2092 }, { "epoch": 0.95, "grad_norm": 0.875579297542572, "learning_rate": 4.559963520291838e-06, "loss": 0.3475, "step": 2093 }, { "epoch": 0.95, "grad_norm": 0.8105667233467102, "learning_rate": 4.5143638850889195e-06, "loss": 0.3321, "step": 2094 }, { "epoch": 0.95, "grad_norm": 0.8081218004226685, "learning_rate": 4.468764249886001e-06, "loss": 0.333, "step": 2095 }, { "epoch": 0.96, "grad_norm": 0.8312126398086548, "learning_rate": 4.423164614683083e-06, "loss": 0.3474, "step": 2096 }, { "epoch": 0.96, "grad_norm": 0.8706237077713013, "learning_rate": 4.377564979480165e-06, "loss": 0.3505, "step": 2097 }, { "epoch": 0.96, "grad_norm": 0.8144000172615051, "learning_rate": 4.331965344277246e-06, "loss": 0.3418, "step": 2098 }, { "epoch": 0.96, "grad_norm": 0.8141364455223083, "learning_rate": 4.286365709074327e-06, "loss": 0.3185, "step": 2099 }, { "epoch": 0.96, "grad_norm": 0.8204436898231506, "learning_rate": 4.240766073871409e-06, "loss": 0.3527, "step": 2100 }, { "epoch": 0.96, "eval_loss": 0.35366660356521606, "eval_runtime": 20.5995, "eval_samples_per_second": 1.359, "eval_steps_per_second": 0.34, "step": 2100 }, { "epoch": 0.96, "grad_norm": 0.8323388695716858, "learning_rate": 4.195166438668491e-06, "loss": 0.3309, "step": 2101 }, { "epoch": 0.96, "grad_norm": 0.8300788998603821, "learning_rate": 4.1495668034655725e-06, "loss": 0.3347, "step": 2102 }, { "epoch": 0.96, "grad_norm": 0.8446533679962158, "learning_rate": 4.103967168262654e-06, "loss": 0.3426, "step": 2103 }, { "epoch": 0.96, "grad_norm": 0.8308296799659729, "learning_rate": 4.058367533059736e-06, "loss": 0.3418, "step": 2104 }, { "epoch": 0.96, "grad_norm": 0.829054057598114, "learning_rate": 4.012767897856817e-06, "loss": 0.3392, "step": 2105 }, { "epoch": 0.96, "grad_norm": 0.874564528465271, "learning_rate": 3.967168262653899e-06, "loss": 0.3456, "step": 2106 }, { "epoch": 0.96, "grad_norm": 0.8792354464530945, "learning_rate": 3.92156862745098e-06, "loss": 0.3558, "step": 2107 }, { "epoch": 0.96, "grad_norm": 0.9232382774353027, "learning_rate": 3.875968992248062e-06, "loss": 0.3672, "step": 2108 }, { "epoch": 0.96, "grad_norm": 0.8330267071723938, "learning_rate": 3.830369357045144e-06, "loss": 0.3293, "step": 2109 }, { "epoch": 0.96, "grad_norm": 0.7642355561256409, "learning_rate": 3.7847697218422255e-06, "loss": 0.3104, "step": 2110 }, { "epoch": 0.96, "eval_loss": 0.35369256138801575, "eval_runtime": 20.5064, "eval_samples_per_second": 1.365, "eval_steps_per_second": 0.341, "step": 2110 }, { "epoch": 0.96, "grad_norm": 0.8061398863792419, "learning_rate": 3.739170086639307e-06, "loss": 0.3413, "step": 2111 }, { "epoch": 0.96, "grad_norm": 0.7958340644836426, "learning_rate": 3.6935704514363886e-06, "loss": 0.3367, "step": 2112 }, { "epoch": 0.96, "grad_norm": 0.8436947464942932, "learning_rate": 3.6479708162334703e-06, "loss": 0.3375, "step": 2113 }, { "epoch": 0.96, "grad_norm": 0.7928789854049683, "learning_rate": 3.602371181030552e-06, "loss": 0.321, "step": 2114 }, { "epoch": 0.96, "grad_norm": 0.8269808292388916, "learning_rate": 3.5567715458276333e-06, "loss": 0.3343, "step": 2115 }, { "epoch": 0.96, "grad_norm": 0.8675149083137512, "learning_rate": 3.511171910624715e-06, "loss": 0.3571, "step": 2116 }, { "epoch": 0.97, "grad_norm": 0.8159371018409729, "learning_rate": 3.465572275421797e-06, "loss": 0.3379, "step": 2117 }, { "epoch": 0.97, "grad_norm": 0.8035234212875366, "learning_rate": 3.4199726402188785e-06, "loss": 0.3491, "step": 2118 }, { "epoch": 0.97, "grad_norm": 0.812272310256958, "learning_rate": 3.37437300501596e-06, "loss": 0.3532, "step": 2119 }, { "epoch": 0.97, "grad_norm": 0.791612982749939, "learning_rate": 3.3287733698130416e-06, "loss": 0.3355, "step": 2120 }, { "epoch": 0.97, "eval_loss": 0.35358789563179016, "eval_runtime": 21.0441, "eval_samples_per_second": 1.331, "eval_steps_per_second": 0.333, "step": 2120 }, { "epoch": 0.97, "grad_norm": 0.8322513699531555, "learning_rate": 3.2831737346101233e-06, "loss": 0.3336, "step": 2121 }, { "epoch": 0.97, "grad_norm": 0.8068985342979431, "learning_rate": 3.237574099407205e-06, "loss": 0.3309, "step": 2122 }, { "epoch": 0.97, "grad_norm": 0.7732910513877869, "learning_rate": 3.1919744642042864e-06, "loss": 0.3142, "step": 2123 }, { "epoch": 0.97, "grad_norm": 0.8337993621826172, "learning_rate": 3.146374829001368e-06, "loss": 0.3453, "step": 2124 }, { "epoch": 0.97, "grad_norm": 0.8158725500106812, "learning_rate": 3.10077519379845e-06, "loss": 0.3417, "step": 2125 }, { "epoch": 0.97, "grad_norm": 0.8460199236869812, "learning_rate": 3.055175558595531e-06, "loss": 0.3438, "step": 2126 }, { "epoch": 0.97, "grad_norm": 0.8237974643707275, "learning_rate": 3.009575923392613e-06, "loss": 0.3434, "step": 2127 }, { "epoch": 0.97, "grad_norm": 0.8849083781242371, "learning_rate": 2.9639762881896946e-06, "loss": 0.3442, "step": 2128 }, { "epoch": 0.97, "grad_norm": 0.8236368298530579, "learning_rate": 2.9183766529867763e-06, "loss": 0.3358, "step": 2129 }, { "epoch": 0.97, "grad_norm": 0.8945069909095764, "learning_rate": 2.8727770177838576e-06, "loss": 0.3566, "step": 2130 }, { "epoch": 0.97, "eval_loss": 0.3535517156124115, "eval_runtime": 21.3121, "eval_samples_per_second": 1.314, "eval_steps_per_second": 0.328, "step": 2130 }, { "epoch": 0.97, "grad_norm": 0.8623038530349731, "learning_rate": 2.8271773825809394e-06, "loss": 0.3519, "step": 2131 }, { "epoch": 0.97, "grad_norm": 0.796574056148529, "learning_rate": 2.781577747378021e-06, "loss": 0.3416, "step": 2132 }, { "epoch": 0.97, "grad_norm": 0.8572683930397034, "learning_rate": 2.735978112175103e-06, "loss": 0.3258, "step": 2133 }, { "epoch": 0.97, "grad_norm": 0.8047083616256714, "learning_rate": 2.690378476972184e-06, "loss": 0.3364, "step": 2134 }, { "epoch": 0.97, "grad_norm": 0.8221876621246338, "learning_rate": 2.644778841769266e-06, "loss": 0.342, "step": 2135 }, { "epoch": 0.97, "grad_norm": 0.8190683722496033, "learning_rate": 2.5991792065663476e-06, "loss": 0.3342, "step": 2136 }, { "epoch": 0.97, "grad_norm": 0.8451207280158997, "learning_rate": 2.5535795713634293e-06, "loss": 0.3588, "step": 2137 }, { "epoch": 0.97, "grad_norm": 0.8056630492210388, "learning_rate": 2.5079799361605106e-06, "loss": 0.3397, "step": 2138 }, { "epoch": 0.98, "grad_norm": 0.8281639218330383, "learning_rate": 2.4623803009575924e-06, "loss": 0.347, "step": 2139 }, { "epoch": 0.98, "grad_norm": 0.8217196464538574, "learning_rate": 2.416780665754674e-06, "loss": 0.3565, "step": 2140 }, { "epoch": 0.98, "eval_loss": 0.3535143733024597, "eval_runtime": 21.8883, "eval_samples_per_second": 1.279, "eval_steps_per_second": 0.32, "step": 2140 }, { "epoch": 0.98, "grad_norm": 0.8088732361793518, "learning_rate": 2.371181030551756e-06, "loss": 0.326, "step": 2141 }, { "epoch": 0.98, "grad_norm": 0.8405909538269043, "learning_rate": 2.325581395348837e-06, "loss": 0.3313, "step": 2142 }, { "epoch": 0.98, "grad_norm": 0.8254099488258362, "learning_rate": 2.279981760145919e-06, "loss": 0.3583, "step": 2143 }, { "epoch": 0.98, "grad_norm": 0.8337267637252808, "learning_rate": 2.2343821249430006e-06, "loss": 0.3323, "step": 2144 }, { "epoch": 0.98, "grad_norm": 0.8419988751411438, "learning_rate": 2.1887824897400824e-06, "loss": 0.3446, "step": 2145 }, { "epoch": 0.98, "grad_norm": 0.846906840801239, "learning_rate": 2.1431828545371637e-06, "loss": 0.3452, "step": 2146 }, { "epoch": 0.98, "grad_norm": 0.8782608509063721, "learning_rate": 2.0975832193342454e-06, "loss": 0.3571, "step": 2147 }, { "epoch": 0.98, "grad_norm": 0.7980951070785522, "learning_rate": 2.051983584131327e-06, "loss": 0.3397, "step": 2148 }, { "epoch": 0.98, "grad_norm": 0.8458917737007141, "learning_rate": 2.0063839489284084e-06, "loss": 0.3391, "step": 2149 }, { "epoch": 0.98, "grad_norm": 0.839817225933075, "learning_rate": 1.96078431372549e-06, "loss": 0.3563, "step": 2150 }, { "epoch": 0.98, "eval_loss": 0.3534523546695709, "eval_runtime": 23.8012, "eval_samples_per_second": 1.176, "eval_steps_per_second": 0.294, "step": 2150 }, { "epoch": 0.98, "grad_norm": 0.8007347583770752, "learning_rate": 1.915184678522572e-06, "loss": 0.3382, "step": 2151 }, { "epoch": 0.98, "grad_norm": 0.8078628778457642, "learning_rate": 1.8695850433196534e-06, "loss": 0.3225, "step": 2152 }, { "epoch": 0.98, "grad_norm": 0.855683445930481, "learning_rate": 1.8239854081167352e-06, "loss": 0.3462, "step": 2153 }, { "epoch": 0.98, "grad_norm": 0.796846866607666, "learning_rate": 1.7783857729138167e-06, "loss": 0.3505, "step": 2154 }, { "epoch": 0.98, "grad_norm": 0.8556407690048218, "learning_rate": 1.7327861377108984e-06, "loss": 0.344, "step": 2155 }, { "epoch": 0.98, "grad_norm": 0.8137206435203552, "learning_rate": 1.68718650250798e-06, "loss": 0.3218, "step": 2156 }, { "epoch": 0.98, "grad_norm": 0.8239472508430481, "learning_rate": 1.6415868673050617e-06, "loss": 0.3395, "step": 2157 }, { "epoch": 0.98, "grad_norm": 0.7975683808326721, "learning_rate": 1.5959872321021432e-06, "loss": 0.3258, "step": 2158 }, { "epoch": 0.98, "grad_norm": 0.8348971605300903, "learning_rate": 1.550387596899225e-06, "loss": 0.3425, "step": 2159 }, { "epoch": 0.98, "grad_norm": 0.892481803894043, "learning_rate": 1.5047879616963064e-06, "loss": 0.3692, "step": 2160 }, { "epoch": 0.98, "eval_loss": 0.3534722328186035, "eval_runtime": 21.8372, "eval_samples_per_second": 1.282, "eval_steps_per_second": 0.321, "step": 2160 }, { "epoch": 0.99, "grad_norm": 0.8270180821418762, "learning_rate": 1.4591883264933882e-06, "loss": 0.3434, "step": 2161 }, { "epoch": 0.99, "grad_norm": 0.8239341974258423, "learning_rate": 1.4135886912904697e-06, "loss": 0.3442, "step": 2162 }, { "epoch": 0.99, "grad_norm": 0.8776184916496277, "learning_rate": 1.3679890560875514e-06, "loss": 0.3642, "step": 2163 }, { "epoch": 0.99, "grad_norm": 0.8055011034011841, "learning_rate": 1.322389420884633e-06, "loss": 0.3332, "step": 2164 }, { "epoch": 0.99, "grad_norm": 0.8007059693336487, "learning_rate": 1.2767897856817147e-06, "loss": 0.3118, "step": 2165 }, { "epoch": 0.99, "grad_norm": 0.8044161796569824, "learning_rate": 1.2311901504787962e-06, "loss": 0.3235, "step": 2166 }, { "epoch": 0.99, "grad_norm": 0.7754327654838562, "learning_rate": 1.185590515275878e-06, "loss": 0.3149, "step": 2167 }, { "epoch": 0.99, "grad_norm": 0.8618872761726379, "learning_rate": 1.1399908800729594e-06, "loss": 0.332, "step": 2168 }, { "epoch": 0.99, "grad_norm": 0.8462168574333191, "learning_rate": 1.0943912448700412e-06, "loss": 0.35, "step": 2169 }, { "epoch": 0.99, "grad_norm": 0.8031334280967712, "learning_rate": 1.0487916096671227e-06, "loss": 0.3182, "step": 2170 }, { "epoch": 0.99, "eval_loss": 0.35348114371299744, "eval_runtime": 21.5618, "eval_samples_per_second": 1.299, "eval_steps_per_second": 0.325, "step": 2170 }, { "epoch": 0.99, "grad_norm": 0.7941566705703735, "learning_rate": 1.0031919744642042e-06, "loss": 0.3134, "step": 2171 }, { "epoch": 0.99, "grad_norm": 0.8218158483505249, "learning_rate": 9.57592339261286e-07, "loss": 0.3459, "step": 2172 }, { "epoch": 0.99, "grad_norm": 0.846577525138855, "learning_rate": 9.119927040583676e-07, "loss": 0.3406, "step": 2173 }, { "epoch": 0.99, "grad_norm": 0.8338779807090759, "learning_rate": 8.663930688554492e-07, "loss": 0.3367, "step": 2174 }, { "epoch": 0.99, "grad_norm": 0.8083705306053162, "learning_rate": 8.207934336525308e-07, "loss": 0.314, "step": 2175 }, { "epoch": 0.99, "grad_norm": 0.8483944535255432, "learning_rate": 7.751937984496125e-07, "loss": 0.3422, "step": 2176 }, { "epoch": 0.99, "grad_norm": 0.8029471039772034, "learning_rate": 7.295941632466941e-07, "loss": 0.3337, "step": 2177 }, { "epoch": 0.99, "grad_norm": 0.8467877507209778, "learning_rate": 6.839945280437757e-07, "loss": 0.3429, "step": 2178 }, { "epoch": 0.99, "grad_norm": 0.7840286493301392, "learning_rate": 6.383948928408573e-07, "loss": 0.3265, "step": 2179 }, { "epoch": 0.99, "grad_norm": 0.7920138835906982, "learning_rate": 5.92795257637939e-07, "loss": 0.3359, "step": 2180 }, { "epoch": 0.99, "eval_loss": 0.3534707725048065, "eval_runtime": 21.7256, "eval_samples_per_second": 1.289, "eval_steps_per_second": 0.322, "step": 2180 }, { "epoch": 0.99, "grad_norm": 0.8483693599700928, "learning_rate": 5.471956224350206e-07, "loss": 0.3586, "step": 2181 }, { "epoch": 0.99, "grad_norm": 0.8043666481971741, "learning_rate": 5.015959872321021e-07, "loss": 0.3351, "step": 2182 }, { "epoch": 1.0, "grad_norm": 0.7986137866973877, "learning_rate": 4.559963520291838e-07, "loss": 0.3371, "step": 2183 }, { "epoch": 1.0, "grad_norm": 0.8146182298660278, "learning_rate": 4.103967168262654e-07, "loss": 0.319, "step": 2184 }, { "epoch": 1.0, "grad_norm": 0.8116611242294312, "learning_rate": 3.6479708162334704e-07, "loss": 0.3309, "step": 2185 }, { "epoch": 1.0, "grad_norm": 0.834054172039032, "learning_rate": 3.1919744642042867e-07, "loss": 0.3504, "step": 2186 }, { "epoch": 1.0, "grad_norm": 0.7927460074424744, "learning_rate": 2.735978112175103e-07, "loss": 0.3502, "step": 2187 }, { "epoch": 1.0, "grad_norm": 0.8293411135673523, "learning_rate": 2.279981760145919e-07, "loss": 0.344, "step": 2188 }, { "epoch": 1.0, "grad_norm": 0.8744671940803528, "learning_rate": 1.8239854081167352e-07, "loss": 0.3274, "step": 2189 }, { "epoch": 1.0, "grad_norm": 0.9020984768867493, "learning_rate": 1.3679890560875515e-07, "loss": 0.3517, "step": 2190 }, { "epoch": 1.0, "eval_loss": 0.35346221923828125, "eval_runtime": 21.6993, "eval_samples_per_second": 1.29, "eval_steps_per_second": 0.323, "step": 2190 } ], "logging_steps": 1, "max_steps": 2193, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "total_flos": 1.2253539900158116e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null }