|
{ |
|
"best_metric": 0.5852221250534058, |
|
"best_model_checkpoint": "/ephemeral/models/qwen-describe_tasks/checkpoint-200", |
|
"epoch": 3.571428571428571, |
|
"eval_steps": 200, |
|
"global_step": 800, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.004464285714285714, |
|
"grad_norm": 10.935125350952148, |
|
"learning_rate": 0.0, |
|
"loss": 1.3545, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.008928571428571428, |
|
"grad_norm": 7.386569976806641, |
|
"learning_rate": 2.626495350371936e-06, |
|
"loss": 1.0726, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.013392857142857142, |
|
"grad_norm": 7.35675048828125, |
|
"learning_rate": 4.162896638657993e-06, |
|
"loss": 1.0713, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.017857142857142856, |
|
"grad_norm": 6.851252555847168, |
|
"learning_rate": 5.252990700743872e-06, |
|
"loss": 0.9934, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.022321428571428572, |
|
"grad_norm": 6.426003456115723, |
|
"learning_rate": 6.098533345119624e-06, |
|
"loss": 0.7561, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.026785714285714284, |
|
"grad_norm": 4.757523536682129, |
|
"learning_rate": 6.7893919890299284e-06, |
|
"loss": 0.7267, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.03125, |
|
"grad_norm": 3.9246585369110107, |
|
"learning_rate": 7.373504649628066e-06, |
|
"loss": 0.6757, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.03571428571428571, |
|
"grad_norm": 5.09559965133667, |
|
"learning_rate": 7.879486051115807e-06, |
|
"loss": 0.7255, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.04017857142857143, |
|
"grad_norm": 4.388647079467773, |
|
"learning_rate": 8.325793277315987e-06, |
|
"loss": 0.6737, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.044642857142857144, |
|
"grad_norm": 3.693603754043579, |
|
"learning_rate": 8.72502869549156e-06, |
|
"loss": 0.6406, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.049107142857142856, |
|
"grad_norm": 3.9717981815338135, |
|
"learning_rate": 9.086181061280522e-06, |
|
"loss": 0.587, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.05357142857142857, |
|
"grad_norm": 3.3187167644500732, |
|
"learning_rate": 9.415887339401865e-06, |
|
"loss": 0.5724, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.05803571428571429, |
|
"grad_norm": 3.3204233646392822, |
|
"learning_rate": 9.719187714029216e-06, |
|
"loss": 0.6456, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.0625, |
|
"grad_norm": 3.1636106967926025, |
|
"learning_rate": 1e-05, |
|
"loss": 0.573, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.06696428571428571, |
|
"grad_norm": 3.5804100036621094, |
|
"learning_rate": 9.999986052613417e-06, |
|
"loss": 0.5863, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.07142857142857142, |
|
"grad_norm": 3.142025947570801, |
|
"learning_rate": 9.99994421053148e-06, |
|
"loss": 0.6012, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.07589285714285714, |
|
"grad_norm": 2.6348273754119873, |
|
"learning_rate": 9.999874473987653e-06, |
|
"loss": 0.5631, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.08035714285714286, |
|
"grad_norm": 2.788102149963379, |
|
"learning_rate": 9.999776843371027e-06, |
|
"loss": 0.6074, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.08482142857142858, |
|
"grad_norm": 2.5704562664031982, |
|
"learning_rate": 9.99965131922634e-06, |
|
"loss": 0.5218, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.08928571428571429, |
|
"grad_norm": 2.773559331893921, |
|
"learning_rate": 9.999497902253949e-06, |
|
"loss": 0.5921, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.09375, |
|
"grad_norm": 2.585707664489746, |
|
"learning_rate": 9.999316593309849e-06, |
|
"loss": 0.603, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.09821428571428571, |
|
"grad_norm": 2.896451711654663, |
|
"learning_rate": 9.999107393405655e-06, |
|
"loss": 0.588, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.10267857142857142, |
|
"grad_norm": 2.606309652328491, |
|
"learning_rate": 9.998870303708601e-06, |
|
"loss": 0.4943, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.10714285714285714, |
|
"grad_norm": 2.6549859046936035, |
|
"learning_rate": 9.998605325541531e-06, |
|
"loss": 0.5824, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.11160714285714286, |
|
"grad_norm": 2.7567014694213867, |
|
"learning_rate": 9.998312460382895e-06, |
|
"loss": 0.5915, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.11607142857142858, |
|
"grad_norm": 2.674490451812744, |
|
"learning_rate": 9.997991709866738e-06, |
|
"loss": 0.5946, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.12053571428571429, |
|
"grad_norm": 2.0534379482269287, |
|
"learning_rate": 9.997643075782691e-06, |
|
"loss": 0.4713, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.125, |
|
"grad_norm": 2.5476534366607666, |
|
"learning_rate": 9.997266560075961e-06, |
|
"loss": 0.5565, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.12946428571428573, |
|
"grad_norm": 3.0594515800476074, |
|
"learning_rate": 9.996862164847323e-06, |
|
"loss": 0.5195, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.13392857142857142, |
|
"grad_norm": 2.6829309463500977, |
|
"learning_rate": 9.996429892353107e-06, |
|
"loss": 0.4949, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.13839285714285715, |
|
"grad_norm": 2.6954853534698486, |
|
"learning_rate": 9.99596974500518e-06, |
|
"loss": 0.5276, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.14285714285714285, |
|
"grad_norm": 2.8353872299194336, |
|
"learning_rate": 9.995481725370941e-06, |
|
"loss": 0.5713, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.14732142857142858, |
|
"grad_norm": 2.503706693649292, |
|
"learning_rate": 9.994965836173303e-06, |
|
"loss": 0.5171, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.15178571428571427, |
|
"grad_norm": 2.876476526260376, |
|
"learning_rate": 9.994422080290675e-06, |
|
"loss": 0.5046, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.15625, |
|
"grad_norm": 2.6266987323760986, |
|
"learning_rate": 9.99385046075695e-06, |
|
"loss": 0.5366, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.16071428571428573, |
|
"grad_norm": 2.4998559951782227, |
|
"learning_rate": 9.993250980761487e-06, |
|
"loss": 0.5082, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.16517857142857142, |
|
"grad_norm": 2.6557369232177734, |
|
"learning_rate": 9.99262364364909e-06, |
|
"loss": 0.5126, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.16964285714285715, |
|
"grad_norm": 2.4919795989990234, |
|
"learning_rate": 9.991968452919999e-06, |
|
"loss": 0.459, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.17410714285714285, |
|
"grad_norm": 2.501042604446411, |
|
"learning_rate": 9.991285412229854e-06, |
|
"loss": 0.5021, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.17857142857142858, |
|
"grad_norm": 2.462800979614258, |
|
"learning_rate": 9.99057452538969e-06, |
|
"loss": 0.5227, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.18303571428571427, |
|
"grad_norm": 2.2987539768218994, |
|
"learning_rate": 9.989835796365911e-06, |
|
"loss": 0.491, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.1875, |
|
"grad_norm": 2.5660648345947266, |
|
"learning_rate": 9.989069229280264e-06, |
|
"loss": 0.5264, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.19196428571428573, |
|
"grad_norm": 2.3724122047424316, |
|
"learning_rate": 9.988274828409821e-06, |
|
"loss": 0.5362, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.19642857142857142, |
|
"grad_norm": 2.1337292194366455, |
|
"learning_rate": 9.987452598186947e-06, |
|
"loss": 0.4827, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.20089285714285715, |
|
"grad_norm": 2.4473042488098145, |
|
"learning_rate": 9.986602543199292e-06, |
|
"loss": 0.5034, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.20535714285714285, |
|
"grad_norm": 2.3975181579589844, |
|
"learning_rate": 9.985724668189744e-06, |
|
"loss": 0.5155, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.20982142857142858, |
|
"grad_norm": 2.3558027744293213, |
|
"learning_rate": 9.98481897805642e-06, |
|
"loss": 0.5271, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.21428571428571427, |
|
"grad_norm": 2.3652427196502686, |
|
"learning_rate": 9.983885477852628e-06, |
|
"loss": 0.5966, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.21875, |
|
"grad_norm": 2.551260471343994, |
|
"learning_rate": 9.982924172786847e-06, |
|
"loss": 0.5586, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.22321428571428573, |
|
"grad_norm": 2.4399120807647705, |
|
"learning_rate": 9.981935068222687e-06, |
|
"loss": 0.5133, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.22767857142857142, |
|
"grad_norm": 2.097429037094116, |
|
"learning_rate": 9.980918169678872e-06, |
|
"loss": 0.439, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.23214285714285715, |
|
"grad_norm": 3.4427826404571533, |
|
"learning_rate": 9.979873482829199e-06, |
|
"loss": 0.5122, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.23660714285714285, |
|
"grad_norm": 2.506448268890381, |
|
"learning_rate": 9.978801013502511e-06, |
|
"loss": 0.5187, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.24107142857142858, |
|
"grad_norm": 2.153078556060791, |
|
"learning_rate": 9.977700767682665e-06, |
|
"loss": 0.4699, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.24553571428571427, |
|
"grad_norm": 2.5463476181030273, |
|
"learning_rate": 9.976572751508497e-06, |
|
"loss": 0.5425, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 2.3144137859344482, |
|
"learning_rate": 9.975416971273787e-06, |
|
"loss": 0.4973, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.2544642857142857, |
|
"grad_norm": 2.373551607131958, |
|
"learning_rate": 9.974233433427222e-06, |
|
"loss": 0.4538, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.25892857142857145, |
|
"grad_norm": 1.888371467590332, |
|
"learning_rate": 9.97302214457237e-06, |
|
"loss": 0.4525, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.26339285714285715, |
|
"grad_norm": 2.0926015377044678, |
|
"learning_rate": 9.971783111467635e-06, |
|
"loss": 0.4793, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.26785714285714285, |
|
"grad_norm": 2.090359926223755, |
|
"learning_rate": 9.970516341026211e-06, |
|
"loss": 0.4818, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.27232142857142855, |
|
"grad_norm": 2.268264055252075, |
|
"learning_rate": 9.969221840316066e-06, |
|
"loss": 0.5479, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.2767857142857143, |
|
"grad_norm": 2.044856309890747, |
|
"learning_rate": 9.967899616559879e-06, |
|
"loss": 0.4726, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.28125, |
|
"grad_norm": 2.361753225326538, |
|
"learning_rate": 9.966549677135015e-06, |
|
"loss": 0.4901, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.2857142857142857, |
|
"grad_norm": 2.3921494483947754, |
|
"learning_rate": 9.965172029573479e-06, |
|
"loss": 0.4856, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.29017857142857145, |
|
"grad_norm": 2.134155750274658, |
|
"learning_rate": 9.96376668156187e-06, |
|
"loss": 0.4621, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.29464285714285715, |
|
"grad_norm": 2.1714282035827637, |
|
"learning_rate": 9.962333640941349e-06, |
|
"loss": 0.5009, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.29910714285714285, |
|
"grad_norm": 2.107659339904785, |
|
"learning_rate": 9.960872915707582e-06, |
|
"loss": 0.5125, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.30357142857142855, |
|
"grad_norm": 1.874833583831787, |
|
"learning_rate": 9.959384514010703e-06, |
|
"loss": 0.4568, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.3080357142857143, |
|
"grad_norm": 2.18188214302063, |
|
"learning_rate": 9.95786844415527e-06, |
|
"loss": 0.5336, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"grad_norm": 1.9995635747909546, |
|
"learning_rate": 9.956324714600212e-06, |
|
"loss": 0.507, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.3169642857142857, |
|
"grad_norm": 2.198190212249756, |
|
"learning_rate": 9.95475333395879e-06, |
|
"loss": 0.4801, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.32142857142857145, |
|
"grad_norm": 2.370695114135742, |
|
"learning_rate": 9.95315431099854e-06, |
|
"loss": 0.4944, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.32589285714285715, |
|
"grad_norm": 2.126605272293091, |
|
"learning_rate": 9.951527654641231e-06, |
|
"loss": 0.4874, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.33035714285714285, |
|
"grad_norm": 2.093745231628418, |
|
"learning_rate": 9.949873373962814e-06, |
|
"loss": 0.5515, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.33482142857142855, |
|
"grad_norm": 1.9955790042877197, |
|
"learning_rate": 9.948191478193365e-06, |
|
"loss": 0.4524, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.3392857142857143, |
|
"grad_norm": 1.8829865455627441, |
|
"learning_rate": 9.946481976717046e-06, |
|
"loss": 0.4387, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.34375, |
|
"grad_norm": 1.8707062005996704, |
|
"learning_rate": 9.944744879072043e-06, |
|
"loss": 0.4467, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.3482142857142857, |
|
"grad_norm": 2.0644657611846924, |
|
"learning_rate": 9.942980194950511e-06, |
|
"loss": 0.5096, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.35267857142857145, |
|
"grad_norm": 2.6129536628723145, |
|
"learning_rate": 9.941187934198528e-06, |
|
"loss": 0.5458, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.35714285714285715, |
|
"grad_norm": 2.0972604751586914, |
|
"learning_rate": 9.939368106816038e-06, |
|
"loss": 0.4918, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.36160714285714285, |
|
"grad_norm": 2.2218029499053955, |
|
"learning_rate": 9.937520722956789e-06, |
|
"loss": 0.5811, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.36607142857142855, |
|
"grad_norm": 2.017685890197754, |
|
"learning_rate": 9.93564579292828e-06, |
|
"loss": 0.4626, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.3705357142857143, |
|
"grad_norm": 2.521484613418579, |
|
"learning_rate": 9.933743327191711e-06, |
|
"loss": 0.4818, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.375, |
|
"grad_norm": 2.1667404174804688, |
|
"learning_rate": 9.93181333636191e-06, |
|
"loss": 0.5324, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.3794642857142857, |
|
"grad_norm": 2.0817134380340576, |
|
"learning_rate": 9.929855831207288e-06, |
|
"loss": 0.4595, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.38392857142857145, |
|
"grad_norm": 1.9740232229232788, |
|
"learning_rate": 9.92787082264977e-06, |
|
"loss": 0.4228, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.38839285714285715, |
|
"grad_norm": 1.985465407371521, |
|
"learning_rate": 9.925858321764733e-06, |
|
"loss": 0.4971, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.39285714285714285, |
|
"grad_norm": 2.02755069732666, |
|
"learning_rate": 9.923818339780954e-06, |
|
"loss": 0.5322, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.39732142857142855, |
|
"grad_norm": 2.2033133506774902, |
|
"learning_rate": 9.921750888080534e-06, |
|
"loss": 0.4646, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.4017857142857143, |
|
"grad_norm": 2.1710963249206543, |
|
"learning_rate": 9.91965597819885e-06, |
|
"loss": 0.4665, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.40625, |
|
"grad_norm": 1.907015085220337, |
|
"learning_rate": 9.917533621824476e-06, |
|
"loss": 0.408, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.4107142857142857, |
|
"grad_norm": 1.8381679058074951, |
|
"learning_rate": 9.915383830799129e-06, |
|
"loss": 0.4057, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.41517857142857145, |
|
"grad_norm": 2.105541229248047, |
|
"learning_rate": 9.91320661711759e-06, |
|
"loss": 0.5089, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.41964285714285715, |
|
"grad_norm": 2.172125816345215, |
|
"learning_rate": 9.911001992927655e-06, |
|
"loss": 0.4856, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.42410714285714285, |
|
"grad_norm": 2.0441055297851562, |
|
"learning_rate": 9.908769970530049e-06, |
|
"loss": 0.505, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.42857142857142855, |
|
"grad_norm": 1.9920746088027954, |
|
"learning_rate": 9.90651056237837e-06, |
|
"loss": 0.4962, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.4330357142857143, |
|
"grad_norm": 2.0216598510742188, |
|
"learning_rate": 9.904223781079017e-06, |
|
"loss": 0.5041, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.4375, |
|
"grad_norm": 2.212711811065674, |
|
"learning_rate": 9.901909639391111e-06, |
|
"loss": 0.5078, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.4419642857142857, |
|
"grad_norm": 2.3241405487060547, |
|
"learning_rate": 9.899568150226435e-06, |
|
"loss": 0.4316, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.44642857142857145, |
|
"grad_norm": 2.092745542526245, |
|
"learning_rate": 9.897199326649362e-06, |
|
"loss": 0.4653, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.45089285714285715, |
|
"grad_norm": 1.8443938493728638, |
|
"learning_rate": 9.894803181876765e-06, |
|
"loss": 0.4527, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.45535714285714285, |
|
"grad_norm": 1.827662467956543, |
|
"learning_rate": 9.892379729277972e-06, |
|
"loss": 0.455, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.45982142857142855, |
|
"grad_norm": 1.8256633281707764, |
|
"learning_rate": 9.889928982374663e-06, |
|
"loss": 0.369, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.4642857142857143, |
|
"grad_norm": 2.305311918258667, |
|
"learning_rate": 9.887450954840812e-06, |
|
"loss": 0.4713, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.46875, |
|
"grad_norm": 2.3441269397735596, |
|
"learning_rate": 9.884945660502607e-06, |
|
"loss": 0.4571, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.4732142857142857, |
|
"grad_norm": 1.9434335231781006, |
|
"learning_rate": 9.882413113338364e-06, |
|
"loss": 0.4814, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.47767857142857145, |
|
"grad_norm": 2.1630849838256836, |
|
"learning_rate": 9.879853327478465e-06, |
|
"loss": 0.4856, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.48214285714285715, |
|
"grad_norm": 2.32119083404541, |
|
"learning_rate": 9.877266317205268e-06, |
|
"loss": 0.4614, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.48660714285714285, |
|
"grad_norm": 2.141681671142578, |
|
"learning_rate": 9.874652096953028e-06, |
|
"loss": 0.4633, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.49107142857142855, |
|
"grad_norm": 2.1267635822296143, |
|
"learning_rate": 9.872010681307821e-06, |
|
"loss": 0.5275, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.4955357142857143, |
|
"grad_norm": 2.1307332515716553, |
|
"learning_rate": 9.869342085007458e-06, |
|
"loss": 0.4224, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.9681659936904907, |
|
"learning_rate": 9.866646322941405e-06, |
|
"loss": 0.4645, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.5044642857142857, |
|
"grad_norm": 1.8705610036849976, |
|
"learning_rate": 9.863923410150704e-06, |
|
"loss": 0.4672, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.5089285714285714, |
|
"grad_norm": 2.2226390838623047, |
|
"learning_rate": 9.861173361827876e-06, |
|
"loss": 0.4418, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.5133928571428571, |
|
"grad_norm": 1.979615569114685, |
|
"learning_rate": 9.858396193316853e-06, |
|
"loss": 0.4622, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.5178571428571429, |
|
"grad_norm": 2.0155093669891357, |
|
"learning_rate": 9.855591920112883e-06, |
|
"loss": 0.4585, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.5223214285714286, |
|
"grad_norm": 1.7425578832626343, |
|
"learning_rate": 9.85276055786244e-06, |
|
"loss": 0.4938, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.5267857142857143, |
|
"grad_norm": 2.0626533031463623, |
|
"learning_rate": 9.849902122363148e-06, |
|
"loss": 0.458, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.53125, |
|
"grad_norm": 1.8312275409698486, |
|
"learning_rate": 9.847016629563683e-06, |
|
"loss": 0.4615, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.5357142857142857, |
|
"grad_norm": 2.0553085803985596, |
|
"learning_rate": 9.844104095563689e-06, |
|
"loss": 0.5286, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.5401785714285714, |
|
"grad_norm": 1.95139741897583, |
|
"learning_rate": 9.841164536613685e-06, |
|
"loss": 0.4863, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.5446428571428571, |
|
"grad_norm": 1.9694424867630005, |
|
"learning_rate": 9.83819796911498e-06, |
|
"loss": 0.5279, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.5491071428571429, |
|
"grad_norm": 1.9648712873458862, |
|
"learning_rate": 9.83520440961957e-06, |
|
"loss": 0.3807, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.5535714285714286, |
|
"grad_norm": 1.8197687864303589, |
|
"learning_rate": 9.83218387483006e-06, |
|
"loss": 0.4943, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.5580357142857143, |
|
"grad_norm": 1.864090919494629, |
|
"learning_rate": 9.829136381599563e-06, |
|
"loss": 0.4804, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.5625, |
|
"grad_norm": 1.827278971672058, |
|
"learning_rate": 9.826061946931605e-06, |
|
"loss": 0.4173, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.5669642857142857, |
|
"grad_norm": 1.8508820533752441, |
|
"learning_rate": 9.822960587980034e-06, |
|
"loss": 0.4831, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.5714285714285714, |
|
"grad_norm": 1.801448941230774, |
|
"learning_rate": 9.81983232204892e-06, |
|
"loss": 0.4823, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.5758928571428571, |
|
"grad_norm": 1.9353251457214355, |
|
"learning_rate": 9.816677166592462e-06, |
|
"loss": 0.4074, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.5803571428571429, |
|
"grad_norm": 1.9287128448486328, |
|
"learning_rate": 9.81349513921489e-06, |
|
"loss": 0.4495, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.5848214285714286, |
|
"grad_norm": 1.5913102626800537, |
|
"learning_rate": 9.810286257670365e-06, |
|
"loss": 0.4161, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.5892857142857143, |
|
"grad_norm": 1.9423521757125854, |
|
"learning_rate": 9.807050539862884e-06, |
|
"loss": 0.4254, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.59375, |
|
"grad_norm": 1.7807602882385254, |
|
"learning_rate": 9.803788003846175e-06, |
|
"loss": 0.4541, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.5982142857142857, |
|
"grad_norm": 1.9615179300308228, |
|
"learning_rate": 9.800498667823595e-06, |
|
"loss": 0.4375, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.6026785714285714, |
|
"grad_norm": 1.7444758415222168, |
|
"learning_rate": 9.797182550148039e-06, |
|
"loss": 0.4568, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.6071428571428571, |
|
"grad_norm": 1.9967138767242432, |
|
"learning_rate": 9.793839669321828e-06, |
|
"loss": 0.4152, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.6116071428571429, |
|
"grad_norm": 1.8935024738311768, |
|
"learning_rate": 9.790470043996604e-06, |
|
"loss": 0.4807, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.6160714285714286, |
|
"grad_norm": 1.8592950105667114, |
|
"learning_rate": 9.78707369297324e-06, |
|
"loss": 0.4849, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.6205357142857143, |
|
"grad_norm": 1.8688433170318604, |
|
"learning_rate": 9.783650635201714e-06, |
|
"loss": 0.5193, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 1.806913137435913, |
|
"learning_rate": 9.780200889781021e-06, |
|
"loss": 0.4224, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.6294642857142857, |
|
"grad_norm": 1.8969337940216064, |
|
"learning_rate": 9.776724475959061e-06, |
|
"loss": 0.4986, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.6339285714285714, |
|
"grad_norm": 1.5755013227462769, |
|
"learning_rate": 9.773221413132525e-06, |
|
"loss": 0.4343, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.6383928571428571, |
|
"grad_norm": 1.70070481300354, |
|
"learning_rate": 9.769691720846801e-06, |
|
"loss": 0.4163, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.6428571428571429, |
|
"grad_norm": 1.9572802782058716, |
|
"learning_rate": 9.766135418795848e-06, |
|
"loss": 0.4618, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.6473214285714286, |
|
"grad_norm": 1.9200794696807861, |
|
"learning_rate": 9.762552526822098e-06, |
|
"loss": 0.4258, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.6517857142857143, |
|
"grad_norm": 1.9915560483932495, |
|
"learning_rate": 9.758943064916342e-06, |
|
"loss": 0.4185, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.65625, |
|
"grad_norm": 1.6791561841964722, |
|
"learning_rate": 9.755307053217622e-06, |
|
"loss": 0.4074, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.6607142857142857, |
|
"grad_norm": 1.984868049621582, |
|
"learning_rate": 9.751644512013106e-06, |
|
"loss": 0.461, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.6651785714285714, |
|
"grad_norm": 1.925027847290039, |
|
"learning_rate": 9.74795546173799e-06, |
|
"loss": 0.4601, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.6696428571428571, |
|
"grad_norm": 2.0234742164611816, |
|
"learning_rate": 9.744239922975377e-06, |
|
"loss": 0.4874, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.6741071428571429, |
|
"grad_norm": 1.8171749114990234, |
|
"learning_rate": 9.740497916456163e-06, |
|
"loss": 0.4806, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.6785714285714286, |
|
"grad_norm": 1.8766131401062012, |
|
"learning_rate": 9.736729463058921e-06, |
|
"loss": 0.5144, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.6830357142857143, |
|
"grad_norm": 1.7700015306472778, |
|
"learning_rate": 9.732934583809782e-06, |
|
"loss": 0.4958, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.6875, |
|
"grad_norm": 1.7756420373916626, |
|
"learning_rate": 9.729113299882324e-06, |
|
"loss": 0.4685, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.6919642857142857, |
|
"grad_norm": 1.7557669878005981, |
|
"learning_rate": 9.725265632597448e-06, |
|
"loss": 0.4736, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.6964285714285714, |
|
"grad_norm": 1.7786169052124023, |
|
"learning_rate": 9.721391603423263e-06, |
|
"loss": 0.4582, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.7008928571428571, |
|
"grad_norm": 1.7477728128433228, |
|
"learning_rate": 9.717491233974962e-06, |
|
"loss": 0.4399, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.7053571428571429, |
|
"grad_norm": 1.8034288883209229, |
|
"learning_rate": 9.713564546014707e-06, |
|
"loss": 0.4933, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.7098214285714286, |
|
"grad_norm": 1.7433615922927856, |
|
"learning_rate": 9.7096115614515e-06, |
|
"loss": 0.4367, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"grad_norm": 1.9310708045959473, |
|
"learning_rate": 9.705632302341073e-06, |
|
"loss": 0.4674, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.71875, |
|
"grad_norm": 1.709022879600525, |
|
"learning_rate": 9.701626790885749e-06, |
|
"loss": 0.4361, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.7232142857142857, |
|
"grad_norm": 1.7623347043991089, |
|
"learning_rate": 9.69759504943433e-06, |
|
"loss": 0.4037, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.7276785714285714, |
|
"grad_norm": 1.6316179037094116, |
|
"learning_rate": 9.69353710048197e-06, |
|
"loss": 0.4707, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.7321428571428571, |
|
"grad_norm": 1.966737151145935, |
|
"learning_rate": 9.68945296667004e-06, |
|
"loss": 0.4493, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.7366071428571429, |
|
"grad_norm": 1.7973566055297852, |
|
"learning_rate": 9.685342670786025e-06, |
|
"loss": 0.5024, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.7410714285714286, |
|
"grad_norm": 2.026155710220337, |
|
"learning_rate": 9.681206235763367e-06, |
|
"loss": 0.4879, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.7455357142857143, |
|
"grad_norm": 1.679463267326355, |
|
"learning_rate": 9.677043684681358e-06, |
|
"loss": 0.4495, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.7095201015472412, |
|
"learning_rate": 9.672855040765006e-06, |
|
"loss": 0.3991, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.7544642857142857, |
|
"grad_norm": 1.7846328020095825, |
|
"learning_rate": 9.668640327384899e-06, |
|
"loss": 0.4653, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.7589285714285714, |
|
"grad_norm": 1.9439399242401123, |
|
"learning_rate": 9.664399568057087e-06, |
|
"loss": 0.4811, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.7633928571428571, |
|
"grad_norm": 1.8142772912979126, |
|
"learning_rate": 9.660132786442937e-06, |
|
"loss": 0.4638, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.7678571428571429, |
|
"grad_norm": 1.8977925777435303, |
|
"learning_rate": 9.655840006349014e-06, |
|
"loss": 0.4698, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.7723214285714286, |
|
"grad_norm": 1.624567985534668, |
|
"learning_rate": 9.651521251726936e-06, |
|
"loss": 0.4651, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.7767857142857143, |
|
"grad_norm": 1.6974815130233765, |
|
"learning_rate": 9.64717654667325e-06, |
|
"loss": 0.5096, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.78125, |
|
"grad_norm": 1.6671240329742432, |
|
"learning_rate": 9.642805915429291e-06, |
|
"loss": 0.4255, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.7857142857142857, |
|
"grad_norm": 1.703273057937622, |
|
"learning_rate": 9.638409382381052e-06, |
|
"loss": 0.4436, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.7901785714285714, |
|
"grad_norm": 1.924634575843811, |
|
"learning_rate": 9.633986972059047e-06, |
|
"loss": 0.4679, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.7946428571428571, |
|
"grad_norm": 1.6996525526046753, |
|
"learning_rate": 9.629538709138166e-06, |
|
"loss": 0.4836, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.7991071428571429, |
|
"grad_norm": 1.6877120733261108, |
|
"learning_rate": 9.625064618437549e-06, |
|
"loss": 0.4473, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.8035714285714286, |
|
"grad_norm": 1.6341931819915771, |
|
"learning_rate": 9.620564724920443e-06, |
|
"loss": 0.4279, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.8080357142857143, |
|
"grad_norm": 1.5455591678619385, |
|
"learning_rate": 9.616039053694058e-06, |
|
"loss": 0.4124, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.8125, |
|
"grad_norm": 1.6947259902954102, |
|
"learning_rate": 9.611487630009436e-06, |
|
"loss": 0.4061, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.8169642857142857, |
|
"grad_norm": 1.7641195058822632, |
|
"learning_rate": 9.606910479261301e-06, |
|
"loss": 0.4839, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.8214285714285714, |
|
"grad_norm": 1.513904333114624, |
|
"learning_rate": 9.602307626987925e-06, |
|
"loss": 0.403, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.8258928571428571, |
|
"grad_norm": 1.791146993637085, |
|
"learning_rate": 9.597679098870978e-06, |
|
"loss": 0.4782, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.8303571428571429, |
|
"grad_norm": 1.7051180601119995, |
|
"learning_rate": 9.593024920735393e-06, |
|
"loss": 0.4754, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.8348214285714286, |
|
"grad_norm": 1.749476671218872, |
|
"learning_rate": 9.588345118549214e-06, |
|
"loss": 0.435, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.8392857142857143, |
|
"grad_norm": 1.661120057106018, |
|
"learning_rate": 9.583639718423457e-06, |
|
"loss": 0.4536, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.84375, |
|
"grad_norm": 1.4928163290023804, |
|
"learning_rate": 9.57890874661196e-06, |
|
"loss": 0.375, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.8482142857142857, |
|
"grad_norm": 1.6453092098236084, |
|
"learning_rate": 9.57415222951124e-06, |
|
"loss": 0.445, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.8526785714285714, |
|
"grad_norm": 1.6496193408966064, |
|
"learning_rate": 9.569370193660348e-06, |
|
"loss": 0.4098, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.8571428571428571, |
|
"grad_norm": 1.779951810836792, |
|
"learning_rate": 9.564562665740708e-06, |
|
"loss": 0.5191, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.8616071428571429, |
|
"grad_norm": 1.5148218870162964, |
|
"learning_rate": 9.559729672575985e-06, |
|
"loss": 0.4001, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.8660714285714286, |
|
"grad_norm": 1.5799369812011719, |
|
"learning_rate": 9.554871241131923e-06, |
|
"loss": 0.4807, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.8705357142857143, |
|
"grad_norm": 1.6424708366394043, |
|
"learning_rate": 9.549987398516206e-06, |
|
"loss": 0.4851, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.875, |
|
"grad_norm": 1.766772985458374, |
|
"learning_rate": 9.54507817197829e-06, |
|
"loss": 0.4295, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.8794642857142857, |
|
"grad_norm": 1.6645870208740234, |
|
"learning_rate": 9.540143588909268e-06, |
|
"loss": 0.426, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.8839285714285714, |
|
"grad_norm": 1.8015894889831543, |
|
"learning_rate": 9.535183676841709e-06, |
|
"loss": 0.4602, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.8883928571428571, |
|
"grad_norm": 1.8687186241149902, |
|
"learning_rate": 9.530198463449507e-06, |
|
"loss": 0.4674, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.8928571428571429, |
|
"grad_norm": 1.8257641792297363, |
|
"learning_rate": 9.525187976547718e-06, |
|
"loss": 0.4584, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.8928571428571429, |
|
"eval_loss": 0.5852221250534058, |
|
"eval_runtime": 5.367, |
|
"eval_samples_per_second": 11.18, |
|
"eval_steps_per_second": 0.745, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.8973214285714286, |
|
"grad_norm": 1.5470904111862183, |
|
"learning_rate": 9.520152244092421e-06, |
|
"loss": 0.3601, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.9017857142857143, |
|
"grad_norm": 1.608096957206726, |
|
"learning_rate": 9.515091294180546e-06, |
|
"loss": 0.3992, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.90625, |
|
"grad_norm": 1.8043137788772583, |
|
"learning_rate": 9.510005155049729e-06, |
|
"loss": 0.4782, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.9107142857142857, |
|
"grad_norm": 1.6916441917419434, |
|
"learning_rate": 9.504893855078144e-06, |
|
"loss": 0.3765, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.9151785714285714, |
|
"grad_norm": 1.5904515981674194, |
|
"learning_rate": 9.499757422784358e-06, |
|
"loss": 0.4424, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.9196428571428571, |
|
"grad_norm": 1.6705337762832642, |
|
"learning_rate": 9.494595886827157e-06, |
|
"loss": 0.4984, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.9241071428571429, |
|
"grad_norm": 1.657344937324524, |
|
"learning_rate": 9.489409276005393e-06, |
|
"loss": 0.4511, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.9285714285714286, |
|
"grad_norm": 1.6433309316635132, |
|
"learning_rate": 9.48419761925783e-06, |
|
"loss": 0.5035, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.9330357142857143, |
|
"grad_norm": 1.81150484085083, |
|
"learning_rate": 9.478960945662974e-06, |
|
"loss": 0.3958, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.9375, |
|
"grad_norm": 1.6642935276031494, |
|
"learning_rate": 9.473699284438908e-06, |
|
"loss": 0.4011, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.9419642857142857, |
|
"grad_norm": 1.8117976188659668, |
|
"learning_rate": 9.468412664943137e-06, |
|
"loss": 0.5123, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.9464285714285714, |
|
"grad_norm": 1.5370533466339111, |
|
"learning_rate": 9.463101116672423e-06, |
|
"loss": 0.4226, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.9508928571428571, |
|
"grad_norm": 2.033900499343872, |
|
"learning_rate": 9.457764669262615e-06, |
|
"loss": 0.4902, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.9553571428571429, |
|
"grad_norm": 1.672672986984253, |
|
"learning_rate": 9.452403352488488e-06, |
|
"loss": 0.442, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.9598214285714286, |
|
"grad_norm": 1.9266510009765625, |
|
"learning_rate": 9.447017196263578e-06, |
|
"loss": 0.4805, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.9642857142857143, |
|
"grad_norm": 1.629258155822754, |
|
"learning_rate": 9.441606230640012e-06, |
|
"loss": 0.444, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.96875, |
|
"grad_norm": 1.6848729848861694, |
|
"learning_rate": 9.436170485808338e-06, |
|
"loss": 0.4444, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.9732142857142857, |
|
"grad_norm": 1.693801999092102, |
|
"learning_rate": 9.430709992097364e-06, |
|
"loss": 0.4913, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.9776785714285714, |
|
"grad_norm": 1.5817019939422607, |
|
"learning_rate": 9.425224779973986e-06, |
|
"loss": 0.4554, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.9821428571428571, |
|
"grad_norm": 1.7226394414901733, |
|
"learning_rate": 9.41971488004301e-06, |
|
"loss": 0.5002, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.9866071428571429, |
|
"grad_norm": 1.7014926671981812, |
|
"learning_rate": 9.414180323046991e-06, |
|
"loss": 0.4582, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.9910714285714286, |
|
"grad_norm": 1.5330203771591187, |
|
"learning_rate": 9.408621139866067e-06, |
|
"loss": 0.3967, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.9955357142857143, |
|
"grad_norm": 1.6649432182312012, |
|
"learning_rate": 9.403037361517762e-06, |
|
"loss": 0.4442, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.5434422492980957, |
|
"learning_rate": 9.397429019156841e-06, |
|
"loss": 0.4281, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.0044642857142858, |
|
"grad_norm": 1.4803626537322998, |
|
"learning_rate": 9.391796144075123e-06, |
|
"loss": 0.3408, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.0089285714285714, |
|
"grad_norm": 1.7202813625335693, |
|
"learning_rate": 9.386138767701306e-06, |
|
"loss": 0.3212, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.0133928571428572, |
|
"grad_norm": 1.5383710861206055, |
|
"learning_rate": 9.380456921600785e-06, |
|
"loss": 0.3628, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.0178571428571428, |
|
"grad_norm": 1.598041296005249, |
|
"learning_rate": 9.374750637475499e-06, |
|
"loss": 0.3354, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.0223214285714286, |
|
"grad_norm": 1.483928918838501, |
|
"learning_rate": 9.36901994716373e-06, |
|
"loss": 0.3572, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 1.0267857142857142, |
|
"grad_norm": 1.4985754489898682, |
|
"learning_rate": 9.363264882639936e-06, |
|
"loss": 0.3117, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.03125, |
|
"grad_norm": 1.6541988849639893, |
|
"learning_rate": 9.357485476014573e-06, |
|
"loss": 0.3235, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.0357142857142858, |
|
"grad_norm": 1.459572672843933, |
|
"learning_rate": 9.351681759533914e-06, |
|
"loss": 0.2768, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.0401785714285714, |
|
"grad_norm": 1.7430468797683716, |
|
"learning_rate": 9.345853765579865e-06, |
|
"loss": 0.2995, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.0446428571428572, |
|
"grad_norm": 1.4982185363769531, |
|
"learning_rate": 9.340001526669794e-06, |
|
"loss": 0.2818, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.0491071428571428, |
|
"grad_norm": 1.6810327768325806, |
|
"learning_rate": 9.33412507545634e-06, |
|
"loss": 0.3382, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.0535714285714286, |
|
"grad_norm": 1.8060071468353271, |
|
"learning_rate": 9.32822444472724e-06, |
|
"loss": 0.343, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.0580357142857142, |
|
"grad_norm": 1.786448359489441, |
|
"learning_rate": 9.322299667405134e-06, |
|
"loss": 0.3411, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.0625, |
|
"grad_norm": 1.776330590248108, |
|
"learning_rate": 9.31635077654739e-06, |
|
"loss": 0.3569, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.0669642857142858, |
|
"grad_norm": 1.5425933599472046, |
|
"learning_rate": 9.310377805345926e-06, |
|
"loss": 0.2962, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.0714285714285714, |
|
"grad_norm": 1.6801799535751343, |
|
"learning_rate": 9.304380787127003e-06, |
|
"loss": 0.3163, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.0758928571428572, |
|
"grad_norm": 1.6421353816986084, |
|
"learning_rate": 9.298359755351065e-06, |
|
"loss": 0.3159, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.0803571428571428, |
|
"grad_norm": 1.7834713459014893, |
|
"learning_rate": 9.29231474361253e-06, |
|
"loss": 0.3795, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.0848214285714286, |
|
"grad_norm": 1.6435611248016357, |
|
"learning_rate": 9.28624578563962e-06, |
|
"loss": 0.3161, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.0892857142857142, |
|
"grad_norm": 1.579264760017395, |
|
"learning_rate": 9.280152915294162e-06, |
|
"loss": 0.3916, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.09375, |
|
"grad_norm": 1.5122257471084595, |
|
"learning_rate": 9.274036166571402e-06, |
|
"loss": 0.3642, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.0982142857142858, |
|
"grad_norm": 1.4102823734283447, |
|
"learning_rate": 9.267895573599819e-06, |
|
"loss": 0.3147, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.1026785714285714, |
|
"grad_norm": 1.5816137790679932, |
|
"learning_rate": 9.261731170640923e-06, |
|
"loss": 0.3346, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 1.1071428571428572, |
|
"grad_norm": 1.426952838897705, |
|
"learning_rate": 9.255542992089086e-06, |
|
"loss": 0.3033, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.1116071428571428, |
|
"grad_norm": 1.437669038772583, |
|
"learning_rate": 9.24933107247132e-06, |
|
"loss": 0.3195, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.1160714285714286, |
|
"grad_norm": 1.5356261730194092, |
|
"learning_rate": 9.243095446447113e-06, |
|
"loss": 0.306, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.1205357142857142, |
|
"grad_norm": 1.4559781551361084, |
|
"learning_rate": 9.23683614880822e-06, |
|
"loss": 0.3088, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 1.125, |
|
"grad_norm": 1.5992493629455566, |
|
"learning_rate": 9.230553214478469e-06, |
|
"loss": 0.3614, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.1294642857142858, |
|
"grad_norm": 1.5381258726119995, |
|
"learning_rate": 9.224246678513569e-06, |
|
"loss": 0.322, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 1.1339285714285714, |
|
"grad_norm": 1.6599974632263184, |
|
"learning_rate": 9.217916576100922e-06, |
|
"loss": 0.3098, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.1383928571428572, |
|
"grad_norm": 1.5781593322753906, |
|
"learning_rate": 9.211562942559408e-06, |
|
"loss": 0.3397, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.1428571428571428, |
|
"grad_norm": 1.5014089345932007, |
|
"learning_rate": 9.20518581333921e-06, |
|
"loss": 0.3195, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.1473214285714286, |
|
"grad_norm": 1.5395718812942505, |
|
"learning_rate": 9.1987852240216e-06, |
|
"loss": 0.3319, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 1.1517857142857142, |
|
"grad_norm": 1.4616154432296753, |
|
"learning_rate": 9.192361210318745e-06, |
|
"loss": 0.3141, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.15625, |
|
"grad_norm": 1.5390483140945435, |
|
"learning_rate": 9.185913808073513e-06, |
|
"loss": 0.3062, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 1.1607142857142858, |
|
"grad_norm": 1.6422654390335083, |
|
"learning_rate": 9.179443053259263e-06, |
|
"loss": 0.3551, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.1651785714285714, |
|
"grad_norm": 1.5900368690490723, |
|
"learning_rate": 9.172948981979654e-06, |
|
"loss": 0.3655, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 1.1696428571428572, |
|
"grad_norm": 1.582152009010315, |
|
"learning_rate": 9.166431630468438e-06, |
|
"loss": 0.3418, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 1.1741071428571428, |
|
"grad_norm": 1.4746930599212646, |
|
"learning_rate": 9.159891035089262e-06, |
|
"loss": 0.2786, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 1.1785714285714286, |
|
"grad_norm": 1.5723954439163208, |
|
"learning_rate": 9.153327232335455e-06, |
|
"loss": 0.3347, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.1830357142857142, |
|
"grad_norm": 1.374165415763855, |
|
"learning_rate": 9.146740258829844e-06, |
|
"loss": 0.2811, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.1875, |
|
"grad_norm": 1.6520895957946777, |
|
"learning_rate": 9.140130151324526e-06, |
|
"loss": 0.3786, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 1.1919642857142858, |
|
"grad_norm": 1.3879092931747437, |
|
"learning_rate": 9.13349694670068e-06, |
|
"loss": 0.3063, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 1.1964285714285714, |
|
"grad_norm": 1.3920384645462036, |
|
"learning_rate": 9.126840681968357e-06, |
|
"loss": 0.3191, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 1.2008928571428572, |
|
"grad_norm": 1.6087769269943237, |
|
"learning_rate": 9.120161394266266e-06, |
|
"loss": 0.3101, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 1.2053571428571428, |
|
"grad_norm": 1.465286135673523, |
|
"learning_rate": 9.113459120861579e-06, |
|
"loss": 0.2833, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.2098214285714286, |
|
"grad_norm": 1.681369423866272, |
|
"learning_rate": 9.106733899149715e-06, |
|
"loss": 0.3389, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 1.2142857142857142, |
|
"grad_norm": 1.5745911598205566, |
|
"learning_rate": 9.099985766654132e-06, |
|
"loss": 0.3352, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 1.21875, |
|
"grad_norm": 1.3523564338684082, |
|
"learning_rate": 9.093214761026121e-06, |
|
"loss": 0.2964, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 1.2232142857142858, |
|
"grad_norm": 1.2804710865020752, |
|
"learning_rate": 9.08642092004459e-06, |
|
"loss": 0.2992, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 1.2276785714285714, |
|
"grad_norm": 1.521187424659729, |
|
"learning_rate": 9.079604281615868e-06, |
|
"loss": 0.3212, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.2321428571428572, |
|
"grad_norm": 1.3694134950637817, |
|
"learning_rate": 9.072764883773464e-06, |
|
"loss": 0.3195, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.2366071428571428, |
|
"grad_norm": 1.4687600135803223, |
|
"learning_rate": 9.065902764677897e-06, |
|
"loss": 0.2878, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 1.2410714285714286, |
|
"grad_norm": 1.5896199941635132, |
|
"learning_rate": 9.059017962616435e-06, |
|
"loss": 0.3482, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 1.2455357142857142, |
|
"grad_norm": 1.6721631288528442, |
|
"learning_rate": 9.052110516002925e-06, |
|
"loss": 0.3629, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 1.703574299812317, |
|
"learning_rate": 9.04518046337755e-06, |
|
"loss": 0.3618, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.2544642857142856, |
|
"grad_norm": 1.704490065574646, |
|
"learning_rate": 9.038227843406628e-06, |
|
"loss": 0.3397, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 1.2589285714285714, |
|
"grad_norm": 1.508264422416687, |
|
"learning_rate": 9.031252694882386e-06, |
|
"loss": 0.3405, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 1.2633928571428572, |
|
"grad_norm": 1.4581125974655151, |
|
"learning_rate": 9.024255056722753e-06, |
|
"loss": 0.3227, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 1.2678571428571428, |
|
"grad_norm": 1.4935333728790283, |
|
"learning_rate": 9.017234967971143e-06, |
|
"loss": 0.3248, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 1.2723214285714286, |
|
"grad_norm": 1.5179234743118286, |
|
"learning_rate": 9.010192467796228e-06, |
|
"loss": 0.3257, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.2767857142857144, |
|
"grad_norm": 1.5567351579666138, |
|
"learning_rate": 9.003127595491723e-06, |
|
"loss": 0.3393, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 1.28125, |
|
"grad_norm": 1.6843305826187134, |
|
"learning_rate": 8.996040390476177e-06, |
|
"loss": 0.365, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 1.2857142857142856, |
|
"grad_norm": 1.558060646057129, |
|
"learning_rate": 8.988930892292738e-06, |
|
"loss": 0.3617, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 1.2901785714285714, |
|
"grad_norm": 1.5220246315002441, |
|
"learning_rate": 8.981799140608938e-06, |
|
"loss": 0.3399, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 1.2946428571428572, |
|
"grad_norm": 1.5785568952560425, |
|
"learning_rate": 8.974645175216478e-06, |
|
"loss": 0.3267, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.2991071428571428, |
|
"grad_norm": 1.5936439037322998, |
|
"learning_rate": 8.967469036030996e-06, |
|
"loss": 0.3337, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 1.3035714285714286, |
|
"grad_norm": 1.7652866840362549, |
|
"learning_rate": 8.960270763091853e-06, |
|
"loss": 0.3109, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 1.3080357142857144, |
|
"grad_norm": 1.3734923601150513, |
|
"learning_rate": 8.953050396561904e-06, |
|
"loss": 0.3026, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 1.3125, |
|
"grad_norm": 1.570879578590393, |
|
"learning_rate": 8.94580797672727e-06, |
|
"loss": 0.3305, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 1.3169642857142856, |
|
"grad_norm": 1.5227439403533936, |
|
"learning_rate": 8.938543543997129e-06, |
|
"loss": 0.2904, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.3214285714285714, |
|
"grad_norm": 1.756388783454895, |
|
"learning_rate": 8.931257138903474e-06, |
|
"loss": 0.3017, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 1.3258928571428572, |
|
"grad_norm": 1.714189052581787, |
|
"learning_rate": 8.923948802100891e-06, |
|
"loss": 0.3486, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 1.3303571428571428, |
|
"grad_norm": 1.7315165996551514, |
|
"learning_rate": 8.916618574366338e-06, |
|
"loss": 0.3542, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 1.3348214285714286, |
|
"grad_norm": 1.5031179189682007, |
|
"learning_rate": 8.909266496598917e-06, |
|
"loss": 0.3139, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 1.3392857142857144, |
|
"grad_norm": 1.3531993627548218, |
|
"learning_rate": 8.901892609819632e-06, |
|
"loss": 0.2723, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.34375, |
|
"grad_norm": 1.5571861267089844, |
|
"learning_rate": 8.894496955171182e-06, |
|
"loss": 0.36, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 1.3482142857142856, |
|
"grad_norm": 1.662018060684204, |
|
"learning_rate": 8.887079573917713e-06, |
|
"loss": 0.3305, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 1.3526785714285714, |
|
"grad_norm": 1.696288824081421, |
|
"learning_rate": 8.879640507444598e-06, |
|
"loss": 0.3286, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 1.3571428571428572, |
|
"grad_norm": 1.5394744873046875, |
|
"learning_rate": 8.872179797258202e-06, |
|
"loss": 0.3549, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 1.3616071428571428, |
|
"grad_norm": 1.6615936756134033, |
|
"learning_rate": 8.86469748498565e-06, |
|
"loss": 0.3198, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.3660714285714286, |
|
"grad_norm": 1.4944754838943481, |
|
"learning_rate": 8.8571936123746e-06, |
|
"loss": 0.3068, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 1.3705357142857144, |
|
"grad_norm": 1.4707030057907104, |
|
"learning_rate": 8.849668221293e-06, |
|
"loss": 0.3293, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 1.375, |
|
"grad_norm": 1.4484243392944336, |
|
"learning_rate": 8.842121353728867e-06, |
|
"loss": 0.3217, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 1.3794642857142856, |
|
"grad_norm": 1.544541835784912, |
|
"learning_rate": 8.834553051790044e-06, |
|
"loss": 0.3514, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 1.3839285714285714, |
|
"grad_norm": 1.3725217580795288, |
|
"learning_rate": 8.826963357703964e-06, |
|
"loss": 0.2865, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.3883928571428572, |
|
"grad_norm": 1.586159110069275, |
|
"learning_rate": 8.819352313817424e-06, |
|
"loss": 0.3376, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 1.3928571428571428, |
|
"grad_norm": 1.4718273878097534, |
|
"learning_rate": 8.811719962596338e-06, |
|
"loss": 0.3243, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 1.3973214285714286, |
|
"grad_norm": 1.41029953956604, |
|
"learning_rate": 8.804066346625506e-06, |
|
"loss": 0.3142, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 1.4017857142857144, |
|
"grad_norm": 1.6441102027893066, |
|
"learning_rate": 8.796391508608372e-06, |
|
"loss": 0.362, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 1.40625, |
|
"grad_norm": 1.501712441444397, |
|
"learning_rate": 8.788695491366795e-06, |
|
"loss": 0.3175, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.4107142857142856, |
|
"grad_norm": 1.4669523239135742, |
|
"learning_rate": 8.780978337840796e-06, |
|
"loss": 0.3059, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 1.4151785714285714, |
|
"grad_norm": 1.505470871925354, |
|
"learning_rate": 8.773240091088335e-06, |
|
"loss": 0.3163, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 1.4196428571428572, |
|
"grad_norm": 1.5221257209777832, |
|
"learning_rate": 8.765480794285054e-06, |
|
"loss": 0.3329, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 1.4241071428571428, |
|
"grad_norm": 1.4438836574554443, |
|
"learning_rate": 8.757700490724046e-06, |
|
"loss": 0.3109, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 1.4285714285714286, |
|
"grad_norm": 1.5612000226974487, |
|
"learning_rate": 8.749899223815618e-06, |
|
"loss": 0.3512, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.4330357142857144, |
|
"grad_norm": 1.447957158088684, |
|
"learning_rate": 8.742077037087032e-06, |
|
"loss": 0.3244, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 1.4375, |
|
"grad_norm": 1.4214884042739868, |
|
"learning_rate": 8.734233974182276e-06, |
|
"loss": 0.3149, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 1.4419642857142856, |
|
"grad_norm": 1.732344388961792, |
|
"learning_rate": 8.726370078861825e-06, |
|
"loss": 0.341, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 1.4464285714285714, |
|
"grad_norm": 1.5918134450912476, |
|
"learning_rate": 8.718485395002377e-06, |
|
"loss": 0.3636, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 1.4508928571428572, |
|
"grad_norm": 1.486258625984192, |
|
"learning_rate": 8.710579966596625e-06, |
|
"loss": 0.3428, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.4553571428571428, |
|
"grad_norm": 1.4833279848098755, |
|
"learning_rate": 8.702653837753005e-06, |
|
"loss": 0.3205, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 1.4598214285714286, |
|
"grad_norm": 1.620224118232727, |
|
"learning_rate": 8.694707052695459e-06, |
|
"loss": 0.3322, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 1.4642857142857144, |
|
"grad_norm": 1.5433918237686157, |
|
"learning_rate": 8.686739655763166e-06, |
|
"loss": 0.3492, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 1.46875, |
|
"grad_norm": 1.4457906484603882, |
|
"learning_rate": 8.678751691410323e-06, |
|
"loss": 0.3102, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 1.4732142857142856, |
|
"grad_norm": 1.4180353879928589, |
|
"learning_rate": 8.670743204205875e-06, |
|
"loss": 0.3432, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.4776785714285714, |
|
"grad_norm": 1.3930200338363647, |
|
"learning_rate": 8.662714238833278e-06, |
|
"loss": 0.294, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 1.4821428571428572, |
|
"grad_norm": 1.436591625213623, |
|
"learning_rate": 8.654664840090247e-06, |
|
"loss": 0.3666, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 1.4866071428571428, |
|
"grad_norm": 1.6145936250686646, |
|
"learning_rate": 8.6465950528885e-06, |
|
"loss": 0.323, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 1.4910714285714286, |
|
"grad_norm": 1.4172477722167969, |
|
"learning_rate": 8.638504922253518e-06, |
|
"loss": 0.3138, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 1.4955357142857144, |
|
"grad_norm": 1.4870661497116089, |
|
"learning_rate": 8.63039449332429e-06, |
|
"loss": 0.3337, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 1.487033724784851, |
|
"learning_rate": 8.62226381135305e-06, |
|
"loss": 0.3157, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 1.5044642857142856, |
|
"grad_norm": 1.5234925746917725, |
|
"learning_rate": 8.614112921705045e-06, |
|
"loss": 0.3067, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 1.5089285714285714, |
|
"grad_norm": 1.5859482288360596, |
|
"learning_rate": 8.605941869858265e-06, |
|
"loss": 0.3364, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 1.5133928571428572, |
|
"grad_norm": 1.5042221546173096, |
|
"learning_rate": 8.597750701403197e-06, |
|
"loss": 0.3187, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 1.5178571428571428, |
|
"grad_norm": 1.379291296005249, |
|
"learning_rate": 8.589539462042566e-06, |
|
"loss": 0.3073, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.5223214285714286, |
|
"grad_norm": 1.4755321741104126, |
|
"learning_rate": 8.581308197591088e-06, |
|
"loss": 0.309, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 1.5267857142857144, |
|
"grad_norm": 1.5573457479476929, |
|
"learning_rate": 8.573056953975208e-06, |
|
"loss": 0.3501, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 1.53125, |
|
"grad_norm": 1.4575328826904297, |
|
"learning_rate": 8.56478577723284e-06, |
|
"loss": 0.3452, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 1.5357142857142856, |
|
"grad_norm": 1.4823976755142212, |
|
"learning_rate": 8.556494713513123e-06, |
|
"loss": 0.3331, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 1.5401785714285714, |
|
"grad_norm": 1.4269355535507202, |
|
"learning_rate": 8.548183809076146e-06, |
|
"loss": 0.3107, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.5446428571428572, |
|
"grad_norm": 1.4220494031906128, |
|
"learning_rate": 8.539853110292708e-06, |
|
"loss": 0.3319, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 1.5491071428571428, |
|
"grad_norm": 1.5970823764801025, |
|
"learning_rate": 8.531502663644046e-06, |
|
"loss": 0.3548, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 1.5535714285714286, |
|
"grad_norm": 1.4499046802520752, |
|
"learning_rate": 8.523132515721586e-06, |
|
"loss": 0.3369, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 1.5580357142857144, |
|
"grad_norm": 1.3973332643508911, |
|
"learning_rate": 8.51474271322667e-06, |
|
"loss": 0.3464, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 1.5625, |
|
"grad_norm": 1.7323225736618042, |
|
"learning_rate": 8.506333302970306e-06, |
|
"loss": 0.3218, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.5669642857142856, |
|
"grad_norm": 1.3297864198684692, |
|
"learning_rate": 8.497904331872909e-06, |
|
"loss": 0.301, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 1.5714285714285714, |
|
"grad_norm": 1.5161700248718262, |
|
"learning_rate": 8.489455846964027e-06, |
|
"loss": 0.3468, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 1.5758928571428572, |
|
"grad_norm": 1.4331910610198975, |
|
"learning_rate": 8.480987895382086e-06, |
|
"loss": 0.3613, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 1.5803571428571428, |
|
"grad_norm": 1.5763874053955078, |
|
"learning_rate": 8.472500524374129e-06, |
|
"loss": 0.357, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 1.5848214285714286, |
|
"grad_norm": 1.50728440284729, |
|
"learning_rate": 8.463993781295552e-06, |
|
"loss": 0.3252, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.5892857142857144, |
|
"grad_norm": 1.4221165180206299, |
|
"learning_rate": 8.45546771360983e-06, |
|
"loss": 0.3036, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 1.59375, |
|
"grad_norm": 1.430577278137207, |
|
"learning_rate": 8.44692236888827e-06, |
|
"loss": 0.3187, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 1.5982142857142856, |
|
"grad_norm": 1.316603422164917, |
|
"learning_rate": 8.43835779480973e-06, |
|
"loss": 0.3275, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 1.6026785714285714, |
|
"grad_norm": 1.6066330671310425, |
|
"learning_rate": 8.429774039160355e-06, |
|
"loss": 0.336, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 1.6071428571428572, |
|
"grad_norm": 1.6857646703720093, |
|
"learning_rate": 8.421171149833322e-06, |
|
"loss": 0.3346, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.6116071428571428, |
|
"grad_norm": 1.6161949634552002, |
|
"learning_rate": 8.412549174828558e-06, |
|
"loss": 0.3806, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 1.6160714285714286, |
|
"grad_norm": 1.5804469585418701, |
|
"learning_rate": 8.403908162252481e-06, |
|
"loss": 0.3332, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 1.6205357142857144, |
|
"grad_norm": 1.568248987197876, |
|
"learning_rate": 8.395248160317728e-06, |
|
"loss": 0.3175, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 1.625, |
|
"grad_norm": 1.4972864389419556, |
|
"learning_rate": 8.386569217342893e-06, |
|
"loss": 0.3233, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 1.6294642857142856, |
|
"grad_norm": 1.5257365703582764, |
|
"learning_rate": 8.377871381752246e-06, |
|
"loss": 0.3273, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.6339285714285714, |
|
"grad_norm": 1.4747686386108398, |
|
"learning_rate": 8.369154702075466e-06, |
|
"loss": 0.3382, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 1.6383928571428572, |
|
"grad_norm": 1.5610405206680298, |
|
"learning_rate": 8.360419226947383e-06, |
|
"loss": 0.3203, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 1.6428571428571428, |
|
"grad_norm": 1.4387973546981812, |
|
"learning_rate": 8.351665005107686e-06, |
|
"loss": 0.3271, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 1.6473214285714286, |
|
"grad_norm": 1.5783442258834839, |
|
"learning_rate": 8.34289208540067e-06, |
|
"loss": 0.3576, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 1.6517857142857144, |
|
"grad_norm": 1.5295450687408447, |
|
"learning_rate": 8.334100516774946e-06, |
|
"loss": 0.3488, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.65625, |
|
"grad_norm": 1.5279836654663086, |
|
"learning_rate": 8.325290348283186e-06, |
|
"loss": 0.358, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 1.6607142857142856, |
|
"grad_norm": 1.5376560688018799, |
|
"learning_rate": 8.316461629081833e-06, |
|
"loss": 0.3025, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 1.6651785714285714, |
|
"grad_norm": 1.4480183124542236, |
|
"learning_rate": 8.307614408430839e-06, |
|
"loss": 0.3609, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 1.6696428571428572, |
|
"grad_norm": 1.5485166311264038, |
|
"learning_rate": 8.298748735693382e-06, |
|
"loss": 0.3405, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 1.6741071428571428, |
|
"grad_norm": 1.408075213432312, |
|
"learning_rate": 8.289864660335595e-06, |
|
"loss": 0.322, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.6785714285714286, |
|
"grad_norm": 1.5506187677383423, |
|
"learning_rate": 8.280962231926288e-06, |
|
"loss": 0.3385, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 1.6830357142857144, |
|
"grad_norm": 1.6020299196243286, |
|
"learning_rate": 8.27204150013667e-06, |
|
"loss": 0.3353, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 1.6875, |
|
"grad_norm": 1.4672306776046753, |
|
"learning_rate": 8.263102514740082e-06, |
|
"loss": 0.3086, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 1.6919642857142856, |
|
"grad_norm": 1.5637385845184326, |
|
"learning_rate": 8.2541453256117e-06, |
|
"loss": 0.3136, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 1.6964285714285714, |
|
"grad_norm": 1.446553111076355, |
|
"learning_rate": 8.245169982728276e-06, |
|
"loss": 0.3389, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.7008928571428572, |
|
"grad_norm": 1.4007227420806885, |
|
"learning_rate": 8.23617653616785e-06, |
|
"loss": 0.3095, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 1.7053571428571428, |
|
"grad_norm": 1.454406976699829, |
|
"learning_rate": 8.227165036109468e-06, |
|
"loss": 0.329, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 1.7098214285714286, |
|
"grad_norm": 1.7223042249679565, |
|
"learning_rate": 8.218135532832909e-06, |
|
"loss": 0.3662, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 1.7142857142857144, |
|
"grad_norm": 1.5587764978408813, |
|
"learning_rate": 8.209088076718398e-06, |
|
"loss": 0.3348, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 1.71875, |
|
"grad_norm": 1.602912187576294, |
|
"learning_rate": 8.20002271824633e-06, |
|
"loss": 0.3736, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.7232142857142856, |
|
"grad_norm": 1.5459792613983154, |
|
"learning_rate": 8.190939507996992e-06, |
|
"loss": 0.3559, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 1.7276785714285714, |
|
"grad_norm": 1.5487347841262817, |
|
"learning_rate": 8.181838496650266e-06, |
|
"loss": 0.3466, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 1.7321428571428572, |
|
"grad_norm": 1.5751219987869263, |
|
"learning_rate": 8.17271973498536e-06, |
|
"loss": 0.3541, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 1.7366071428571428, |
|
"grad_norm": 1.5197848081588745, |
|
"learning_rate": 8.163583273880519e-06, |
|
"loss": 0.3276, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 1.7410714285714286, |
|
"grad_norm": 1.4693080186843872, |
|
"learning_rate": 8.154429164312742e-06, |
|
"loss": 0.3051, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.7455357142857144, |
|
"grad_norm": 1.4455931186676025, |
|
"learning_rate": 8.145257457357502e-06, |
|
"loss": 0.328, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 1.5701878070831299, |
|
"learning_rate": 8.136068204188448e-06, |
|
"loss": 0.3301, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 1.7544642857142856, |
|
"grad_norm": 1.358162522315979, |
|
"learning_rate": 8.12686145607714e-06, |
|
"loss": 0.3074, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 1.7589285714285714, |
|
"grad_norm": 1.8741075992584229, |
|
"learning_rate": 8.11763726439274e-06, |
|
"loss": 0.3883, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 1.7633928571428572, |
|
"grad_norm": 1.5605037212371826, |
|
"learning_rate": 8.108395680601742e-06, |
|
"loss": 0.3213, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.7678571428571428, |
|
"grad_norm": 1.5065466165542603, |
|
"learning_rate": 8.099136756267682e-06, |
|
"loss": 0.3213, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 1.7723214285714286, |
|
"grad_norm": 1.3987689018249512, |
|
"learning_rate": 8.089860543050843e-06, |
|
"loss": 0.324, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 1.7767857142857144, |
|
"grad_norm": 1.6072906255722046, |
|
"learning_rate": 8.080567092707973e-06, |
|
"loss": 0.344, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 1.78125, |
|
"grad_norm": 1.4562393426895142, |
|
"learning_rate": 8.071256457091994e-06, |
|
"loss": 0.3173, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 1.7857142857142856, |
|
"grad_norm": 1.4061006307601929, |
|
"learning_rate": 8.06192868815172e-06, |
|
"loss": 0.321, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.7857142857142856, |
|
"eval_loss": 0.5984869003295898, |
|
"eval_runtime": 4.436, |
|
"eval_samples_per_second": 13.526, |
|
"eval_steps_per_second": 0.902, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.7901785714285714, |
|
"grad_norm": 1.521924614906311, |
|
"learning_rate": 8.05258383793155e-06, |
|
"loss": 0.3542, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 1.7946428571428572, |
|
"grad_norm": 1.4396589994430542, |
|
"learning_rate": 8.043221958571193e-06, |
|
"loss": 0.3303, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 1.7991071428571428, |
|
"grad_norm": 1.5028449296951294, |
|
"learning_rate": 8.033843102305376e-06, |
|
"loss": 0.3436, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 1.8035714285714286, |
|
"grad_norm": 1.3993929624557495, |
|
"learning_rate": 8.024447321463545e-06, |
|
"loss": 0.3207, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 1.8080357142857144, |
|
"grad_norm": 1.4850343465805054, |
|
"learning_rate": 8.015034668469576e-06, |
|
"loss": 0.3495, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.8125, |
|
"grad_norm": 1.5270286798477173, |
|
"learning_rate": 8.005605195841485e-06, |
|
"loss": 0.3224, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 1.8169642857142856, |
|
"grad_norm": 1.462645411491394, |
|
"learning_rate": 7.996158956191135e-06, |
|
"loss": 0.2767, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 1.8214285714285714, |
|
"grad_norm": 1.292880654335022, |
|
"learning_rate": 7.986696002223936e-06, |
|
"loss": 0.2946, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 1.8258928571428572, |
|
"grad_norm": 1.4748491048812866, |
|
"learning_rate": 7.97721638673856e-06, |
|
"loss": 0.3342, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 1.8303571428571428, |
|
"grad_norm": 1.4022036790847778, |
|
"learning_rate": 7.967720162626643e-06, |
|
"loss": 0.333, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.8348214285714286, |
|
"grad_norm": 1.479123830795288, |
|
"learning_rate": 7.958207382872486e-06, |
|
"loss": 0.3424, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 1.8392857142857144, |
|
"grad_norm": 1.455062747001648, |
|
"learning_rate": 7.94867810055276e-06, |
|
"loss": 0.304, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 1.84375, |
|
"grad_norm": 1.506971001625061, |
|
"learning_rate": 7.93913236883622e-06, |
|
"loss": 0.3409, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 1.8482142857142856, |
|
"grad_norm": 1.5958027839660645, |
|
"learning_rate": 7.929570240983393e-06, |
|
"loss": 0.3264, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 1.8526785714285714, |
|
"grad_norm": 1.3807499408721924, |
|
"learning_rate": 7.919991770346295e-06, |
|
"loss": 0.3436, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.8571428571428572, |
|
"grad_norm": 1.3814480304718018, |
|
"learning_rate": 7.910397010368122e-06, |
|
"loss": 0.3288, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 1.8616071428571428, |
|
"grad_norm": 1.412415862083435, |
|
"learning_rate": 7.900786014582957e-06, |
|
"loss": 0.353, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 1.8660714285714286, |
|
"grad_norm": 1.3772637844085693, |
|
"learning_rate": 7.891158836615472e-06, |
|
"loss": 0.2992, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 1.8705357142857144, |
|
"grad_norm": 1.425160527229309, |
|
"learning_rate": 7.881515530180629e-06, |
|
"loss": 0.3536, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 1.875, |
|
"grad_norm": 1.4070899486541748, |
|
"learning_rate": 7.871856149083377e-06, |
|
"loss": 0.3252, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.8794642857142856, |
|
"grad_norm": 1.3919873237609863, |
|
"learning_rate": 7.862180747218354e-06, |
|
"loss": 0.3378, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 1.8839285714285714, |
|
"grad_norm": 1.4947540760040283, |
|
"learning_rate": 7.852489378569588e-06, |
|
"loss": 0.3383, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 1.8883928571428572, |
|
"grad_norm": 1.500719428062439, |
|
"learning_rate": 7.84278209721019e-06, |
|
"loss": 0.3414, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 1.8928571428571428, |
|
"grad_norm": 1.202958583831787, |
|
"learning_rate": 7.83305895730206e-06, |
|
"loss": 0.2683, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 1.8973214285714286, |
|
"grad_norm": 1.518190860748291, |
|
"learning_rate": 7.823320013095578e-06, |
|
"loss": 0.3315, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.9017857142857144, |
|
"grad_norm": 1.435086965560913, |
|
"learning_rate": 7.81356531892931e-06, |
|
"loss": 0.3238, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 1.90625, |
|
"grad_norm": 1.4547661542892456, |
|
"learning_rate": 7.803794929229689e-06, |
|
"loss": 0.3171, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 1.9107142857142856, |
|
"grad_norm": 1.5060296058654785, |
|
"learning_rate": 7.794008898510731e-06, |
|
"loss": 0.348, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 1.9151785714285714, |
|
"grad_norm": 1.5503387451171875, |
|
"learning_rate": 7.784207281373716e-06, |
|
"loss": 0.3573, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 1.9196428571428572, |
|
"grad_norm": 1.4911384582519531, |
|
"learning_rate": 7.774390132506892e-06, |
|
"loss": 0.3234, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.9241071428571428, |
|
"grad_norm": 1.5123614072799683, |
|
"learning_rate": 7.764557506685162e-06, |
|
"loss": 0.3838, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 1.9285714285714286, |
|
"grad_norm": 1.529229760169983, |
|
"learning_rate": 7.754709458769787e-06, |
|
"loss": 0.3425, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 1.9330357142857144, |
|
"grad_norm": 1.4191179275512695, |
|
"learning_rate": 7.744846043708076e-06, |
|
"loss": 0.3175, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 1.9375, |
|
"grad_norm": 1.5373347997665405, |
|
"learning_rate": 7.734967316533074e-06, |
|
"loss": 0.3379, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 1.9419642857142856, |
|
"grad_norm": 1.562549114227295, |
|
"learning_rate": 7.725073332363265e-06, |
|
"loss": 0.3521, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.9464285714285714, |
|
"grad_norm": 1.4550033807754517, |
|
"learning_rate": 7.715164146402259e-06, |
|
"loss": 0.3314, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 1.9508928571428572, |
|
"grad_norm": 1.5194437503814697, |
|
"learning_rate": 7.705239813938486e-06, |
|
"loss": 0.3369, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 1.9553571428571428, |
|
"grad_norm": 1.331009030342102, |
|
"learning_rate": 7.69530039034488e-06, |
|
"loss": 0.3156, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 1.9598214285714286, |
|
"grad_norm": 1.4136159420013428, |
|
"learning_rate": 7.685345931078579e-06, |
|
"loss": 0.352, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 1.9642857142857144, |
|
"grad_norm": 1.4941082000732422, |
|
"learning_rate": 7.675376491680617e-06, |
|
"loss": 0.3487, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.96875, |
|
"grad_norm": 1.5224647521972656, |
|
"learning_rate": 7.665392127775605e-06, |
|
"loss": 0.3472, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 1.9732142857142856, |
|
"grad_norm": 1.389153242111206, |
|
"learning_rate": 7.65539289507143e-06, |
|
"loss": 0.3188, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 1.9776785714285714, |
|
"grad_norm": 1.3954931497573853, |
|
"learning_rate": 7.645378849358931e-06, |
|
"loss": 0.2805, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 1.9821428571428572, |
|
"grad_norm": 1.4153132438659668, |
|
"learning_rate": 7.635350046511609e-06, |
|
"loss": 0.3351, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 1.9866071428571428, |
|
"grad_norm": 1.4916757345199585, |
|
"learning_rate": 7.625306542485289e-06, |
|
"loss": 0.3388, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.9910714285714286, |
|
"grad_norm": 1.395013689994812, |
|
"learning_rate": 7.615248393317833e-06, |
|
"loss": 0.3422, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 1.9955357142857144, |
|
"grad_norm": 1.6745942831039429, |
|
"learning_rate": 7.605175655128809e-06, |
|
"loss": 0.3205, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1.5079350471496582, |
|
"learning_rate": 7.595088384119186e-06, |
|
"loss": 0.3489, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 2.0044642857142856, |
|
"grad_norm": 1.3217769861221313, |
|
"learning_rate": 7.58498663657102e-06, |
|
"loss": 0.2103, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 2.0089285714285716, |
|
"grad_norm": 1.3270319700241089, |
|
"learning_rate": 7.57487046884714e-06, |
|
"loss": 0.2176, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.013392857142857, |
|
"grad_norm": 1.1652079820632935, |
|
"learning_rate": 7.5647399373908296e-06, |
|
"loss": 0.1737, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 2.017857142857143, |
|
"grad_norm": 1.2553536891937256, |
|
"learning_rate": 7.554595098725515e-06, |
|
"loss": 0.2092, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 2.0223214285714284, |
|
"grad_norm": 1.1758341789245605, |
|
"learning_rate": 7.544436009454454e-06, |
|
"loss": 0.1817, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 2.0267857142857144, |
|
"grad_norm": 1.359291434288025, |
|
"learning_rate": 7.534262726260413e-06, |
|
"loss": 0.2095, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 2.03125, |
|
"grad_norm": 1.4187179803848267, |
|
"learning_rate": 7.524075305905351e-06, |
|
"loss": 0.199, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 2.0357142857142856, |
|
"grad_norm": 1.6442433595657349, |
|
"learning_rate": 7.513873805230111e-06, |
|
"loss": 0.2029, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 2.0401785714285716, |
|
"grad_norm": 1.8652024269104004, |
|
"learning_rate": 7.5036582811540935e-06, |
|
"loss": 0.2194, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 2.044642857142857, |
|
"grad_norm": 1.7254689931869507, |
|
"learning_rate": 7.493428790674943e-06, |
|
"loss": 0.1686, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 2.049107142857143, |
|
"grad_norm": 1.8289271593093872, |
|
"learning_rate": 7.483185390868232e-06, |
|
"loss": 0.1941, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 2.0535714285714284, |
|
"grad_norm": 1.7519882917404175, |
|
"learning_rate": 7.472928138887134e-06, |
|
"loss": 0.1833, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.0580357142857144, |
|
"grad_norm": 1.9071825742721558, |
|
"learning_rate": 7.462657091962122e-06, |
|
"loss": 0.202, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 2.0625, |
|
"grad_norm": 1.690094232559204, |
|
"learning_rate": 7.452372307400626e-06, |
|
"loss": 0.1903, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 2.0669642857142856, |
|
"grad_norm": 1.6332119703292847, |
|
"learning_rate": 7.442073842586733e-06, |
|
"loss": 0.1868, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 2.0714285714285716, |
|
"grad_norm": 1.5547043085098267, |
|
"learning_rate": 7.43176175498086e-06, |
|
"loss": 0.17, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 2.075892857142857, |
|
"grad_norm": 1.6427596807479858, |
|
"learning_rate": 7.421436102119427e-06, |
|
"loss": 0.2107, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 2.080357142857143, |
|
"grad_norm": 1.4404045343399048, |
|
"learning_rate": 7.411096941614543e-06, |
|
"loss": 0.1707, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 2.0848214285714284, |
|
"grad_norm": 1.3924144506454468, |
|
"learning_rate": 7.400744331153684e-06, |
|
"loss": 0.1847, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 2.0892857142857144, |
|
"grad_norm": 1.2086669206619263, |
|
"learning_rate": 7.390378328499372e-06, |
|
"loss": 0.1564, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 2.09375, |
|
"grad_norm": 1.4519152641296387, |
|
"learning_rate": 7.3799989914888506e-06, |
|
"loss": 0.1997, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 2.0982142857142856, |
|
"grad_norm": 1.478129506111145, |
|
"learning_rate": 7.3696063780337566e-06, |
|
"loss": 0.1925, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.1026785714285716, |
|
"grad_norm": 1.650312900543213, |
|
"learning_rate": 7.359200546119813e-06, |
|
"loss": 0.1883, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 2.107142857142857, |
|
"grad_norm": 1.4365777969360352, |
|
"learning_rate": 7.3487815538064865e-06, |
|
"loss": 0.1868, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 2.111607142857143, |
|
"grad_norm": 1.342756748199463, |
|
"learning_rate": 7.338349459226678e-06, |
|
"loss": 0.1692, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 2.1160714285714284, |
|
"grad_norm": 1.5859627723693848, |
|
"learning_rate": 7.327904320586387e-06, |
|
"loss": 0.201, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 2.1205357142857144, |
|
"grad_norm": 1.499635100364685, |
|
"learning_rate": 7.3174461961644e-06, |
|
"loss": 0.1817, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 2.125, |
|
"grad_norm": 2.32686710357666, |
|
"learning_rate": 7.3069751443119505e-06, |
|
"loss": 0.1704, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 2.1294642857142856, |
|
"grad_norm": 1.412901759147644, |
|
"learning_rate": 7.296491223452407e-06, |
|
"loss": 0.1828, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 2.1339285714285716, |
|
"grad_norm": 1.5171948671340942, |
|
"learning_rate": 7.285994492080934e-06, |
|
"loss": 0.1953, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 2.138392857142857, |
|
"grad_norm": 1.5599428415298462, |
|
"learning_rate": 7.275485008764183e-06, |
|
"loss": 0.1923, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 2.142857142857143, |
|
"grad_norm": 1.5545156002044678, |
|
"learning_rate": 7.2649628321399415e-06, |
|
"loss": 0.19, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.1473214285714284, |
|
"grad_norm": 1.479979157447815, |
|
"learning_rate": 7.254428020916829e-06, |
|
"loss": 0.1862, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 2.1517857142857144, |
|
"grad_norm": 1.4323021173477173, |
|
"learning_rate": 7.243880633873957e-06, |
|
"loss": 0.1968, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 2.15625, |
|
"grad_norm": 1.5759376287460327, |
|
"learning_rate": 7.2333207298606075e-06, |
|
"loss": 0.1912, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 2.1607142857142856, |
|
"grad_norm": 1.4067718982696533, |
|
"learning_rate": 7.222748367795892e-06, |
|
"loss": 0.1968, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 2.1651785714285716, |
|
"grad_norm": 1.4289413690567017, |
|
"learning_rate": 7.212163606668442e-06, |
|
"loss": 0.181, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 2.169642857142857, |
|
"grad_norm": 1.386183738708496, |
|
"learning_rate": 7.201566505536065e-06, |
|
"loss": 0.1859, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 2.174107142857143, |
|
"grad_norm": 1.5737900733947754, |
|
"learning_rate": 7.190957123525417e-06, |
|
"loss": 0.1975, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 2.1785714285714284, |
|
"grad_norm": 1.4346133470535278, |
|
"learning_rate": 7.180335519831685e-06, |
|
"loss": 0.1988, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 2.1830357142857144, |
|
"grad_norm": 1.6407471895217896, |
|
"learning_rate": 7.169701753718232e-06, |
|
"loss": 0.1942, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 2.1875, |
|
"grad_norm": 1.4272671937942505, |
|
"learning_rate": 7.159055884516297e-06, |
|
"loss": 0.1709, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.1919642857142856, |
|
"grad_norm": 1.4495371580123901, |
|
"learning_rate": 7.148397971624636e-06, |
|
"loss": 0.1857, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 2.1964285714285716, |
|
"grad_norm": 1.474555253982544, |
|
"learning_rate": 7.137728074509211e-06, |
|
"loss": 0.1994, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 2.200892857142857, |
|
"grad_norm": 1.3996185064315796, |
|
"learning_rate": 7.127046252702847e-06, |
|
"loss": 0.1976, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 2.205357142857143, |
|
"grad_norm": 1.4525631666183472, |
|
"learning_rate": 7.116352565804904e-06, |
|
"loss": 0.1781, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 2.2098214285714284, |
|
"grad_norm": 1.3915441036224365, |
|
"learning_rate": 7.105647073480939e-06, |
|
"loss": 0.1934, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 2.2142857142857144, |
|
"grad_norm": 1.548956274986267, |
|
"learning_rate": 7.0949298354623855e-06, |
|
"loss": 0.1956, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 2.21875, |
|
"grad_norm": 1.67276930809021, |
|
"learning_rate": 7.084200911546205e-06, |
|
"loss": 0.2031, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 2.2232142857142856, |
|
"grad_norm": 1.4230951070785522, |
|
"learning_rate": 7.073460361594565e-06, |
|
"loss": 0.1747, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 2.2276785714285716, |
|
"grad_norm": 1.3589422702789307, |
|
"learning_rate": 7.0627082455344984e-06, |
|
"loss": 0.1697, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 2.232142857142857, |
|
"grad_norm": 1.5158501863479614, |
|
"learning_rate": 7.0519446233575715e-06, |
|
"loss": 0.1931, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.236607142857143, |
|
"grad_norm": 1.584936499595642, |
|
"learning_rate": 7.041169555119552e-06, |
|
"loss": 0.1835, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 2.2410714285714284, |
|
"grad_norm": 1.75810968875885, |
|
"learning_rate": 7.030383100940068e-06, |
|
"loss": 0.2101, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 2.2455357142857144, |
|
"grad_norm": 1.5574495792388916, |
|
"learning_rate": 7.019585321002276e-06, |
|
"loss": 0.1967, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 1.4675954580307007, |
|
"learning_rate": 7.008776275552522e-06, |
|
"loss": 0.1835, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 2.2544642857142856, |
|
"grad_norm": 1.5428667068481445, |
|
"learning_rate": 6.997956024900014e-06, |
|
"loss": 0.1758, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 2.2589285714285716, |
|
"grad_norm": 1.7386614084243774, |
|
"learning_rate": 6.9871246294164775e-06, |
|
"loss": 0.2004, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 2.263392857142857, |
|
"grad_norm": 1.6915313005447388, |
|
"learning_rate": 6.9762821495358194e-06, |
|
"loss": 0.2123, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 2.267857142857143, |
|
"grad_norm": 1.512141227722168, |
|
"learning_rate": 6.965428645753792e-06, |
|
"loss": 0.1849, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 2.2723214285714284, |
|
"grad_norm": 1.3899809122085571, |
|
"learning_rate": 6.954564178627655e-06, |
|
"loss": 0.1755, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 2.2767857142857144, |
|
"grad_norm": 1.446179747581482, |
|
"learning_rate": 6.943688808775843e-06, |
|
"loss": 0.2002, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.28125, |
|
"grad_norm": 1.4694817066192627, |
|
"learning_rate": 6.9328025968776155e-06, |
|
"loss": 0.1733, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 2.2857142857142856, |
|
"grad_norm": 1.4220669269561768, |
|
"learning_rate": 6.921905603672733e-06, |
|
"loss": 0.1865, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 2.2901785714285716, |
|
"grad_norm": 1.5075308084487915, |
|
"learning_rate": 6.910997889961098e-06, |
|
"loss": 0.2068, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 2.294642857142857, |
|
"grad_norm": 1.433173418045044, |
|
"learning_rate": 6.900079516602445e-06, |
|
"loss": 0.1646, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 2.299107142857143, |
|
"grad_norm": 1.5466556549072266, |
|
"learning_rate": 6.889150544515972e-06, |
|
"loss": 0.1876, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 2.3035714285714284, |
|
"grad_norm": 1.5720268487930298, |
|
"learning_rate": 6.8782110346800155e-06, |
|
"loss": 0.1895, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 2.3080357142857144, |
|
"grad_norm": 1.5228114128112793, |
|
"learning_rate": 6.867261048131712e-06, |
|
"loss": 0.1775, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 2.3125, |
|
"grad_norm": 1.5427687168121338, |
|
"learning_rate": 6.856300645966645e-06, |
|
"loss": 0.1949, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 2.3169642857142856, |
|
"grad_norm": 1.4784200191497803, |
|
"learning_rate": 6.845329889338519e-06, |
|
"loss": 0.1972, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 2.3214285714285716, |
|
"grad_norm": 1.5751314163208008, |
|
"learning_rate": 6.834348839458806e-06, |
|
"loss": 0.192, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.325892857142857, |
|
"grad_norm": 1.4170998334884644, |
|
"learning_rate": 6.823357557596416e-06, |
|
"loss": 0.1739, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 2.330357142857143, |
|
"grad_norm": 1.5813453197479248, |
|
"learning_rate": 6.81235610507734e-06, |
|
"loss": 0.1973, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 2.3348214285714284, |
|
"grad_norm": 1.4130178689956665, |
|
"learning_rate": 6.801344543284324e-06, |
|
"loss": 0.1784, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 2.3392857142857144, |
|
"grad_norm": 1.503487229347229, |
|
"learning_rate": 6.790322933656515e-06, |
|
"loss": 0.1758, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 2.34375, |
|
"grad_norm": 1.5043838024139404, |
|
"learning_rate": 6.779291337689122e-06, |
|
"loss": 0.1931, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 2.3482142857142856, |
|
"grad_norm": 1.5936574935913086, |
|
"learning_rate": 6.768249816933074e-06, |
|
"loss": 0.2102, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 2.3526785714285716, |
|
"grad_norm": 1.39701247215271, |
|
"learning_rate": 6.757198432994674e-06, |
|
"loss": 0.1888, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 2.357142857142857, |
|
"grad_norm": 1.4147759675979614, |
|
"learning_rate": 6.7461372475352585e-06, |
|
"loss": 0.1728, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 2.361607142857143, |
|
"grad_norm": 1.5155245065689087, |
|
"learning_rate": 6.73506632227085e-06, |
|
"loss": 0.2029, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 2.3660714285714284, |
|
"grad_norm": 1.3971562385559082, |
|
"learning_rate": 6.723985718971818e-06, |
|
"loss": 0.1712, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.3705357142857144, |
|
"grad_norm": 1.4848732948303223, |
|
"learning_rate": 6.712895499462527e-06, |
|
"loss": 0.1778, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 2.375, |
|
"grad_norm": 1.3537763357162476, |
|
"learning_rate": 6.701795725620995e-06, |
|
"loss": 0.1807, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 2.3794642857142856, |
|
"grad_norm": 1.278257131576538, |
|
"learning_rate": 6.69068645937855e-06, |
|
"loss": 0.1739, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 2.3839285714285716, |
|
"grad_norm": 1.4302947521209717, |
|
"learning_rate": 6.6795677627194835e-06, |
|
"loss": 0.1757, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 2.388392857142857, |
|
"grad_norm": 1.5175752639770508, |
|
"learning_rate": 6.668439697680704e-06, |
|
"loss": 0.1744, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 2.392857142857143, |
|
"grad_norm": 1.582108974456787, |
|
"learning_rate": 6.65730232635139e-06, |
|
"loss": 0.2061, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 2.3973214285714284, |
|
"grad_norm": 1.3695740699768066, |
|
"learning_rate": 6.6461557108726435e-06, |
|
"loss": 0.1861, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 2.4017857142857144, |
|
"grad_norm": 1.4187901020050049, |
|
"learning_rate": 6.634999913437148e-06, |
|
"loss": 0.1646, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 2.40625, |
|
"grad_norm": 1.5142133235931396, |
|
"learning_rate": 6.623834996288816e-06, |
|
"loss": 0.1878, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 2.4107142857142856, |
|
"grad_norm": 1.4513888359069824, |
|
"learning_rate": 6.6126610217224405e-06, |
|
"loss": 0.2083, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.4151785714285716, |
|
"grad_norm": 1.6881784200668335, |
|
"learning_rate": 6.601478052083356e-06, |
|
"loss": 0.215, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 2.419642857142857, |
|
"grad_norm": 1.5719410181045532, |
|
"learning_rate": 6.59028614976708e-06, |
|
"loss": 0.2007, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 2.424107142857143, |
|
"grad_norm": 1.5767160654067993, |
|
"learning_rate": 6.579085377218973e-06, |
|
"loss": 0.1993, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 2.4285714285714284, |
|
"grad_norm": 1.4855055809020996, |
|
"learning_rate": 6.567875796933888e-06, |
|
"loss": 0.1962, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 2.4330357142857144, |
|
"grad_norm": 1.4299588203430176, |
|
"learning_rate": 6.556657471455817e-06, |
|
"loss": 0.1904, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 2.4375, |
|
"grad_norm": 1.4422001838684082, |
|
"learning_rate": 6.54543046337755e-06, |
|
"loss": 0.211, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 2.4419642857142856, |
|
"grad_norm": 1.6163188219070435, |
|
"learning_rate": 6.534194835340321e-06, |
|
"loss": 0.2123, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 2.4464285714285716, |
|
"grad_norm": 1.485174298286438, |
|
"learning_rate": 6.522950650033454e-06, |
|
"loss": 0.1834, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 2.450892857142857, |
|
"grad_norm": 1.5538384914398193, |
|
"learning_rate": 6.511697970194024e-06, |
|
"loss": 0.1982, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 2.455357142857143, |
|
"grad_norm": 1.3437016010284424, |
|
"learning_rate": 6.500436858606501e-06, |
|
"loss": 0.1809, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.4598214285714284, |
|
"grad_norm": 1.5390422344207764, |
|
"learning_rate": 6.4891673781023975e-06, |
|
"loss": 0.1861, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 2.4642857142857144, |
|
"grad_norm": 1.5996623039245605, |
|
"learning_rate": 6.477889591559926e-06, |
|
"loss": 0.2132, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 2.46875, |
|
"grad_norm": 1.3555296659469604, |
|
"learning_rate": 6.466603561903633e-06, |
|
"loss": 0.1709, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 2.4732142857142856, |
|
"grad_norm": 1.4055936336517334, |
|
"learning_rate": 6.455309352104065e-06, |
|
"loss": 0.1931, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 2.4776785714285716, |
|
"grad_norm": 1.490340232849121, |
|
"learning_rate": 6.444007025177407e-06, |
|
"loss": 0.1923, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 2.482142857142857, |
|
"grad_norm": 1.451197862625122, |
|
"learning_rate": 6.4326966441851355e-06, |
|
"loss": 0.1822, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 2.486607142857143, |
|
"grad_norm": 1.33744215965271, |
|
"learning_rate": 6.4213782722336625e-06, |
|
"loss": 0.1557, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 2.4910714285714284, |
|
"grad_norm": 1.7766042947769165, |
|
"learning_rate": 6.4100519724739875e-06, |
|
"loss": 0.2113, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 2.4955357142857144, |
|
"grad_norm": 1.5189027786254883, |
|
"learning_rate": 6.3987178081013446e-06, |
|
"loss": 0.1917, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 1.4922457933425903, |
|
"learning_rate": 6.387375842354843e-06, |
|
"loss": 0.1928, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.5044642857142856, |
|
"grad_norm": 1.5204309225082397, |
|
"learning_rate": 6.376026138517125e-06, |
|
"loss": 0.1852, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 2.508928571428571, |
|
"grad_norm": 1.5281089544296265, |
|
"learning_rate": 6.364668759914005e-06, |
|
"loss": 0.1955, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 2.513392857142857, |
|
"grad_norm": 1.4325025081634521, |
|
"learning_rate": 6.353303769914121e-06, |
|
"loss": 0.2076, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 2.517857142857143, |
|
"grad_norm": 1.4212669134140015, |
|
"learning_rate": 6.341931231928577e-06, |
|
"loss": 0.1674, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 2.522321428571429, |
|
"grad_norm": 1.6153303384780884, |
|
"learning_rate": 6.330551209410593e-06, |
|
"loss": 0.1878, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 2.5267857142857144, |
|
"grad_norm": 1.956329345703125, |
|
"learning_rate": 6.319163765855146e-06, |
|
"loss": 0.2096, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 2.53125, |
|
"grad_norm": 1.5983258485794067, |
|
"learning_rate": 6.307768964798623e-06, |
|
"loss": 0.1859, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 2.5357142857142856, |
|
"grad_norm": 1.6349304914474487, |
|
"learning_rate": 6.296366869818458e-06, |
|
"loss": 0.2049, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 2.540178571428571, |
|
"grad_norm": 1.6352767944335938, |
|
"learning_rate": 6.284957544532783e-06, |
|
"loss": 0.2139, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 2.544642857142857, |
|
"grad_norm": 1.6097818613052368, |
|
"learning_rate": 6.273541052600074e-06, |
|
"loss": 0.2227, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.549107142857143, |
|
"grad_norm": 1.4204213619232178, |
|
"learning_rate": 6.2621174577187895e-06, |
|
"loss": 0.1626, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 2.553571428571429, |
|
"grad_norm": 1.5108147859573364, |
|
"learning_rate": 6.250686823627022e-06, |
|
"loss": 0.1906, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 2.5580357142857144, |
|
"grad_norm": 1.318894863128662, |
|
"learning_rate": 6.239249214102139e-06, |
|
"loss": 0.1644, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 2.5625, |
|
"grad_norm": 1.4823791980743408, |
|
"learning_rate": 6.2278046929604265e-06, |
|
"loss": 0.1953, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 2.5669642857142856, |
|
"grad_norm": 1.440727949142456, |
|
"learning_rate": 6.216353324056732e-06, |
|
"loss": 0.182, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 2.571428571428571, |
|
"grad_norm": 1.3238186836242676, |
|
"learning_rate": 6.204895171284115e-06, |
|
"loss": 0.1706, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 2.575892857142857, |
|
"grad_norm": 1.4335119724273682, |
|
"learning_rate": 6.193430298573481e-06, |
|
"loss": 0.2052, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 2.580357142857143, |
|
"grad_norm": 1.4563027620315552, |
|
"learning_rate": 6.181958769893234e-06, |
|
"loss": 0.2076, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 2.584821428571429, |
|
"grad_norm": 1.4558268785476685, |
|
"learning_rate": 6.17048064924891e-06, |
|
"loss": 0.2074, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 2.5892857142857144, |
|
"grad_norm": 1.4916455745697021, |
|
"learning_rate": 6.158996000682829e-06, |
|
"loss": 0.1868, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.59375, |
|
"grad_norm": 1.5160123109817505, |
|
"learning_rate": 6.147504888273736e-06, |
|
"loss": 0.1952, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 2.5982142857142856, |
|
"grad_norm": 1.518829107284546, |
|
"learning_rate": 6.136007376136429e-06, |
|
"loss": 0.2029, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 2.602678571428571, |
|
"grad_norm": 1.2285252809524536, |
|
"learning_rate": 6.124503528421429e-06, |
|
"loss": 0.1513, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 2.607142857142857, |
|
"grad_norm": 1.465366244316101, |
|
"learning_rate": 6.112993409314594e-06, |
|
"loss": 0.183, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 2.611607142857143, |
|
"grad_norm": 1.3672759532928467, |
|
"learning_rate": 6.101477083036783e-06, |
|
"loss": 0.1763, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 2.616071428571429, |
|
"grad_norm": 1.5228221416473389, |
|
"learning_rate": 6.0899546138434785e-06, |
|
"loss": 0.2012, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 2.6205357142857144, |
|
"grad_norm": 1.5724462270736694, |
|
"learning_rate": 6.0784260660244475e-06, |
|
"loss": 0.1689, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 2.625, |
|
"grad_norm": 1.5473616123199463, |
|
"learning_rate": 6.066891503903363e-06, |
|
"loss": 0.1887, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 2.6294642857142856, |
|
"grad_norm": 1.4898332357406616, |
|
"learning_rate": 6.0553509918374635e-06, |
|
"loss": 0.1752, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 2.633928571428571, |
|
"grad_norm": 1.443292498588562, |
|
"learning_rate": 6.0438045942171775e-06, |
|
"loss": 0.1846, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.638392857142857, |
|
"grad_norm": 1.4720582962036133, |
|
"learning_rate": 6.032252375465778e-06, |
|
"loss": 0.1795, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 2.642857142857143, |
|
"grad_norm": 1.4213638305664062, |
|
"learning_rate": 6.020694400039017e-06, |
|
"loss": 0.1745, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 2.647321428571429, |
|
"grad_norm": 1.5590949058532715, |
|
"learning_rate": 6.009130732424758e-06, |
|
"loss": 0.1866, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 2.6517857142857144, |
|
"grad_norm": 1.6477134227752686, |
|
"learning_rate": 5.997561437142636e-06, |
|
"loss": 0.2065, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 2.65625, |
|
"grad_norm": 12.8923921585083, |
|
"learning_rate": 5.985986578743676e-06, |
|
"loss": 0.1789, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 2.6607142857142856, |
|
"grad_norm": 1.4159936904907227, |
|
"learning_rate": 5.974406221809949e-06, |
|
"loss": 0.1875, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 2.665178571428571, |
|
"grad_norm": 1.6385328769683838, |
|
"learning_rate": 5.962820430954198e-06, |
|
"loss": 0.2118, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 2.669642857142857, |
|
"grad_norm": 1.4674583673477173, |
|
"learning_rate": 5.951229270819494e-06, |
|
"loss": 0.1992, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 2.674107142857143, |
|
"grad_norm": 1.3399938344955444, |
|
"learning_rate": 5.9396328060788576e-06, |
|
"loss": 0.1903, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 2.678571428571429, |
|
"grad_norm": 1.516242265701294, |
|
"learning_rate": 5.928031101434908e-06, |
|
"loss": 0.1835, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.678571428571429, |
|
"eval_loss": 0.6832678914070129, |
|
"eval_runtime": 4.5972, |
|
"eval_samples_per_second": 13.051, |
|
"eval_steps_per_second": 0.87, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.6830357142857144, |
|
"grad_norm": 1.591635823249817, |
|
"learning_rate": 5.916424221619507e-06, |
|
"loss": 0.2111, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 2.6875, |
|
"grad_norm": 1.3842867612838745, |
|
"learning_rate": 5.904812231393383e-06, |
|
"loss": 0.1948, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 2.6919642857142856, |
|
"grad_norm": 1.2690483331680298, |
|
"learning_rate": 5.893195195545784e-06, |
|
"loss": 0.1747, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 2.696428571428571, |
|
"grad_norm": 1.5160295963287354, |
|
"learning_rate": 5.8815731788941064e-06, |
|
"loss": 0.185, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 2.700892857142857, |
|
"grad_norm": 1.5125572681427002, |
|
"learning_rate": 5.86994624628354e-06, |
|
"loss": 0.1879, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 2.705357142857143, |
|
"grad_norm": 1.4671415090560913, |
|
"learning_rate": 5.858314462586697e-06, |
|
"loss": 0.1826, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 2.709821428571429, |
|
"grad_norm": 1.5814452171325684, |
|
"learning_rate": 5.846677892703268e-06, |
|
"loss": 0.2095, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 2.7142857142857144, |
|
"grad_norm": 1.3826875686645508, |
|
"learning_rate": 5.835036601559634e-06, |
|
"loss": 0.1791, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 2.71875, |
|
"grad_norm": 1.4754812717437744, |
|
"learning_rate": 5.82339065410853e-06, |
|
"loss": 0.1866, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 2.7232142857142856, |
|
"grad_norm": 1.541463017463684, |
|
"learning_rate": 5.811740115328665e-06, |
|
"loss": 0.1812, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.727678571428571, |
|
"grad_norm": 1.5529158115386963, |
|
"learning_rate": 5.800085050224367e-06, |
|
"loss": 0.198, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 2.732142857142857, |
|
"grad_norm": 1.5426830053329468, |
|
"learning_rate": 5.7884255238252175e-06, |
|
"loss": 0.1981, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 2.736607142857143, |
|
"grad_norm": 1.610823154449463, |
|
"learning_rate": 5.776761601185692e-06, |
|
"loss": 0.1886, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 2.741071428571429, |
|
"grad_norm": 1.5132354497909546, |
|
"learning_rate": 5.765093347384793e-06, |
|
"loss": 0.1957, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 2.7455357142857144, |
|
"grad_norm": 1.4942512512207031, |
|
"learning_rate": 5.753420827525691e-06, |
|
"loss": 0.2006, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 1.5573617219924927, |
|
"learning_rate": 5.741744106735354e-06, |
|
"loss": 0.1947, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 2.7544642857142856, |
|
"grad_norm": 1.4934524297714233, |
|
"learning_rate": 5.730063250164196e-06, |
|
"loss": 0.177, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 2.758928571428571, |
|
"grad_norm": 1.6269088983535767, |
|
"learning_rate": 5.718378322985702e-06, |
|
"loss": 0.1986, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 2.763392857142857, |
|
"grad_norm": 1.3660807609558105, |
|
"learning_rate": 5.70668939039607e-06, |
|
"loss": 0.1755, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 2.767857142857143, |
|
"grad_norm": 1.5547926425933838, |
|
"learning_rate": 5.694996517613847e-06, |
|
"loss": 0.2123, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.772321428571429, |
|
"grad_norm": 1.579438328742981, |
|
"learning_rate": 5.683299769879562e-06, |
|
"loss": 0.1965, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 2.7767857142857144, |
|
"grad_norm": 1.6027774810791016, |
|
"learning_rate": 5.6715992124553685e-06, |
|
"loss": 0.1928, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 2.78125, |
|
"grad_norm": 1.556585669517517, |
|
"learning_rate": 5.6598949106246734e-06, |
|
"loss": 0.203, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 2.7857142857142856, |
|
"grad_norm": 1.7546970844268799, |
|
"learning_rate": 5.648186929691776e-06, |
|
"loss": 0.2128, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 2.790178571428571, |
|
"grad_norm": 1.5303380489349365, |
|
"learning_rate": 5.6364753349815035e-06, |
|
"loss": 0.1956, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 2.794642857142857, |
|
"grad_norm": 1.4398573637008667, |
|
"learning_rate": 5.624760191838845e-06, |
|
"loss": 0.1818, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 2.799107142857143, |
|
"grad_norm": 1.4977809190750122, |
|
"learning_rate": 5.61304156562859e-06, |
|
"loss": 0.2104, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 2.803571428571429, |
|
"grad_norm": 1.465651512145996, |
|
"learning_rate": 5.60131952173496e-06, |
|
"loss": 0.1905, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 2.8080357142857144, |
|
"grad_norm": 1.5513821840286255, |
|
"learning_rate": 5.589594125561246e-06, |
|
"loss": 0.201, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 2.8125, |
|
"grad_norm": 1.4582335948944092, |
|
"learning_rate": 5.577865442529447e-06, |
|
"loss": 0.2003, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.8169642857142856, |
|
"grad_norm": 1.513297438621521, |
|
"learning_rate": 5.566133538079893e-06, |
|
"loss": 0.1771, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 2.821428571428571, |
|
"grad_norm": 1.5092986822128296, |
|
"learning_rate": 5.554398477670895e-06, |
|
"loss": 0.185, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 2.825892857142857, |
|
"grad_norm": 1.3965249061584473, |
|
"learning_rate": 5.54266032677837e-06, |
|
"loss": 0.1931, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 2.830357142857143, |
|
"grad_norm": 1.5133389234542847, |
|
"learning_rate": 5.53091915089548e-06, |
|
"loss": 0.1805, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 2.834821428571429, |
|
"grad_norm": 1.5035574436187744, |
|
"learning_rate": 5.5191750155322595e-06, |
|
"loss": 0.1936, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 2.8392857142857144, |
|
"grad_norm": 1.4027115106582642, |
|
"learning_rate": 5.507427986215265e-06, |
|
"loss": 0.188, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 2.84375, |
|
"grad_norm": 1.6537582874298096, |
|
"learning_rate": 5.49567812848719e-06, |
|
"loss": 0.1989, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 2.8482142857142856, |
|
"grad_norm": 1.6560035943984985, |
|
"learning_rate": 5.483925507906514e-06, |
|
"loss": 0.2003, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 2.852678571428571, |
|
"grad_norm": 1.5221431255340576, |
|
"learning_rate": 5.4721701900471335e-06, |
|
"loss": 0.1898, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 2.857142857142857, |
|
"grad_norm": 1.5042264461517334, |
|
"learning_rate": 5.460412240497993e-06, |
|
"loss": 0.1964, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.861607142857143, |
|
"grad_norm": 1.5381810665130615, |
|
"learning_rate": 5.448651724862716e-06, |
|
"loss": 0.185, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 2.866071428571429, |
|
"grad_norm": 1.4257649183273315, |
|
"learning_rate": 5.436888708759253e-06, |
|
"loss": 0.189, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 2.8705357142857144, |
|
"grad_norm": 1.5205702781677246, |
|
"learning_rate": 5.425123257819494e-06, |
|
"loss": 0.2022, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 2.875, |
|
"grad_norm": 1.4278076887130737, |
|
"learning_rate": 5.413355437688926e-06, |
|
"loss": 0.1724, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 2.8794642857142856, |
|
"grad_norm": 1.529142141342163, |
|
"learning_rate": 5.401585314026248e-06, |
|
"loss": 0.1946, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 2.883928571428571, |
|
"grad_norm": 1.5609850883483887, |
|
"learning_rate": 5.3898129525030105e-06, |
|
"loss": 0.2121, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 2.888392857142857, |
|
"grad_norm": 1.690317153930664, |
|
"learning_rate": 5.378038418803256e-06, |
|
"loss": 0.2023, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 2.892857142857143, |
|
"grad_norm": 1.5702646970748901, |
|
"learning_rate": 5.366261778623143e-06, |
|
"loss": 0.2036, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 2.897321428571429, |
|
"grad_norm": 1.6084643602371216, |
|
"learning_rate": 5.354483097670584e-06, |
|
"loss": 0.2009, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 2.9017857142857144, |
|
"grad_norm": 1.7028254270553589, |
|
"learning_rate": 5.342702441664875e-06, |
|
"loss": 0.2093, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.90625, |
|
"grad_norm": 1.5291582345962524, |
|
"learning_rate": 5.3309198763363345e-06, |
|
"loss": 0.2031, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 2.9107142857142856, |
|
"grad_norm": 1.5348578691482544, |
|
"learning_rate": 5.319135467425937e-06, |
|
"loss": 0.199, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 2.915178571428571, |
|
"grad_norm": 1.3724780082702637, |
|
"learning_rate": 5.3073492806849405e-06, |
|
"loss": 0.1765, |
|
"step": 653 |
|
}, |
|
{ |
|
"epoch": 2.919642857142857, |
|
"grad_norm": 1.4268510341644287, |
|
"learning_rate": 5.295561381874518e-06, |
|
"loss": 0.17, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 2.924107142857143, |
|
"grad_norm": 1.462509036064148, |
|
"learning_rate": 5.2837718367654036e-06, |
|
"loss": 0.1968, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 2.928571428571429, |
|
"grad_norm": 1.5478267669677734, |
|
"learning_rate": 5.2719807111375096e-06, |
|
"loss": 0.2011, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 2.9330357142857144, |
|
"grad_norm": 1.6338170766830444, |
|
"learning_rate": 5.260188070779573e-06, |
|
"loss": 0.1974, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 2.9375, |
|
"grad_norm": 1.4555277824401855, |
|
"learning_rate": 5.248393981488777e-06, |
|
"loss": 0.1859, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 2.9419642857142856, |
|
"grad_norm": 1.3970043659210205, |
|
"learning_rate": 5.236598509070389e-06, |
|
"loss": 0.1768, |
|
"step": 659 |
|
}, |
|
{ |
|
"epoch": 2.946428571428571, |
|
"grad_norm": 1.533681869506836, |
|
"learning_rate": 5.2248017193374e-06, |
|
"loss": 0.2143, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.950892857142857, |
|
"grad_norm": 1.6132566928863525, |
|
"learning_rate": 5.2130036781101455e-06, |
|
"loss": 0.1895, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 2.955357142857143, |
|
"grad_norm": 1.5554327964782715, |
|
"learning_rate": 5.201204451215945e-06, |
|
"loss": 0.1883, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 2.959821428571429, |
|
"grad_norm": 1.3858691453933716, |
|
"learning_rate": 5.18940410448873e-06, |
|
"loss": 0.1775, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 2.9642857142857144, |
|
"grad_norm": 1.49596107006073, |
|
"learning_rate": 5.1776027037686895e-06, |
|
"loss": 0.1754, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 2.96875, |
|
"grad_norm": 1.5804927349090576, |
|
"learning_rate": 5.165800314901883e-06, |
|
"loss": 0.1922, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 2.9732142857142856, |
|
"grad_norm": 1.5328700542449951, |
|
"learning_rate": 5.15399700373989e-06, |
|
"loss": 0.1994, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 2.977678571428571, |
|
"grad_norm": 1.593167781829834, |
|
"learning_rate": 5.142192836139432e-06, |
|
"loss": 0.1943, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 2.982142857142857, |
|
"grad_norm": 1.4839582443237305, |
|
"learning_rate": 5.130387877962012e-06, |
|
"loss": 0.1697, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 2.986607142857143, |
|
"grad_norm": 1.5925782918930054, |
|
"learning_rate": 5.118582195073542e-06, |
|
"loss": 0.1986, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 2.991071428571429, |
|
"grad_norm": 1.4743068218231201, |
|
"learning_rate": 5.1067758533439804e-06, |
|
"loss": 0.1959, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.9955357142857144, |
|
"grad_norm": 1.3711611032485962, |
|
"learning_rate": 5.094968918646954e-06, |
|
"loss": 0.1737, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 1.3573774099349976, |
|
"learning_rate": 5.0831614568594105e-06, |
|
"loss": 0.1787, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 3.0044642857142856, |
|
"grad_norm": 1.1417394876480103, |
|
"learning_rate": 5.071353533861225e-06, |
|
"loss": 0.114, |
|
"step": 673 |
|
}, |
|
{ |
|
"epoch": 3.0089285714285716, |
|
"grad_norm": 1.105907678604126, |
|
"learning_rate": 5.059545215534859e-06, |
|
"loss": 0.1086, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 3.013392857142857, |
|
"grad_norm": 1.189517617225647, |
|
"learning_rate": 5.047736567764967e-06, |
|
"loss": 0.1138, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 3.017857142857143, |
|
"grad_norm": 1.0811058282852173, |
|
"learning_rate": 5.0359276564380514e-06, |
|
"loss": 0.1067, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 3.0223214285714284, |
|
"grad_norm": 1.2044225931167603, |
|
"learning_rate": 5.024118547442083e-06, |
|
"loss": 0.0996, |
|
"step": 677 |
|
}, |
|
{ |
|
"epoch": 3.0267857142857144, |
|
"grad_norm": 1.1959599256515503, |
|
"learning_rate": 5.012309306666129e-06, |
|
"loss": 0.0986, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 3.03125, |
|
"grad_norm": 1.1417187452316284, |
|
"learning_rate": 5.000500000000001e-06, |
|
"loss": 0.0816, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 3.0357142857142856, |
|
"grad_norm": 1.2518037557601929, |
|
"learning_rate": 4.988690693333873e-06, |
|
"loss": 0.0855, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 3.0401785714285716, |
|
"grad_norm": 1.4060600996017456, |
|
"learning_rate": 4.97688145255792e-06, |
|
"loss": 0.0997, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 3.044642857142857, |
|
"grad_norm": 1.3264249563217163, |
|
"learning_rate": 4.965072343561948e-06, |
|
"loss": 0.0907, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 3.049107142857143, |
|
"grad_norm": 1.5176544189453125, |
|
"learning_rate": 4.953263432235034e-06, |
|
"loss": 0.0882, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 3.0535714285714284, |
|
"grad_norm": 1.6541138887405396, |
|
"learning_rate": 4.941454784465144e-06, |
|
"loss": 0.0876, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 3.0580357142857144, |
|
"grad_norm": 1.5412651300430298, |
|
"learning_rate": 4.929646466138777e-06, |
|
"loss": 0.0923, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 3.0625, |
|
"grad_norm": 1.4274966716766357, |
|
"learning_rate": 4.917838543140591e-06, |
|
"loss": 0.0732, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 3.0669642857142856, |
|
"grad_norm": 1.5422570705413818, |
|
"learning_rate": 4.906031081353047e-06, |
|
"loss": 0.0882, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 3.0714285714285716, |
|
"grad_norm": 1.7544273138046265, |
|
"learning_rate": 4.8942241466560226e-06, |
|
"loss": 0.0925, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 3.075892857142857, |
|
"grad_norm": 1.6329741477966309, |
|
"learning_rate": 4.882417804926457e-06, |
|
"loss": 0.0908, |
|
"step": 689 |
|
}, |
|
{ |
|
"epoch": 3.080357142857143, |
|
"grad_norm": 1.8030346632003784, |
|
"learning_rate": 4.870612122037989e-06, |
|
"loss": 0.1051, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 3.0848214285714284, |
|
"grad_norm": 1.8434675931930542, |
|
"learning_rate": 4.858807163860569e-06, |
|
"loss": 0.1039, |
|
"step": 691 |
|
}, |
|
{ |
|
"epoch": 3.0892857142857144, |
|
"grad_norm": 1.7246928215026855, |
|
"learning_rate": 4.847002996260113e-06, |
|
"loss": 0.0913, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 3.09375, |
|
"grad_norm": 1.6196149587631226, |
|
"learning_rate": 4.835199685098117e-06, |
|
"loss": 0.1038, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 3.0982142857142856, |
|
"grad_norm": 1.6847678422927856, |
|
"learning_rate": 4.823397296231313e-06, |
|
"loss": 0.0855, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 3.1026785714285716, |
|
"grad_norm": 1.7434442043304443, |
|
"learning_rate": 4.8115958955112715e-06, |
|
"loss": 0.094, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 3.107142857142857, |
|
"grad_norm": 1.6175901889801025, |
|
"learning_rate": 4.799795548784058e-06, |
|
"loss": 0.0908, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 3.111607142857143, |
|
"grad_norm": 1.6020538806915283, |
|
"learning_rate": 4.787996321889856e-06, |
|
"loss": 0.0943, |
|
"step": 697 |
|
}, |
|
{ |
|
"epoch": 3.1160714285714284, |
|
"grad_norm": 1.4542748928070068, |
|
"learning_rate": 4.7761982806626015e-06, |
|
"loss": 0.098, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 3.1205357142857144, |
|
"grad_norm": 1.4568634033203125, |
|
"learning_rate": 4.764401490929613e-06, |
|
"loss": 0.0884, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 3.125, |
|
"grad_norm": 1.4934730529785156, |
|
"learning_rate": 4.752606018511225e-06, |
|
"loss": 0.0941, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 3.1294642857142856, |
|
"grad_norm": 1.4770476818084717, |
|
"learning_rate": 4.740811929220429e-06, |
|
"loss": 0.1118, |
|
"step": 701 |
|
}, |
|
{ |
|
"epoch": 3.1339285714285716, |
|
"grad_norm": 1.416353464126587, |
|
"learning_rate": 4.729019288862492e-06, |
|
"loss": 0.0996, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 3.138392857142857, |
|
"grad_norm": 1.4401273727416992, |
|
"learning_rate": 4.717228163234599e-06, |
|
"loss": 0.092, |
|
"step": 703 |
|
}, |
|
{ |
|
"epoch": 3.142857142857143, |
|
"grad_norm": 1.1891121864318848, |
|
"learning_rate": 4.705438618125482e-06, |
|
"loss": 0.0741, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 3.1473214285714284, |
|
"grad_norm": 1.386759877204895, |
|
"learning_rate": 4.693650719315062e-06, |
|
"loss": 0.0998, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 3.1517857142857144, |
|
"grad_norm": 1.3926823139190674, |
|
"learning_rate": 4.681864532574064e-06, |
|
"loss": 0.1015, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 3.15625, |
|
"grad_norm": 1.3910696506500244, |
|
"learning_rate": 4.670080123663668e-06, |
|
"loss": 0.1002, |
|
"step": 707 |
|
}, |
|
{ |
|
"epoch": 3.1607142857142856, |
|
"grad_norm": 1.2177248001098633, |
|
"learning_rate": 4.658297558335127e-06, |
|
"loss": 0.0903, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 3.1651785714285716, |
|
"grad_norm": 1.434577465057373, |
|
"learning_rate": 4.64651690232942e-06, |
|
"loss": 0.0956, |
|
"step": 709 |
|
}, |
|
{ |
|
"epoch": 3.169642857142857, |
|
"grad_norm": 1.2175168991088867, |
|
"learning_rate": 4.634738221376858e-06, |
|
"loss": 0.0805, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 3.174107142857143, |
|
"grad_norm": 1.2530947923660278, |
|
"learning_rate": 4.622961581196743e-06, |
|
"loss": 0.0804, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 3.1785714285714284, |
|
"grad_norm": 1.273105502128601, |
|
"learning_rate": 4.611187047496989e-06, |
|
"loss": 0.0912, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 3.1830357142857144, |
|
"grad_norm": 1.1457451581954956, |
|
"learning_rate": 4.599414685973754e-06, |
|
"loss": 0.0794, |
|
"step": 713 |
|
}, |
|
{ |
|
"epoch": 3.1875, |
|
"grad_norm": 1.20325767993927, |
|
"learning_rate": 4.587644562311077e-06, |
|
"loss": 0.0809, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 3.1919642857142856, |
|
"grad_norm": 1.5332928895950317, |
|
"learning_rate": 4.575876742180506e-06, |
|
"loss": 0.0959, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 3.1964285714285716, |
|
"grad_norm": 1.1909514665603638, |
|
"learning_rate": 4.56411129124075e-06, |
|
"loss": 0.0843, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 3.200892857142857, |
|
"grad_norm": 1.472267508506775, |
|
"learning_rate": 4.552348275137285e-06, |
|
"loss": 0.1156, |
|
"step": 717 |
|
}, |
|
{ |
|
"epoch": 3.205357142857143, |
|
"grad_norm": 1.532912254333496, |
|
"learning_rate": 4.54058775950201e-06, |
|
"loss": 0.0935, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 3.2098214285714284, |
|
"grad_norm": 1.4942517280578613, |
|
"learning_rate": 4.528829809952867e-06, |
|
"loss": 0.0945, |
|
"step": 719 |
|
}, |
|
{ |
|
"epoch": 3.2142857142857144, |
|
"grad_norm": 1.5307639837265015, |
|
"learning_rate": 4.517074492093487e-06, |
|
"loss": 0.1106, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 3.21875, |
|
"grad_norm": 1.477367639541626, |
|
"learning_rate": 4.505321871512813e-06, |
|
"loss": 0.1024, |
|
"step": 721 |
|
}, |
|
{ |
|
"epoch": 3.2232142857142856, |
|
"grad_norm": 1.6967626810073853, |
|
"learning_rate": 4.493572013784737e-06, |
|
"loss": 0.1039, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 3.2276785714285716, |
|
"grad_norm": 1.4869396686553955, |
|
"learning_rate": 4.481824984467742e-06, |
|
"loss": 0.0929, |
|
"step": 723 |
|
}, |
|
{ |
|
"epoch": 3.232142857142857, |
|
"grad_norm": 1.466221570968628, |
|
"learning_rate": 4.470080849104521e-06, |
|
"loss": 0.1008, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 3.236607142857143, |
|
"grad_norm": 1.7747437953948975, |
|
"learning_rate": 4.458339673221631e-06, |
|
"loss": 0.1099, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 3.2410714285714284, |
|
"grad_norm": 1.5948566198349, |
|
"learning_rate": 4.446601522329105e-06, |
|
"loss": 0.0876, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 3.2455357142857144, |
|
"grad_norm": 1.3861619234085083, |
|
"learning_rate": 4.434866461920108e-06, |
|
"loss": 0.093, |
|
"step": 727 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 1.7178670167922974, |
|
"learning_rate": 4.4231345574705555e-06, |
|
"loss": 0.1074, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 3.2544642857142856, |
|
"grad_norm": 1.6227957010269165, |
|
"learning_rate": 4.4114058744387535e-06, |
|
"loss": 0.0984, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 3.2589285714285716, |
|
"grad_norm": 1.5968222618103027, |
|
"learning_rate": 4.399680478265042e-06, |
|
"loss": 0.1015, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 3.263392857142857, |
|
"grad_norm": 1.6373909711837769, |
|
"learning_rate": 4.387958434371413e-06, |
|
"loss": 0.099, |
|
"step": 731 |
|
}, |
|
{ |
|
"epoch": 3.267857142857143, |
|
"grad_norm": 1.3308444023132324, |
|
"learning_rate": 4.376239808161157e-06, |
|
"loss": 0.0819, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 3.2723214285714284, |
|
"grad_norm": 1.4825462102890015, |
|
"learning_rate": 4.364524665018496e-06, |
|
"loss": 0.0872, |
|
"step": 733 |
|
}, |
|
{ |
|
"epoch": 3.2767857142857144, |
|
"grad_norm": 1.5847879648208618, |
|
"learning_rate": 4.3528130703082245e-06, |
|
"loss": 0.0873, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 3.28125, |
|
"grad_norm": 1.3668209314346313, |
|
"learning_rate": 4.341105089375328e-06, |
|
"loss": 0.0843, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 3.2857142857142856, |
|
"grad_norm": 1.4361255168914795, |
|
"learning_rate": 4.329400787544633e-06, |
|
"loss": 0.093, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 3.2901785714285716, |
|
"grad_norm": 1.2126491069793701, |
|
"learning_rate": 4.317700230120438e-06, |
|
"loss": 0.0825, |
|
"step": 737 |
|
}, |
|
{ |
|
"epoch": 3.294642857142857, |
|
"grad_norm": 1.4545817375183105, |
|
"learning_rate": 4.306003482386156e-06, |
|
"loss": 0.1061, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 3.299107142857143, |
|
"grad_norm": 1.3296757936477661, |
|
"learning_rate": 4.2943106096039315e-06, |
|
"loss": 0.0888, |
|
"step": 739 |
|
}, |
|
{ |
|
"epoch": 3.3035714285714284, |
|
"grad_norm": 1.4436862468719482, |
|
"learning_rate": 4.282621677014299e-06, |
|
"loss": 0.0955, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 3.3080357142857144, |
|
"grad_norm": 1.313293218612671, |
|
"learning_rate": 4.270936749835805e-06, |
|
"loss": 0.0979, |
|
"step": 741 |
|
}, |
|
{ |
|
"epoch": 3.3125, |
|
"grad_norm": 1.2750566005706787, |
|
"learning_rate": 4.259255893264647e-06, |
|
"loss": 0.0907, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 3.3169642857142856, |
|
"grad_norm": 1.6833667755126953, |
|
"learning_rate": 4.247579172474312e-06, |
|
"loss": 0.1025, |
|
"step": 743 |
|
}, |
|
{ |
|
"epoch": 3.3214285714285716, |
|
"grad_norm": 1.4311575889587402, |
|
"learning_rate": 4.235906652615207e-06, |
|
"loss": 0.1006, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 3.325892857142857, |
|
"grad_norm": 1.5903573036193848, |
|
"learning_rate": 4.224238398814309e-06, |
|
"loss": 0.0953, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 3.330357142857143, |
|
"grad_norm": 1.3978747129440308, |
|
"learning_rate": 4.212574476174784e-06, |
|
"loss": 0.095, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 3.3348214285714284, |
|
"grad_norm": 1.4329924583435059, |
|
"learning_rate": 4.2009149497756355e-06, |
|
"loss": 0.1007, |
|
"step": 747 |
|
}, |
|
{ |
|
"epoch": 3.3392857142857144, |
|
"grad_norm": 1.2830747365951538, |
|
"learning_rate": 4.189259884671336e-06, |
|
"loss": 0.0862, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 3.34375, |
|
"grad_norm": 1.3609569072723389, |
|
"learning_rate": 4.177609345891472e-06, |
|
"loss": 0.0875, |
|
"step": 749 |
|
}, |
|
{ |
|
"epoch": 3.3482142857142856, |
|
"grad_norm": 1.3331035375595093, |
|
"learning_rate": 4.165963398440368e-06, |
|
"loss": 0.1016, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 3.3526785714285716, |
|
"grad_norm": 1.4151555299758911, |
|
"learning_rate": 4.1543221072967334e-06, |
|
"loss": 0.0975, |
|
"step": 751 |
|
}, |
|
{ |
|
"epoch": 3.357142857142857, |
|
"grad_norm": 1.490128755569458, |
|
"learning_rate": 4.142685537413303e-06, |
|
"loss": 0.0915, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 3.361607142857143, |
|
"grad_norm": 1.3559775352478027, |
|
"learning_rate": 4.1310537537164615e-06, |
|
"loss": 0.0979, |
|
"step": 753 |
|
}, |
|
{ |
|
"epoch": 3.3660714285714284, |
|
"grad_norm": 1.4377248287200928, |
|
"learning_rate": 4.119426821105895e-06, |
|
"loss": 0.0935, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 3.3705357142857144, |
|
"grad_norm": 1.4320459365844727, |
|
"learning_rate": 4.107804804454215e-06, |
|
"loss": 0.0976, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 3.375, |
|
"grad_norm": 1.473273754119873, |
|
"learning_rate": 4.096187768606617e-06, |
|
"loss": 0.0949, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 3.3794642857142856, |
|
"grad_norm": 1.240604043006897, |
|
"learning_rate": 4.084575778380495e-06, |
|
"loss": 0.077, |
|
"step": 757 |
|
}, |
|
{ |
|
"epoch": 3.3839285714285716, |
|
"grad_norm": 1.6010382175445557, |
|
"learning_rate": 4.072968898565094e-06, |
|
"loss": 0.0987, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 3.388392857142857, |
|
"grad_norm": 1.5959616899490356, |
|
"learning_rate": 4.061367193921145e-06, |
|
"loss": 0.1125, |
|
"step": 759 |
|
}, |
|
{ |
|
"epoch": 3.392857142857143, |
|
"grad_norm": 1.5466870069503784, |
|
"learning_rate": 4.049770729180508e-06, |
|
"loss": 0.0985, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 3.3973214285714284, |
|
"grad_norm": 1.485848307609558, |
|
"learning_rate": 4.038179569045803e-06, |
|
"loss": 0.0942, |
|
"step": 761 |
|
}, |
|
{ |
|
"epoch": 3.4017857142857144, |
|
"grad_norm": 1.4044053554534912, |
|
"learning_rate": 4.026593778190052e-06, |
|
"loss": 0.096, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 3.40625, |
|
"grad_norm": 1.2948501110076904, |
|
"learning_rate": 4.015013421256324e-06, |
|
"loss": 0.0852, |
|
"step": 763 |
|
}, |
|
{ |
|
"epoch": 3.4107142857142856, |
|
"grad_norm": 1.2491077184677124, |
|
"learning_rate": 4.0034385628573655e-06, |
|
"loss": 0.0814, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 3.4151785714285716, |
|
"grad_norm": 1.4824570417404175, |
|
"learning_rate": 3.991869267575243e-06, |
|
"loss": 0.0848, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 3.419642857142857, |
|
"grad_norm": 1.6389641761779785, |
|
"learning_rate": 3.9803055999609855e-06, |
|
"loss": 0.0929, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 3.424107142857143, |
|
"grad_norm": 1.5796787738800049, |
|
"learning_rate": 3.9687476245342234e-06, |
|
"loss": 0.1006, |
|
"step": 767 |
|
}, |
|
{ |
|
"epoch": 3.4285714285714284, |
|
"grad_norm": 1.4374067783355713, |
|
"learning_rate": 3.957195405782824e-06, |
|
"loss": 0.0938, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 3.4330357142857144, |
|
"grad_norm": 1.5482842922210693, |
|
"learning_rate": 3.9456490081625396e-06, |
|
"loss": 0.1003, |
|
"step": 769 |
|
}, |
|
{ |
|
"epoch": 3.4375, |
|
"grad_norm": 1.366696834564209, |
|
"learning_rate": 3.934108496096638e-06, |
|
"loss": 0.0896, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 3.4419642857142856, |
|
"grad_norm": 1.4534419775009155, |
|
"learning_rate": 3.922573933975555e-06, |
|
"loss": 0.0954, |
|
"step": 771 |
|
}, |
|
{ |
|
"epoch": 3.4464285714285716, |
|
"grad_norm": 1.3738148212432861, |
|
"learning_rate": 3.911045386156523e-06, |
|
"loss": 0.0986, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 3.450892857142857, |
|
"grad_norm": 1.4549438953399658, |
|
"learning_rate": 3.899522916963219e-06, |
|
"loss": 0.0937, |
|
"step": 773 |
|
}, |
|
{ |
|
"epoch": 3.455357142857143, |
|
"grad_norm": 1.5725458860397339, |
|
"learning_rate": 3.888006590685407e-06, |
|
"loss": 0.0884, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 3.4598214285714284, |
|
"grad_norm": 1.409229040145874, |
|
"learning_rate": 3.876496471578572e-06, |
|
"loss": 0.0985, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 3.4642857142857144, |
|
"grad_norm": 1.4481651782989502, |
|
"learning_rate": 3.864992623863572e-06, |
|
"loss": 0.0993, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 3.46875, |
|
"grad_norm": 1.3321104049682617, |
|
"learning_rate": 3.853495111726265e-06, |
|
"loss": 0.0938, |
|
"step": 777 |
|
}, |
|
{ |
|
"epoch": 3.4732142857142856, |
|
"grad_norm": 1.501019835472107, |
|
"learning_rate": 3.84200399931717e-06, |
|
"loss": 0.0955, |
|
"step": 778 |
|
}, |
|
{ |
|
"epoch": 3.4776785714285716, |
|
"grad_norm": 1.3454251289367676, |
|
"learning_rate": 3.8305193507510905e-06, |
|
"loss": 0.0955, |
|
"step": 779 |
|
}, |
|
{ |
|
"epoch": 3.482142857142857, |
|
"grad_norm": 1.5015486478805542, |
|
"learning_rate": 3.819041230106768e-06, |
|
"loss": 0.1041, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 3.486607142857143, |
|
"grad_norm": 1.1266005039215088, |
|
"learning_rate": 3.807569701426519e-06, |
|
"loss": 0.0815, |
|
"step": 781 |
|
}, |
|
{ |
|
"epoch": 3.4910714285714284, |
|
"grad_norm": 1.322966456413269, |
|
"learning_rate": 3.7961048287158865e-06, |
|
"loss": 0.088, |
|
"step": 782 |
|
}, |
|
{ |
|
"epoch": 3.4955357142857144, |
|
"grad_norm": 1.31028151512146, |
|
"learning_rate": 3.784646675943269e-06, |
|
"loss": 0.0905, |
|
"step": 783 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 1.4570220708847046, |
|
"learning_rate": 3.773195307039574e-06, |
|
"loss": 0.0966, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 3.5044642857142856, |
|
"grad_norm": 1.4365588426589966, |
|
"learning_rate": 3.7617507858978615e-06, |
|
"loss": 0.0924, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 3.508928571428571, |
|
"grad_norm": 1.2722663879394531, |
|
"learning_rate": 3.7503131763729785e-06, |
|
"loss": 0.0861, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 3.513392857142857, |
|
"grad_norm": 1.4443727731704712, |
|
"learning_rate": 3.738882542281212e-06, |
|
"loss": 0.0943, |
|
"step": 787 |
|
}, |
|
{ |
|
"epoch": 3.517857142857143, |
|
"grad_norm": 1.4455920457839966, |
|
"learning_rate": 3.727458947399927e-06, |
|
"loss": 0.0924, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 3.522321428571429, |
|
"grad_norm": 1.2739661931991577, |
|
"learning_rate": 3.7160424554672187e-06, |
|
"loss": 0.0938, |
|
"step": 789 |
|
}, |
|
{ |
|
"epoch": 3.5267857142857144, |
|
"grad_norm": 1.3610420227050781, |
|
"learning_rate": 3.7046331301815435e-06, |
|
"loss": 0.0928, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 3.53125, |
|
"grad_norm": 1.360996127128601, |
|
"learning_rate": 3.6932310352013796e-06, |
|
"loss": 0.0848, |
|
"step": 791 |
|
}, |
|
{ |
|
"epoch": 3.5357142857142856, |
|
"grad_norm": 1.3822091817855835, |
|
"learning_rate": 3.6818362341448545e-06, |
|
"loss": 0.0937, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 3.540178571428571, |
|
"grad_norm": 1.5511473417282104, |
|
"learning_rate": 3.670448790589408e-06, |
|
"loss": 0.0964, |
|
"step": 793 |
|
}, |
|
{ |
|
"epoch": 3.544642857142857, |
|
"grad_norm": 1.322980284690857, |
|
"learning_rate": 3.659068768071425e-06, |
|
"loss": 0.0828, |
|
"step": 794 |
|
}, |
|
{ |
|
"epoch": 3.549107142857143, |
|
"grad_norm": 1.4569833278656006, |
|
"learning_rate": 3.6476962300858793e-06, |
|
"loss": 0.0982, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 3.553571428571429, |
|
"grad_norm": 1.2515519857406616, |
|
"learning_rate": 3.6363312400859963e-06, |
|
"loss": 0.0874, |
|
"step": 796 |
|
}, |
|
{ |
|
"epoch": 3.5580357142857144, |
|
"grad_norm": 1.5618796348571777, |
|
"learning_rate": 3.6249738614828765e-06, |
|
"loss": 0.0951, |
|
"step": 797 |
|
}, |
|
{ |
|
"epoch": 3.5625, |
|
"grad_norm": 1.530705213546753, |
|
"learning_rate": 3.613624157645159e-06, |
|
"loss": 0.0983, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 3.5669642857142856, |
|
"grad_norm": 1.6114871501922607, |
|
"learning_rate": 3.6022821918986563e-06, |
|
"loss": 0.0946, |
|
"step": 799 |
|
}, |
|
{ |
|
"epoch": 3.571428571428571, |
|
"grad_norm": 1.43599271774292, |
|
"learning_rate": 3.590948027526012e-06, |
|
"loss": 0.0989, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 3.571428571428571, |
|
"eval_loss": 0.843691885471344, |
|
"eval_runtime": 4.4852, |
|
"eval_samples_per_second": 13.377, |
|
"eval_steps_per_second": 0.892, |
|
"step": 800 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 1344, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 6, |
|
"save_steps": 200, |
|
"total_flos": 1.444165112574247e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|