|
{ |
|
"best_metric": 0.5934335589408875, |
|
"best_model_checkpoint": "/ephemeral/models/qwen-describe_tasks_uniform_1slice/checkpoint-171", |
|
"epoch": 6.0, |
|
"eval_steps": 57, |
|
"global_step": 1344, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.004464285714285714, |
|
"grad_norm": 11.030957221984863, |
|
"learning_rate": 0.0, |
|
"loss": 1.3544, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.008928571428571428, |
|
"grad_norm": 7.5021185874938965, |
|
"learning_rate": 2.626495350371936e-06, |
|
"loss": 1.081, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.013392857142857142, |
|
"grad_norm": 7.396653652191162, |
|
"learning_rate": 4.162896638657993e-06, |
|
"loss": 1.0844, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.017857142857142856, |
|
"grad_norm": 6.913914203643799, |
|
"learning_rate": 5.252990700743872e-06, |
|
"loss": 0.9979, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.022321428571428572, |
|
"grad_norm": 6.602043628692627, |
|
"learning_rate": 6.098533345119624e-06, |
|
"loss": 0.7692, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.026785714285714284, |
|
"grad_norm": 4.961360454559326, |
|
"learning_rate": 6.7893919890299284e-06, |
|
"loss": 0.7355, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.03125, |
|
"grad_norm": 3.8967952728271484, |
|
"learning_rate": 7.373504649628066e-06, |
|
"loss": 0.6837, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.03571428571428571, |
|
"grad_norm": 5.37336540222168, |
|
"learning_rate": 7.879486051115807e-06, |
|
"loss": 0.7389, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.04017857142857143, |
|
"grad_norm": 4.335018157958984, |
|
"learning_rate": 8.325793277315987e-06, |
|
"loss": 0.6734, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.044642857142857144, |
|
"grad_norm": 3.5685887336730957, |
|
"learning_rate": 8.72502869549156e-06, |
|
"loss": 0.6469, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.049107142857142856, |
|
"grad_norm": 3.8976097106933594, |
|
"learning_rate": 9.086181061280522e-06, |
|
"loss": 0.5893, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.05357142857142857, |
|
"grad_norm": 3.3721907138824463, |
|
"learning_rate": 9.415887339401865e-06, |
|
"loss": 0.5792, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.05803571428571429, |
|
"grad_norm": 3.204407215118408, |
|
"learning_rate": 9.719187714029216e-06, |
|
"loss": 0.6511, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.0625, |
|
"grad_norm": 2.965876817703247, |
|
"learning_rate": 1e-05, |
|
"loss": 0.5734, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.06696428571428571, |
|
"grad_norm": 3.5149614810943604, |
|
"learning_rate": 9.999986052613417e-06, |
|
"loss": 0.5893, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.07142857142857142, |
|
"grad_norm": 3.0840070247650146, |
|
"learning_rate": 9.99994421053148e-06, |
|
"loss": 0.6035, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.07589285714285714, |
|
"grad_norm": 2.63883113861084, |
|
"learning_rate": 9.999874473987653e-06, |
|
"loss": 0.5674, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.08035714285714286, |
|
"grad_norm": 2.7957351207733154, |
|
"learning_rate": 9.999776843371027e-06, |
|
"loss": 0.6093, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.08482142857142858, |
|
"grad_norm": 2.5564727783203125, |
|
"learning_rate": 9.99965131922634e-06, |
|
"loss": 0.5237, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.08928571428571429, |
|
"grad_norm": 2.794530153274536, |
|
"learning_rate": 9.999497902253949e-06, |
|
"loss": 0.5938, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.09375, |
|
"grad_norm": 2.564220428466797, |
|
"learning_rate": 9.999316593309849e-06, |
|
"loss": 0.605, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.09821428571428571, |
|
"grad_norm": 2.927748918533325, |
|
"learning_rate": 9.999107393405655e-06, |
|
"loss": 0.5935, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.10267857142857142, |
|
"grad_norm": 3.0531842708587646, |
|
"learning_rate": 9.998870303708601e-06, |
|
"loss": 0.4989, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.10714285714285714, |
|
"grad_norm": 2.589479923248291, |
|
"learning_rate": 9.998605325541531e-06, |
|
"loss": 0.5839, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.11160714285714286, |
|
"grad_norm": 2.739987850189209, |
|
"learning_rate": 9.998312460382895e-06, |
|
"loss": 0.5932, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.11607142857142858, |
|
"grad_norm": 2.6279497146606445, |
|
"learning_rate": 9.997991709866738e-06, |
|
"loss": 0.5965, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.12053571428571429, |
|
"grad_norm": 2.1124532222747803, |
|
"learning_rate": 9.997643075782691e-06, |
|
"loss": 0.475, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.125, |
|
"grad_norm": 2.6372640132904053, |
|
"learning_rate": 9.997266560075961e-06, |
|
"loss": 0.5592, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.12946428571428573, |
|
"grad_norm": 3.1374351978302, |
|
"learning_rate": 9.996862164847323e-06, |
|
"loss": 0.5208, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.13392857142857142, |
|
"grad_norm": 2.652562379837036, |
|
"learning_rate": 9.996429892353107e-06, |
|
"loss": 0.4983, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.13839285714285715, |
|
"grad_norm": 2.6861250400543213, |
|
"learning_rate": 9.99596974500518e-06, |
|
"loss": 0.5353, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.14285714285714285, |
|
"grad_norm": 2.821516990661621, |
|
"learning_rate": 9.995481725370941e-06, |
|
"loss": 0.5777, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.14732142857142858, |
|
"grad_norm": 2.599327325820923, |
|
"learning_rate": 9.994965836173303e-06, |
|
"loss": 0.5224, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.15178571428571427, |
|
"grad_norm": 3.059565544128418, |
|
"learning_rate": 9.994422080290675e-06, |
|
"loss": 0.5117, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.15625, |
|
"grad_norm": 2.7172787189483643, |
|
"learning_rate": 9.99385046075695e-06, |
|
"loss": 0.5436, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.16071428571428573, |
|
"grad_norm": 2.469592571258545, |
|
"learning_rate": 9.993250980761487e-06, |
|
"loss": 0.5107, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.16517857142857142, |
|
"grad_norm": 2.5674359798431396, |
|
"learning_rate": 9.99262364364909e-06, |
|
"loss": 0.5125, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.16964285714285715, |
|
"grad_norm": 2.548276901245117, |
|
"learning_rate": 9.991968452919999e-06, |
|
"loss": 0.4619, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.17410714285714285, |
|
"grad_norm": 2.671201229095459, |
|
"learning_rate": 9.991285412229854e-06, |
|
"loss": 0.5114, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.17857142857142858, |
|
"grad_norm": 2.5654966831207275, |
|
"learning_rate": 9.99057452538969e-06, |
|
"loss": 0.5276, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.18303571428571427, |
|
"grad_norm": 2.3860409259796143, |
|
"learning_rate": 9.989835796365911e-06, |
|
"loss": 0.4979, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.1875, |
|
"grad_norm": 2.5768401622772217, |
|
"learning_rate": 9.989069229280264e-06, |
|
"loss": 0.5313, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.19196428571428573, |
|
"grad_norm": 2.3968393802642822, |
|
"learning_rate": 9.988274828409821e-06, |
|
"loss": 0.5388, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.19642857142857142, |
|
"grad_norm": 2.1060421466827393, |
|
"learning_rate": 9.987452598186947e-06, |
|
"loss": 0.487, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.20089285714285715, |
|
"grad_norm": 2.427931070327759, |
|
"learning_rate": 9.986602543199292e-06, |
|
"loss": 0.5106, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.20535714285714285, |
|
"grad_norm": 2.4082748889923096, |
|
"learning_rate": 9.985724668189744e-06, |
|
"loss": 0.5157, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.20982142857142858, |
|
"grad_norm": 2.3398637771606445, |
|
"learning_rate": 9.98481897805642e-06, |
|
"loss": 0.5287, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.21428571428571427, |
|
"grad_norm": 2.396620750427246, |
|
"learning_rate": 9.983885477852628e-06, |
|
"loss": 0.5999, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.21875, |
|
"grad_norm": 2.5863988399505615, |
|
"learning_rate": 9.982924172786847e-06, |
|
"loss": 0.5622, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.22321428571428573, |
|
"grad_norm": 2.56740665435791, |
|
"learning_rate": 9.981935068222687e-06, |
|
"loss": 0.5167, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.22767857142857142, |
|
"grad_norm": 2.1008498668670654, |
|
"learning_rate": 9.980918169678872e-06, |
|
"loss": 0.4426, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.23214285714285715, |
|
"grad_norm": 3.540738821029663, |
|
"learning_rate": 9.979873482829199e-06, |
|
"loss": 0.5136, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.23660714285714285, |
|
"grad_norm": 2.5687336921691895, |
|
"learning_rate": 9.978801013502511e-06, |
|
"loss": 0.5229, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.24107142857142858, |
|
"grad_norm": 2.21964168548584, |
|
"learning_rate": 9.977700767682665e-06, |
|
"loss": 0.4715, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.24553571428571427, |
|
"grad_norm": 2.621365547180176, |
|
"learning_rate": 9.976572751508497e-06, |
|
"loss": 0.5466, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 2.3530430793762207, |
|
"learning_rate": 9.975416971273787e-06, |
|
"loss": 0.5023, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.2544642857142857, |
|
"grad_norm": 2.344916820526123, |
|
"learning_rate": 9.974233433427222e-06, |
|
"loss": 0.4531, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.2544642857142857, |
|
"eval_loss": 0.6133683323860168, |
|
"eval_runtime": 3.5046, |
|
"eval_samples_per_second": 17.12, |
|
"eval_steps_per_second": 1.141, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.25892857142857145, |
|
"grad_norm": 1.9012365341186523, |
|
"learning_rate": 9.97302214457237e-06, |
|
"loss": 0.4561, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.26339285714285715, |
|
"grad_norm": 2.1153564453125, |
|
"learning_rate": 9.971783111467635e-06, |
|
"loss": 0.4818, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.26785714285714285, |
|
"grad_norm": 2.0768792629241943, |
|
"learning_rate": 9.970516341026211e-06, |
|
"loss": 0.4822, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.27232142857142855, |
|
"grad_norm": 2.338013172149658, |
|
"learning_rate": 9.969221840316066e-06, |
|
"loss": 0.5536, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.2767857142857143, |
|
"grad_norm": 2.0366456508636475, |
|
"learning_rate": 9.967899616559879e-06, |
|
"loss": 0.4722, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.28125, |
|
"grad_norm": 2.3978164196014404, |
|
"learning_rate": 9.966549677135015e-06, |
|
"loss": 0.4918, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.2857142857142857, |
|
"grad_norm": 2.4098730087280273, |
|
"learning_rate": 9.965172029573479e-06, |
|
"loss": 0.4849, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.29017857142857145, |
|
"grad_norm": 2.1472318172454834, |
|
"learning_rate": 9.96376668156187e-06, |
|
"loss": 0.4636, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.29464285714285715, |
|
"grad_norm": 2.1821885108947754, |
|
"learning_rate": 9.962333640941349e-06, |
|
"loss": 0.5001, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.29910714285714285, |
|
"grad_norm": 2.1246650218963623, |
|
"learning_rate": 9.960872915707582e-06, |
|
"loss": 0.5134, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.30357142857142855, |
|
"grad_norm": 1.918825626373291, |
|
"learning_rate": 9.959384514010703e-06, |
|
"loss": 0.4598, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.3080357142857143, |
|
"grad_norm": 2.18090558052063, |
|
"learning_rate": 9.95786844415527e-06, |
|
"loss": 0.535, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"grad_norm": 2.012615203857422, |
|
"learning_rate": 9.956324714600212e-06, |
|
"loss": 0.5072, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.3169642857142857, |
|
"grad_norm": 2.2373433113098145, |
|
"learning_rate": 9.95475333395879e-06, |
|
"loss": 0.4815, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.32142857142857145, |
|
"grad_norm": 2.2920775413513184, |
|
"learning_rate": 9.95315431099854e-06, |
|
"loss": 0.4943, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.32589285714285715, |
|
"grad_norm": 2.103111743927002, |
|
"learning_rate": 9.951527654641231e-06, |
|
"loss": 0.4899, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.33035714285714285, |
|
"grad_norm": 2.1315195560455322, |
|
"learning_rate": 9.949873373962814e-06, |
|
"loss": 0.5516, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.33482142857142855, |
|
"grad_norm": 2.0358729362487793, |
|
"learning_rate": 9.948191478193365e-06, |
|
"loss": 0.4557, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.3392857142857143, |
|
"grad_norm": 1.9133963584899902, |
|
"learning_rate": 9.946481976717046e-06, |
|
"loss": 0.4406, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.34375, |
|
"grad_norm": 1.8832437992095947, |
|
"learning_rate": 9.944744879072043e-06, |
|
"loss": 0.4518, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.3482142857142857, |
|
"grad_norm": 2.119741439819336, |
|
"learning_rate": 9.942980194950511e-06, |
|
"loss": 0.5114, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.35267857142857145, |
|
"grad_norm": 2.543752670288086, |
|
"learning_rate": 9.941187934198528e-06, |
|
"loss": 0.546, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.35714285714285715, |
|
"grad_norm": 2.132417917251587, |
|
"learning_rate": 9.939368106816038e-06, |
|
"loss": 0.4919, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.36160714285714285, |
|
"grad_norm": 2.2283120155334473, |
|
"learning_rate": 9.937520722956789e-06, |
|
"loss": 0.5794, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.36607142857142855, |
|
"grad_norm": 2.0629560947418213, |
|
"learning_rate": 9.93564579292828e-06, |
|
"loss": 0.4665, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.3705357142857143, |
|
"grad_norm": 3.117424249649048, |
|
"learning_rate": 9.933743327191711e-06, |
|
"loss": 0.4809, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.375, |
|
"grad_norm": 2.185500144958496, |
|
"learning_rate": 9.93181333636191e-06, |
|
"loss": 0.5341, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.3794642857142857, |
|
"grad_norm": 2.1382153034210205, |
|
"learning_rate": 9.929855831207288e-06, |
|
"loss": 0.4604, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.38392857142857145, |
|
"grad_norm": 2.0280189514160156, |
|
"learning_rate": 9.92787082264977e-06, |
|
"loss": 0.4251, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.38839285714285715, |
|
"grad_norm": 1.953497290611267, |
|
"learning_rate": 9.925858321764733e-06, |
|
"loss": 0.5032, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.39285714285714285, |
|
"grad_norm": 2.049941062927246, |
|
"learning_rate": 9.923818339780954e-06, |
|
"loss": 0.5318, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.39732142857142855, |
|
"grad_norm": 2.191439390182495, |
|
"learning_rate": 9.921750888080534e-06, |
|
"loss": 0.4664, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.4017857142857143, |
|
"grad_norm": 2.2432761192321777, |
|
"learning_rate": 9.91965597819885e-06, |
|
"loss": 0.4706, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.40625, |
|
"grad_norm": 1.9182389974594116, |
|
"learning_rate": 9.917533621824476e-06, |
|
"loss": 0.4089, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.4107142857142857, |
|
"grad_norm": 1.8381813764572144, |
|
"learning_rate": 9.915383830799129e-06, |
|
"loss": 0.4085, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.41517857142857145, |
|
"grad_norm": 2.1741788387298584, |
|
"learning_rate": 9.91320661711759e-06, |
|
"loss": 0.5138, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.41964285714285715, |
|
"grad_norm": 2.1558420658111572, |
|
"learning_rate": 9.911001992927655e-06, |
|
"loss": 0.4869, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.42410714285714285, |
|
"grad_norm": 2.0384535789489746, |
|
"learning_rate": 9.908769970530049e-06, |
|
"loss": 0.5034, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.42857142857142855, |
|
"grad_norm": 2.0241775512695312, |
|
"learning_rate": 9.90651056237837e-06, |
|
"loss": 0.5012, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.4330357142857143, |
|
"grad_norm": 2.040360689163208, |
|
"learning_rate": 9.904223781079017e-06, |
|
"loss": 0.5085, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.4375, |
|
"grad_norm": 2.2121329307556152, |
|
"learning_rate": 9.901909639391111e-06, |
|
"loss": 0.5127, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.4419642857142857, |
|
"grad_norm": 2.294875383377075, |
|
"learning_rate": 9.899568150226435e-06, |
|
"loss": 0.4317, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.44642857142857145, |
|
"grad_norm": 2.159607410430908, |
|
"learning_rate": 9.897199326649362e-06, |
|
"loss": 0.4729, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.45089285714285715, |
|
"grad_norm": 1.8338195085525513, |
|
"learning_rate": 9.894803181876765e-06, |
|
"loss": 0.4525, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.45535714285714285, |
|
"grad_norm": 1.8389171361923218, |
|
"learning_rate": 9.892379729277972e-06, |
|
"loss": 0.4579, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.45982142857142855, |
|
"grad_norm": 1.8244115114212036, |
|
"learning_rate": 9.889928982374663e-06, |
|
"loss": 0.37, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.4642857142857143, |
|
"grad_norm": 2.351200819015503, |
|
"learning_rate": 9.887450954840812e-06, |
|
"loss": 0.4758, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.46875, |
|
"grad_norm": 2.3291563987731934, |
|
"learning_rate": 9.884945660502607e-06, |
|
"loss": 0.4573, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.4732142857142857, |
|
"grad_norm": 1.9490771293640137, |
|
"learning_rate": 9.882413113338364e-06, |
|
"loss": 0.487, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.47767857142857145, |
|
"grad_norm": 2.136775255203247, |
|
"learning_rate": 9.879853327478465e-06, |
|
"loss": 0.4859, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.48214285714285715, |
|
"grad_norm": 2.3490424156188965, |
|
"learning_rate": 9.877266317205268e-06, |
|
"loss": 0.4642, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.48660714285714285, |
|
"grad_norm": 2.140052556991577, |
|
"learning_rate": 9.874652096953028e-06, |
|
"loss": 0.4657, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.49107142857142855, |
|
"grad_norm": 2.1019790172576904, |
|
"learning_rate": 9.872010681307821e-06, |
|
"loss": 0.5293, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.4955357142857143, |
|
"grad_norm": 2.066462755203247, |
|
"learning_rate": 9.869342085007458e-06, |
|
"loss": 0.4226, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.9048563241958618, |
|
"learning_rate": 9.866646322941405e-06, |
|
"loss": 0.4657, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.5044642857142857, |
|
"grad_norm": 1.835811734199524, |
|
"learning_rate": 9.863923410150704e-06, |
|
"loss": 0.4676, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.5089285714285714, |
|
"grad_norm": 2.1999454498291016, |
|
"learning_rate": 9.861173361827876e-06, |
|
"loss": 0.4419, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.5089285714285714, |
|
"eval_loss": 0.6013900637626648, |
|
"eval_runtime": 3.4939, |
|
"eval_samples_per_second": 17.173, |
|
"eval_steps_per_second": 1.145, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.5133928571428571, |
|
"grad_norm": 1.9584894180297852, |
|
"learning_rate": 9.858396193316853e-06, |
|
"loss": 0.4626, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.5178571428571429, |
|
"grad_norm": 1.9644426107406616, |
|
"learning_rate": 9.855591920112883e-06, |
|
"loss": 0.4588, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.5223214285714286, |
|
"grad_norm": 1.712542176246643, |
|
"learning_rate": 9.85276055786244e-06, |
|
"loss": 0.4963, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.5267857142857143, |
|
"grad_norm": 2.043294906616211, |
|
"learning_rate": 9.849902122363148e-06, |
|
"loss": 0.4623, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.53125, |
|
"grad_norm": 1.8057119846343994, |
|
"learning_rate": 9.847016629563683e-06, |
|
"loss": 0.4633, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.5357142857142857, |
|
"grad_norm": 2.072108030319214, |
|
"learning_rate": 9.844104095563689e-06, |
|
"loss": 0.5328, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.5401785714285714, |
|
"grad_norm": 2.03574800491333, |
|
"learning_rate": 9.841164536613685e-06, |
|
"loss": 0.4889, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.5446428571428571, |
|
"grad_norm": 1.9355207681655884, |
|
"learning_rate": 9.83819796911498e-06, |
|
"loss": 0.53, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.5491071428571429, |
|
"grad_norm": 1.9122416973114014, |
|
"learning_rate": 9.83520440961957e-06, |
|
"loss": 0.3849, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.5535714285714286, |
|
"grad_norm": 1.7899999618530273, |
|
"learning_rate": 9.83218387483006e-06, |
|
"loss": 0.4963, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.5580357142857143, |
|
"grad_norm": 1.8941091299057007, |
|
"learning_rate": 9.829136381599563e-06, |
|
"loss": 0.4781, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.5625, |
|
"grad_norm": 1.7798947095870972, |
|
"learning_rate": 9.826061946931605e-06, |
|
"loss": 0.4165, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.5669642857142857, |
|
"grad_norm": 1.9236619472503662, |
|
"learning_rate": 9.822960587980034e-06, |
|
"loss": 0.4841, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.5714285714285714, |
|
"grad_norm": 1.7892321348190308, |
|
"learning_rate": 9.81983232204892e-06, |
|
"loss": 0.4859, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.5758928571428571, |
|
"grad_norm": 1.9513256549835205, |
|
"learning_rate": 9.816677166592462e-06, |
|
"loss": 0.4075, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.5803571428571429, |
|
"grad_norm": 1.8629826307296753, |
|
"learning_rate": 9.81349513921489e-06, |
|
"loss": 0.4481, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.5848214285714286, |
|
"grad_norm": 1.646044135093689, |
|
"learning_rate": 9.810286257670365e-06, |
|
"loss": 0.4173, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.5892857142857143, |
|
"grad_norm": 2.023775339126587, |
|
"learning_rate": 9.807050539862884e-06, |
|
"loss": 0.4307, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.59375, |
|
"grad_norm": 1.763723611831665, |
|
"learning_rate": 9.803788003846175e-06, |
|
"loss": 0.4549, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.5982142857142857, |
|
"grad_norm": 1.967829942703247, |
|
"learning_rate": 9.800498667823595e-06, |
|
"loss": 0.4401, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.6026785714285714, |
|
"grad_norm": 1.759050726890564, |
|
"learning_rate": 9.797182550148039e-06, |
|
"loss": 0.4588, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.6071428571428571, |
|
"grad_norm": 1.9382320642471313, |
|
"learning_rate": 9.793839669321828e-06, |
|
"loss": 0.4162, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.6116071428571429, |
|
"grad_norm": 1.9155679941177368, |
|
"learning_rate": 9.790470043996604e-06, |
|
"loss": 0.4854, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.6160714285714286, |
|
"grad_norm": 1.8347245454788208, |
|
"learning_rate": 9.78707369297324e-06, |
|
"loss": 0.4797, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.6205357142857143, |
|
"grad_norm": 1.8676778078079224, |
|
"learning_rate": 9.783650635201714e-06, |
|
"loss": 0.5195, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 1.8418855667114258, |
|
"learning_rate": 9.780200889781021e-06, |
|
"loss": 0.4262, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.6294642857142857, |
|
"grad_norm": 1.934958815574646, |
|
"learning_rate": 9.776724475959061e-06, |
|
"loss": 0.5029, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.6339285714285714, |
|
"grad_norm": 1.6102712154388428, |
|
"learning_rate": 9.773221413132525e-06, |
|
"loss": 0.4372, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.6383928571428571, |
|
"grad_norm": 1.7224174737930298, |
|
"learning_rate": 9.769691720846801e-06, |
|
"loss": 0.4196, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.6428571428571429, |
|
"grad_norm": 1.9889607429504395, |
|
"learning_rate": 9.766135418795848e-06, |
|
"loss": 0.4587, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.6473214285714286, |
|
"grad_norm": 1.9140293598175049, |
|
"learning_rate": 9.762552526822098e-06, |
|
"loss": 0.4267, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.6517857142857143, |
|
"grad_norm": 1.9280264377593994, |
|
"learning_rate": 9.758943064916342e-06, |
|
"loss": 0.4194, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.65625, |
|
"grad_norm": 1.6826374530792236, |
|
"learning_rate": 9.755307053217622e-06, |
|
"loss": 0.4084, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.6607142857142857, |
|
"grad_norm": 2.0488829612731934, |
|
"learning_rate": 9.751644512013106e-06, |
|
"loss": 0.4627, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.6651785714285714, |
|
"grad_norm": 1.9179582595825195, |
|
"learning_rate": 9.74795546173799e-06, |
|
"loss": 0.4595, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.6696428571428571, |
|
"grad_norm": 2.0788657665252686, |
|
"learning_rate": 9.744239922975377e-06, |
|
"loss": 0.4916, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.6741071428571429, |
|
"grad_norm": 1.7633529901504517, |
|
"learning_rate": 9.740497916456163e-06, |
|
"loss": 0.4813, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.6785714285714286, |
|
"grad_norm": 1.8860759735107422, |
|
"learning_rate": 9.736729463058921e-06, |
|
"loss": 0.5132, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.6830357142857143, |
|
"grad_norm": 1.725066065788269, |
|
"learning_rate": 9.732934583809782e-06, |
|
"loss": 0.4973, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.6875, |
|
"grad_norm": 1.7949278354644775, |
|
"learning_rate": 9.729113299882324e-06, |
|
"loss": 0.4693, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.6919642857142857, |
|
"grad_norm": 1.7584055662155151, |
|
"learning_rate": 9.725265632597448e-06, |
|
"loss": 0.4737, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.6964285714285714, |
|
"grad_norm": 1.766491174697876, |
|
"learning_rate": 9.721391603423263e-06, |
|
"loss": 0.4587, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.7008928571428571, |
|
"grad_norm": 1.7748725414276123, |
|
"learning_rate": 9.717491233974962e-06, |
|
"loss": 0.441, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.7053571428571429, |
|
"grad_norm": 1.8221688270568848, |
|
"learning_rate": 9.713564546014707e-06, |
|
"loss": 0.4942, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.7098214285714286, |
|
"grad_norm": 1.745335578918457, |
|
"learning_rate": 9.7096115614515e-06, |
|
"loss": 0.4398, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"grad_norm": 1.9421252012252808, |
|
"learning_rate": 9.705632302341073e-06, |
|
"loss": 0.4656, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.71875, |
|
"grad_norm": 1.720116138458252, |
|
"learning_rate": 9.701626790885749e-06, |
|
"loss": 0.4361, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.7232142857142857, |
|
"grad_norm": 1.8272347450256348, |
|
"learning_rate": 9.69759504943433e-06, |
|
"loss": 0.4062, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.7276785714285714, |
|
"grad_norm": 1.6778669357299805, |
|
"learning_rate": 9.69353710048197e-06, |
|
"loss": 0.4743, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.7321428571428571, |
|
"grad_norm": 1.8853040933609009, |
|
"learning_rate": 9.68945296667004e-06, |
|
"loss": 0.4481, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.7366071428571429, |
|
"grad_norm": 1.826070785522461, |
|
"learning_rate": 9.685342670786025e-06, |
|
"loss": 0.5064, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.7410714285714286, |
|
"grad_norm": 2.058189868927002, |
|
"learning_rate": 9.681206235763367e-06, |
|
"loss": 0.488, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.7455357142857143, |
|
"grad_norm": 1.6986044645309448, |
|
"learning_rate": 9.677043684681358e-06, |
|
"loss": 0.4543, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.7179213762283325, |
|
"learning_rate": 9.672855040765006e-06, |
|
"loss": 0.4026, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.7544642857142857, |
|
"grad_norm": 1.7653132677078247, |
|
"learning_rate": 9.668640327384899e-06, |
|
"loss": 0.4711, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.7589285714285714, |
|
"grad_norm": 1.9445300102233887, |
|
"learning_rate": 9.664399568057087e-06, |
|
"loss": 0.4854, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.7633928571428571, |
|
"grad_norm": 1.8008228540420532, |
|
"learning_rate": 9.660132786442937e-06, |
|
"loss": 0.4672, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.7633928571428571, |
|
"eval_loss": 0.5934335589408875, |
|
"eval_runtime": 3.4095, |
|
"eval_samples_per_second": 17.598, |
|
"eval_steps_per_second": 1.173, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.7678571428571429, |
|
"grad_norm": 1.8834118843078613, |
|
"learning_rate": 9.655840006349014e-06, |
|
"loss": 0.4698, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.7723214285714286, |
|
"grad_norm": 1.642304539680481, |
|
"learning_rate": 9.651521251726936e-06, |
|
"loss": 0.4687, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.7767857142857143, |
|
"grad_norm": 1.7180356979370117, |
|
"learning_rate": 9.64717654667325e-06, |
|
"loss": 0.5118, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.78125, |
|
"grad_norm": 1.6180371046066284, |
|
"learning_rate": 9.642805915429291e-06, |
|
"loss": 0.4241, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.7857142857142857, |
|
"grad_norm": 1.7322157621383667, |
|
"learning_rate": 9.638409382381052e-06, |
|
"loss": 0.4465, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.7901785714285714, |
|
"grad_norm": 1.9031792879104614, |
|
"learning_rate": 9.633986972059047e-06, |
|
"loss": 0.4716, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.7946428571428571, |
|
"grad_norm": 1.7738388776779175, |
|
"learning_rate": 9.629538709138166e-06, |
|
"loss": 0.4896, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.7991071428571429, |
|
"grad_norm": 1.725033164024353, |
|
"learning_rate": 9.625064618437549e-06, |
|
"loss": 0.4508, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.8035714285714286, |
|
"grad_norm": 1.687565565109253, |
|
"learning_rate": 9.620564724920443e-06, |
|
"loss": 0.4277, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.8080357142857143, |
|
"grad_norm": 1.5750185251235962, |
|
"learning_rate": 9.616039053694058e-06, |
|
"loss": 0.4168, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.8125, |
|
"grad_norm": 1.7201429605484009, |
|
"learning_rate": 9.611487630009436e-06, |
|
"loss": 0.4097, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.8169642857142857, |
|
"grad_norm": 1.8075188398361206, |
|
"learning_rate": 9.606910479261301e-06, |
|
"loss": 0.4857, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.8214285714285714, |
|
"grad_norm": 1.4948920011520386, |
|
"learning_rate": 9.602307626987925e-06, |
|
"loss": 0.4004, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.8258928571428571, |
|
"grad_norm": 1.8233128786087036, |
|
"learning_rate": 9.597679098870978e-06, |
|
"loss": 0.4808, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.8303571428571429, |
|
"grad_norm": 1.698188304901123, |
|
"learning_rate": 9.593024920735393e-06, |
|
"loss": 0.4741, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.8348214285714286, |
|
"grad_norm": 1.7174502611160278, |
|
"learning_rate": 9.588345118549214e-06, |
|
"loss": 0.4313, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.8392857142857143, |
|
"grad_norm": 1.6591637134552002, |
|
"learning_rate": 9.583639718423457e-06, |
|
"loss": 0.453, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.84375, |
|
"grad_norm": 1.4759143590927124, |
|
"learning_rate": 9.57890874661196e-06, |
|
"loss": 0.3768, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.8482142857142857, |
|
"grad_norm": 1.6473032236099243, |
|
"learning_rate": 9.57415222951124e-06, |
|
"loss": 0.4448, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.8526785714285714, |
|
"grad_norm": 1.6935791969299316, |
|
"learning_rate": 9.569370193660348e-06, |
|
"loss": 0.4112, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.8571428571428571, |
|
"grad_norm": 1.8300622701644897, |
|
"learning_rate": 9.564562665740708e-06, |
|
"loss": 0.5206, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.8616071428571429, |
|
"grad_norm": 1.512624979019165, |
|
"learning_rate": 9.559729672575985e-06, |
|
"loss": 0.4014, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.8660714285714286, |
|
"grad_norm": 1.5721509456634521, |
|
"learning_rate": 9.554871241131923e-06, |
|
"loss": 0.4845, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.8705357142857143, |
|
"grad_norm": 1.6285744905471802, |
|
"learning_rate": 9.549987398516206e-06, |
|
"loss": 0.486, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.875, |
|
"grad_norm": 1.7633181810379028, |
|
"learning_rate": 9.54507817197829e-06, |
|
"loss": 0.429, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.8794642857142857, |
|
"grad_norm": 1.6904608011245728, |
|
"learning_rate": 9.540143588909268e-06, |
|
"loss": 0.4263, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.8839285714285714, |
|
"grad_norm": 1.7905316352844238, |
|
"learning_rate": 9.535183676841709e-06, |
|
"loss": 0.457, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.8883928571428571, |
|
"grad_norm": 1.888141393661499, |
|
"learning_rate": 9.530198463449507e-06, |
|
"loss": 0.4649, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.8928571428571429, |
|
"grad_norm": 1.853598713874817, |
|
"learning_rate": 9.525187976547718e-06, |
|
"loss": 0.46, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.8973214285714286, |
|
"grad_norm": 1.5846953392028809, |
|
"learning_rate": 9.520152244092421e-06, |
|
"loss": 0.362, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.9017857142857143, |
|
"grad_norm": 1.608396053314209, |
|
"learning_rate": 9.515091294180546e-06, |
|
"loss": 0.4001, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.90625, |
|
"grad_norm": 1.8720006942749023, |
|
"learning_rate": 9.510005155049729e-06, |
|
"loss": 0.48, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.9107142857142857, |
|
"grad_norm": 1.741791009902954, |
|
"learning_rate": 9.504893855078144e-06, |
|
"loss": 0.381, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.9151785714285714, |
|
"grad_norm": 1.5903561115264893, |
|
"learning_rate": 9.499757422784358e-06, |
|
"loss": 0.4422, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.9196428571428571, |
|
"grad_norm": 1.6735451221466064, |
|
"learning_rate": 9.494595886827157e-06, |
|
"loss": 0.4973, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.9241071428571429, |
|
"grad_norm": 1.6444984674453735, |
|
"learning_rate": 9.489409276005393e-06, |
|
"loss": 0.4464, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.9285714285714286, |
|
"grad_norm": 1.6434756517410278, |
|
"learning_rate": 9.48419761925783e-06, |
|
"loss": 0.5094, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.9330357142857143, |
|
"grad_norm": 1.8670734167099, |
|
"learning_rate": 9.478960945662974e-06, |
|
"loss": 0.3986, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.9375, |
|
"grad_norm": 1.7340041399002075, |
|
"learning_rate": 9.473699284438908e-06, |
|
"loss": 0.4024, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.9419642857142857, |
|
"grad_norm": 1.8192938566207886, |
|
"learning_rate": 9.468412664943137e-06, |
|
"loss": 0.515, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.9464285714285714, |
|
"grad_norm": 1.5921608209609985, |
|
"learning_rate": 9.463101116672423e-06, |
|
"loss": 0.4257, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.9508928571428571, |
|
"grad_norm": 2.0308141708374023, |
|
"learning_rate": 9.457764669262615e-06, |
|
"loss": 0.4918, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.9553571428571429, |
|
"grad_norm": 1.6878437995910645, |
|
"learning_rate": 9.452403352488488e-06, |
|
"loss": 0.4433, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.9598214285714286, |
|
"grad_norm": 1.9883629083633423, |
|
"learning_rate": 9.447017196263578e-06, |
|
"loss": 0.4836, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.9642857142857143, |
|
"grad_norm": 1.67111337184906, |
|
"learning_rate": 9.441606230640012e-06, |
|
"loss": 0.448, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.96875, |
|
"grad_norm": 1.7054740190505981, |
|
"learning_rate": 9.436170485808338e-06, |
|
"loss": 0.449, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.9732142857142857, |
|
"grad_norm": 1.690224289894104, |
|
"learning_rate": 9.430709992097364e-06, |
|
"loss": 0.4924, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.9776785714285714, |
|
"grad_norm": 1.5849170684814453, |
|
"learning_rate": 9.425224779973986e-06, |
|
"loss": 0.4583, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.9821428571428571, |
|
"grad_norm": 1.726426124572754, |
|
"learning_rate": 9.41971488004301e-06, |
|
"loss": 0.4982, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.9866071428571429, |
|
"grad_norm": 1.7829738855361938, |
|
"learning_rate": 9.414180323046991e-06, |
|
"loss": 0.4642, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.9910714285714286, |
|
"grad_norm": 1.6020630598068237, |
|
"learning_rate": 9.408621139866067e-06, |
|
"loss": 0.3982, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.9955357142857143, |
|
"grad_norm": 1.7414110898971558, |
|
"learning_rate": 9.403037361517762e-06, |
|
"loss": 0.4485, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.4676933288574219, |
|
"learning_rate": 9.397429019156841e-06, |
|
"loss": 0.4268, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.0044642857142858, |
|
"grad_norm": 1.4603513479232788, |
|
"learning_rate": 9.391796144075123e-06, |
|
"loss": 0.341, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.0089285714285714, |
|
"grad_norm": 1.742700457572937, |
|
"learning_rate": 9.386138767701306e-06, |
|
"loss": 0.3227, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.0133928571428572, |
|
"grad_norm": 1.5671008825302124, |
|
"learning_rate": 9.380456921600785e-06, |
|
"loss": 0.3646, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.0178571428571428, |
|
"grad_norm": 1.6107045412063599, |
|
"learning_rate": 9.374750637475499e-06, |
|
"loss": 0.3366, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.0178571428571428, |
|
"eval_loss": 0.5993052124977112, |
|
"eval_runtime": 3.4011, |
|
"eval_samples_per_second": 17.641, |
|
"eval_steps_per_second": 1.176, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.0223214285714286, |
|
"grad_norm": 1.5372443199157715, |
|
"learning_rate": 9.36901994716373e-06, |
|
"loss": 0.3582, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 1.0267857142857142, |
|
"grad_norm": 1.5077764987945557, |
|
"learning_rate": 9.363264882639936e-06, |
|
"loss": 0.3136, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.03125, |
|
"grad_norm": 1.6883026361465454, |
|
"learning_rate": 9.357485476014573e-06, |
|
"loss": 0.3196, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.0357142857142858, |
|
"grad_norm": 1.380987524986267, |
|
"learning_rate": 9.351681759533914e-06, |
|
"loss": 0.2765, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.0401785714285714, |
|
"grad_norm": 1.6997498273849487, |
|
"learning_rate": 9.345853765579865e-06, |
|
"loss": 0.3005, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.0446428571428572, |
|
"grad_norm": 1.550504446029663, |
|
"learning_rate": 9.340001526669794e-06, |
|
"loss": 0.2848, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.0491071428571428, |
|
"grad_norm": 1.7140612602233887, |
|
"learning_rate": 9.33412507545634e-06, |
|
"loss": 0.3395, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.0535714285714286, |
|
"grad_norm": 1.783673644065857, |
|
"learning_rate": 9.32822444472724e-06, |
|
"loss": 0.3454, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.0580357142857142, |
|
"grad_norm": 1.7617640495300293, |
|
"learning_rate": 9.322299667405134e-06, |
|
"loss": 0.3444, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.0625, |
|
"grad_norm": 1.7787531614303589, |
|
"learning_rate": 9.31635077654739e-06, |
|
"loss": 0.3595, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.0669642857142858, |
|
"grad_norm": 1.4490877389907837, |
|
"learning_rate": 9.310377805345926e-06, |
|
"loss": 0.2953, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.0714285714285714, |
|
"grad_norm": 1.6861026287078857, |
|
"learning_rate": 9.304380787127003e-06, |
|
"loss": 0.3182, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.0758928571428572, |
|
"grad_norm": 1.58506441116333, |
|
"learning_rate": 9.298359755351065e-06, |
|
"loss": 0.3151, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.0803571428571428, |
|
"grad_norm": 1.7218842506408691, |
|
"learning_rate": 9.29231474361253e-06, |
|
"loss": 0.3803, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.0848214285714286, |
|
"grad_norm": 1.6209449768066406, |
|
"learning_rate": 9.28624578563962e-06, |
|
"loss": 0.3161, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.0892857142857142, |
|
"grad_norm": 1.5501459836959839, |
|
"learning_rate": 9.280152915294162e-06, |
|
"loss": 0.3917, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.09375, |
|
"grad_norm": 1.5111147165298462, |
|
"learning_rate": 9.274036166571402e-06, |
|
"loss": 0.3649, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.0982142857142858, |
|
"grad_norm": 1.4189096689224243, |
|
"learning_rate": 9.267895573599819e-06, |
|
"loss": 0.3162, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.1026785714285714, |
|
"grad_norm": 1.57675039768219, |
|
"learning_rate": 9.261731170640923e-06, |
|
"loss": 0.3331, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 1.1071428571428572, |
|
"grad_norm": 1.4706840515136719, |
|
"learning_rate": 9.255542992089086e-06, |
|
"loss": 0.3054, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.1116071428571428, |
|
"grad_norm": 1.4259796142578125, |
|
"learning_rate": 9.24933107247132e-06, |
|
"loss": 0.3209, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.1160714285714286, |
|
"grad_norm": 1.526444435119629, |
|
"learning_rate": 9.243095446447113e-06, |
|
"loss": 0.3066, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.1205357142857142, |
|
"grad_norm": 1.4234224557876587, |
|
"learning_rate": 9.23683614880822e-06, |
|
"loss": 0.3107, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 1.125, |
|
"grad_norm": 1.6062707901000977, |
|
"learning_rate": 9.230553214478469e-06, |
|
"loss": 0.3621, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.1294642857142858, |
|
"grad_norm": 1.5600666999816895, |
|
"learning_rate": 9.224246678513569e-06, |
|
"loss": 0.3225, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 1.1339285714285714, |
|
"grad_norm": 1.6989763975143433, |
|
"learning_rate": 9.217916576100922e-06, |
|
"loss": 0.3107, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.1383928571428572, |
|
"grad_norm": 1.603786587715149, |
|
"learning_rate": 9.211562942559408e-06, |
|
"loss": 0.3434, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.1428571428571428, |
|
"grad_norm": 1.5290991067886353, |
|
"learning_rate": 9.20518581333921e-06, |
|
"loss": 0.3242, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.1473214285714286, |
|
"grad_norm": 1.4983583688735962, |
|
"learning_rate": 9.1987852240216e-06, |
|
"loss": 0.3315, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 1.1517857142857142, |
|
"grad_norm": 1.5081202983856201, |
|
"learning_rate": 9.192361210318745e-06, |
|
"loss": 0.3156, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.15625, |
|
"grad_norm": 1.575416088104248, |
|
"learning_rate": 9.185913808073513e-06, |
|
"loss": 0.3083, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 1.1607142857142858, |
|
"grad_norm": 1.6067839860916138, |
|
"learning_rate": 9.179443053259263e-06, |
|
"loss": 0.3597, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.1651785714285714, |
|
"grad_norm": 1.5789872407913208, |
|
"learning_rate": 9.172948981979654e-06, |
|
"loss": 0.3639, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 1.1696428571428572, |
|
"grad_norm": 1.6132181882858276, |
|
"learning_rate": 9.166431630468438e-06, |
|
"loss": 0.3473, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 1.1741071428571428, |
|
"grad_norm": 1.4658143520355225, |
|
"learning_rate": 9.159891035089262e-06, |
|
"loss": 0.275, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 1.1785714285714286, |
|
"grad_norm": 1.6115272045135498, |
|
"learning_rate": 9.153327232335455e-06, |
|
"loss": 0.337, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.1830357142857142, |
|
"grad_norm": 1.3804121017456055, |
|
"learning_rate": 9.146740258829844e-06, |
|
"loss": 0.2805, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.1875, |
|
"grad_norm": 1.6690940856933594, |
|
"learning_rate": 9.140130151324526e-06, |
|
"loss": 0.3812, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 1.1919642857142858, |
|
"grad_norm": 1.4038200378417969, |
|
"learning_rate": 9.13349694670068e-06, |
|
"loss": 0.3075, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 1.1964285714285714, |
|
"grad_norm": 1.4030662775039673, |
|
"learning_rate": 9.126840681968357e-06, |
|
"loss": 0.3192, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 1.2008928571428572, |
|
"grad_norm": 1.6158102750778198, |
|
"learning_rate": 9.120161394266266e-06, |
|
"loss": 0.3123, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 1.2053571428571428, |
|
"grad_norm": 1.463722586631775, |
|
"learning_rate": 9.113459120861579e-06, |
|
"loss": 0.2831, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.2098214285714286, |
|
"grad_norm": 1.7193071842193604, |
|
"learning_rate": 9.106733899149715e-06, |
|
"loss": 0.3422, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 1.2142857142857142, |
|
"grad_norm": 1.5576410293579102, |
|
"learning_rate": 9.099985766654132e-06, |
|
"loss": 0.3379, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 1.21875, |
|
"grad_norm": 1.4065855741500854, |
|
"learning_rate": 9.093214761026121e-06, |
|
"loss": 0.2972, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 1.2232142857142858, |
|
"grad_norm": 1.2878752946853638, |
|
"learning_rate": 9.08642092004459e-06, |
|
"loss": 0.301, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 1.2276785714285714, |
|
"grad_norm": 1.5659253597259521, |
|
"learning_rate": 9.079604281615868e-06, |
|
"loss": 0.3259, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.2321428571428572, |
|
"grad_norm": 1.4330716133117676, |
|
"learning_rate": 9.072764883773464e-06, |
|
"loss": 0.3241, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.2366071428571428, |
|
"grad_norm": 1.465423583984375, |
|
"learning_rate": 9.065902764677897e-06, |
|
"loss": 0.2896, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 1.2410714285714286, |
|
"grad_norm": 1.60066819190979, |
|
"learning_rate": 9.059017962616435e-06, |
|
"loss": 0.3486, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 1.2455357142857142, |
|
"grad_norm": 1.688224196434021, |
|
"learning_rate": 9.052110516002925e-06, |
|
"loss": 0.3625, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 1.7048122882843018, |
|
"learning_rate": 9.04518046337755e-06, |
|
"loss": 0.3627, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.2544642857142856, |
|
"grad_norm": 1.699864149093628, |
|
"learning_rate": 9.038227843406628e-06, |
|
"loss": 0.3429, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 1.2589285714285714, |
|
"grad_norm": 1.503157377243042, |
|
"learning_rate": 9.031252694882386e-06, |
|
"loss": 0.3415, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 1.2633928571428572, |
|
"grad_norm": 1.475395917892456, |
|
"learning_rate": 9.024255056722753e-06, |
|
"loss": 0.3211, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 1.2678571428571428, |
|
"grad_norm": 1.4830482006072998, |
|
"learning_rate": 9.017234967971143e-06, |
|
"loss": 0.3258, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 1.2723214285714286, |
|
"grad_norm": 1.499875783920288, |
|
"learning_rate": 9.010192467796228e-06, |
|
"loss": 0.3257, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.2723214285714286, |
|
"eval_loss": 0.6085591912269592, |
|
"eval_runtime": 3.4657, |
|
"eval_samples_per_second": 17.313, |
|
"eval_steps_per_second": 1.154, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.2767857142857144, |
|
"grad_norm": 1.5229018926620483, |
|
"learning_rate": 9.003127595491723e-06, |
|
"loss": 0.3377, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 1.28125, |
|
"grad_norm": 1.5909078121185303, |
|
"learning_rate": 8.996040390476177e-06, |
|
"loss": 0.3675, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 1.2857142857142856, |
|
"grad_norm": 1.553013801574707, |
|
"learning_rate": 8.988930892292738e-06, |
|
"loss": 0.3638, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 1.2901785714285714, |
|
"grad_norm": 1.5105496644973755, |
|
"learning_rate": 8.981799140608938e-06, |
|
"loss": 0.3437, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 1.2946428571428572, |
|
"grad_norm": 1.563785195350647, |
|
"learning_rate": 8.974645175216478e-06, |
|
"loss": 0.3299, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.2991071428571428, |
|
"grad_norm": 1.5746451616287231, |
|
"learning_rate": 8.967469036030996e-06, |
|
"loss": 0.3344, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 1.3035714285714286, |
|
"grad_norm": 1.735245943069458, |
|
"learning_rate": 8.960270763091853e-06, |
|
"loss": 0.3105, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 1.3080357142857144, |
|
"grad_norm": 1.3750625848770142, |
|
"learning_rate": 8.953050396561904e-06, |
|
"loss": 0.3074, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 1.3125, |
|
"grad_norm": 1.5357894897460938, |
|
"learning_rate": 8.94580797672727e-06, |
|
"loss": 0.3292, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 1.3169642857142856, |
|
"grad_norm": 1.5508729219436646, |
|
"learning_rate": 8.938543543997129e-06, |
|
"loss": 0.2959, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.3214285714285714, |
|
"grad_norm": 1.7151294946670532, |
|
"learning_rate": 8.931257138903474e-06, |
|
"loss": 0.3023, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 1.3258928571428572, |
|
"grad_norm": 1.6977832317352295, |
|
"learning_rate": 8.923948802100891e-06, |
|
"loss": 0.352, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 1.3303571428571428, |
|
"grad_norm": 1.724831461906433, |
|
"learning_rate": 8.916618574366338e-06, |
|
"loss": 0.3564, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 1.3348214285714286, |
|
"grad_norm": 1.5105293989181519, |
|
"learning_rate": 8.909266496598917e-06, |
|
"loss": 0.315, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 1.3392857142857144, |
|
"grad_norm": 1.3590584993362427, |
|
"learning_rate": 8.901892609819632e-06, |
|
"loss": 0.2753, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.34375, |
|
"grad_norm": 1.5936496257781982, |
|
"learning_rate": 8.894496955171182e-06, |
|
"loss": 0.357, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 1.3482142857142856, |
|
"grad_norm": 1.6498358249664307, |
|
"learning_rate": 8.887079573917713e-06, |
|
"loss": 0.3318, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 1.3526785714285714, |
|
"grad_norm": 1.6604431867599487, |
|
"learning_rate": 8.879640507444598e-06, |
|
"loss": 0.3314, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 1.3571428571428572, |
|
"grad_norm": 1.5546607971191406, |
|
"learning_rate": 8.872179797258202e-06, |
|
"loss": 0.3534, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 1.3616071428571428, |
|
"grad_norm": 1.6600899696350098, |
|
"learning_rate": 8.86469748498565e-06, |
|
"loss": 0.3231, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.3660714285714286, |
|
"grad_norm": 1.4908391237258911, |
|
"learning_rate": 8.8571936123746e-06, |
|
"loss": 0.308, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 1.3705357142857144, |
|
"grad_norm": 1.4231539964675903, |
|
"learning_rate": 8.849668221293e-06, |
|
"loss": 0.3309, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 1.375, |
|
"grad_norm": 1.4389381408691406, |
|
"learning_rate": 8.842121353728867e-06, |
|
"loss": 0.3231, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 1.3794642857142856, |
|
"grad_norm": 1.5428330898284912, |
|
"learning_rate": 8.834553051790044e-06, |
|
"loss": 0.3523, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 1.3839285714285714, |
|
"grad_norm": 1.3884952068328857, |
|
"learning_rate": 8.826963357703964e-06, |
|
"loss": 0.2901, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.3883928571428572, |
|
"grad_norm": 1.549255609512329, |
|
"learning_rate": 8.819352313817424e-06, |
|
"loss": 0.3387, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 1.3928571428571428, |
|
"grad_norm": 1.483252763748169, |
|
"learning_rate": 8.811719962596338e-06, |
|
"loss": 0.3238, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 1.3973214285714286, |
|
"grad_norm": 1.4185503721237183, |
|
"learning_rate": 8.804066346625506e-06, |
|
"loss": 0.314, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 1.4017857142857144, |
|
"grad_norm": 1.6147483587265015, |
|
"learning_rate": 8.796391508608372e-06, |
|
"loss": 0.3628, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 1.40625, |
|
"grad_norm": 1.513182282447815, |
|
"learning_rate": 8.788695491366795e-06, |
|
"loss": 0.3209, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.4107142857142856, |
|
"grad_norm": 1.4564861059188843, |
|
"learning_rate": 8.780978337840796e-06, |
|
"loss": 0.3037, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 1.4151785714285714, |
|
"grad_norm": 1.4925004243850708, |
|
"learning_rate": 8.773240091088335e-06, |
|
"loss": 0.3171, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 1.4196428571428572, |
|
"grad_norm": 1.5218322277069092, |
|
"learning_rate": 8.765480794285054e-06, |
|
"loss": 0.3329, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 1.4241071428571428, |
|
"grad_norm": 1.4326412677764893, |
|
"learning_rate": 8.757700490724046e-06, |
|
"loss": 0.3119, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 1.4285714285714286, |
|
"grad_norm": 1.6294825077056885, |
|
"learning_rate": 8.749899223815618e-06, |
|
"loss": 0.3568, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.4330357142857144, |
|
"grad_norm": 1.4534196853637695, |
|
"learning_rate": 8.742077037087032e-06, |
|
"loss": 0.3247, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 1.4375, |
|
"grad_norm": 1.4942129850387573, |
|
"learning_rate": 8.734233974182276e-06, |
|
"loss": 0.3189, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 1.4419642857142856, |
|
"grad_norm": 1.6860888004302979, |
|
"learning_rate": 8.726370078861825e-06, |
|
"loss": 0.3466, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 1.4464285714285714, |
|
"grad_norm": 1.5800039768218994, |
|
"learning_rate": 8.718485395002377e-06, |
|
"loss": 0.3636, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 1.4508928571428572, |
|
"grad_norm": 1.4802786111831665, |
|
"learning_rate": 8.710579966596625e-06, |
|
"loss": 0.3442, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.4553571428571428, |
|
"grad_norm": 1.4833781719207764, |
|
"learning_rate": 8.702653837753005e-06, |
|
"loss": 0.3162, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 1.4598214285714286, |
|
"grad_norm": 1.6002053022384644, |
|
"learning_rate": 8.694707052695459e-06, |
|
"loss": 0.3312, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 1.4642857142857144, |
|
"grad_norm": 1.5287590026855469, |
|
"learning_rate": 8.686739655763166e-06, |
|
"loss": 0.3525, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 1.46875, |
|
"grad_norm": 1.4307093620300293, |
|
"learning_rate": 8.678751691410323e-06, |
|
"loss": 0.3121, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 1.4732142857142856, |
|
"grad_norm": 1.4360722303390503, |
|
"learning_rate": 8.670743204205875e-06, |
|
"loss": 0.3461, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.4776785714285714, |
|
"grad_norm": 1.3801894187927246, |
|
"learning_rate": 8.662714238833278e-06, |
|
"loss": 0.2942, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 1.4821428571428572, |
|
"grad_norm": 1.4250258207321167, |
|
"learning_rate": 8.654664840090247e-06, |
|
"loss": 0.3665, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 1.4866071428571428, |
|
"grad_norm": 1.5609809160232544, |
|
"learning_rate": 8.6465950528885e-06, |
|
"loss": 0.3197, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 1.4910714285714286, |
|
"grad_norm": 1.4296934604644775, |
|
"learning_rate": 8.638504922253518e-06, |
|
"loss": 0.3132, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 1.4955357142857144, |
|
"grad_norm": 1.4732247591018677, |
|
"learning_rate": 8.63039449332429e-06, |
|
"loss": 0.3335, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 1.5229887962341309, |
|
"learning_rate": 8.62226381135305e-06, |
|
"loss": 0.3197, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 1.5044642857142856, |
|
"grad_norm": 1.5183864831924438, |
|
"learning_rate": 8.614112921705045e-06, |
|
"loss": 0.3109, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 1.5089285714285714, |
|
"grad_norm": 1.5982593297958374, |
|
"learning_rate": 8.605941869858265e-06, |
|
"loss": 0.3363, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 1.5133928571428572, |
|
"grad_norm": 1.5128353834152222, |
|
"learning_rate": 8.597750701403197e-06, |
|
"loss": 0.3177, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 1.5178571428571428, |
|
"grad_norm": 1.383318305015564, |
|
"learning_rate": 8.589539462042566e-06, |
|
"loss": 0.3099, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.5223214285714286, |
|
"grad_norm": 1.4567229747772217, |
|
"learning_rate": 8.581308197591088e-06, |
|
"loss": 0.3105, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 1.5267857142857144, |
|
"grad_norm": 1.5694351196289062, |
|
"learning_rate": 8.573056953975208e-06, |
|
"loss": 0.3521, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 1.5267857142857144, |
|
"eval_loss": 0.6054974794387817, |
|
"eval_runtime": 3.4752, |
|
"eval_samples_per_second": 17.265, |
|
"eval_steps_per_second": 1.151, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 1.53125, |
|
"grad_norm": 1.499888300895691, |
|
"learning_rate": 8.56478577723284e-06, |
|
"loss": 0.3472, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 1.5357142857142856, |
|
"grad_norm": 1.5075103044509888, |
|
"learning_rate": 8.556494713513123e-06, |
|
"loss": 0.3353, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 1.5401785714285714, |
|
"grad_norm": 1.4308085441589355, |
|
"learning_rate": 8.548183809076146e-06, |
|
"loss": 0.31, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.5446428571428572, |
|
"grad_norm": 1.4208667278289795, |
|
"learning_rate": 8.539853110292708e-06, |
|
"loss": 0.3331, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 1.5491071428571428, |
|
"grad_norm": 1.6008033752441406, |
|
"learning_rate": 8.531502663644046e-06, |
|
"loss": 0.3568, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 1.5535714285714286, |
|
"grad_norm": 1.471472978591919, |
|
"learning_rate": 8.523132515721586e-06, |
|
"loss": 0.3379, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 1.5580357142857144, |
|
"grad_norm": 1.4036483764648438, |
|
"learning_rate": 8.51474271322667e-06, |
|
"loss": 0.3488, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 1.5625, |
|
"grad_norm": 1.4578651189804077, |
|
"learning_rate": 8.506333302970306e-06, |
|
"loss": 0.3206, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.5669642857142856, |
|
"grad_norm": 1.3492405414581299, |
|
"learning_rate": 8.497904331872909e-06, |
|
"loss": 0.2995, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 1.5714285714285714, |
|
"grad_norm": 1.5430017709732056, |
|
"learning_rate": 8.489455846964027e-06, |
|
"loss": 0.3503, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 1.5758928571428572, |
|
"grad_norm": 1.4253814220428467, |
|
"learning_rate": 8.480987895382086e-06, |
|
"loss": 0.3621, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 1.5803571428571428, |
|
"grad_norm": 1.615607500076294, |
|
"learning_rate": 8.472500524374129e-06, |
|
"loss": 0.3578, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 1.5848214285714286, |
|
"grad_norm": 1.4999765157699585, |
|
"learning_rate": 8.463993781295552e-06, |
|
"loss": 0.3241, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.5892857142857144, |
|
"grad_norm": 1.4261177778244019, |
|
"learning_rate": 8.45546771360983e-06, |
|
"loss": 0.3021, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 1.59375, |
|
"grad_norm": 1.4563229084014893, |
|
"learning_rate": 8.44692236888827e-06, |
|
"loss": 0.3194, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 1.5982142857142856, |
|
"grad_norm": 1.3366395235061646, |
|
"learning_rate": 8.43835779480973e-06, |
|
"loss": 0.3295, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 1.6026785714285714, |
|
"grad_norm": 1.6574403047561646, |
|
"learning_rate": 8.429774039160355e-06, |
|
"loss": 0.3375, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 1.6071428571428572, |
|
"grad_norm": 1.7103718519210815, |
|
"learning_rate": 8.421171149833322e-06, |
|
"loss": 0.3367, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.6116071428571428, |
|
"grad_norm": 1.6518172025680542, |
|
"learning_rate": 8.412549174828558e-06, |
|
"loss": 0.3802, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 1.6160714285714286, |
|
"grad_norm": 1.59016752243042, |
|
"learning_rate": 8.403908162252481e-06, |
|
"loss": 0.3342, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 1.6205357142857144, |
|
"grad_norm": 1.6090682744979858, |
|
"learning_rate": 8.395248160317728e-06, |
|
"loss": 0.3196, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 1.625, |
|
"grad_norm": 1.4735345840454102, |
|
"learning_rate": 8.386569217342893e-06, |
|
"loss": 0.3211, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 1.6294642857142856, |
|
"grad_norm": 1.554945707321167, |
|
"learning_rate": 8.377871381752246e-06, |
|
"loss": 0.3309, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.6339285714285714, |
|
"grad_norm": 1.4980627298355103, |
|
"learning_rate": 8.369154702075466e-06, |
|
"loss": 0.3394, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 1.6383928571428572, |
|
"grad_norm": 1.5292226076126099, |
|
"learning_rate": 8.360419226947383e-06, |
|
"loss": 0.319, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 1.6428571428571428, |
|
"grad_norm": 1.5913363695144653, |
|
"learning_rate": 8.351665005107686e-06, |
|
"loss": 0.3279, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 1.6473214285714286, |
|
"grad_norm": 1.526674509048462, |
|
"learning_rate": 8.34289208540067e-06, |
|
"loss": 0.3564, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 1.6517857142857144, |
|
"grad_norm": 1.5326498746871948, |
|
"learning_rate": 8.334100516774946e-06, |
|
"loss": 0.3462, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.65625, |
|
"grad_norm": 1.512019395828247, |
|
"learning_rate": 8.325290348283186e-06, |
|
"loss": 0.3624, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 1.6607142857142856, |
|
"grad_norm": 1.5395188331604004, |
|
"learning_rate": 8.316461629081833e-06, |
|
"loss": 0.3028, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 1.6651785714285714, |
|
"grad_norm": 1.4373003244400024, |
|
"learning_rate": 8.307614408430839e-06, |
|
"loss": 0.3623, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 1.6696428571428572, |
|
"grad_norm": 1.5330880880355835, |
|
"learning_rate": 8.298748735693382e-06, |
|
"loss": 0.3412, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 1.6741071428571428, |
|
"grad_norm": 1.408527135848999, |
|
"learning_rate": 8.289864660335595e-06, |
|
"loss": 0.3248, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.6785714285714286, |
|
"grad_norm": 1.5303860902786255, |
|
"learning_rate": 8.280962231926288e-06, |
|
"loss": 0.3394, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 1.6830357142857144, |
|
"grad_norm": 1.5478315353393555, |
|
"learning_rate": 8.27204150013667e-06, |
|
"loss": 0.334, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 1.6875, |
|
"grad_norm": 1.4640541076660156, |
|
"learning_rate": 8.263102514740082e-06, |
|
"loss": 0.3116, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 1.6919642857142856, |
|
"grad_norm": 1.5886211395263672, |
|
"learning_rate": 8.2541453256117e-06, |
|
"loss": 0.3146, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 1.6964285714285714, |
|
"grad_norm": 1.450217366218567, |
|
"learning_rate": 8.245169982728276e-06, |
|
"loss": 0.3425, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.7008928571428572, |
|
"grad_norm": 1.4083800315856934, |
|
"learning_rate": 8.23617653616785e-06, |
|
"loss": 0.3078, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 1.7053571428571428, |
|
"grad_norm": 1.4849356412887573, |
|
"learning_rate": 8.227165036109468e-06, |
|
"loss": 0.328, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 1.7098214285714286, |
|
"grad_norm": 1.6867343187332153, |
|
"learning_rate": 8.218135532832909e-06, |
|
"loss": 0.3688, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 1.7142857142857144, |
|
"grad_norm": 1.5400327444076538, |
|
"learning_rate": 8.209088076718398e-06, |
|
"loss": 0.3378, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 1.71875, |
|
"grad_norm": 1.5973129272460938, |
|
"learning_rate": 8.20002271824633e-06, |
|
"loss": 0.3734, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.7232142857142856, |
|
"grad_norm": 1.532235860824585, |
|
"learning_rate": 8.190939507996992e-06, |
|
"loss": 0.3582, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 1.7276785714285714, |
|
"grad_norm": 1.5533599853515625, |
|
"learning_rate": 8.181838496650266e-06, |
|
"loss": 0.3479, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 1.7321428571428572, |
|
"grad_norm": 1.6157996654510498, |
|
"learning_rate": 8.17271973498536e-06, |
|
"loss": 0.3596, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 1.7366071428571428, |
|
"grad_norm": 1.5097051858901978, |
|
"learning_rate": 8.163583273880519e-06, |
|
"loss": 0.3298, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 1.7410714285714286, |
|
"grad_norm": 1.4383480548858643, |
|
"learning_rate": 8.154429164312742e-06, |
|
"loss": 0.3054, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.7455357142857144, |
|
"grad_norm": 1.43813157081604, |
|
"learning_rate": 8.145257457357502e-06, |
|
"loss": 0.3249, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 1.5752897262573242, |
|
"learning_rate": 8.136068204188448e-06, |
|
"loss": 0.3342, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 1.7544642857142856, |
|
"grad_norm": 1.3768292665481567, |
|
"learning_rate": 8.12686145607714e-06, |
|
"loss": 0.3069, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 1.7589285714285714, |
|
"grad_norm": 1.6397638320922852, |
|
"learning_rate": 8.11763726439274e-06, |
|
"loss": 0.3843, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 1.7633928571428572, |
|
"grad_norm": 1.5874100923538208, |
|
"learning_rate": 8.108395680601742e-06, |
|
"loss": 0.3226, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.7678571428571428, |
|
"grad_norm": 1.5031986236572266, |
|
"learning_rate": 8.099136756267682e-06, |
|
"loss": 0.3176, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 1.7723214285714286, |
|
"grad_norm": 1.420452356338501, |
|
"learning_rate": 8.089860543050843e-06, |
|
"loss": 0.3259, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 1.7767857142857144, |
|
"grad_norm": 1.6253381967544556, |
|
"learning_rate": 8.080567092707973e-06, |
|
"loss": 0.3457, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 1.78125, |
|
"grad_norm": 1.4841629266738892, |
|
"learning_rate": 8.071256457091994e-06, |
|
"loss": 0.3201, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 1.78125, |
|
"eval_loss": 0.6055915951728821, |
|
"eval_runtime": 3.4629, |
|
"eval_samples_per_second": 17.327, |
|
"eval_steps_per_second": 1.155, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 1.7857142857142856, |
|
"grad_norm": 1.4217731952667236, |
|
"learning_rate": 8.06192868815172e-06, |
|
"loss": 0.3229, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.7901785714285714, |
|
"grad_norm": 1.5509140491485596, |
|
"learning_rate": 8.05258383793155e-06, |
|
"loss": 0.3586, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 1.7946428571428572, |
|
"grad_norm": 1.4150621891021729, |
|
"learning_rate": 8.043221958571193e-06, |
|
"loss": 0.3282, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 1.7991071428571428, |
|
"grad_norm": 1.4854953289031982, |
|
"learning_rate": 8.033843102305376e-06, |
|
"loss": 0.3426, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 1.8035714285714286, |
|
"grad_norm": 1.3903532028198242, |
|
"learning_rate": 8.024447321463545e-06, |
|
"loss": 0.3215, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 1.8080357142857144, |
|
"grad_norm": 1.4859530925750732, |
|
"learning_rate": 8.015034668469576e-06, |
|
"loss": 0.3505, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.8125, |
|
"grad_norm": 1.5304354429244995, |
|
"learning_rate": 8.005605195841485e-06, |
|
"loss": 0.323, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 1.8169642857142856, |
|
"grad_norm": 1.44026517868042, |
|
"learning_rate": 7.996158956191135e-06, |
|
"loss": 0.2791, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 1.8214285714285714, |
|
"grad_norm": 1.3094749450683594, |
|
"learning_rate": 7.986696002223936e-06, |
|
"loss": 0.2975, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 1.8258928571428572, |
|
"grad_norm": 1.4733068943023682, |
|
"learning_rate": 7.97721638673856e-06, |
|
"loss": 0.3331, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 1.8303571428571428, |
|
"grad_norm": 1.403172254562378, |
|
"learning_rate": 7.967720162626643e-06, |
|
"loss": 0.3336, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.8348214285714286, |
|
"grad_norm": 1.4719949960708618, |
|
"learning_rate": 7.958207382872486e-06, |
|
"loss": 0.3417, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 1.8392857142857144, |
|
"grad_norm": 1.4690333604812622, |
|
"learning_rate": 7.94867810055276e-06, |
|
"loss": 0.307, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 1.84375, |
|
"grad_norm": 1.5231949090957642, |
|
"learning_rate": 7.93913236883622e-06, |
|
"loss": 0.3421, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 1.8482142857142856, |
|
"grad_norm": 1.6096547842025757, |
|
"learning_rate": 7.929570240983393e-06, |
|
"loss": 0.3273, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 1.8526785714285714, |
|
"grad_norm": 1.3807638883590698, |
|
"learning_rate": 7.919991770346295e-06, |
|
"loss": 0.3431, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.8571428571428572, |
|
"grad_norm": 1.413569688796997, |
|
"learning_rate": 7.910397010368122e-06, |
|
"loss": 0.33, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 1.8616071428571428, |
|
"grad_norm": 1.435351014137268, |
|
"learning_rate": 7.900786014582957e-06, |
|
"loss": 0.3581, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 1.8660714285714286, |
|
"grad_norm": 1.4194928407669067, |
|
"learning_rate": 7.891158836615472e-06, |
|
"loss": 0.2989, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 1.8705357142857144, |
|
"grad_norm": 1.4297808408737183, |
|
"learning_rate": 7.881515530180629e-06, |
|
"loss": 0.3542, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 1.875, |
|
"grad_norm": 1.4238284826278687, |
|
"learning_rate": 7.871856149083377e-06, |
|
"loss": 0.3257, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.8794642857142856, |
|
"grad_norm": 1.3757729530334473, |
|
"learning_rate": 7.862180747218354e-06, |
|
"loss": 0.3359, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 1.8839285714285714, |
|
"grad_norm": 1.5045543909072876, |
|
"learning_rate": 7.852489378569588e-06, |
|
"loss": 0.3401, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 1.8883928571428572, |
|
"grad_norm": 1.4873552322387695, |
|
"learning_rate": 7.84278209721019e-06, |
|
"loss": 0.3424, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 1.8928571428571428, |
|
"grad_norm": 1.2040387392044067, |
|
"learning_rate": 7.83305895730206e-06, |
|
"loss": 0.2713, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 1.8973214285714286, |
|
"grad_norm": 1.522424340248108, |
|
"learning_rate": 7.823320013095578e-06, |
|
"loss": 0.3308, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.9017857142857144, |
|
"grad_norm": 1.4239548444747925, |
|
"learning_rate": 7.81356531892931e-06, |
|
"loss": 0.3225, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 1.90625, |
|
"grad_norm": 1.491533875465393, |
|
"learning_rate": 7.803794929229689e-06, |
|
"loss": 0.3185, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 1.9107142857142856, |
|
"grad_norm": 1.5129714012145996, |
|
"learning_rate": 7.794008898510731e-06, |
|
"loss": 0.3497, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 1.9151785714285714, |
|
"grad_norm": 1.5560288429260254, |
|
"learning_rate": 7.784207281373716e-06, |
|
"loss": 0.36, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 1.9196428571428572, |
|
"grad_norm": 1.469099998474121, |
|
"learning_rate": 7.774390132506892e-06, |
|
"loss": 0.3236, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.9241071428571428, |
|
"grad_norm": 1.5299158096313477, |
|
"learning_rate": 7.764557506685162e-06, |
|
"loss": 0.3857, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 1.9285714285714286, |
|
"grad_norm": 1.5339336395263672, |
|
"learning_rate": 7.754709458769787e-06, |
|
"loss": 0.3451, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 1.9330357142857144, |
|
"grad_norm": 1.4419758319854736, |
|
"learning_rate": 7.744846043708076e-06, |
|
"loss": 0.3194, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 1.9375, |
|
"grad_norm": 1.580003023147583, |
|
"learning_rate": 7.734967316533074e-06, |
|
"loss": 0.3381, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 1.9419642857142856, |
|
"grad_norm": 1.5472431182861328, |
|
"learning_rate": 7.725073332363265e-06, |
|
"loss": 0.3528, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.9464285714285714, |
|
"grad_norm": 1.468518853187561, |
|
"learning_rate": 7.715164146402259e-06, |
|
"loss": 0.3329, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 1.9508928571428572, |
|
"grad_norm": 1.5422027111053467, |
|
"learning_rate": 7.705239813938486e-06, |
|
"loss": 0.3392, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 1.9553571428571428, |
|
"grad_norm": 1.3538919687271118, |
|
"learning_rate": 7.69530039034488e-06, |
|
"loss": 0.3179, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 1.9598214285714286, |
|
"grad_norm": 1.3986053466796875, |
|
"learning_rate": 7.685345931078579e-06, |
|
"loss": 0.3533, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 1.9642857142857144, |
|
"grad_norm": 1.484214186668396, |
|
"learning_rate": 7.675376491680617e-06, |
|
"loss": 0.3466, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.96875, |
|
"grad_norm": 1.5053229331970215, |
|
"learning_rate": 7.665392127775605e-06, |
|
"loss": 0.3464, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 1.9732142857142856, |
|
"grad_norm": 1.4051772356033325, |
|
"learning_rate": 7.65539289507143e-06, |
|
"loss": 0.3184, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 1.9776785714285714, |
|
"grad_norm": 1.3952518701553345, |
|
"learning_rate": 7.645378849358931e-06, |
|
"loss": 0.2796, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 1.9821428571428572, |
|
"grad_norm": 1.409744381904602, |
|
"learning_rate": 7.635350046511609e-06, |
|
"loss": 0.3369, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 1.9866071428571428, |
|
"grad_norm": 1.5328154563903809, |
|
"learning_rate": 7.625306542485289e-06, |
|
"loss": 0.3415, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.9910714285714286, |
|
"grad_norm": 1.4076818227767944, |
|
"learning_rate": 7.615248393317833e-06, |
|
"loss": 0.3431, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 1.9955357142857144, |
|
"grad_norm": 1.6852420568466187, |
|
"learning_rate": 7.605175655128809e-06, |
|
"loss": 0.3177, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1.4953469038009644, |
|
"learning_rate": 7.595088384119186e-06, |
|
"loss": 0.3471, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 2.0044642857142856, |
|
"grad_norm": 1.3344584703445435, |
|
"learning_rate": 7.58498663657102e-06, |
|
"loss": 0.2074, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 2.0089285714285716, |
|
"grad_norm": 1.3186531066894531, |
|
"learning_rate": 7.57487046884714e-06, |
|
"loss": 0.2191, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.013392857142857, |
|
"grad_norm": 1.155795931816101, |
|
"learning_rate": 7.5647399373908296e-06, |
|
"loss": 0.1733, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 2.017857142857143, |
|
"grad_norm": 1.2673778533935547, |
|
"learning_rate": 7.554595098725515e-06, |
|
"loss": 0.2102, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 2.0223214285714284, |
|
"grad_norm": 1.1908435821533203, |
|
"learning_rate": 7.544436009454454e-06, |
|
"loss": 0.1796, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 2.0267857142857144, |
|
"grad_norm": 1.380186676979065, |
|
"learning_rate": 7.534262726260413e-06, |
|
"loss": 0.2077, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 2.03125, |
|
"grad_norm": 1.4421330690383911, |
|
"learning_rate": 7.524075305905351e-06, |
|
"loss": 0.1972, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 2.0357142857142856, |
|
"grad_norm": 1.6806164979934692, |
|
"learning_rate": 7.513873805230111e-06, |
|
"loss": 0.2034, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 2.0357142857142856, |
|
"eval_loss": 0.6833150386810303, |
|
"eval_runtime": 3.4113, |
|
"eval_samples_per_second": 17.588, |
|
"eval_steps_per_second": 1.173, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 2.0401785714285716, |
|
"grad_norm": 1.8982454538345337, |
|
"learning_rate": 7.5036582811540935e-06, |
|
"loss": 0.2251, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 2.044642857142857, |
|
"grad_norm": 1.90556001663208, |
|
"learning_rate": 7.493428790674943e-06, |
|
"loss": 0.1763, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 2.049107142857143, |
|
"grad_norm": 1.8609174489974976, |
|
"learning_rate": 7.483185390868232e-06, |
|
"loss": 0.1971, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 2.0535714285714284, |
|
"grad_norm": 1.6848630905151367, |
|
"learning_rate": 7.472928138887134e-06, |
|
"loss": 0.1822, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.0580357142857144, |
|
"grad_norm": 1.8364639282226562, |
|
"learning_rate": 7.462657091962122e-06, |
|
"loss": 0.2003, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 2.0625, |
|
"grad_norm": 1.6772575378417969, |
|
"learning_rate": 7.452372307400626e-06, |
|
"loss": 0.1848, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 2.0669642857142856, |
|
"grad_norm": 1.6733424663543701, |
|
"learning_rate": 7.442073842586733e-06, |
|
"loss": 0.1877, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 2.0714285714285716, |
|
"grad_norm": 1.5740326642990112, |
|
"learning_rate": 7.43176175498086e-06, |
|
"loss": 0.1693, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 2.075892857142857, |
|
"grad_norm": 1.621090054512024, |
|
"learning_rate": 7.421436102119427e-06, |
|
"loss": 0.2113, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 2.080357142857143, |
|
"grad_norm": 1.4219175577163696, |
|
"learning_rate": 7.411096941614543e-06, |
|
"loss": 0.1687, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 2.0848214285714284, |
|
"grad_norm": 1.4048285484313965, |
|
"learning_rate": 7.400744331153684e-06, |
|
"loss": 0.1832, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 2.0892857142857144, |
|
"grad_norm": 1.2322577238082886, |
|
"learning_rate": 7.390378328499372e-06, |
|
"loss": 0.1558, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 2.09375, |
|
"grad_norm": 1.5150905847549438, |
|
"learning_rate": 7.3799989914888506e-06, |
|
"loss": 0.2065, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 2.0982142857142856, |
|
"grad_norm": 1.4450438022613525, |
|
"learning_rate": 7.3696063780337566e-06, |
|
"loss": 0.1938, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.1026785714285716, |
|
"grad_norm": 1.6060644388198853, |
|
"learning_rate": 7.359200546119813e-06, |
|
"loss": 0.1865, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 2.107142857142857, |
|
"grad_norm": 1.4423115253448486, |
|
"learning_rate": 7.3487815538064865e-06, |
|
"loss": 0.1867, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 2.111607142857143, |
|
"grad_norm": 1.3800395727157593, |
|
"learning_rate": 7.338349459226678e-06, |
|
"loss": 0.1674, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 2.1160714285714284, |
|
"grad_norm": 1.5749934911727905, |
|
"learning_rate": 7.327904320586387e-06, |
|
"loss": 0.2004, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 2.1205357142857144, |
|
"grad_norm": 1.5107316970825195, |
|
"learning_rate": 7.3174461961644e-06, |
|
"loss": 0.1827, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 2.125, |
|
"grad_norm": 1.3470244407653809, |
|
"learning_rate": 7.3069751443119505e-06, |
|
"loss": 0.1696, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 2.1294642857142856, |
|
"grad_norm": 1.3709880113601685, |
|
"learning_rate": 7.296491223452407e-06, |
|
"loss": 0.1768, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 2.1339285714285716, |
|
"grad_norm": 1.4795838594436646, |
|
"learning_rate": 7.285994492080934e-06, |
|
"loss": 0.1915, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 2.138392857142857, |
|
"grad_norm": 1.5192097425460815, |
|
"learning_rate": 7.275485008764183e-06, |
|
"loss": 0.192, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 2.142857142857143, |
|
"grad_norm": 1.7044153213500977, |
|
"learning_rate": 7.2649628321399415e-06, |
|
"loss": 0.1945, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.1473214285714284, |
|
"grad_norm": 1.4719630479812622, |
|
"learning_rate": 7.254428020916829e-06, |
|
"loss": 0.1842, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 2.1517857142857144, |
|
"grad_norm": 1.4197455644607544, |
|
"learning_rate": 7.243880633873957e-06, |
|
"loss": 0.1977, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 2.15625, |
|
"grad_norm": 1.5410860776901245, |
|
"learning_rate": 7.2333207298606075e-06, |
|
"loss": 0.1899, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 2.1607142857142856, |
|
"grad_norm": 1.4274650812149048, |
|
"learning_rate": 7.222748367795892e-06, |
|
"loss": 0.1977, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 2.1651785714285716, |
|
"grad_norm": 1.4208344221115112, |
|
"learning_rate": 7.212163606668442e-06, |
|
"loss": 0.1799, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 2.169642857142857, |
|
"grad_norm": 1.4087142944335938, |
|
"learning_rate": 7.201566505536065e-06, |
|
"loss": 0.1876, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 2.174107142857143, |
|
"grad_norm": 1.5957927703857422, |
|
"learning_rate": 7.190957123525417e-06, |
|
"loss": 0.1926, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 2.1785714285714284, |
|
"grad_norm": 1.4640475511550903, |
|
"learning_rate": 7.180335519831685e-06, |
|
"loss": 0.1962, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 2.1830357142857144, |
|
"grad_norm": 1.6668205261230469, |
|
"learning_rate": 7.169701753718232e-06, |
|
"loss": 0.1923, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 2.1875, |
|
"grad_norm": 1.4836277961730957, |
|
"learning_rate": 7.159055884516297e-06, |
|
"loss": 0.1698, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.1919642857142856, |
|
"grad_norm": 1.4313628673553467, |
|
"learning_rate": 7.148397971624636e-06, |
|
"loss": 0.1823, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 2.1964285714285716, |
|
"grad_norm": 1.4819891452789307, |
|
"learning_rate": 7.137728074509211e-06, |
|
"loss": 0.2001, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 2.200892857142857, |
|
"grad_norm": 1.4184925556182861, |
|
"learning_rate": 7.127046252702847e-06, |
|
"loss": 0.1985, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 2.205357142857143, |
|
"grad_norm": 1.4677350521087646, |
|
"learning_rate": 7.116352565804904e-06, |
|
"loss": 0.1803, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 2.2098214285714284, |
|
"grad_norm": 1.4225105047225952, |
|
"learning_rate": 7.105647073480939e-06, |
|
"loss": 0.1934, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 2.2142857142857144, |
|
"grad_norm": 1.5292863845825195, |
|
"learning_rate": 7.0949298354623855e-06, |
|
"loss": 0.1983, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 2.21875, |
|
"grad_norm": 1.6830313205718994, |
|
"learning_rate": 7.084200911546205e-06, |
|
"loss": 0.2033, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 2.2232142857142856, |
|
"grad_norm": 1.462639331817627, |
|
"learning_rate": 7.073460361594565e-06, |
|
"loss": 0.1775, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 2.2276785714285716, |
|
"grad_norm": 1.3703210353851318, |
|
"learning_rate": 7.0627082455344984e-06, |
|
"loss": 0.167, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 2.232142857142857, |
|
"grad_norm": 1.4754226207733154, |
|
"learning_rate": 7.0519446233575715e-06, |
|
"loss": 0.1945, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.236607142857143, |
|
"grad_norm": 1.5474300384521484, |
|
"learning_rate": 7.041169555119552e-06, |
|
"loss": 0.1792, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 2.2410714285714284, |
|
"grad_norm": 1.745498776435852, |
|
"learning_rate": 7.030383100940068e-06, |
|
"loss": 0.2106, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 2.2455357142857144, |
|
"grad_norm": 1.510685920715332, |
|
"learning_rate": 7.019585321002276e-06, |
|
"loss": 0.1952, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 1.4274405241012573, |
|
"learning_rate": 7.008776275552522e-06, |
|
"loss": 0.1823, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 2.2544642857142856, |
|
"grad_norm": 1.5458096265792847, |
|
"learning_rate": 6.997956024900014e-06, |
|
"loss": 0.1768, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 2.2589285714285716, |
|
"grad_norm": 1.645890474319458, |
|
"learning_rate": 6.9871246294164775e-06, |
|
"loss": 0.1986, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 2.263392857142857, |
|
"grad_norm": 1.677795171737671, |
|
"learning_rate": 6.9762821495358194e-06, |
|
"loss": 0.2074, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 2.267857142857143, |
|
"grad_norm": 1.4883826971054077, |
|
"learning_rate": 6.965428645753792e-06, |
|
"loss": 0.1853, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 2.2723214285714284, |
|
"grad_norm": 1.3959269523620605, |
|
"learning_rate": 6.954564178627655e-06, |
|
"loss": 0.1731, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 2.2767857142857144, |
|
"grad_norm": 1.4190350770950317, |
|
"learning_rate": 6.943688808775843e-06, |
|
"loss": 0.1996, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.28125, |
|
"grad_norm": 1.4735363721847534, |
|
"learning_rate": 6.9328025968776155e-06, |
|
"loss": 0.1716, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 2.2857142857142856, |
|
"grad_norm": 1.458999752998352, |
|
"learning_rate": 6.921905603672733e-06, |
|
"loss": 0.1894, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 2.2901785714285716, |
|
"grad_norm": 1.522794246673584, |
|
"learning_rate": 6.910997889961098e-06, |
|
"loss": 0.2079, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 2.2901785714285716, |
|
"eval_loss": 0.6830385327339172, |
|
"eval_runtime": 3.1216, |
|
"eval_samples_per_second": 19.221, |
|
"eval_steps_per_second": 1.281, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 2.294642857142857, |
|
"grad_norm": 1.3990702629089355, |
|
"learning_rate": 6.900079516602445e-06, |
|
"loss": 0.1609, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 2.299107142857143, |
|
"grad_norm": 1.5663005113601685, |
|
"learning_rate": 6.889150544515972e-06, |
|
"loss": 0.1873, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 2.3035714285714284, |
|
"grad_norm": 1.6061094999313354, |
|
"learning_rate": 6.8782110346800155e-06, |
|
"loss": 0.1913, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 2.3080357142857144, |
|
"grad_norm": 1.5188199281692505, |
|
"learning_rate": 6.867261048131712e-06, |
|
"loss": 0.1748, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 2.3125, |
|
"grad_norm": 1.574725866317749, |
|
"learning_rate": 6.856300645966645e-06, |
|
"loss": 0.1963, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 2.3169642857142856, |
|
"grad_norm": 1.462173581123352, |
|
"learning_rate": 6.845329889338519e-06, |
|
"loss": 0.1942, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 2.3214285714285716, |
|
"grad_norm": 1.5870957374572754, |
|
"learning_rate": 6.834348839458806e-06, |
|
"loss": 0.1953, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.325892857142857, |
|
"grad_norm": 1.4638605117797852, |
|
"learning_rate": 6.823357557596416e-06, |
|
"loss": 0.173, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 2.330357142857143, |
|
"grad_norm": 1.6311603784561157, |
|
"learning_rate": 6.81235610507734e-06, |
|
"loss": 0.1966, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 2.3348214285714284, |
|
"grad_norm": 1.397397756576538, |
|
"learning_rate": 6.801344543284324e-06, |
|
"loss": 0.1798, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 2.3392857142857144, |
|
"grad_norm": 1.4249341487884521, |
|
"learning_rate": 6.790322933656515e-06, |
|
"loss": 0.1752, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 2.34375, |
|
"grad_norm": 1.5181248188018799, |
|
"learning_rate": 6.779291337689122e-06, |
|
"loss": 0.1919, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 2.3482142857142856, |
|
"grad_norm": 1.5693392753601074, |
|
"learning_rate": 6.768249816933074e-06, |
|
"loss": 0.2068, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 2.3526785714285716, |
|
"grad_norm": 1.3944882154464722, |
|
"learning_rate": 6.757198432994674e-06, |
|
"loss": 0.1921, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 2.357142857142857, |
|
"grad_norm": 1.467804193496704, |
|
"learning_rate": 6.7461372475352585e-06, |
|
"loss": 0.1736, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 2.361607142857143, |
|
"grad_norm": 1.5602641105651855, |
|
"learning_rate": 6.73506632227085e-06, |
|
"loss": 0.2016, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 2.3660714285714284, |
|
"grad_norm": 1.4512310028076172, |
|
"learning_rate": 6.723985718971818e-06, |
|
"loss": 0.1734, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.3705357142857144, |
|
"grad_norm": 1.5196360349655151, |
|
"learning_rate": 6.712895499462527e-06, |
|
"loss": 0.1815, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 2.375, |
|
"grad_norm": 1.38957679271698, |
|
"learning_rate": 6.701795725620995e-06, |
|
"loss": 0.1815, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 2.3794642857142856, |
|
"grad_norm": 1.3005454540252686, |
|
"learning_rate": 6.69068645937855e-06, |
|
"loss": 0.1751, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 2.3839285714285716, |
|
"grad_norm": 1.3817120790481567, |
|
"learning_rate": 6.6795677627194835e-06, |
|
"loss": 0.1728, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 2.388392857142857, |
|
"grad_norm": 1.559007167816162, |
|
"learning_rate": 6.668439697680704e-06, |
|
"loss": 0.1765, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 2.392857142857143, |
|
"grad_norm": 1.5712296962738037, |
|
"learning_rate": 6.65730232635139e-06, |
|
"loss": 0.2081, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 2.3973214285714284, |
|
"grad_norm": 1.3464829921722412, |
|
"learning_rate": 6.6461557108726435e-06, |
|
"loss": 0.1874, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 2.4017857142857144, |
|
"grad_norm": 1.4296271800994873, |
|
"learning_rate": 6.634999913437148e-06, |
|
"loss": 0.1625, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 2.40625, |
|
"grad_norm": 1.5054280757904053, |
|
"learning_rate": 6.623834996288816e-06, |
|
"loss": 0.185, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 2.4107142857142856, |
|
"grad_norm": 1.50546133518219, |
|
"learning_rate": 6.6126610217224405e-06, |
|
"loss": 0.2131, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.4151785714285716, |
|
"grad_norm": 1.6412122249603271, |
|
"learning_rate": 6.601478052083356e-06, |
|
"loss": 0.2126, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 2.419642857142857, |
|
"grad_norm": 1.5263912677764893, |
|
"learning_rate": 6.59028614976708e-06, |
|
"loss": 0.2023, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 2.424107142857143, |
|
"grad_norm": 1.551417589187622, |
|
"learning_rate": 6.579085377218973e-06, |
|
"loss": 0.2003, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 2.4285714285714284, |
|
"grad_norm": 1.6211479902267456, |
|
"learning_rate": 6.567875796933888e-06, |
|
"loss": 0.2003, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 2.4330357142857144, |
|
"grad_norm": 1.471747875213623, |
|
"learning_rate": 6.556657471455817e-06, |
|
"loss": 0.1965, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 2.4375, |
|
"grad_norm": 1.4076069593429565, |
|
"learning_rate": 6.54543046337755e-06, |
|
"loss": 0.2109, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 2.4419642857142856, |
|
"grad_norm": 1.5845184326171875, |
|
"learning_rate": 6.534194835340321e-06, |
|
"loss": 0.2135, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 2.4464285714285716, |
|
"grad_norm": 1.4862483739852905, |
|
"learning_rate": 6.522950650033454e-06, |
|
"loss": 0.1866, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 2.450892857142857, |
|
"grad_norm": 1.5533431768417358, |
|
"learning_rate": 6.511697970194024e-06, |
|
"loss": 0.2, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 2.455357142857143, |
|
"grad_norm": 1.3386541604995728, |
|
"learning_rate": 6.500436858606501e-06, |
|
"loss": 0.1801, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.4598214285714284, |
|
"grad_norm": 1.4485145807266235, |
|
"learning_rate": 6.4891673781023975e-06, |
|
"loss": 0.1829, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 2.4642857142857144, |
|
"grad_norm": 1.59889554977417, |
|
"learning_rate": 6.477889591559926e-06, |
|
"loss": 0.2162, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 2.46875, |
|
"grad_norm": 1.4210249185562134, |
|
"learning_rate": 6.466603561903633e-06, |
|
"loss": 0.1716, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 2.4732142857142856, |
|
"grad_norm": 1.406024694442749, |
|
"learning_rate": 6.455309352104065e-06, |
|
"loss": 0.1924, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 2.4776785714285716, |
|
"grad_norm": 1.4636434316635132, |
|
"learning_rate": 6.444007025177407e-06, |
|
"loss": 0.1902, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 2.482142857142857, |
|
"grad_norm": 1.5032724142074585, |
|
"learning_rate": 6.4326966441851355e-06, |
|
"loss": 0.1878, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 2.486607142857143, |
|
"grad_norm": 1.3488855361938477, |
|
"learning_rate": 6.4213782722336625e-06, |
|
"loss": 0.1592, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 2.4910714285714284, |
|
"grad_norm": 1.7310540676116943, |
|
"learning_rate": 6.4100519724739875e-06, |
|
"loss": 0.2106, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 2.4955357142857144, |
|
"grad_norm": 1.4981244802474976, |
|
"learning_rate": 6.3987178081013446e-06, |
|
"loss": 0.1887, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 1.3576077222824097, |
|
"learning_rate": 6.387375842354843e-06, |
|
"loss": 0.1877, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.5044642857142856, |
|
"grad_norm": 1.4860981702804565, |
|
"learning_rate": 6.376026138517125e-06, |
|
"loss": 0.1788, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 2.508928571428571, |
|
"grad_norm": 1.5492916107177734, |
|
"learning_rate": 6.364668759914005e-06, |
|
"loss": 0.1964, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 2.513392857142857, |
|
"grad_norm": 1.440119743347168, |
|
"learning_rate": 6.353303769914121e-06, |
|
"loss": 0.2057, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 2.517857142857143, |
|
"grad_norm": 1.3989627361297607, |
|
"learning_rate": 6.341931231928577e-06, |
|
"loss": 0.1675, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 2.522321428571429, |
|
"grad_norm": 1.6142938137054443, |
|
"learning_rate": 6.330551209410593e-06, |
|
"loss": 0.1856, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 2.5267857142857144, |
|
"grad_norm": 1.6988365650177002, |
|
"learning_rate": 6.319163765855146e-06, |
|
"loss": 0.2122, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 2.53125, |
|
"grad_norm": 1.641639232635498, |
|
"learning_rate": 6.307768964798623e-06, |
|
"loss": 0.1836, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 2.5357142857142856, |
|
"grad_norm": 1.6539298295974731, |
|
"learning_rate": 6.296366869818458e-06, |
|
"loss": 0.203, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 2.540178571428571, |
|
"grad_norm": 1.6499872207641602, |
|
"learning_rate": 6.284957544532783e-06, |
|
"loss": 0.2115, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 2.544642857142857, |
|
"grad_norm": 1.6138975620269775, |
|
"learning_rate": 6.273541052600074e-06, |
|
"loss": 0.2228, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.544642857142857, |
|
"eval_loss": 0.6881652474403381, |
|
"eval_runtime": 3.0998, |
|
"eval_samples_per_second": 19.356, |
|
"eval_steps_per_second": 1.29, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.549107142857143, |
|
"grad_norm": 1.3579055070877075, |
|
"learning_rate": 6.2621174577187895e-06, |
|
"loss": 0.1622, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 2.553571428571429, |
|
"grad_norm": 1.519002079963684, |
|
"learning_rate": 6.250686823627022e-06, |
|
"loss": 0.1875, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 2.5580357142857144, |
|
"grad_norm": 1.2916154861450195, |
|
"learning_rate": 6.239249214102139e-06, |
|
"loss": 0.1616, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 2.5625, |
|
"grad_norm": 1.4468685388565063, |
|
"learning_rate": 6.2278046929604265e-06, |
|
"loss": 0.2001, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 2.5669642857142856, |
|
"grad_norm": 1.4290978908538818, |
|
"learning_rate": 6.216353324056732e-06, |
|
"loss": 0.1818, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 2.571428571428571, |
|
"grad_norm": 1.3633805513381958, |
|
"learning_rate": 6.204895171284115e-06, |
|
"loss": 0.1724, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 2.575892857142857, |
|
"grad_norm": 1.4299393892288208, |
|
"learning_rate": 6.193430298573481e-06, |
|
"loss": 0.2044, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 2.580357142857143, |
|
"grad_norm": 1.428096890449524, |
|
"learning_rate": 6.181958769893234e-06, |
|
"loss": 0.2038, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 2.584821428571429, |
|
"grad_norm": 1.4805070161819458, |
|
"learning_rate": 6.17048064924891e-06, |
|
"loss": 0.2023, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 2.5892857142857144, |
|
"grad_norm": 1.5858566761016846, |
|
"learning_rate": 6.158996000682829e-06, |
|
"loss": 0.1874, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.59375, |
|
"grad_norm": 1.547018051147461, |
|
"learning_rate": 6.147504888273736e-06, |
|
"loss": 0.1938, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 2.5982142857142856, |
|
"grad_norm": 1.5140594244003296, |
|
"learning_rate": 6.136007376136429e-06, |
|
"loss": 0.1989, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 2.602678571428571, |
|
"grad_norm": 1.2419382333755493, |
|
"learning_rate": 6.124503528421429e-06, |
|
"loss": 0.1524, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 2.607142857142857, |
|
"grad_norm": 1.5366337299346924, |
|
"learning_rate": 6.112993409314594e-06, |
|
"loss": 0.1853, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 2.611607142857143, |
|
"grad_norm": 1.4070125818252563, |
|
"learning_rate": 6.101477083036783e-06, |
|
"loss": 0.1767, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 2.616071428571429, |
|
"grad_norm": 1.5408363342285156, |
|
"learning_rate": 6.0899546138434785e-06, |
|
"loss": 0.2016, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 2.6205357142857144, |
|
"grad_norm": 1.5800507068634033, |
|
"learning_rate": 6.0784260660244475e-06, |
|
"loss": 0.1665, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 2.625, |
|
"grad_norm": 1.4633119106292725, |
|
"learning_rate": 6.066891503903363e-06, |
|
"loss": 0.1846, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 2.6294642857142856, |
|
"grad_norm": 1.5278315544128418, |
|
"learning_rate": 6.0553509918374635e-06, |
|
"loss": 0.1761, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 2.633928571428571, |
|
"grad_norm": 1.4787242412567139, |
|
"learning_rate": 6.0438045942171775e-06, |
|
"loss": 0.1836, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.638392857142857, |
|
"grad_norm": 1.4805656671524048, |
|
"learning_rate": 6.032252375465778e-06, |
|
"loss": 0.1823, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 2.642857142857143, |
|
"grad_norm": 1.4597275257110596, |
|
"learning_rate": 6.020694400039017e-06, |
|
"loss": 0.1751, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 2.647321428571429, |
|
"grad_norm": 1.5701682567596436, |
|
"learning_rate": 6.009130732424758e-06, |
|
"loss": 0.1856, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 2.6517857142857144, |
|
"grad_norm": 1.5376973152160645, |
|
"learning_rate": 5.997561437142636e-06, |
|
"loss": 0.2012, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 2.65625, |
|
"grad_norm": 1.3718751668930054, |
|
"learning_rate": 5.985986578743676e-06, |
|
"loss": 0.1746, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 2.6607142857142856, |
|
"grad_norm": 1.4416507482528687, |
|
"learning_rate": 5.974406221809949e-06, |
|
"loss": 0.1865, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 2.665178571428571, |
|
"grad_norm": 1.628366470336914, |
|
"learning_rate": 5.962820430954198e-06, |
|
"loss": 0.2131, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 2.669642857142857, |
|
"grad_norm": 1.4710111618041992, |
|
"learning_rate": 5.951229270819494e-06, |
|
"loss": 0.197, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 2.674107142857143, |
|
"grad_norm": 1.3397061824798584, |
|
"learning_rate": 5.9396328060788576e-06, |
|
"loss": 0.1876, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 2.678571428571429, |
|
"grad_norm": 1.5016379356384277, |
|
"learning_rate": 5.928031101434908e-06, |
|
"loss": 0.1825, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.6830357142857144, |
|
"grad_norm": 1.597392201423645, |
|
"learning_rate": 5.916424221619507e-06, |
|
"loss": 0.2096, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 2.6875, |
|
"grad_norm": 1.4211905002593994, |
|
"learning_rate": 5.904812231393383e-06, |
|
"loss": 0.1961, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 2.6919642857142856, |
|
"grad_norm": 1.3317689895629883, |
|
"learning_rate": 5.893195195545784e-06, |
|
"loss": 0.1727, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 2.696428571428571, |
|
"grad_norm": 1.5371907949447632, |
|
"learning_rate": 5.8815731788941064e-06, |
|
"loss": 0.1877, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 2.700892857142857, |
|
"grad_norm": 1.4972670078277588, |
|
"learning_rate": 5.86994624628354e-06, |
|
"loss": 0.1853, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 2.705357142857143, |
|
"grad_norm": 1.468634843826294, |
|
"learning_rate": 5.858314462586697e-06, |
|
"loss": 0.1787, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 2.709821428571429, |
|
"grad_norm": 1.6285043954849243, |
|
"learning_rate": 5.846677892703268e-06, |
|
"loss": 0.2082, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 2.7142857142857144, |
|
"grad_norm": 1.4436808824539185, |
|
"learning_rate": 5.835036601559634e-06, |
|
"loss": 0.1811, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 2.71875, |
|
"grad_norm": 1.5260608196258545, |
|
"learning_rate": 5.82339065410853e-06, |
|
"loss": 0.1872, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 2.7232142857142856, |
|
"grad_norm": 1.5837960243225098, |
|
"learning_rate": 5.811740115328665e-06, |
|
"loss": 0.1817, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.727678571428571, |
|
"grad_norm": 1.5583548545837402, |
|
"learning_rate": 5.800085050224367e-06, |
|
"loss": 0.1993, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 2.732142857142857, |
|
"grad_norm": 1.5718779563903809, |
|
"learning_rate": 5.7884255238252175e-06, |
|
"loss": 0.1955, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 2.736607142857143, |
|
"grad_norm": 1.5729767084121704, |
|
"learning_rate": 5.776761601185692e-06, |
|
"loss": 0.1868, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 2.741071428571429, |
|
"grad_norm": 1.5369676351547241, |
|
"learning_rate": 5.765093347384793e-06, |
|
"loss": 0.1974, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 2.7455357142857144, |
|
"grad_norm": 1.5321385860443115, |
|
"learning_rate": 5.753420827525691e-06, |
|
"loss": 0.1991, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 1.6197004318237305, |
|
"learning_rate": 5.741744106735354e-06, |
|
"loss": 0.1967, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 2.7544642857142856, |
|
"grad_norm": 1.4618470668792725, |
|
"learning_rate": 5.730063250164196e-06, |
|
"loss": 0.1778, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 2.758928571428571, |
|
"grad_norm": 1.6011563539505005, |
|
"learning_rate": 5.718378322985702e-06, |
|
"loss": 0.1986, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 2.763392857142857, |
|
"grad_norm": 1.3535871505737305, |
|
"learning_rate": 5.70668939039607e-06, |
|
"loss": 0.179, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 2.767857142857143, |
|
"grad_norm": 1.5589332580566406, |
|
"learning_rate": 5.694996517613847e-06, |
|
"loss": 0.2123, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.772321428571429, |
|
"grad_norm": 1.483581304550171, |
|
"learning_rate": 5.683299769879562e-06, |
|
"loss": 0.189, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 2.7767857142857144, |
|
"grad_norm": 1.5656222105026245, |
|
"learning_rate": 5.6715992124553685e-06, |
|
"loss": 0.1915, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 2.78125, |
|
"grad_norm": 1.5247652530670166, |
|
"learning_rate": 5.6598949106246734e-06, |
|
"loss": 0.2024, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 2.7857142857142856, |
|
"grad_norm": 1.7072973251342773, |
|
"learning_rate": 5.648186929691776e-06, |
|
"loss": 0.2113, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 2.790178571428571, |
|
"grad_norm": 1.5222718715667725, |
|
"learning_rate": 5.6364753349815035e-06, |
|
"loss": 0.2002, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 2.794642857142857, |
|
"grad_norm": 1.42404305934906, |
|
"learning_rate": 5.624760191838845e-06, |
|
"loss": 0.1804, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 2.799107142857143, |
|
"grad_norm": 1.5567388534545898, |
|
"learning_rate": 5.61304156562859e-06, |
|
"loss": 0.2147, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 2.799107142857143, |
|
"eval_loss": 0.6883992552757263, |
|
"eval_runtime": 3.1165, |
|
"eval_samples_per_second": 19.252, |
|
"eval_steps_per_second": 1.283, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 2.803571428571429, |
|
"grad_norm": 1.4494566917419434, |
|
"learning_rate": 5.60131952173496e-06, |
|
"loss": 0.1921, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 2.8080357142857144, |
|
"grad_norm": 1.530175805091858, |
|
"learning_rate": 5.589594125561246e-06, |
|
"loss": 0.1979, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 2.8125, |
|
"grad_norm": 1.4815398454666138, |
|
"learning_rate": 5.577865442529447e-06, |
|
"loss": 0.1999, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.8169642857142856, |
|
"grad_norm": 1.4520913362503052, |
|
"learning_rate": 5.566133538079893e-06, |
|
"loss": 0.1783, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 2.821428571428571, |
|
"grad_norm": 1.5236479043960571, |
|
"learning_rate": 5.554398477670895e-06, |
|
"loss": 0.1875, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 2.825892857142857, |
|
"grad_norm": 1.3968005180358887, |
|
"learning_rate": 5.54266032677837e-06, |
|
"loss": 0.1919, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 2.830357142857143, |
|
"grad_norm": 1.4452468156814575, |
|
"learning_rate": 5.53091915089548e-06, |
|
"loss": 0.1802, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 2.834821428571429, |
|
"grad_norm": 1.512412190437317, |
|
"learning_rate": 5.5191750155322595e-06, |
|
"loss": 0.195, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 2.8392857142857144, |
|
"grad_norm": 1.4154279232025146, |
|
"learning_rate": 5.507427986215265e-06, |
|
"loss": 0.1846, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 2.84375, |
|
"grad_norm": 1.6457233428955078, |
|
"learning_rate": 5.49567812848719e-06, |
|
"loss": 0.1981, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 2.8482142857142856, |
|
"grad_norm": 1.6645605564117432, |
|
"learning_rate": 5.483925507906514e-06, |
|
"loss": 0.1992, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 2.852678571428571, |
|
"grad_norm": 1.491758942604065, |
|
"learning_rate": 5.4721701900471335e-06, |
|
"loss": 0.188, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 2.857142857142857, |
|
"grad_norm": 1.5240974426269531, |
|
"learning_rate": 5.460412240497993e-06, |
|
"loss": 0.1966, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.861607142857143, |
|
"grad_norm": 1.5371315479278564, |
|
"learning_rate": 5.448651724862716e-06, |
|
"loss": 0.1846, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 2.866071428571429, |
|
"grad_norm": 1.4583534002304077, |
|
"learning_rate": 5.436888708759253e-06, |
|
"loss": 0.1906, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 2.8705357142857144, |
|
"grad_norm": 1.4962140321731567, |
|
"learning_rate": 5.425123257819494e-06, |
|
"loss": 0.1965, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 2.875, |
|
"grad_norm": 1.5177336931228638, |
|
"learning_rate": 5.413355437688926e-06, |
|
"loss": 0.1751, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 2.8794642857142856, |
|
"grad_norm": 1.5701833963394165, |
|
"learning_rate": 5.401585314026248e-06, |
|
"loss": 0.1948, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 2.883928571428571, |
|
"grad_norm": 1.6026036739349365, |
|
"learning_rate": 5.3898129525030105e-06, |
|
"loss": 0.2127, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 2.888392857142857, |
|
"grad_norm": 1.6437232494354248, |
|
"learning_rate": 5.378038418803256e-06, |
|
"loss": 0.1991, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 2.892857142857143, |
|
"grad_norm": 1.5540730953216553, |
|
"learning_rate": 5.366261778623143e-06, |
|
"loss": 0.2023, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 2.897321428571429, |
|
"grad_norm": 1.5963166952133179, |
|
"learning_rate": 5.354483097670584e-06, |
|
"loss": 0.1991, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 2.9017857142857144, |
|
"grad_norm": 1.7462104558944702, |
|
"learning_rate": 5.342702441664875e-06, |
|
"loss": 0.2079, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.90625, |
|
"grad_norm": 1.6072155237197876, |
|
"learning_rate": 5.3309198763363345e-06, |
|
"loss": 0.207, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 2.9107142857142856, |
|
"grad_norm": 1.5229613780975342, |
|
"learning_rate": 5.319135467425937e-06, |
|
"loss": 0.1961, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 2.915178571428571, |
|
"grad_norm": 1.3907989263534546, |
|
"learning_rate": 5.3073492806849405e-06, |
|
"loss": 0.1754, |
|
"step": 653 |
|
}, |
|
{ |
|
"epoch": 2.919642857142857, |
|
"grad_norm": 1.3996022939682007, |
|
"learning_rate": 5.295561381874518e-06, |
|
"loss": 0.1684, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 2.924107142857143, |
|
"grad_norm": 1.448462724685669, |
|
"learning_rate": 5.2837718367654036e-06, |
|
"loss": 0.1932, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 2.928571428571429, |
|
"grad_norm": 1.5644668340682983, |
|
"learning_rate": 5.2719807111375096e-06, |
|
"loss": 0.1985, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 2.9330357142857144, |
|
"grad_norm": 1.6308317184448242, |
|
"learning_rate": 5.260188070779573e-06, |
|
"loss": 0.1931, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 2.9375, |
|
"grad_norm": 1.47319757938385, |
|
"learning_rate": 5.248393981488777e-06, |
|
"loss": 0.1859, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 2.9419642857142856, |
|
"grad_norm": 1.4166940450668335, |
|
"learning_rate": 5.236598509070389e-06, |
|
"loss": 0.175, |
|
"step": 659 |
|
}, |
|
{ |
|
"epoch": 2.946428571428571, |
|
"grad_norm": 1.5478429794311523, |
|
"learning_rate": 5.2248017193374e-06, |
|
"loss": 0.2117, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.950892857142857, |
|
"grad_norm": 1.5836637020111084, |
|
"learning_rate": 5.2130036781101455e-06, |
|
"loss": 0.1859, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 2.955357142857143, |
|
"grad_norm": 1.584566593170166, |
|
"learning_rate": 5.201204451215945e-06, |
|
"loss": 0.1871, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 2.959821428571429, |
|
"grad_norm": 1.43748939037323, |
|
"learning_rate": 5.18940410448873e-06, |
|
"loss": 0.1822, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 2.9642857142857144, |
|
"grad_norm": 1.4014517068862915, |
|
"learning_rate": 5.1776027037686895e-06, |
|
"loss": 0.1731, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 2.96875, |
|
"grad_norm": 1.5529756546020508, |
|
"learning_rate": 5.165800314901883e-06, |
|
"loss": 0.1923, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 2.9732142857142856, |
|
"grad_norm": 1.4991168975830078, |
|
"learning_rate": 5.15399700373989e-06, |
|
"loss": 0.1984, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 2.977678571428571, |
|
"grad_norm": 1.5158687829971313, |
|
"learning_rate": 5.142192836139432e-06, |
|
"loss": 0.1903, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 2.982142857142857, |
|
"grad_norm": 1.5199565887451172, |
|
"learning_rate": 5.130387877962012e-06, |
|
"loss": 0.1709, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 2.986607142857143, |
|
"grad_norm": 1.5208170413970947, |
|
"learning_rate": 5.118582195073542e-06, |
|
"loss": 0.1964, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 2.991071428571429, |
|
"grad_norm": 1.45652437210083, |
|
"learning_rate": 5.1067758533439804e-06, |
|
"loss": 0.1957, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.9955357142857144, |
|
"grad_norm": 1.3445968627929688, |
|
"learning_rate": 5.094968918646954e-06, |
|
"loss": 0.1752, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 1.3375955820083618, |
|
"learning_rate": 5.0831614568594105e-06, |
|
"loss": 0.1758, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 3.0044642857142856, |
|
"grad_norm": 1.089407205581665, |
|
"learning_rate": 5.071353533861225e-06, |
|
"loss": 0.1099, |
|
"step": 673 |
|
}, |
|
{ |
|
"epoch": 3.0089285714285716, |
|
"grad_norm": 1.0388957262039185, |
|
"learning_rate": 5.059545215534859e-06, |
|
"loss": 0.1054, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 3.013392857142857, |
|
"grad_norm": 1.2963316440582275, |
|
"learning_rate": 5.047736567764967e-06, |
|
"loss": 0.114, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 3.017857142857143, |
|
"grad_norm": 1.0460807085037231, |
|
"learning_rate": 5.0359276564380514e-06, |
|
"loss": 0.0957, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 3.0223214285714284, |
|
"grad_norm": 1.1701343059539795, |
|
"learning_rate": 5.024118547442083e-06, |
|
"loss": 0.0955, |
|
"step": 677 |
|
}, |
|
{ |
|
"epoch": 3.0267857142857144, |
|
"grad_norm": 1.2869516611099243, |
|
"learning_rate": 5.012309306666129e-06, |
|
"loss": 0.1006, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 3.03125, |
|
"grad_norm": 1.0782946348190308, |
|
"learning_rate": 5.000500000000001e-06, |
|
"loss": 0.0765, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 3.0357142857142856, |
|
"grad_norm": 1.4111815690994263, |
|
"learning_rate": 4.988690693333873e-06, |
|
"loss": 0.0861, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 3.0401785714285716, |
|
"grad_norm": 1.5609370470046997, |
|
"learning_rate": 4.97688145255792e-06, |
|
"loss": 0.1035, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 3.044642857142857, |
|
"grad_norm": 1.3300373554229736, |
|
"learning_rate": 4.965072343561948e-06, |
|
"loss": 0.0937, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 3.049107142857143, |
|
"grad_norm": 1.6226136684417725, |
|
"learning_rate": 4.953263432235034e-06, |
|
"loss": 0.0875, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 3.0535714285714284, |
|
"grad_norm": 1.5775179862976074, |
|
"learning_rate": 4.941454784465144e-06, |
|
"loss": 0.0839, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 3.0535714285714284, |
|
"eval_loss": 0.8500283360481262, |
|
"eval_runtime": 3.0932, |
|
"eval_samples_per_second": 19.397, |
|
"eval_steps_per_second": 1.293, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 3.0580357142857144, |
|
"grad_norm": 1.7753103971481323, |
|
"learning_rate": 4.929646466138777e-06, |
|
"loss": 0.0929, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 3.0625, |
|
"grad_norm": 1.6143715381622314, |
|
"learning_rate": 4.917838543140591e-06, |
|
"loss": 0.0753, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 3.0669642857142856, |
|
"grad_norm": 1.5157322883605957, |
|
"learning_rate": 4.906031081353047e-06, |
|
"loss": 0.0869, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 3.0714285714285716, |
|
"grad_norm": 1.7490545511245728, |
|
"learning_rate": 4.8942241466560226e-06, |
|
"loss": 0.0908, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 3.075892857142857, |
|
"grad_norm": 1.8048044443130493, |
|
"learning_rate": 4.882417804926457e-06, |
|
"loss": 0.0893, |
|
"step": 689 |
|
}, |
|
{ |
|
"epoch": 3.080357142857143, |
|
"grad_norm": 1.587409496307373, |
|
"learning_rate": 4.870612122037989e-06, |
|
"loss": 0.0918, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 3.0848214285714284, |
|
"grad_norm": 1.6927273273468018, |
|
"learning_rate": 4.858807163860569e-06, |
|
"loss": 0.0969, |
|
"step": 691 |
|
}, |
|
{ |
|
"epoch": 3.0892857142857144, |
|
"grad_norm": 1.8378304243087769, |
|
"learning_rate": 4.847002996260113e-06, |
|
"loss": 0.0923, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 3.09375, |
|
"grad_norm": 1.6607656478881836, |
|
"learning_rate": 4.835199685098117e-06, |
|
"loss": 0.1028, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 3.0982142857142856, |
|
"grad_norm": 1.640812635421753, |
|
"learning_rate": 4.823397296231313e-06, |
|
"loss": 0.0849, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 3.1026785714285716, |
|
"grad_norm": 1.707801103591919, |
|
"learning_rate": 4.8115958955112715e-06, |
|
"loss": 0.0944, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 3.107142857142857, |
|
"grad_norm": 1.6389654874801636, |
|
"learning_rate": 4.799795548784058e-06, |
|
"loss": 0.0872, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 3.111607142857143, |
|
"grad_norm": 1.723669409751892, |
|
"learning_rate": 4.787996321889856e-06, |
|
"loss": 0.0956, |
|
"step": 697 |
|
}, |
|
{ |
|
"epoch": 3.1160714285714284, |
|
"grad_norm": 1.560467004776001, |
|
"learning_rate": 4.7761982806626015e-06, |
|
"loss": 0.1029, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 3.1205357142857144, |
|
"grad_norm": 1.4397633075714111, |
|
"learning_rate": 4.764401490929613e-06, |
|
"loss": 0.087, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 3.125, |
|
"grad_norm": 1.480446457862854, |
|
"learning_rate": 4.752606018511225e-06, |
|
"loss": 0.0883, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 3.1294642857142856, |
|
"grad_norm": 1.4872808456420898, |
|
"learning_rate": 4.740811929220429e-06, |
|
"loss": 0.1094, |
|
"step": 701 |
|
}, |
|
{ |
|
"epoch": 3.1339285714285716, |
|
"grad_norm": 1.4935928583145142, |
|
"learning_rate": 4.729019288862492e-06, |
|
"loss": 0.0976, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 3.138392857142857, |
|
"grad_norm": 1.3575384616851807, |
|
"learning_rate": 4.717228163234599e-06, |
|
"loss": 0.0837, |
|
"step": 703 |
|
}, |
|
{ |
|
"epoch": 3.142857142857143, |
|
"grad_norm": 1.112143874168396, |
|
"learning_rate": 4.705438618125482e-06, |
|
"loss": 0.0696, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 3.1473214285714284, |
|
"grad_norm": 1.4292476177215576, |
|
"learning_rate": 4.693650719315062e-06, |
|
"loss": 0.1014, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 3.1517857142857144, |
|
"grad_norm": 1.388718605041504, |
|
"learning_rate": 4.681864532574064e-06, |
|
"loss": 0.1, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 3.15625, |
|
"grad_norm": 1.3625158071517944, |
|
"learning_rate": 4.670080123663668e-06, |
|
"loss": 0.1026, |
|
"step": 707 |
|
}, |
|
{ |
|
"epoch": 3.1607142857142856, |
|
"grad_norm": 1.1807421445846558, |
|
"learning_rate": 4.658297558335127e-06, |
|
"loss": 0.0884, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 3.1651785714285716, |
|
"grad_norm": 1.36719810962677, |
|
"learning_rate": 4.64651690232942e-06, |
|
"loss": 0.0948, |
|
"step": 709 |
|
}, |
|
{ |
|
"epoch": 3.169642857142857, |
|
"grad_norm": 1.239861011505127, |
|
"learning_rate": 4.634738221376858e-06, |
|
"loss": 0.0783, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 3.174107142857143, |
|
"grad_norm": 1.1686639785766602, |
|
"learning_rate": 4.622961581196743e-06, |
|
"loss": 0.0785, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 3.1785714285714284, |
|
"grad_norm": 1.2871395349502563, |
|
"learning_rate": 4.611187047496989e-06, |
|
"loss": 0.0838, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 3.1830357142857144, |
|
"grad_norm": 1.3209705352783203, |
|
"learning_rate": 4.599414685973754e-06, |
|
"loss": 0.0802, |
|
"step": 713 |
|
}, |
|
{ |
|
"epoch": 3.1875, |
|
"grad_norm": 1.2725917100906372, |
|
"learning_rate": 4.587644562311077e-06, |
|
"loss": 0.0856, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 3.1919642857142856, |
|
"grad_norm": 1.506381630897522, |
|
"learning_rate": 4.575876742180506e-06, |
|
"loss": 0.0974, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 3.1964285714285716, |
|
"grad_norm": 1.2153488397598267, |
|
"learning_rate": 4.56411129124075e-06, |
|
"loss": 0.085, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 3.200892857142857, |
|
"grad_norm": 1.4479278326034546, |
|
"learning_rate": 4.552348275137285e-06, |
|
"loss": 0.1026, |
|
"step": 717 |
|
}, |
|
{ |
|
"epoch": 3.205357142857143, |
|
"grad_norm": 1.657431960105896, |
|
"learning_rate": 4.54058775950201e-06, |
|
"loss": 0.1017, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 3.2098214285714284, |
|
"grad_norm": 1.4893465042114258, |
|
"learning_rate": 4.528829809952867e-06, |
|
"loss": 0.0917, |
|
"step": 719 |
|
}, |
|
{ |
|
"epoch": 3.2142857142857144, |
|
"grad_norm": 1.6063541173934937, |
|
"learning_rate": 4.517074492093487e-06, |
|
"loss": 0.1133, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 3.21875, |
|
"grad_norm": 1.5053054094314575, |
|
"learning_rate": 4.505321871512813e-06, |
|
"loss": 0.1028, |
|
"step": 721 |
|
}, |
|
{ |
|
"epoch": 3.2232142857142856, |
|
"grad_norm": 1.6949694156646729, |
|
"learning_rate": 4.493572013784737e-06, |
|
"loss": 0.1063, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 3.2276785714285716, |
|
"grad_norm": 1.3530110120773315, |
|
"learning_rate": 4.481824984467742e-06, |
|
"loss": 0.0957, |
|
"step": 723 |
|
}, |
|
{ |
|
"epoch": 3.232142857142857, |
|
"grad_norm": 1.4270166158676147, |
|
"learning_rate": 4.470080849104521e-06, |
|
"loss": 0.0959, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 3.236607142857143, |
|
"grad_norm": 1.7574278116226196, |
|
"learning_rate": 4.458339673221631e-06, |
|
"loss": 0.1066, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 3.2410714285714284, |
|
"grad_norm": 1.531517744064331, |
|
"learning_rate": 4.446601522329105e-06, |
|
"loss": 0.086, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 3.2455357142857144, |
|
"grad_norm": 1.428818702697754, |
|
"learning_rate": 4.434866461920108e-06, |
|
"loss": 0.0859, |
|
"step": 727 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 1.7895373106002808, |
|
"learning_rate": 4.4231345574705555e-06, |
|
"loss": 0.1118, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 3.2544642857142856, |
|
"grad_norm": 1.335732102394104, |
|
"learning_rate": 4.4114058744387535e-06, |
|
"loss": 0.0875, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 3.2589285714285716, |
|
"grad_norm": 1.547389268875122, |
|
"learning_rate": 4.399680478265042e-06, |
|
"loss": 0.1004, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 3.263392857142857, |
|
"grad_norm": 1.5613607168197632, |
|
"learning_rate": 4.387958434371413e-06, |
|
"loss": 0.0989, |
|
"step": 731 |
|
}, |
|
{ |
|
"epoch": 3.267857142857143, |
|
"grad_norm": 1.3108863830566406, |
|
"learning_rate": 4.376239808161157e-06, |
|
"loss": 0.0791, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 3.2723214285714284, |
|
"grad_norm": 1.4128496646881104, |
|
"learning_rate": 4.364524665018496e-06, |
|
"loss": 0.088, |
|
"step": 733 |
|
}, |
|
{ |
|
"epoch": 3.2767857142857144, |
|
"grad_norm": 1.280126690864563, |
|
"learning_rate": 4.3528130703082245e-06, |
|
"loss": 0.0838, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 3.28125, |
|
"grad_norm": 1.370133399963379, |
|
"learning_rate": 4.341105089375328e-06, |
|
"loss": 0.0833, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 3.2857142857142856, |
|
"grad_norm": 1.5750563144683838, |
|
"learning_rate": 4.329400787544633e-06, |
|
"loss": 0.0924, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 3.2901785714285716, |
|
"grad_norm": 1.218309760093689, |
|
"learning_rate": 4.317700230120438e-06, |
|
"loss": 0.0807, |
|
"step": 737 |
|
}, |
|
{ |
|
"epoch": 3.294642857142857, |
|
"grad_norm": 1.4388004541397095, |
|
"learning_rate": 4.306003482386156e-06, |
|
"loss": 0.1089, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 3.299107142857143, |
|
"grad_norm": 1.3800266981124878, |
|
"learning_rate": 4.2943106096039315e-06, |
|
"loss": 0.0916, |
|
"step": 739 |
|
}, |
|
{ |
|
"epoch": 3.3035714285714284, |
|
"grad_norm": 1.3160336017608643, |
|
"learning_rate": 4.282621677014299e-06, |
|
"loss": 0.0877, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 3.3080357142857144, |
|
"grad_norm": 1.3781185150146484, |
|
"learning_rate": 4.270936749835805e-06, |
|
"loss": 0.0984, |
|
"step": 741 |
|
}, |
|
{ |
|
"epoch": 3.3080357142857144, |
|
"eval_loss": 0.8194601535797119, |
|
"eval_runtime": 3.0964, |
|
"eval_samples_per_second": 19.378, |
|
"eval_steps_per_second": 1.292, |
|
"step": 741 |
|
}, |
|
{ |
|
"epoch": 3.3125, |
|
"grad_norm": 1.310025691986084, |
|
"learning_rate": 4.259255893264647e-06, |
|
"loss": 0.0893, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 3.3169642857142856, |
|
"grad_norm": 1.4620391130447388, |
|
"learning_rate": 4.247579172474312e-06, |
|
"loss": 0.1009, |
|
"step": 743 |
|
}, |
|
{ |
|
"epoch": 3.3214285714285716, |
|
"grad_norm": 1.4170101881027222, |
|
"learning_rate": 4.235906652615207e-06, |
|
"loss": 0.1037, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 3.325892857142857, |
|
"grad_norm": 1.8959122896194458, |
|
"learning_rate": 4.224238398814309e-06, |
|
"loss": 0.0969, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 3.330357142857143, |
|
"grad_norm": 1.463104248046875, |
|
"learning_rate": 4.212574476174784e-06, |
|
"loss": 0.0902, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 3.3348214285714284, |
|
"grad_norm": 1.4386403560638428, |
|
"learning_rate": 4.2009149497756355e-06, |
|
"loss": 0.0984, |
|
"step": 747 |
|
}, |
|
{ |
|
"epoch": 3.3392857142857144, |
|
"grad_norm": 1.3516470193862915, |
|
"learning_rate": 4.189259884671336e-06, |
|
"loss": 0.0875, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 3.34375, |
|
"grad_norm": 1.3817871809005737, |
|
"learning_rate": 4.177609345891472e-06, |
|
"loss": 0.0881, |
|
"step": 749 |
|
}, |
|
{ |
|
"epoch": 3.3482142857142856, |
|
"grad_norm": 1.302051305770874, |
|
"learning_rate": 4.165963398440368e-06, |
|
"loss": 0.0976, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 3.3526785714285716, |
|
"grad_norm": 1.4859888553619385, |
|
"learning_rate": 4.1543221072967334e-06, |
|
"loss": 0.0977, |
|
"step": 751 |
|
}, |
|
{ |
|
"epoch": 3.357142857142857, |
|
"grad_norm": 1.3460721969604492, |
|
"learning_rate": 4.142685537413303e-06, |
|
"loss": 0.086, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 3.361607142857143, |
|
"grad_norm": 1.467089056968689, |
|
"learning_rate": 4.1310537537164615e-06, |
|
"loss": 0.0999, |
|
"step": 753 |
|
}, |
|
{ |
|
"epoch": 3.3660714285714284, |
|
"grad_norm": 1.6154288053512573, |
|
"learning_rate": 4.119426821105895e-06, |
|
"loss": 0.0937, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 3.3705357142857144, |
|
"grad_norm": 1.610485315322876, |
|
"learning_rate": 4.107804804454215e-06, |
|
"loss": 0.0977, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 3.375, |
|
"grad_norm": 1.5039902925491333, |
|
"learning_rate": 4.096187768606617e-06, |
|
"loss": 0.0934, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 3.3794642857142856, |
|
"grad_norm": 1.3222577571868896, |
|
"learning_rate": 4.084575778380495e-06, |
|
"loss": 0.0777, |
|
"step": 757 |
|
}, |
|
{ |
|
"epoch": 3.3839285714285716, |
|
"grad_norm": 1.5685445070266724, |
|
"learning_rate": 4.072968898565094e-06, |
|
"loss": 0.0969, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 3.388392857142857, |
|
"grad_norm": 1.5284277200698853, |
|
"learning_rate": 4.061367193921145e-06, |
|
"loss": 0.1104, |
|
"step": 759 |
|
}, |
|
{ |
|
"epoch": 3.392857142857143, |
|
"grad_norm": 1.5759763717651367, |
|
"learning_rate": 4.049770729180508e-06, |
|
"loss": 0.0991, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 3.3973214285714284, |
|
"grad_norm": 1.490787386894226, |
|
"learning_rate": 4.038179569045803e-06, |
|
"loss": 0.0934, |
|
"step": 761 |
|
}, |
|
{ |
|
"epoch": 3.4017857142857144, |
|
"grad_norm": 1.490193486213684, |
|
"learning_rate": 4.026593778190052e-06, |
|
"loss": 0.1003, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 3.40625, |
|
"grad_norm": 1.3332754373550415, |
|
"learning_rate": 4.015013421256324e-06, |
|
"loss": 0.0855, |
|
"step": 763 |
|
}, |
|
{ |
|
"epoch": 3.4107142857142856, |
|
"grad_norm": 1.195845127105713, |
|
"learning_rate": 4.0034385628573655e-06, |
|
"loss": 0.0811, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 3.4151785714285716, |
|
"grad_norm": 1.3731515407562256, |
|
"learning_rate": 3.991869267575243e-06, |
|
"loss": 0.0848, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 3.419642857142857, |
|
"grad_norm": 1.6035957336425781, |
|
"learning_rate": 3.9803055999609855e-06, |
|
"loss": 0.0925, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 3.424107142857143, |
|
"grad_norm": 1.4509934186935425, |
|
"learning_rate": 3.9687476245342234e-06, |
|
"loss": 0.0963, |
|
"step": 767 |
|
}, |
|
{ |
|
"epoch": 3.4285714285714284, |
|
"grad_norm": 1.679079532623291, |
|
"learning_rate": 3.957195405782824e-06, |
|
"loss": 0.0976, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 3.4330357142857144, |
|
"grad_norm": 1.6150476932525635, |
|
"learning_rate": 3.9456490081625396e-06, |
|
"loss": 0.101, |
|
"step": 769 |
|
}, |
|
{ |
|
"epoch": 3.4375, |
|
"grad_norm": 1.3960528373718262, |
|
"learning_rate": 3.934108496096638e-06, |
|
"loss": 0.0918, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 3.4419642857142856, |
|
"grad_norm": 1.4752354621887207, |
|
"learning_rate": 3.922573933975555e-06, |
|
"loss": 0.0957, |
|
"step": 771 |
|
}, |
|
{ |
|
"epoch": 3.4464285714285716, |
|
"grad_norm": 1.4538674354553223, |
|
"learning_rate": 3.911045386156523e-06, |
|
"loss": 0.0991, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 3.450892857142857, |
|
"grad_norm": 1.5427879095077515, |
|
"learning_rate": 3.899522916963219e-06, |
|
"loss": 0.0964, |
|
"step": 773 |
|
}, |
|
{ |
|
"epoch": 3.455357142857143, |
|
"grad_norm": 1.506783366203308, |
|
"learning_rate": 3.888006590685407e-06, |
|
"loss": 0.0907, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 3.4598214285714284, |
|
"grad_norm": 1.3656387329101562, |
|
"learning_rate": 3.876496471578572e-06, |
|
"loss": 0.0939, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 3.4642857142857144, |
|
"grad_norm": 1.509194254875183, |
|
"learning_rate": 3.864992623863572e-06, |
|
"loss": 0.0962, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 3.46875, |
|
"grad_norm": 1.249863862991333, |
|
"learning_rate": 3.853495111726265e-06, |
|
"loss": 0.0884, |
|
"step": 777 |
|
}, |
|
{ |
|
"epoch": 3.4732142857142856, |
|
"grad_norm": 1.534104585647583, |
|
"learning_rate": 3.84200399931717e-06, |
|
"loss": 0.0983, |
|
"step": 778 |
|
}, |
|
{ |
|
"epoch": 3.4776785714285716, |
|
"grad_norm": 1.4144586324691772, |
|
"learning_rate": 3.8305193507510905e-06, |
|
"loss": 0.0954, |
|
"step": 779 |
|
}, |
|
{ |
|
"epoch": 3.482142857142857, |
|
"grad_norm": 1.5498822927474976, |
|
"learning_rate": 3.819041230106768e-06, |
|
"loss": 0.1033, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 3.486607142857143, |
|
"grad_norm": 1.2330667972564697, |
|
"learning_rate": 3.807569701426519e-06, |
|
"loss": 0.0825, |
|
"step": 781 |
|
}, |
|
{ |
|
"epoch": 3.4910714285714284, |
|
"grad_norm": 1.3459640741348267, |
|
"learning_rate": 3.7961048287158865e-06, |
|
"loss": 0.0847, |
|
"step": 782 |
|
}, |
|
{ |
|
"epoch": 3.4955357142857144, |
|
"grad_norm": 1.4080768823623657, |
|
"learning_rate": 3.784646675943269e-06, |
|
"loss": 0.0901, |
|
"step": 783 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 1.5240070819854736, |
|
"learning_rate": 3.773195307039574e-06, |
|
"loss": 0.0994, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 3.5044642857142856, |
|
"grad_norm": 1.3279098272323608, |
|
"learning_rate": 3.7617507858978615e-06, |
|
"loss": 0.089, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 3.508928571428571, |
|
"grad_norm": 1.2906996011734009, |
|
"learning_rate": 3.7503131763729785e-06, |
|
"loss": 0.0892, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 3.513392857142857, |
|
"grad_norm": 1.472264051437378, |
|
"learning_rate": 3.738882542281212e-06, |
|
"loss": 0.0889, |
|
"step": 787 |
|
}, |
|
{ |
|
"epoch": 3.517857142857143, |
|
"grad_norm": 1.4592229127883911, |
|
"learning_rate": 3.727458947399927e-06, |
|
"loss": 0.0923, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 3.522321428571429, |
|
"grad_norm": 1.5087579488754272, |
|
"learning_rate": 3.7160424554672187e-06, |
|
"loss": 0.0954, |
|
"step": 789 |
|
}, |
|
{ |
|
"epoch": 3.5267857142857144, |
|
"grad_norm": 1.4825618267059326, |
|
"learning_rate": 3.7046331301815435e-06, |
|
"loss": 0.0937, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 3.53125, |
|
"grad_norm": 1.3035829067230225, |
|
"learning_rate": 3.6932310352013796e-06, |
|
"loss": 0.0827, |
|
"step": 791 |
|
}, |
|
{ |
|
"epoch": 3.5357142857142856, |
|
"grad_norm": 1.418879508972168, |
|
"learning_rate": 3.6818362341448545e-06, |
|
"loss": 0.0957, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 3.540178571428571, |
|
"grad_norm": 1.6021920442581177, |
|
"learning_rate": 3.670448790589408e-06, |
|
"loss": 0.0964, |
|
"step": 793 |
|
}, |
|
{ |
|
"epoch": 3.544642857142857, |
|
"grad_norm": 1.3729946613311768, |
|
"learning_rate": 3.659068768071425e-06, |
|
"loss": 0.085, |
|
"step": 794 |
|
}, |
|
{ |
|
"epoch": 3.549107142857143, |
|
"grad_norm": 1.3656620979309082, |
|
"learning_rate": 3.6476962300858793e-06, |
|
"loss": 0.0946, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 3.553571428571429, |
|
"grad_norm": 1.2834999561309814, |
|
"learning_rate": 3.6363312400859963e-06, |
|
"loss": 0.084, |
|
"step": 796 |
|
}, |
|
{ |
|
"epoch": 3.5580357142857144, |
|
"grad_norm": 1.388552188873291, |
|
"learning_rate": 3.6249738614828765e-06, |
|
"loss": 0.0903, |
|
"step": 797 |
|
}, |
|
{ |
|
"epoch": 3.5625, |
|
"grad_norm": 1.4588837623596191, |
|
"learning_rate": 3.613624157645159e-06, |
|
"loss": 0.0961, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 3.5625, |
|
"eval_loss": 0.8479318618774414, |
|
"eval_runtime": 3.1035, |
|
"eval_samples_per_second": 19.333, |
|
"eval_steps_per_second": 1.289, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 3.5669642857142856, |
|
"grad_norm": 1.5527163743972778, |
|
"learning_rate": 3.6022821918986563e-06, |
|
"loss": 0.0891, |
|
"step": 799 |
|
}, |
|
{ |
|
"epoch": 3.571428571428571, |
|
"grad_norm": 1.3039300441741943, |
|
"learning_rate": 3.590948027526012e-06, |
|
"loss": 0.0961, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 3.575892857142857, |
|
"grad_norm": 1.2164970636367798, |
|
"learning_rate": 3.579621727766339e-06, |
|
"loss": 0.0763, |
|
"step": 801 |
|
}, |
|
{ |
|
"epoch": 3.580357142857143, |
|
"grad_norm": 1.3291783332824707, |
|
"learning_rate": 3.568303355814867e-06, |
|
"loss": 0.0914, |
|
"step": 802 |
|
}, |
|
{ |
|
"epoch": 3.584821428571429, |
|
"grad_norm": 1.267117977142334, |
|
"learning_rate": 3.5569929748225945e-06, |
|
"loss": 0.086, |
|
"step": 803 |
|
}, |
|
{ |
|
"epoch": 3.5892857142857144, |
|
"grad_norm": 1.430953860282898, |
|
"learning_rate": 3.5456906478959367e-06, |
|
"loss": 0.1026, |
|
"step": 804 |
|
}, |
|
{ |
|
"epoch": 3.59375, |
|
"grad_norm": 1.5599972009658813, |
|
"learning_rate": 3.534396438096369e-06, |
|
"loss": 0.0978, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 3.5982142857142856, |
|
"grad_norm": 1.3565053939819336, |
|
"learning_rate": 3.5231104084400745e-06, |
|
"loss": 0.0893, |
|
"step": 806 |
|
}, |
|
{ |
|
"epoch": 3.602678571428571, |
|
"grad_norm": 1.3777987957000732, |
|
"learning_rate": 3.5118326218976013e-06, |
|
"loss": 0.091, |
|
"step": 807 |
|
}, |
|
{ |
|
"epoch": 3.607142857142857, |
|
"grad_norm": 1.642444133758545, |
|
"learning_rate": 3.5005631413935006e-06, |
|
"loss": 0.0943, |
|
"step": 808 |
|
}, |
|
{ |
|
"epoch": 3.611607142857143, |
|
"grad_norm": 1.4887057542800903, |
|
"learning_rate": 3.4893020298059784e-06, |
|
"loss": 0.0829, |
|
"step": 809 |
|
}, |
|
{ |
|
"epoch": 3.616071428571429, |
|
"grad_norm": 1.5320430994033813, |
|
"learning_rate": 3.4780493499665478e-06, |
|
"loss": 0.1012, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 3.6205357142857144, |
|
"grad_norm": 1.5760090351104736, |
|
"learning_rate": 3.4668051646596825e-06, |
|
"loss": 0.101, |
|
"step": 811 |
|
}, |
|
{ |
|
"epoch": 3.625, |
|
"grad_norm": 1.5013108253479004, |
|
"learning_rate": 3.455569536622451e-06, |
|
"loss": 0.0926, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 3.6294642857142856, |
|
"grad_norm": 1.4752159118652344, |
|
"learning_rate": 3.4443425285441847e-06, |
|
"loss": 0.0981, |
|
"step": 813 |
|
}, |
|
{ |
|
"epoch": 3.633928571428571, |
|
"grad_norm": 1.3213896751403809, |
|
"learning_rate": 3.433124203066113e-06, |
|
"loss": 0.092, |
|
"step": 814 |
|
}, |
|
{ |
|
"epoch": 3.638392857142857, |
|
"grad_norm": 1.4325315952301025, |
|
"learning_rate": 3.421914622781028e-06, |
|
"loss": 0.0994, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 3.642857142857143, |
|
"grad_norm": 1.538521647453308, |
|
"learning_rate": 3.4107138502329225e-06, |
|
"loss": 0.0989, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 3.647321428571429, |
|
"grad_norm": 1.4363764524459839, |
|
"learning_rate": 3.399521947916646e-06, |
|
"loss": 0.1054, |
|
"step": 817 |
|
}, |
|
{ |
|
"epoch": 3.6517857142857144, |
|
"grad_norm": 1.3556541204452515, |
|
"learning_rate": 3.3883389782775604e-06, |
|
"loss": 0.0883, |
|
"step": 818 |
|
}, |
|
{ |
|
"epoch": 3.65625, |
|
"grad_norm": 1.4465206861495972, |
|
"learning_rate": 3.377165003711185e-06, |
|
"loss": 0.0826, |
|
"step": 819 |
|
}, |
|
{ |
|
"epoch": 3.6607142857142856, |
|
"grad_norm": 1.4532499313354492, |
|
"learning_rate": 3.3660000865628523e-06, |
|
"loss": 0.0808, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 3.665178571428571, |
|
"grad_norm": 1.5245791673660278, |
|
"learning_rate": 3.3548442891273553e-06, |
|
"loss": 0.0914, |
|
"step": 821 |
|
}, |
|
{ |
|
"epoch": 3.669642857142857, |
|
"grad_norm": 1.1719828844070435, |
|
"learning_rate": 3.343697673648611e-06, |
|
"loss": 0.0849, |
|
"step": 822 |
|
}, |
|
{ |
|
"epoch": 3.674107142857143, |
|
"grad_norm": 1.5081385374069214, |
|
"learning_rate": 3.332560302319297e-06, |
|
"loss": 0.1125, |
|
"step": 823 |
|
}, |
|
{ |
|
"epoch": 3.678571428571429, |
|
"grad_norm": 1.331972599029541, |
|
"learning_rate": 3.321432237280518e-06, |
|
"loss": 0.0891, |
|
"step": 824 |
|
}, |
|
{ |
|
"epoch": 3.6830357142857144, |
|
"grad_norm": 1.2731318473815918, |
|
"learning_rate": 3.3103135406214506e-06, |
|
"loss": 0.0855, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 3.6875, |
|
"grad_norm": 1.3402756452560425, |
|
"learning_rate": 3.2992042743790055e-06, |
|
"loss": 0.0852, |
|
"step": 826 |
|
}, |
|
{ |
|
"epoch": 3.6919642857142856, |
|
"grad_norm": 1.49905264377594, |
|
"learning_rate": 3.2881045005374747e-06, |
|
"loss": 0.0901, |
|
"step": 827 |
|
}, |
|
{ |
|
"epoch": 3.696428571428571, |
|
"grad_norm": 1.3129043579101562, |
|
"learning_rate": 3.277014281028181e-06, |
|
"loss": 0.0823, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 3.700892857142857, |
|
"grad_norm": 1.5265233516693115, |
|
"learning_rate": 3.2659336777291497e-06, |
|
"loss": 0.1079, |
|
"step": 829 |
|
}, |
|
{ |
|
"epoch": 3.705357142857143, |
|
"grad_norm": 1.3291493654251099, |
|
"learning_rate": 3.254862752464743e-06, |
|
"loss": 0.0854, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 3.709821428571429, |
|
"grad_norm": 1.509037733078003, |
|
"learning_rate": 3.243801567005329e-06, |
|
"loss": 0.1013, |
|
"step": 831 |
|
}, |
|
{ |
|
"epoch": 3.7142857142857144, |
|
"grad_norm": 1.7585151195526123, |
|
"learning_rate": 3.232750183066928e-06, |
|
"loss": 0.0994, |
|
"step": 832 |
|
}, |
|
{ |
|
"epoch": 3.71875, |
|
"grad_norm": 1.5366458892822266, |
|
"learning_rate": 3.2217086623108796e-06, |
|
"loss": 0.1006, |
|
"step": 833 |
|
}, |
|
{ |
|
"epoch": 3.7232142857142856, |
|
"grad_norm": 1.3573994636535645, |
|
"learning_rate": 3.2106770663434867e-06, |
|
"loss": 0.0855, |
|
"step": 834 |
|
}, |
|
{ |
|
"epoch": 3.727678571428571, |
|
"grad_norm": 1.3388910293579102, |
|
"learning_rate": 3.1996554567156774e-06, |
|
"loss": 0.0859, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 3.732142857142857, |
|
"grad_norm": 1.2037252187728882, |
|
"learning_rate": 3.18864389492266e-06, |
|
"loss": 0.0857, |
|
"step": 836 |
|
}, |
|
{ |
|
"epoch": 3.736607142857143, |
|
"grad_norm": 1.4536150693893433, |
|
"learning_rate": 3.1776424424035857e-06, |
|
"loss": 0.0983, |
|
"step": 837 |
|
}, |
|
{ |
|
"epoch": 3.741071428571429, |
|
"grad_norm": 1.406284213066101, |
|
"learning_rate": 3.1666511605411947e-06, |
|
"loss": 0.0899, |
|
"step": 838 |
|
}, |
|
{ |
|
"epoch": 3.7455357142857144, |
|
"grad_norm": 1.4425653219223022, |
|
"learning_rate": 3.155670110661482e-06, |
|
"loss": 0.0889, |
|
"step": 839 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 1.3152062892913818, |
|
"learning_rate": 3.144699354033356e-06, |
|
"loss": 0.0859, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 3.7544642857142856, |
|
"grad_norm": 1.456082820892334, |
|
"learning_rate": 3.1337389518682894e-06, |
|
"loss": 0.0937, |
|
"step": 841 |
|
}, |
|
{ |
|
"epoch": 3.758928571428571, |
|
"grad_norm": 1.3939170837402344, |
|
"learning_rate": 3.122788965319985e-06, |
|
"loss": 0.0802, |
|
"step": 842 |
|
}, |
|
{ |
|
"epoch": 3.763392857142857, |
|
"grad_norm": 1.2668402194976807, |
|
"learning_rate": 3.1118494554840284e-06, |
|
"loss": 0.0849, |
|
"step": 843 |
|
}, |
|
{ |
|
"epoch": 3.767857142857143, |
|
"grad_norm": 1.369589924812317, |
|
"learning_rate": 3.100920483397556e-06, |
|
"loss": 0.076, |
|
"step": 844 |
|
}, |
|
{ |
|
"epoch": 3.772321428571429, |
|
"grad_norm": 1.4516035318374634, |
|
"learning_rate": 3.090002110038903e-06, |
|
"loss": 0.0892, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 3.7767857142857144, |
|
"grad_norm": 1.5191524028778076, |
|
"learning_rate": 3.0790943963272697e-06, |
|
"loss": 0.0962, |
|
"step": 846 |
|
}, |
|
{ |
|
"epoch": 3.78125, |
|
"grad_norm": 1.4074844121932983, |
|
"learning_rate": 3.0681974031223854e-06, |
|
"loss": 0.0942, |
|
"step": 847 |
|
}, |
|
{ |
|
"epoch": 3.7857142857142856, |
|
"grad_norm": 1.3058712482452393, |
|
"learning_rate": 3.0573111912241575e-06, |
|
"loss": 0.085, |
|
"step": 848 |
|
}, |
|
{ |
|
"epoch": 3.790178571428571, |
|
"grad_norm": 1.3393832445144653, |
|
"learning_rate": 3.0464358213723455e-06, |
|
"loss": 0.0894, |
|
"step": 849 |
|
}, |
|
{ |
|
"epoch": 3.794642857142857, |
|
"grad_norm": 1.4606190919876099, |
|
"learning_rate": 3.0355713542462086e-06, |
|
"loss": 0.1035, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 3.799107142857143, |
|
"grad_norm": 1.4510114192962646, |
|
"learning_rate": 3.024717850464181e-06, |
|
"loss": 0.0965, |
|
"step": 851 |
|
}, |
|
{ |
|
"epoch": 3.803571428571429, |
|
"grad_norm": 1.7563194036483765, |
|
"learning_rate": 3.0138753705835234e-06, |
|
"loss": 0.0993, |
|
"step": 852 |
|
}, |
|
{ |
|
"epoch": 3.8080357142857144, |
|
"grad_norm": 1.5832990407943726, |
|
"learning_rate": 3.003043975099988e-06, |
|
"loss": 0.094, |
|
"step": 853 |
|
}, |
|
{ |
|
"epoch": 3.8125, |
|
"grad_norm": 1.3764246702194214, |
|
"learning_rate": 2.99222372444748e-06, |
|
"loss": 0.0797, |
|
"step": 854 |
|
}, |
|
{ |
|
"epoch": 3.8169642857142856, |
|
"grad_norm": 1.3575708866119385, |
|
"learning_rate": 2.9814146789977278e-06, |
|
"loss": 0.0884, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 3.8169642857142856, |
|
"eval_loss": 0.8522133231163025, |
|
"eval_runtime": 3.118, |
|
"eval_samples_per_second": 19.243, |
|
"eval_steps_per_second": 1.283, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 3.821428571428571, |
|
"grad_norm": 1.6227210760116577, |
|
"learning_rate": 2.970616899059934e-06, |
|
"loss": 0.0896, |
|
"step": 856 |
|
}, |
|
{ |
|
"epoch": 3.825892857142857, |
|
"grad_norm": 1.3933652639389038, |
|
"learning_rate": 2.959830444880447e-06, |
|
"loss": 0.089, |
|
"step": 857 |
|
}, |
|
{ |
|
"epoch": 3.830357142857143, |
|
"grad_norm": 1.4572827816009521, |
|
"learning_rate": 2.949055376642428e-06, |
|
"loss": 0.0906, |
|
"step": 858 |
|
}, |
|
{ |
|
"epoch": 3.834821428571429, |
|
"grad_norm": 1.3022706508636475, |
|
"learning_rate": 2.9382917544655025e-06, |
|
"loss": 0.0836, |
|
"step": 859 |
|
}, |
|
{ |
|
"epoch": 3.8392857142857144, |
|
"grad_norm": 1.3461490869522095, |
|
"learning_rate": 2.9275396384054373e-06, |
|
"loss": 0.0843, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 3.84375, |
|
"grad_norm": 1.421582579612732, |
|
"learning_rate": 2.9167990884537943e-06, |
|
"loss": 0.0972, |
|
"step": 861 |
|
}, |
|
{ |
|
"epoch": 3.8482142857142856, |
|
"grad_norm": 1.499457597732544, |
|
"learning_rate": 2.906070164537616e-06, |
|
"loss": 0.1037, |
|
"step": 862 |
|
}, |
|
{ |
|
"epoch": 3.852678571428571, |
|
"grad_norm": 1.422713279724121, |
|
"learning_rate": 2.8953529265190618e-06, |
|
"loss": 0.0898, |
|
"step": 863 |
|
}, |
|
{ |
|
"epoch": 3.857142857142857, |
|
"grad_norm": 1.5362988710403442, |
|
"learning_rate": 2.884647434195099e-06, |
|
"loss": 0.0992, |
|
"step": 864 |
|
}, |
|
{ |
|
"epoch": 3.861607142857143, |
|
"grad_norm": 1.393685221672058, |
|
"learning_rate": 2.873953747297153e-06, |
|
"loss": 0.0786, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 3.866071428571429, |
|
"grad_norm": 1.4596489667892456, |
|
"learning_rate": 2.863271925490791e-06, |
|
"loss": 0.1016, |
|
"step": 866 |
|
}, |
|
{ |
|
"epoch": 3.8705357142857144, |
|
"grad_norm": 1.3821113109588623, |
|
"learning_rate": 2.8526020283753658e-06, |
|
"loss": 0.0918, |
|
"step": 867 |
|
}, |
|
{ |
|
"epoch": 3.875, |
|
"grad_norm": 1.502488613128662, |
|
"learning_rate": 2.841944115483703e-06, |
|
"loss": 0.0933, |
|
"step": 868 |
|
}, |
|
{ |
|
"epoch": 3.8794642857142856, |
|
"grad_norm": 1.3570479154586792, |
|
"learning_rate": 2.8312982462817686e-06, |
|
"loss": 0.0929, |
|
"step": 869 |
|
}, |
|
{ |
|
"epoch": 3.883928571428571, |
|
"grad_norm": 1.5335980653762817, |
|
"learning_rate": 2.820664480168317e-06, |
|
"loss": 0.1003, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 3.888392857142857, |
|
"grad_norm": 1.459232211112976, |
|
"learning_rate": 2.810042876474584e-06, |
|
"loss": 0.1046, |
|
"step": 871 |
|
}, |
|
{ |
|
"epoch": 3.892857142857143, |
|
"grad_norm": 1.4594147205352783, |
|
"learning_rate": 2.799433494463935e-06, |
|
"loss": 0.1012, |
|
"step": 872 |
|
}, |
|
{ |
|
"epoch": 3.897321428571429, |
|
"grad_norm": 1.5626919269561768, |
|
"learning_rate": 2.7888363933315593e-06, |
|
"loss": 0.1023, |
|
"step": 873 |
|
}, |
|
{ |
|
"epoch": 3.9017857142857144, |
|
"grad_norm": 1.3069941997528076, |
|
"learning_rate": 2.7782516322041087e-06, |
|
"loss": 0.0817, |
|
"step": 874 |
|
}, |
|
{ |
|
"epoch": 3.90625, |
|
"grad_norm": 1.4590051174163818, |
|
"learning_rate": 2.767679270139394e-06, |
|
"loss": 0.0885, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 3.9107142857142856, |
|
"grad_norm": 1.47640860080719, |
|
"learning_rate": 2.7571193661260427e-06, |
|
"loss": 0.092, |
|
"step": 876 |
|
}, |
|
{ |
|
"epoch": 3.915178571428571, |
|
"grad_norm": 1.5121464729309082, |
|
"learning_rate": 2.746571979083172e-06, |
|
"loss": 0.0954, |
|
"step": 877 |
|
}, |
|
{ |
|
"epoch": 3.919642857142857, |
|
"grad_norm": 1.3360768556594849, |
|
"learning_rate": 2.736037167860061e-06, |
|
"loss": 0.0983, |
|
"step": 878 |
|
}, |
|
{ |
|
"epoch": 3.924107142857143, |
|
"grad_norm": 1.395384669303894, |
|
"learning_rate": 2.725514991235818e-06, |
|
"loss": 0.0953, |
|
"step": 879 |
|
}, |
|
{ |
|
"epoch": 3.928571428571429, |
|
"grad_norm": 1.3785486221313477, |
|
"learning_rate": 2.7150055079190663e-06, |
|
"loss": 0.0825, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 3.9330357142857144, |
|
"grad_norm": 1.4748543500900269, |
|
"learning_rate": 2.704508776547595e-06, |
|
"loss": 0.0903, |
|
"step": 881 |
|
}, |
|
{ |
|
"epoch": 3.9375, |
|
"grad_norm": 1.3787004947662354, |
|
"learning_rate": 2.6940248556880512e-06, |
|
"loss": 0.0912, |
|
"step": 882 |
|
}, |
|
{ |
|
"epoch": 3.9419642857142856, |
|
"grad_norm": 1.3161265850067139, |
|
"learning_rate": 2.6835538038356017e-06, |
|
"loss": 0.08, |
|
"step": 883 |
|
}, |
|
{ |
|
"epoch": 3.946428571428571, |
|
"grad_norm": 1.4215610027313232, |
|
"learning_rate": 2.6730956794136138e-06, |
|
"loss": 0.088, |
|
"step": 884 |
|
}, |
|
{ |
|
"epoch": 3.950892857142857, |
|
"grad_norm": 1.396683692932129, |
|
"learning_rate": 2.6626505407733255e-06, |
|
"loss": 0.0925, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 3.955357142857143, |
|
"grad_norm": 1.4125168323516846, |
|
"learning_rate": 2.6522184461935153e-06, |
|
"loss": 0.0888, |
|
"step": 886 |
|
}, |
|
{ |
|
"epoch": 3.959821428571429, |
|
"grad_norm": 1.6444463729858398, |
|
"learning_rate": 2.6417994538801882e-06, |
|
"loss": 0.1116, |
|
"step": 887 |
|
}, |
|
{ |
|
"epoch": 3.9642857142857144, |
|
"grad_norm": 1.562724232673645, |
|
"learning_rate": 2.6313936219662435e-06, |
|
"loss": 0.0936, |
|
"step": 888 |
|
}, |
|
{ |
|
"epoch": 3.96875, |
|
"grad_norm": 1.3996388912200928, |
|
"learning_rate": 2.6210010085111507e-06, |
|
"loss": 0.0949, |
|
"step": 889 |
|
}, |
|
{ |
|
"epoch": 3.9732142857142856, |
|
"grad_norm": 1.3662428855895996, |
|
"learning_rate": 2.6106216715006282e-06, |
|
"loss": 0.0883, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 3.977678571428571, |
|
"grad_norm": 1.5796185731887817, |
|
"learning_rate": 2.600255668846316e-06, |
|
"loss": 0.0824, |
|
"step": 891 |
|
}, |
|
{ |
|
"epoch": 3.982142857142857, |
|
"grad_norm": 1.5451363325119019, |
|
"learning_rate": 2.58990305838546e-06, |
|
"loss": 0.1012, |
|
"step": 892 |
|
}, |
|
{ |
|
"epoch": 3.986607142857143, |
|
"grad_norm": 1.4925421476364136, |
|
"learning_rate": 2.5795638978805755e-06, |
|
"loss": 0.0859, |
|
"step": 893 |
|
}, |
|
{ |
|
"epoch": 3.991071428571429, |
|
"grad_norm": 1.4481643438339233, |
|
"learning_rate": 2.5692382450191404e-06, |
|
"loss": 0.1046, |
|
"step": 894 |
|
}, |
|
{ |
|
"epoch": 3.9955357142857144, |
|
"grad_norm": 1.4745389223098755, |
|
"learning_rate": 2.558926157413266e-06, |
|
"loss": 0.1084, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 1.4540438652038574, |
|
"learning_rate": 2.5486276925993746e-06, |
|
"loss": 0.0899, |
|
"step": 896 |
|
}, |
|
{ |
|
"epoch": 4.004464285714286, |
|
"grad_norm": 0.8183535933494568, |
|
"learning_rate": 2.5383429080378807e-06, |
|
"loss": 0.0459, |
|
"step": 897 |
|
}, |
|
{ |
|
"epoch": 4.008928571428571, |
|
"grad_norm": 0.8469629287719727, |
|
"learning_rate": 2.5280718611128657e-06, |
|
"loss": 0.0473, |
|
"step": 898 |
|
}, |
|
{ |
|
"epoch": 4.013392857142857, |
|
"grad_norm": 0.7638592720031738, |
|
"learning_rate": 2.517814609131772e-06, |
|
"loss": 0.0451, |
|
"step": 899 |
|
}, |
|
{ |
|
"epoch": 4.017857142857143, |
|
"grad_norm": 0.8340522646903992, |
|
"learning_rate": 2.507571209325058e-06, |
|
"loss": 0.0452, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 4.022321428571429, |
|
"grad_norm": 0.9095687866210938, |
|
"learning_rate": 2.4973417188459074e-06, |
|
"loss": 0.0472, |
|
"step": 901 |
|
}, |
|
{ |
|
"epoch": 4.026785714285714, |
|
"grad_norm": 0.8626049160957336, |
|
"learning_rate": 2.4871261947698892e-06, |
|
"loss": 0.0485, |
|
"step": 902 |
|
}, |
|
{ |
|
"epoch": 4.03125, |
|
"grad_norm": 1.161464810371399, |
|
"learning_rate": 2.4769246940946487e-06, |
|
"loss": 0.0539, |
|
"step": 903 |
|
}, |
|
{ |
|
"epoch": 4.035714285714286, |
|
"grad_norm": 0.7767446637153625, |
|
"learning_rate": 2.4667372737395894e-06, |
|
"loss": 0.0431, |
|
"step": 904 |
|
}, |
|
{ |
|
"epoch": 4.040178571428571, |
|
"grad_norm": 0.8548541069030762, |
|
"learning_rate": 2.4565639905455455e-06, |
|
"loss": 0.0429, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 4.044642857142857, |
|
"grad_norm": 1.0133311748504639, |
|
"learning_rate": 2.446404901274486e-06, |
|
"loss": 0.0457, |
|
"step": 906 |
|
}, |
|
{ |
|
"epoch": 4.049107142857143, |
|
"grad_norm": 0.8490503430366516, |
|
"learning_rate": 2.436260062609173e-06, |
|
"loss": 0.0424, |
|
"step": 907 |
|
}, |
|
{ |
|
"epoch": 4.053571428571429, |
|
"grad_norm": 0.9454727172851562, |
|
"learning_rate": 2.4261295311528632e-06, |
|
"loss": 0.038, |
|
"step": 908 |
|
}, |
|
{ |
|
"epoch": 4.058035714285714, |
|
"grad_norm": 0.9559707641601562, |
|
"learning_rate": 2.4160133634289804e-06, |
|
"loss": 0.0424, |
|
"step": 909 |
|
}, |
|
{ |
|
"epoch": 4.0625, |
|
"grad_norm": 1.1564035415649414, |
|
"learning_rate": 2.4059116158808147e-06, |
|
"loss": 0.0504, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 4.066964285714286, |
|
"grad_norm": 1.1515920162200928, |
|
"learning_rate": 2.395824344871193e-06, |
|
"loss": 0.0468, |
|
"step": 911 |
|
}, |
|
{ |
|
"epoch": 4.071428571428571, |
|
"grad_norm": 1.1842188835144043, |
|
"learning_rate": 2.385751606682167e-06, |
|
"loss": 0.0474, |
|
"step": 912 |
|
}, |
|
{ |
|
"epoch": 4.071428571428571, |
|
"eval_loss": 1.0089935064315796, |
|
"eval_runtime": 3.4591, |
|
"eval_samples_per_second": 17.346, |
|
"eval_steps_per_second": 1.156, |
|
"step": 912 |
|
}, |
|
{ |
|
"epoch": 4.075892857142857, |
|
"grad_norm": 0.9655628800392151, |
|
"learning_rate": 2.3756934575147117e-06, |
|
"loss": 0.0399, |
|
"step": 913 |
|
}, |
|
{ |
|
"epoch": 4.080357142857143, |
|
"grad_norm": 1.0869725942611694, |
|
"learning_rate": 2.365649953488393e-06, |
|
"loss": 0.0383, |
|
"step": 914 |
|
}, |
|
{ |
|
"epoch": 4.084821428571429, |
|
"grad_norm": 1.3358078002929688, |
|
"learning_rate": 2.3556211506410708e-06, |
|
"loss": 0.0382, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 4.089285714285714, |
|
"grad_norm": 1.2573343515396118, |
|
"learning_rate": 2.3456071049285717e-06, |
|
"loss": 0.0459, |
|
"step": 916 |
|
}, |
|
{ |
|
"epoch": 4.09375, |
|
"grad_norm": 1.1886146068572998, |
|
"learning_rate": 2.3356078722243963e-06, |
|
"loss": 0.0407, |
|
"step": 917 |
|
}, |
|
{ |
|
"epoch": 4.098214285714286, |
|
"grad_norm": 1.2139469385147095, |
|
"learning_rate": 2.325623508319385e-06, |
|
"loss": 0.047, |
|
"step": 918 |
|
}, |
|
{ |
|
"epoch": 4.102678571428571, |
|
"grad_norm": 0.9977007508277893, |
|
"learning_rate": 2.3156540689214227e-06, |
|
"loss": 0.0339, |
|
"step": 919 |
|
}, |
|
{ |
|
"epoch": 4.107142857142857, |
|
"grad_norm": 1.0386406183242798, |
|
"learning_rate": 2.3056996096551228e-06, |
|
"loss": 0.0383, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 4.111607142857143, |
|
"grad_norm": 1.2945815324783325, |
|
"learning_rate": 2.2957601860615152e-06, |
|
"loss": 0.0442, |
|
"step": 921 |
|
}, |
|
{ |
|
"epoch": 4.116071428571429, |
|
"grad_norm": 1.2519358396530151, |
|
"learning_rate": 2.285835853597742e-06, |
|
"loss": 0.0429, |
|
"step": 922 |
|
}, |
|
{ |
|
"epoch": 4.120535714285714, |
|
"grad_norm": 1.4103801250457764, |
|
"learning_rate": 2.2759266676367345e-06, |
|
"loss": 0.0538, |
|
"step": 923 |
|
}, |
|
{ |
|
"epoch": 4.125, |
|
"grad_norm": 1.0328574180603027, |
|
"learning_rate": 2.266032683466928e-06, |
|
"loss": 0.0382, |
|
"step": 924 |
|
}, |
|
{ |
|
"epoch": 4.129464285714286, |
|
"grad_norm": 1.3296153545379639, |
|
"learning_rate": 2.2561539562919265e-06, |
|
"loss": 0.0408, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 4.133928571428571, |
|
"grad_norm": 1.1139670610427856, |
|
"learning_rate": 2.246290541230214e-06, |
|
"loss": 0.0433, |
|
"step": 926 |
|
}, |
|
{ |
|
"epoch": 4.138392857142857, |
|
"grad_norm": 1.2616791725158691, |
|
"learning_rate": 2.236442493314839e-06, |
|
"loss": 0.0412, |
|
"step": 927 |
|
}, |
|
{ |
|
"epoch": 4.142857142857143, |
|
"grad_norm": 1.1220238208770752, |
|
"learning_rate": 2.2266098674931094e-06, |
|
"loss": 0.0414, |
|
"step": 928 |
|
}, |
|
{ |
|
"epoch": 4.147321428571429, |
|
"grad_norm": 1.056321620941162, |
|
"learning_rate": 2.216792718626286e-06, |
|
"loss": 0.0457, |
|
"step": 929 |
|
}, |
|
{ |
|
"epoch": 4.151785714285714, |
|
"grad_norm": 1.0423365831375122, |
|
"learning_rate": 2.2069911014892712e-06, |
|
"loss": 0.0386, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 4.15625, |
|
"grad_norm": 0.9723503589630127, |
|
"learning_rate": 2.197205070770313e-06, |
|
"loss": 0.0316, |
|
"step": 931 |
|
}, |
|
{ |
|
"epoch": 4.160714285714286, |
|
"grad_norm": 1.3372763395309448, |
|
"learning_rate": 2.1874346810706925e-06, |
|
"loss": 0.043, |
|
"step": 932 |
|
}, |
|
{ |
|
"epoch": 4.165178571428571, |
|
"grad_norm": 0.9659892916679382, |
|
"learning_rate": 2.177679986904422e-06, |
|
"loss": 0.0361, |
|
"step": 933 |
|
}, |
|
{ |
|
"epoch": 4.169642857142857, |
|
"grad_norm": 1.2238346338272095, |
|
"learning_rate": 2.1679410426979412e-06, |
|
"loss": 0.0546, |
|
"step": 934 |
|
}, |
|
{ |
|
"epoch": 4.174107142857143, |
|
"grad_norm": 1.1462628841400146, |
|
"learning_rate": 2.1582179027898102e-06, |
|
"loss": 0.0485, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 4.178571428571429, |
|
"grad_norm": 1.2392868995666504, |
|
"learning_rate": 2.148510621430414e-06, |
|
"loss": 0.0453, |
|
"step": 936 |
|
}, |
|
{ |
|
"epoch": 4.183035714285714, |
|
"grad_norm": 0.8607609272003174, |
|
"learning_rate": 2.1388192527816472e-06, |
|
"loss": 0.032, |
|
"step": 937 |
|
}, |
|
{ |
|
"epoch": 4.1875, |
|
"grad_norm": 1.1780694723129272, |
|
"learning_rate": 2.1291438509166236e-06, |
|
"loss": 0.0465, |
|
"step": 938 |
|
}, |
|
{ |
|
"epoch": 4.191964285714286, |
|
"grad_norm": 1.0053000450134277, |
|
"learning_rate": 2.119484469819371e-06, |
|
"loss": 0.0456, |
|
"step": 939 |
|
}, |
|
{ |
|
"epoch": 4.196428571428571, |
|
"grad_norm": 0.9943235516548157, |
|
"learning_rate": 2.109841163384528e-06, |
|
"loss": 0.0408, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 4.200892857142857, |
|
"grad_norm": 1.1572273969650269, |
|
"learning_rate": 2.100213985417045e-06, |
|
"loss": 0.041, |
|
"step": 941 |
|
}, |
|
{ |
|
"epoch": 4.205357142857143, |
|
"grad_norm": 0.9679527878761292, |
|
"learning_rate": 2.090602989631878e-06, |
|
"loss": 0.0394, |
|
"step": 942 |
|
}, |
|
{ |
|
"epoch": 4.209821428571429, |
|
"grad_norm": 0.9852113723754883, |
|
"learning_rate": 2.081008229653706e-06, |
|
"loss": 0.041, |
|
"step": 943 |
|
}, |
|
{ |
|
"epoch": 4.214285714285714, |
|
"grad_norm": 1.0761950016021729, |
|
"learning_rate": 2.071429759016607e-06, |
|
"loss": 0.0426, |
|
"step": 944 |
|
}, |
|
{ |
|
"epoch": 4.21875, |
|
"grad_norm": 1.483879804611206, |
|
"learning_rate": 2.061867631163781e-06, |
|
"loss": 0.0506, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 4.223214285714286, |
|
"grad_norm": 1.103798270225525, |
|
"learning_rate": 2.0523218994472408e-06, |
|
"loss": 0.0352, |
|
"step": 946 |
|
}, |
|
{ |
|
"epoch": 4.227678571428571, |
|
"grad_norm": 1.1449997425079346, |
|
"learning_rate": 2.0427926171275157e-06, |
|
"loss": 0.0416, |
|
"step": 947 |
|
}, |
|
{ |
|
"epoch": 4.232142857142857, |
|
"grad_norm": 1.024032711982727, |
|
"learning_rate": 2.033279837373359e-06, |
|
"loss": 0.0421, |
|
"step": 948 |
|
}, |
|
{ |
|
"epoch": 4.236607142857143, |
|
"grad_norm": 0.9040204882621765, |
|
"learning_rate": 2.023783613261439e-06, |
|
"loss": 0.0357, |
|
"step": 949 |
|
}, |
|
{ |
|
"epoch": 4.241071428571429, |
|
"grad_norm": 1.1004376411437988, |
|
"learning_rate": 2.0143039977760663e-06, |
|
"loss": 0.0468, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 4.245535714285714, |
|
"grad_norm": 1.3727236986160278, |
|
"learning_rate": 2.0048410438088675e-06, |
|
"loss": 0.058, |
|
"step": 951 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"grad_norm": 1.0734888315200806, |
|
"learning_rate": 1.995394804158516e-06, |
|
"loss": 0.045, |
|
"step": 952 |
|
}, |
|
{ |
|
"epoch": 4.254464285714286, |
|
"grad_norm": 1.0533781051635742, |
|
"learning_rate": 1.9859653315304254e-06, |
|
"loss": 0.0432, |
|
"step": 953 |
|
}, |
|
{ |
|
"epoch": 4.258928571428571, |
|
"grad_norm": 0.9554777145385742, |
|
"learning_rate": 1.976552678536456e-06, |
|
"loss": 0.0429, |
|
"step": 954 |
|
}, |
|
{ |
|
"epoch": 4.263392857142857, |
|
"grad_norm": 1.208379864692688, |
|
"learning_rate": 1.9671568976946257e-06, |
|
"loss": 0.0548, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 4.267857142857143, |
|
"grad_norm": 1.1880334615707397, |
|
"learning_rate": 1.9577780414288066e-06, |
|
"loss": 0.0449, |
|
"step": 956 |
|
}, |
|
{ |
|
"epoch": 4.272321428571429, |
|
"grad_norm": 1.1660935878753662, |
|
"learning_rate": 1.9484161620684524e-06, |
|
"loss": 0.0474, |
|
"step": 957 |
|
}, |
|
{ |
|
"epoch": 4.276785714285714, |
|
"grad_norm": 1.281361699104309, |
|
"learning_rate": 1.939071311848282e-06, |
|
"loss": 0.0468, |
|
"step": 958 |
|
}, |
|
{ |
|
"epoch": 4.28125, |
|
"grad_norm": 1.1080694198608398, |
|
"learning_rate": 1.9297435429080076e-06, |
|
"loss": 0.0491, |
|
"step": 959 |
|
}, |
|
{ |
|
"epoch": 4.285714285714286, |
|
"grad_norm": 0.9572265148162842, |
|
"learning_rate": 1.9204329072920285e-06, |
|
"loss": 0.035, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 4.290178571428571, |
|
"grad_norm": 1.2457669973373413, |
|
"learning_rate": 1.911139456949158e-06, |
|
"loss": 0.049, |
|
"step": 961 |
|
}, |
|
{ |
|
"epoch": 4.294642857142857, |
|
"grad_norm": 1.1637389659881592, |
|
"learning_rate": 1.9018632437323198e-06, |
|
"loss": 0.0481, |
|
"step": 962 |
|
}, |
|
{ |
|
"epoch": 4.299107142857143, |
|
"grad_norm": 1.0944490432739258, |
|
"learning_rate": 1.892604319398259e-06, |
|
"loss": 0.0516, |
|
"step": 963 |
|
}, |
|
{ |
|
"epoch": 4.303571428571429, |
|
"grad_norm": 1.129134178161621, |
|
"learning_rate": 1.883362735607262e-06, |
|
"loss": 0.0405, |
|
"step": 964 |
|
}, |
|
{ |
|
"epoch": 4.308035714285714, |
|
"grad_norm": 1.2214771509170532, |
|
"learning_rate": 1.8741385439228616e-06, |
|
"loss": 0.0455, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 4.3125, |
|
"grad_norm": 1.0837950706481934, |
|
"learning_rate": 1.8649317958115534e-06, |
|
"loss": 0.0445, |
|
"step": 966 |
|
}, |
|
{ |
|
"epoch": 4.316964285714286, |
|
"grad_norm": 1.048578143119812, |
|
"learning_rate": 1.8557425426424989e-06, |
|
"loss": 0.0488, |
|
"step": 967 |
|
}, |
|
{ |
|
"epoch": 4.321428571428571, |
|
"grad_norm": 0.9222402572631836, |
|
"learning_rate": 1.8465708356872592e-06, |
|
"loss": 0.0336, |
|
"step": 968 |
|
}, |
|
{ |
|
"epoch": 4.325892857142857, |
|
"grad_norm": 1.0440882444381714, |
|
"learning_rate": 1.8374167261194826e-06, |
|
"loss": 0.046, |
|
"step": 969 |
|
}, |
|
{ |
|
"epoch": 4.325892857142857, |
|
"eval_loss": 1.0081344842910767, |
|
"eval_runtime": 3.1161, |
|
"eval_samples_per_second": 19.255, |
|
"eval_steps_per_second": 1.284, |
|
"step": 969 |
|
}, |
|
{ |
|
"epoch": 4.330357142857143, |
|
"grad_norm": 1.0037401914596558, |
|
"learning_rate": 1.8282802650146408e-06, |
|
"loss": 0.0423, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 4.334821428571429, |
|
"grad_norm": 1.0888147354125977, |
|
"learning_rate": 1.8191615033497345e-06, |
|
"loss": 0.0426, |
|
"step": 971 |
|
}, |
|
{ |
|
"epoch": 4.339285714285714, |
|
"grad_norm": 1.0715805292129517, |
|
"learning_rate": 1.810060492003008e-06, |
|
"loss": 0.0444, |
|
"step": 972 |
|
}, |
|
{ |
|
"epoch": 4.34375, |
|
"grad_norm": 1.0277068614959717, |
|
"learning_rate": 1.800977281753671e-06, |
|
"loss": 0.0428, |
|
"step": 973 |
|
}, |
|
{ |
|
"epoch": 4.348214285714286, |
|
"grad_norm": 0.999724268913269, |
|
"learning_rate": 1.7919119232816049e-06, |
|
"loss": 0.0387, |
|
"step": 974 |
|
}, |
|
{ |
|
"epoch": 4.352678571428571, |
|
"grad_norm": 0.9766676425933838, |
|
"learning_rate": 1.7828644671670943e-06, |
|
"loss": 0.0393, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 4.357142857142857, |
|
"grad_norm": 1.0714170932769775, |
|
"learning_rate": 1.773834963890534e-06, |
|
"loss": 0.0458, |
|
"step": 976 |
|
}, |
|
{ |
|
"epoch": 4.361607142857143, |
|
"grad_norm": 0.8471158742904663, |
|
"learning_rate": 1.764823463832151e-06, |
|
"loss": 0.0364, |
|
"step": 977 |
|
}, |
|
{ |
|
"epoch": 4.366071428571429, |
|
"grad_norm": 1.0079231262207031, |
|
"learning_rate": 1.7558300172717234e-06, |
|
"loss": 0.0364, |
|
"step": 978 |
|
}, |
|
{ |
|
"epoch": 4.370535714285714, |
|
"grad_norm": 1.1584757566452026, |
|
"learning_rate": 1.7468546743882997e-06, |
|
"loss": 0.0442, |
|
"step": 979 |
|
}, |
|
{ |
|
"epoch": 4.375, |
|
"grad_norm": 1.0798187255859375, |
|
"learning_rate": 1.7378974852599203e-06, |
|
"loss": 0.0366, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 4.379464285714286, |
|
"grad_norm": 1.2951469421386719, |
|
"learning_rate": 1.7289584998633307e-06, |
|
"loss": 0.0501, |
|
"step": 981 |
|
}, |
|
{ |
|
"epoch": 4.383928571428571, |
|
"grad_norm": 1.370492696762085, |
|
"learning_rate": 1.7200377680737148e-06, |
|
"loss": 0.0511, |
|
"step": 982 |
|
}, |
|
{ |
|
"epoch": 4.388392857142857, |
|
"grad_norm": 1.24025297164917, |
|
"learning_rate": 1.7111353396644071e-06, |
|
"loss": 0.0433, |
|
"step": 983 |
|
}, |
|
{ |
|
"epoch": 4.392857142857143, |
|
"grad_norm": 1.1378741264343262, |
|
"learning_rate": 1.7022512643066196e-06, |
|
"loss": 0.0417, |
|
"step": 984 |
|
}, |
|
{ |
|
"epoch": 4.397321428571429, |
|
"grad_norm": 1.17164146900177, |
|
"learning_rate": 1.6933855915691622e-06, |
|
"loss": 0.0464, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 4.401785714285714, |
|
"grad_norm": 1.1803674697875977, |
|
"learning_rate": 1.6845383709181676e-06, |
|
"loss": 0.0404, |
|
"step": 986 |
|
}, |
|
{ |
|
"epoch": 4.40625, |
|
"grad_norm": 1.1585767269134521, |
|
"learning_rate": 1.675709651716817e-06, |
|
"loss": 0.0417, |
|
"step": 987 |
|
}, |
|
{ |
|
"epoch": 4.410714285714286, |
|
"grad_norm": 1.1010210514068604, |
|
"learning_rate": 1.6668994832250556e-06, |
|
"loss": 0.0429, |
|
"step": 988 |
|
}, |
|
{ |
|
"epoch": 4.415178571428571, |
|
"grad_norm": 1.0791648626327515, |
|
"learning_rate": 1.6581079145993323e-06, |
|
"loss": 0.0457, |
|
"step": 989 |
|
}, |
|
{ |
|
"epoch": 4.419642857142857, |
|
"grad_norm": 1.114737868309021, |
|
"learning_rate": 1.649334994892314e-06, |
|
"loss": 0.0502, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 4.424107142857143, |
|
"grad_norm": 1.1722322702407837, |
|
"learning_rate": 1.640580773052618e-06, |
|
"loss": 0.0444, |
|
"step": 991 |
|
}, |
|
{ |
|
"epoch": 4.428571428571429, |
|
"grad_norm": 1.1338437795639038, |
|
"learning_rate": 1.6318452979245355e-06, |
|
"loss": 0.0462, |
|
"step": 992 |
|
}, |
|
{ |
|
"epoch": 4.433035714285714, |
|
"grad_norm": 1.1225347518920898, |
|
"learning_rate": 1.6231286182477555e-06, |
|
"loss": 0.049, |
|
"step": 993 |
|
}, |
|
{ |
|
"epoch": 4.4375, |
|
"grad_norm": 1.1152468919754028, |
|
"learning_rate": 1.6144307826571085e-06, |
|
"loss": 0.0474, |
|
"step": 994 |
|
}, |
|
{ |
|
"epoch": 4.441964285714286, |
|
"grad_norm": 0.970764696598053, |
|
"learning_rate": 1.6057518396822724e-06, |
|
"loss": 0.0474, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 4.446428571428571, |
|
"grad_norm": 1.2339911460876465, |
|
"learning_rate": 1.5970918377475208e-06, |
|
"loss": 0.0497, |
|
"step": 996 |
|
}, |
|
{ |
|
"epoch": 4.450892857142857, |
|
"grad_norm": 1.1019359827041626, |
|
"learning_rate": 1.5884508251714436e-06, |
|
"loss": 0.0419, |
|
"step": 997 |
|
}, |
|
{ |
|
"epoch": 4.455357142857143, |
|
"grad_norm": 1.013564109802246, |
|
"learning_rate": 1.5798288501666793e-06, |
|
"loss": 0.0405, |
|
"step": 998 |
|
}, |
|
{ |
|
"epoch": 4.459821428571429, |
|
"grad_norm": 1.2404049634933472, |
|
"learning_rate": 1.5712259608396462e-06, |
|
"loss": 0.0437, |
|
"step": 999 |
|
}, |
|
{ |
|
"epoch": 4.464285714285714, |
|
"grad_norm": 0.8601024746894836, |
|
"learning_rate": 1.5626422051902709e-06, |
|
"loss": 0.0374, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 4.46875, |
|
"grad_norm": 1.117348074913025, |
|
"learning_rate": 1.5540776311117304e-06, |
|
"loss": 0.042, |
|
"step": 1001 |
|
}, |
|
{ |
|
"epoch": 4.473214285714286, |
|
"grad_norm": 1.1788359880447388, |
|
"learning_rate": 1.5455322863901704e-06, |
|
"loss": 0.0436, |
|
"step": 1002 |
|
}, |
|
{ |
|
"epoch": 4.477678571428571, |
|
"grad_norm": 1.2720320224761963, |
|
"learning_rate": 1.5370062187044502e-06, |
|
"loss": 0.0491, |
|
"step": 1003 |
|
}, |
|
{ |
|
"epoch": 4.482142857142857, |
|
"grad_norm": 1.1377935409545898, |
|
"learning_rate": 1.5284994756258718e-06, |
|
"loss": 0.0447, |
|
"step": 1004 |
|
}, |
|
{ |
|
"epoch": 4.486607142857143, |
|
"grad_norm": 1.1015208959579468, |
|
"learning_rate": 1.5200121046179151e-06, |
|
"loss": 0.0441, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 4.491071428571429, |
|
"grad_norm": 1.0011322498321533, |
|
"learning_rate": 1.511544153035975e-06, |
|
"loss": 0.0407, |
|
"step": 1006 |
|
}, |
|
{ |
|
"epoch": 4.495535714285714, |
|
"grad_norm": 1.0262796878814697, |
|
"learning_rate": 1.5030956681270903e-06, |
|
"loss": 0.0368, |
|
"step": 1007 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"grad_norm": 0.9674072861671448, |
|
"learning_rate": 1.4946666970296932e-06, |
|
"loss": 0.0366, |
|
"step": 1008 |
|
}, |
|
{ |
|
"epoch": 4.504464285714286, |
|
"grad_norm": 1.2644139528274536, |
|
"learning_rate": 1.486257286773331e-06, |
|
"loss": 0.0472, |
|
"step": 1009 |
|
}, |
|
{ |
|
"epoch": 4.508928571428571, |
|
"grad_norm": 1.3127970695495605, |
|
"learning_rate": 1.4778674842784168e-06, |
|
"loss": 0.0517, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 4.513392857142857, |
|
"grad_norm": 0.9656462669372559, |
|
"learning_rate": 1.4694973363559539e-06, |
|
"loss": 0.0347, |
|
"step": 1011 |
|
}, |
|
{ |
|
"epoch": 4.517857142857143, |
|
"grad_norm": 1.1587086915969849, |
|
"learning_rate": 1.4611468897072933e-06, |
|
"loss": 0.0403, |
|
"step": 1012 |
|
}, |
|
{ |
|
"epoch": 4.522321428571429, |
|
"grad_norm": 1.2970781326293945, |
|
"learning_rate": 1.4528161909238569e-06, |
|
"loss": 0.0361, |
|
"step": 1013 |
|
}, |
|
{ |
|
"epoch": 4.526785714285714, |
|
"grad_norm": 1.1997908353805542, |
|
"learning_rate": 1.44450528648688e-06, |
|
"loss": 0.0469, |
|
"step": 1014 |
|
}, |
|
{ |
|
"epoch": 4.53125, |
|
"grad_norm": 1.2376347780227661, |
|
"learning_rate": 1.4362142227671607e-06, |
|
"loss": 0.0453, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 4.535714285714286, |
|
"grad_norm": 1.1609876155853271, |
|
"learning_rate": 1.427943046024793e-06, |
|
"loss": 0.0404, |
|
"step": 1016 |
|
}, |
|
{ |
|
"epoch": 4.540178571428571, |
|
"grad_norm": 1.313492774963379, |
|
"learning_rate": 1.4196918024089133e-06, |
|
"loss": 0.0503, |
|
"step": 1017 |
|
}, |
|
{ |
|
"epoch": 4.544642857142857, |
|
"grad_norm": 1.1651771068572998, |
|
"learning_rate": 1.4114605379574342e-06, |
|
"loss": 0.0426, |
|
"step": 1018 |
|
}, |
|
{ |
|
"epoch": 4.549107142857143, |
|
"grad_norm": 1.1554261445999146, |
|
"learning_rate": 1.4032492985968057e-06, |
|
"loss": 0.0516, |
|
"step": 1019 |
|
}, |
|
{ |
|
"epoch": 4.553571428571429, |
|
"grad_norm": 1.2204504013061523, |
|
"learning_rate": 1.3950581301417365e-06, |
|
"loss": 0.0426, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 4.558035714285714, |
|
"grad_norm": 1.2661011219024658, |
|
"learning_rate": 1.3868870782949565e-06, |
|
"loss": 0.0505, |
|
"step": 1021 |
|
}, |
|
{ |
|
"epoch": 4.5625, |
|
"grad_norm": 0.979673445224762, |
|
"learning_rate": 1.3787361886469509e-06, |
|
"loss": 0.0422, |
|
"step": 1022 |
|
}, |
|
{ |
|
"epoch": 4.566964285714286, |
|
"grad_norm": 1.139962911605835, |
|
"learning_rate": 1.3706055066757116e-06, |
|
"loss": 0.0474, |
|
"step": 1023 |
|
}, |
|
{ |
|
"epoch": 4.571428571428571, |
|
"grad_norm": 1.2725175619125366, |
|
"learning_rate": 1.3624950777464828e-06, |
|
"loss": 0.0467, |
|
"step": 1024 |
|
}, |
|
{ |
|
"epoch": 4.575892857142857, |
|
"grad_norm": 1.189316987991333, |
|
"learning_rate": 1.3544049471115017e-06, |
|
"loss": 0.0438, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 4.580357142857143, |
|
"grad_norm": 0.9822104573249817, |
|
"learning_rate": 1.3463351599097552e-06, |
|
"loss": 0.0386, |
|
"step": 1026 |
|
}, |
|
{ |
|
"epoch": 4.580357142857143, |
|
"eval_loss": 1.028843641281128, |
|
"eval_runtime": 3.1044, |
|
"eval_samples_per_second": 19.327, |
|
"eval_steps_per_second": 1.288, |
|
"step": 1026 |
|
}, |
|
{ |
|
"epoch": 4.584821428571429, |
|
"grad_norm": 1.140833854675293, |
|
"learning_rate": 1.3382857611667233e-06, |
|
"loss": 0.0433, |
|
"step": 1027 |
|
}, |
|
{ |
|
"epoch": 4.589285714285714, |
|
"grad_norm": 0.9858164191246033, |
|
"learning_rate": 1.3302567957941265e-06, |
|
"loss": 0.0394, |
|
"step": 1028 |
|
}, |
|
{ |
|
"epoch": 4.59375, |
|
"grad_norm": 1.0154980421066284, |
|
"learning_rate": 1.3222483085896786e-06, |
|
"loss": 0.0412, |
|
"step": 1029 |
|
}, |
|
{ |
|
"epoch": 4.598214285714286, |
|
"grad_norm": 1.2253535985946655, |
|
"learning_rate": 1.3142603442368346e-06, |
|
"loss": 0.0508, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 4.602678571428571, |
|
"grad_norm": 1.0019500255584717, |
|
"learning_rate": 1.3062929473045442e-06, |
|
"loss": 0.041, |
|
"step": 1031 |
|
}, |
|
{ |
|
"epoch": 4.607142857142857, |
|
"grad_norm": 1.0669821500778198, |
|
"learning_rate": 1.2983461622469953e-06, |
|
"loss": 0.0456, |
|
"step": 1032 |
|
}, |
|
{ |
|
"epoch": 4.611607142857143, |
|
"grad_norm": 1.2003577947616577, |
|
"learning_rate": 1.290420033403377e-06, |
|
"loss": 0.0423, |
|
"step": 1033 |
|
}, |
|
{ |
|
"epoch": 4.616071428571429, |
|
"grad_norm": 1.1979588270187378, |
|
"learning_rate": 1.2825146049976244e-06, |
|
"loss": 0.0426, |
|
"step": 1034 |
|
}, |
|
{ |
|
"epoch": 4.620535714285714, |
|
"grad_norm": 1.0805418491363525, |
|
"learning_rate": 1.2746299211381755e-06, |
|
"loss": 0.0453, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 4.625, |
|
"grad_norm": 0.9814077615737915, |
|
"learning_rate": 1.2667660258177241e-06, |
|
"loss": 0.0398, |
|
"step": 1036 |
|
}, |
|
{ |
|
"epoch": 4.629464285714286, |
|
"grad_norm": 0.8897956013679504, |
|
"learning_rate": 1.258922962912969e-06, |
|
"loss": 0.0334, |
|
"step": 1037 |
|
}, |
|
{ |
|
"epoch": 4.633928571428571, |
|
"grad_norm": 1.0321533679962158, |
|
"learning_rate": 1.2511007761843839e-06, |
|
"loss": 0.0467, |
|
"step": 1038 |
|
}, |
|
{ |
|
"epoch": 4.638392857142857, |
|
"grad_norm": 1.1807042360305786, |
|
"learning_rate": 1.2432995092759538e-06, |
|
"loss": 0.0459, |
|
"step": 1039 |
|
}, |
|
{ |
|
"epoch": 4.642857142857143, |
|
"grad_norm": 1.181643009185791, |
|
"learning_rate": 1.2355192057149477e-06, |
|
"loss": 0.043, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 4.647321428571429, |
|
"grad_norm": 1.0699180364608765, |
|
"learning_rate": 1.2277599089116662e-06, |
|
"loss": 0.043, |
|
"step": 1041 |
|
}, |
|
{ |
|
"epoch": 4.651785714285714, |
|
"grad_norm": 1.0686554908752441, |
|
"learning_rate": 1.220021662159204e-06, |
|
"loss": 0.0533, |
|
"step": 1042 |
|
}, |
|
{ |
|
"epoch": 4.65625, |
|
"grad_norm": 1.014110803604126, |
|
"learning_rate": 1.2123045086332076e-06, |
|
"loss": 0.0417, |
|
"step": 1043 |
|
}, |
|
{ |
|
"epoch": 4.660714285714286, |
|
"grad_norm": 1.0047358274459839, |
|
"learning_rate": 1.204608491391628e-06, |
|
"loss": 0.0355, |
|
"step": 1044 |
|
}, |
|
{ |
|
"epoch": 4.665178571428571, |
|
"grad_norm": 0.8971364498138428, |
|
"learning_rate": 1.1969336533744965e-06, |
|
"loss": 0.0361, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 4.669642857142857, |
|
"grad_norm": 1.230128288269043, |
|
"learning_rate": 1.1892800374036632e-06, |
|
"loss": 0.0462, |
|
"step": 1046 |
|
}, |
|
{ |
|
"epoch": 4.674107142857143, |
|
"grad_norm": 1.1491485834121704, |
|
"learning_rate": 1.181647686182576e-06, |
|
"loss": 0.0449, |
|
"step": 1047 |
|
}, |
|
{ |
|
"epoch": 4.678571428571429, |
|
"grad_norm": 1.171077847480774, |
|
"learning_rate": 1.174036642296036e-06, |
|
"loss": 0.0382, |
|
"step": 1048 |
|
}, |
|
{ |
|
"epoch": 4.683035714285714, |
|
"grad_norm": 1.138710856437683, |
|
"learning_rate": 1.166446948209957e-06, |
|
"loss": 0.0472, |
|
"step": 1049 |
|
}, |
|
{ |
|
"epoch": 4.6875, |
|
"grad_norm": 0.9967400431632996, |
|
"learning_rate": 1.1588786462711347e-06, |
|
"loss": 0.0438, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 4.691964285714286, |
|
"grad_norm": 0.9416896104812622, |
|
"learning_rate": 1.1513317787070002e-06, |
|
"loss": 0.0326, |
|
"step": 1051 |
|
}, |
|
{ |
|
"epoch": 4.696428571428571, |
|
"grad_norm": 0.9042790532112122, |
|
"learning_rate": 1.1438063876254025e-06, |
|
"loss": 0.0376, |
|
"step": 1052 |
|
}, |
|
{ |
|
"epoch": 4.700892857142857, |
|
"grad_norm": 1.1666338443756104, |
|
"learning_rate": 1.1363025150143508e-06, |
|
"loss": 0.0467, |
|
"step": 1053 |
|
}, |
|
{ |
|
"epoch": 4.705357142857143, |
|
"grad_norm": 1.1811097860336304, |
|
"learning_rate": 1.1288202027417996e-06, |
|
"loss": 0.0456, |
|
"step": 1054 |
|
}, |
|
{ |
|
"epoch": 4.709821428571429, |
|
"grad_norm": 1.1350547075271606, |
|
"learning_rate": 1.1213594925554027e-06, |
|
"loss": 0.0482, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 4.714285714285714, |
|
"grad_norm": 1.0363948345184326, |
|
"learning_rate": 1.1139204260822874e-06, |
|
"loss": 0.0516, |
|
"step": 1056 |
|
}, |
|
{ |
|
"epoch": 4.71875, |
|
"grad_norm": 1.008671760559082, |
|
"learning_rate": 1.1065030448288196e-06, |
|
"loss": 0.0436, |
|
"step": 1057 |
|
}, |
|
{ |
|
"epoch": 4.723214285714286, |
|
"grad_norm": 1.1995580196380615, |
|
"learning_rate": 1.0991073901803692e-06, |
|
"loss": 0.0464, |
|
"step": 1058 |
|
}, |
|
{ |
|
"epoch": 4.727678571428571, |
|
"grad_norm": 1.1144418716430664, |
|
"learning_rate": 1.091733503401085e-06, |
|
"loss": 0.0439, |
|
"step": 1059 |
|
}, |
|
{ |
|
"epoch": 4.732142857142857, |
|
"grad_norm": 1.1510818004608154, |
|
"learning_rate": 1.0843814256336622e-06, |
|
"loss": 0.0443, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 4.736607142857143, |
|
"grad_norm": 0.9981248378753662, |
|
"learning_rate": 1.0770511978991116e-06, |
|
"loss": 0.0414, |
|
"step": 1061 |
|
}, |
|
{ |
|
"epoch": 4.741071428571429, |
|
"grad_norm": 1.0740482807159424, |
|
"learning_rate": 1.0697428610965275e-06, |
|
"loss": 0.0475, |
|
"step": 1062 |
|
}, |
|
{ |
|
"epoch": 4.745535714285714, |
|
"grad_norm": 1.2632689476013184, |
|
"learning_rate": 1.0624564560028723e-06, |
|
"loss": 0.0464, |
|
"step": 1063 |
|
}, |
|
{ |
|
"epoch": 4.75, |
|
"grad_norm": 0.9502501487731934, |
|
"learning_rate": 1.0551920232727309e-06, |
|
"loss": 0.0399, |
|
"step": 1064 |
|
}, |
|
{ |
|
"epoch": 4.754464285714286, |
|
"grad_norm": 1.2378613948822021, |
|
"learning_rate": 1.0479496034380988e-06, |
|
"loss": 0.0504, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 4.758928571428571, |
|
"grad_norm": 1.0768165588378906, |
|
"learning_rate": 1.0407292369081479e-06, |
|
"loss": 0.0459, |
|
"step": 1066 |
|
}, |
|
{ |
|
"epoch": 4.763392857142857, |
|
"grad_norm": 1.039918065071106, |
|
"learning_rate": 1.0335309639690041e-06, |
|
"loss": 0.0452, |
|
"step": 1067 |
|
}, |
|
{ |
|
"epoch": 4.767857142857143, |
|
"grad_norm": 0.9375714659690857, |
|
"learning_rate": 1.0263548247835246e-06, |
|
"loss": 0.0419, |
|
"step": 1068 |
|
}, |
|
{ |
|
"epoch": 4.772321428571429, |
|
"grad_norm": 0.9457955360412598, |
|
"learning_rate": 1.0192008593910643e-06, |
|
"loss": 0.0444, |
|
"step": 1069 |
|
}, |
|
{ |
|
"epoch": 4.776785714285714, |
|
"grad_norm": 1.225278377532959, |
|
"learning_rate": 1.0120691077072643e-06, |
|
"loss": 0.0433, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 4.78125, |
|
"grad_norm": 1.281240463256836, |
|
"learning_rate": 1.004959609523824e-06, |
|
"loss": 0.0497, |
|
"step": 1071 |
|
}, |
|
{ |
|
"epoch": 4.785714285714286, |
|
"grad_norm": 1.1898356676101685, |
|
"learning_rate": 9.978724045082772e-07, |
|
"loss": 0.0401, |
|
"step": 1072 |
|
}, |
|
{ |
|
"epoch": 4.790178571428571, |
|
"grad_norm": 1.2565916776657104, |
|
"learning_rate": 9.908075322037738e-07, |
|
"loss": 0.0442, |
|
"step": 1073 |
|
}, |
|
{ |
|
"epoch": 4.794642857142857, |
|
"grad_norm": 0.9693619012832642, |
|
"learning_rate": 9.83765032028858e-07, |
|
"loss": 0.0408, |
|
"step": 1074 |
|
}, |
|
{ |
|
"epoch": 4.799107142857143, |
|
"grad_norm": 1.2235596179962158, |
|
"learning_rate": 9.767449432772485e-07, |
|
"loss": 0.0423, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 4.803571428571429, |
|
"grad_norm": 1.3905706405639648, |
|
"learning_rate": 9.697473051176173e-07, |
|
"loss": 0.0506, |
|
"step": 1076 |
|
}, |
|
{ |
|
"epoch": 4.808035714285714, |
|
"grad_norm": 1.111341953277588, |
|
"learning_rate": 9.627721565933749e-07, |
|
"loss": 0.0409, |
|
"step": 1077 |
|
}, |
|
{ |
|
"epoch": 4.8125, |
|
"grad_norm": 1.2336980104446411, |
|
"learning_rate": 9.558195366224508e-07, |
|
"loss": 0.0487, |
|
"step": 1078 |
|
}, |
|
{ |
|
"epoch": 4.816964285714286, |
|
"grad_norm": 1.066314458847046, |
|
"learning_rate": 9.488894839970758e-07, |
|
"loss": 0.04, |
|
"step": 1079 |
|
}, |
|
{ |
|
"epoch": 4.821428571428571, |
|
"grad_norm": 0.9892422556877136, |
|
"learning_rate": 9.419820373835668e-07, |
|
"loss": 0.0417, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 4.825892857142857, |
|
"grad_norm": 1.1529948711395264, |
|
"learning_rate": 9.350972353221052e-07, |
|
"loss": 0.0433, |
|
"step": 1081 |
|
}, |
|
{ |
|
"epoch": 4.830357142857143, |
|
"grad_norm": 1.2176202535629272, |
|
"learning_rate": 9.282351162265363e-07, |
|
"loss": 0.0456, |
|
"step": 1082 |
|
}, |
|
{ |
|
"epoch": 4.834821428571429, |
|
"grad_norm": 1.2151131629943848, |
|
"learning_rate": 9.213957183841355e-07, |
|
"loss": 0.0425, |
|
"step": 1083 |
|
}, |
|
{ |
|
"epoch": 4.834821428571429, |
|
"eval_loss": 1.0552717447280884, |
|
"eval_runtime": 3.0996, |
|
"eval_samples_per_second": 19.357, |
|
"eval_steps_per_second": 1.29, |
|
"step": 1083 |
|
}, |
|
{ |
|
"epoch": 4.839285714285714, |
|
"grad_norm": 1.2349426746368408, |
|
"learning_rate": 9.145790799554101e-07, |
|
"loss": 0.0467, |
|
"step": 1084 |
|
}, |
|
{ |
|
"epoch": 4.84375, |
|
"grad_norm": 1.0061850547790527, |
|
"learning_rate": 9.077852389738817e-07, |
|
"loss": 0.0407, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 4.848214285714286, |
|
"grad_norm": 1.1742695569992065, |
|
"learning_rate": 9.010142333458698e-07, |
|
"loss": 0.0371, |
|
"step": 1086 |
|
}, |
|
{ |
|
"epoch": 4.852678571428571, |
|
"grad_norm": 1.044801950454712, |
|
"learning_rate": 8.942661008502875e-07, |
|
"loss": 0.0409, |
|
"step": 1087 |
|
}, |
|
{ |
|
"epoch": 4.857142857142857, |
|
"grad_norm": 1.2313170433044434, |
|
"learning_rate": 8.87540879138421e-07, |
|
"loss": 0.0474, |
|
"step": 1088 |
|
}, |
|
{ |
|
"epoch": 4.861607142857143, |
|
"grad_norm": 1.2384284734725952, |
|
"learning_rate": 8.808386057337353e-07, |
|
"loss": 0.0502, |
|
"step": 1089 |
|
}, |
|
{ |
|
"epoch": 4.866071428571429, |
|
"grad_norm": 1.2501604557037354, |
|
"learning_rate": 8.741593180316439e-07, |
|
"loss": 0.046, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 4.870535714285714, |
|
"grad_norm": 1.151362419128418, |
|
"learning_rate": 8.675030532993193e-07, |
|
"loss": 0.0411, |
|
"step": 1091 |
|
}, |
|
{ |
|
"epoch": 4.875, |
|
"grad_norm": 1.1054540872573853, |
|
"learning_rate": 8.608698486754739e-07, |
|
"loss": 0.0453, |
|
"step": 1092 |
|
}, |
|
{ |
|
"epoch": 4.879464285714286, |
|
"grad_norm": 1.1711599826812744, |
|
"learning_rate": 8.542597411701563e-07, |
|
"loss": 0.043, |
|
"step": 1093 |
|
}, |
|
{ |
|
"epoch": 4.883928571428571, |
|
"grad_norm": 1.048905372619629, |
|
"learning_rate": 8.476727676645453e-07, |
|
"loss": 0.0417, |
|
"step": 1094 |
|
}, |
|
{ |
|
"epoch": 4.888392857142857, |
|
"grad_norm": 0.9673648476600647, |
|
"learning_rate": 8.411089649107396e-07, |
|
"loss": 0.0388, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 4.892857142857143, |
|
"grad_norm": 1.09621000289917, |
|
"learning_rate": 8.345683695315633e-07, |
|
"loss": 0.0392, |
|
"step": 1096 |
|
}, |
|
{ |
|
"epoch": 4.897321428571429, |
|
"grad_norm": 1.307114601135254, |
|
"learning_rate": 8.280510180203476e-07, |
|
"loss": 0.0491, |
|
"step": 1097 |
|
}, |
|
{ |
|
"epoch": 4.901785714285714, |
|
"grad_norm": 1.013657569885254, |
|
"learning_rate": 8.215569467407386e-07, |
|
"loss": 0.0463, |
|
"step": 1098 |
|
}, |
|
{ |
|
"epoch": 4.90625, |
|
"grad_norm": 1.0433632135391235, |
|
"learning_rate": 8.150861919264887e-07, |
|
"loss": 0.0415, |
|
"step": 1099 |
|
}, |
|
{ |
|
"epoch": 4.910714285714286, |
|
"grad_norm": 1.2945899963378906, |
|
"learning_rate": 8.086387896812546e-07, |
|
"loss": 0.0503, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 4.915178571428571, |
|
"grad_norm": 1.0583516359329224, |
|
"learning_rate": 8.022147759784016e-07, |
|
"loss": 0.0426, |
|
"step": 1101 |
|
}, |
|
{ |
|
"epoch": 4.919642857142857, |
|
"grad_norm": 1.0814223289489746, |
|
"learning_rate": 7.958141866607897e-07, |
|
"loss": 0.0438, |
|
"step": 1102 |
|
}, |
|
{ |
|
"epoch": 4.924107142857143, |
|
"grad_norm": 1.4671058654785156, |
|
"learning_rate": 7.894370574405928e-07, |
|
"loss": 0.0489, |
|
"step": 1103 |
|
}, |
|
{ |
|
"epoch": 4.928571428571429, |
|
"grad_norm": 0.9236575961112976, |
|
"learning_rate": 7.830834238990803e-07, |
|
"loss": 0.0416, |
|
"step": 1104 |
|
}, |
|
{ |
|
"epoch": 4.933035714285714, |
|
"grad_norm": 1.0398495197296143, |
|
"learning_rate": 7.767533214864331e-07, |
|
"loss": 0.046, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 4.9375, |
|
"grad_norm": 1.139332890510559, |
|
"learning_rate": 7.70446785521533e-07, |
|
"loss": 0.0411, |
|
"step": 1106 |
|
}, |
|
{ |
|
"epoch": 4.941964285714286, |
|
"grad_norm": 1.0621843338012695, |
|
"learning_rate": 7.641638511917806e-07, |
|
"loss": 0.0424, |
|
"step": 1107 |
|
}, |
|
{ |
|
"epoch": 4.946428571428571, |
|
"grad_norm": 1.08333420753479, |
|
"learning_rate": 7.579045535528878e-07, |
|
"loss": 0.042, |
|
"step": 1108 |
|
}, |
|
{ |
|
"epoch": 4.950892857142857, |
|
"grad_norm": 0.8833259344100952, |
|
"learning_rate": 7.516689275286813e-07, |
|
"loss": 0.0381, |
|
"step": 1109 |
|
}, |
|
{ |
|
"epoch": 4.955357142857143, |
|
"grad_norm": 1.1298121213912964, |
|
"learning_rate": 7.454570079109164e-07, |
|
"loss": 0.0437, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 4.959821428571429, |
|
"grad_norm": 0.9060654044151306, |
|
"learning_rate": 7.392688293590767e-07, |
|
"loss": 0.037, |
|
"step": 1111 |
|
}, |
|
{ |
|
"epoch": 4.964285714285714, |
|
"grad_norm": 0.8614628314971924, |
|
"learning_rate": 7.331044264001836e-07, |
|
"loss": 0.0341, |
|
"step": 1112 |
|
}, |
|
{ |
|
"epoch": 4.96875, |
|
"grad_norm": 1.1477363109588623, |
|
"learning_rate": 7.269638334285973e-07, |
|
"loss": 0.0379, |
|
"step": 1113 |
|
}, |
|
{ |
|
"epoch": 4.973214285714286, |
|
"grad_norm": 1.0552114248275757, |
|
"learning_rate": 7.208470847058387e-07, |
|
"loss": 0.052, |
|
"step": 1114 |
|
}, |
|
{ |
|
"epoch": 4.977678571428571, |
|
"grad_norm": 1.0197445154190063, |
|
"learning_rate": 7.147542143603806e-07, |
|
"loss": 0.0381, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 4.982142857142857, |
|
"grad_norm": 1.0818407535552979, |
|
"learning_rate": 7.086852563874705e-07, |
|
"loss": 0.0451, |
|
"step": 1116 |
|
}, |
|
{ |
|
"epoch": 4.986607142857143, |
|
"grad_norm": 1.2270269393920898, |
|
"learning_rate": 7.026402446489368e-07, |
|
"loss": 0.0405, |
|
"step": 1117 |
|
}, |
|
{ |
|
"epoch": 4.991071428571429, |
|
"grad_norm": 1.078790307044983, |
|
"learning_rate": 6.966192128729969e-07, |
|
"loss": 0.0383, |
|
"step": 1118 |
|
}, |
|
{ |
|
"epoch": 4.995535714285714, |
|
"grad_norm": 1.2608201503753662, |
|
"learning_rate": 6.906221946540765e-07, |
|
"loss": 0.0359, |
|
"step": 1119 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 1.2368184328079224, |
|
"learning_rate": 6.846492234526105e-07, |
|
"loss": 0.0482, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 5.004464285714286, |
|
"grad_norm": 0.767040491104126, |
|
"learning_rate": 6.787003325948681e-07, |
|
"loss": 0.0239, |
|
"step": 1121 |
|
}, |
|
{ |
|
"epoch": 5.008928571428571, |
|
"grad_norm": 0.5072701573371887, |
|
"learning_rate": 6.727755552727618e-07, |
|
"loss": 0.0208, |
|
"step": 1122 |
|
}, |
|
{ |
|
"epoch": 5.013392857142857, |
|
"grad_norm": 0.5942046642303467, |
|
"learning_rate": 6.668749245436603e-07, |
|
"loss": 0.0231, |
|
"step": 1123 |
|
}, |
|
{ |
|
"epoch": 5.017857142857143, |
|
"grad_norm": 0.6232619285583496, |
|
"learning_rate": 6.609984733302069e-07, |
|
"loss": 0.0291, |
|
"step": 1124 |
|
}, |
|
{ |
|
"epoch": 5.022321428571429, |
|
"grad_norm": 0.7308699488639832, |
|
"learning_rate": 6.551462344201356e-07, |
|
"loss": 0.0275, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 5.026785714285714, |
|
"grad_norm": 0.6961386799812317, |
|
"learning_rate": 6.493182404660884e-07, |
|
"loss": 0.0287, |
|
"step": 1126 |
|
}, |
|
{ |
|
"epoch": 5.03125, |
|
"grad_norm": 0.6047128438949585, |
|
"learning_rate": 6.435145239854279e-07, |
|
"loss": 0.0264, |
|
"step": 1127 |
|
}, |
|
{ |
|
"epoch": 5.035714285714286, |
|
"grad_norm": 0.6986120939254761, |
|
"learning_rate": 6.377351173600649e-07, |
|
"loss": 0.0282, |
|
"step": 1128 |
|
}, |
|
{ |
|
"epoch": 5.040178571428571, |
|
"grad_norm": 0.56222003698349, |
|
"learning_rate": 6.319800528362713e-07, |
|
"loss": 0.0205, |
|
"step": 1129 |
|
}, |
|
{ |
|
"epoch": 5.044642857142857, |
|
"grad_norm": 0.6198891997337341, |
|
"learning_rate": 6.262493625245017e-07, |
|
"loss": 0.0212, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 5.049107142857143, |
|
"grad_norm": 0.7689740657806396, |
|
"learning_rate": 6.205430783992163e-07, |
|
"loss": 0.032, |
|
"step": 1131 |
|
}, |
|
{ |
|
"epoch": 5.053571428571429, |
|
"grad_norm": 0.5175718665122986, |
|
"learning_rate": 6.148612322986963e-07, |
|
"loss": 0.021, |
|
"step": 1132 |
|
}, |
|
{ |
|
"epoch": 5.058035714285714, |
|
"grad_norm": 0.551143229007721, |
|
"learning_rate": 6.092038559248772e-07, |
|
"loss": 0.0236, |
|
"step": 1133 |
|
}, |
|
{ |
|
"epoch": 5.0625, |
|
"grad_norm": 0.5913689136505127, |
|
"learning_rate": 6.035709808431585e-07, |
|
"loss": 0.0207, |
|
"step": 1134 |
|
}, |
|
{ |
|
"epoch": 5.066964285714286, |
|
"grad_norm": 0.6526609063148499, |
|
"learning_rate": 5.979626384822384e-07, |
|
"loss": 0.0267, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 5.071428571428571, |
|
"grad_norm": 0.5639797449111938, |
|
"learning_rate": 5.923788601339348e-07, |
|
"loss": 0.0195, |
|
"step": 1136 |
|
}, |
|
{ |
|
"epoch": 5.075892857142857, |
|
"grad_norm": 0.5921444892883301, |
|
"learning_rate": 5.868196769530085e-07, |
|
"loss": 0.0211, |
|
"step": 1137 |
|
}, |
|
{ |
|
"epoch": 5.080357142857143, |
|
"grad_norm": 0.6509416103363037, |
|
"learning_rate": 5.81285119956993e-07, |
|
"loss": 0.0211, |
|
"step": 1138 |
|
}, |
|
{ |
|
"epoch": 5.084821428571429, |
|
"grad_norm": 0.7350341081619263, |
|
"learning_rate": 5.757752200260156e-07, |
|
"loss": 0.0256, |
|
"step": 1139 |
|
}, |
|
{ |
|
"epoch": 5.089285714285714, |
|
"grad_norm": 0.581320583820343, |
|
"learning_rate": 5.702900079026365e-07, |
|
"loss": 0.0211, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 5.089285714285714, |
|
"eval_loss": 1.1169615983963013, |
|
"eval_runtime": 3.0991, |
|
"eval_samples_per_second": 19.361, |
|
"eval_steps_per_second": 1.291, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 5.09375, |
|
"grad_norm": 0.5776538252830505, |
|
"learning_rate": 5.648295141916629e-07, |
|
"loss": 0.021, |
|
"step": 1141 |
|
}, |
|
{ |
|
"epoch": 5.098214285714286, |
|
"grad_norm": 0.6957246661186218, |
|
"learning_rate": 5.593937693599892e-07, |
|
"loss": 0.0226, |
|
"step": 1142 |
|
}, |
|
{ |
|
"epoch": 5.102678571428571, |
|
"grad_norm": 0.6897474527359009, |
|
"learning_rate": 5.539828037364222e-07, |
|
"loss": 0.0239, |
|
"step": 1143 |
|
}, |
|
{ |
|
"epoch": 5.107142857142857, |
|
"grad_norm": 0.6816636323928833, |
|
"learning_rate": 5.485966475115118e-07, |
|
"loss": 0.0254, |
|
"step": 1144 |
|
}, |
|
{ |
|
"epoch": 5.111607142857143, |
|
"grad_norm": 0.7038552761077881, |
|
"learning_rate": 5.432353307373865e-07, |
|
"loss": 0.0248, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 5.116071428571429, |
|
"grad_norm": 0.8172852993011475, |
|
"learning_rate": 5.378988833275772e-07, |
|
"loss": 0.0304, |
|
"step": 1146 |
|
}, |
|
{ |
|
"epoch": 5.120535714285714, |
|
"grad_norm": 0.6974570155143738, |
|
"learning_rate": 5.325873350568641e-07, |
|
"loss": 0.0236, |
|
"step": 1147 |
|
}, |
|
{ |
|
"epoch": 5.125, |
|
"grad_norm": 0.7102988362312317, |
|
"learning_rate": 5.27300715561093e-07, |
|
"loss": 0.0264, |
|
"step": 1148 |
|
}, |
|
{ |
|
"epoch": 5.129464285714286, |
|
"grad_norm": 0.7948593497276306, |
|
"learning_rate": 5.220390543370269e-07, |
|
"loss": 0.0272, |
|
"step": 1149 |
|
}, |
|
{ |
|
"epoch": 5.133928571428571, |
|
"grad_norm": 0.659605085849762, |
|
"learning_rate": 5.16802380742169e-07, |
|
"loss": 0.0222, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 5.138392857142857, |
|
"grad_norm": 0.6647667288780212, |
|
"learning_rate": 5.115907239946071e-07, |
|
"loss": 0.0212, |
|
"step": 1151 |
|
}, |
|
{ |
|
"epoch": 5.142857142857143, |
|
"grad_norm": 0.6717812418937683, |
|
"learning_rate": 5.064041131728456e-07, |
|
"loss": 0.0218, |
|
"step": 1152 |
|
}, |
|
{ |
|
"epoch": 5.147321428571429, |
|
"grad_norm": 0.6710494756698608, |
|
"learning_rate": 5.012425772156433e-07, |
|
"loss": 0.0245, |
|
"step": 1153 |
|
}, |
|
{ |
|
"epoch": 5.151785714285714, |
|
"grad_norm": 0.7254083752632141, |
|
"learning_rate": 4.961061449218561e-07, |
|
"loss": 0.0187, |
|
"step": 1154 |
|
}, |
|
{ |
|
"epoch": 5.15625, |
|
"grad_norm": 0.7306315898895264, |
|
"learning_rate": 4.90994844950272e-07, |
|
"loss": 0.0235, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 5.160714285714286, |
|
"grad_norm": 0.8266925811767578, |
|
"learning_rate": 4.859087058194547e-07, |
|
"loss": 0.0237, |
|
"step": 1156 |
|
}, |
|
{ |
|
"epoch": 5.165178571428571, |
|
"grad_norm": 0.9365939497947693, |
|
"learning_rate": 4.808477559075791e-07, |
|
"loss": 0.0265, |
|
"step": 1157 |
|
}, |
|
{ |
|
"epoch": 5.169642857142857, |
|
"grad_norm": 0.7625580430030823, |
|
"learning_rate": 4.758120234522819e-07, |
|
"loss": 0.0249, |
|
"step": 1158 |
|
}, |
|
{ |
|
"epoch": 5.174107142857143, |
|
"grad_norm": 0.7768296599388123, |
|
"learning_rate": 4.708015365504947e-07, |
|
"loss": 0.0272, |
|
"step": 1159 |
|
}, |
|
{ |
|
"epoch": 5.178571428571429, |
|
"grad_norm": 0.6355034112930298, |
|
"learning_rate": 4.658163231582916e-07, |
|
"loss": 0.0195, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 5.183035714285714, |
|
"grad_norm": 0.7550672888755798, |
|
"learning_rate": 4.6085641109073313e-07, |
|
"loss": 0.0243, |
|
"step": 1161 |
|
}, |
|
{ |
|
"epoch": 5.1875, |
|
"grad_norm": 0.8513064980506897, |
|
"learning_rate": 4.559218280217121e-07, |
|
"loss": 0.0268, |
|
"step": 1162 |
|
}, |
|
{ |
|
"epoch": 5.191964285714286, |
|
"grad_norm": 0.6934846639633179, |
|
"learning_rate": 4.5101260148379735e-07, |
|
"loss": 0.0215, |
|
"step": 1163 |
|
}, |
|
{ |
|
"epoch": 5.196428571428571, |
|
"grad_norm": 0.8758344054222107, |
|
"learning_rate": 4.461287588680783e-07, |
|
"loss": 0.0304, |
|
"step": 1164 |
|
}, |
|
{ |
|
"epoch": 5.200892857142857, |
|
"grad_norm": 0.7218351364135742, |
|
"learning_rate": 4.4127032742401697e-07, |
|
"loss": 0.0252, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 5.205357142857143, |
|
"grad_norm": 0.7134067416191101, |
|
"learning_rate": 4.364373342592935e-07, |
|
"loss": 0.0256, |
|
"step": 1166 |
|
}, |
|
{ |
|
"epoch": 5.209821428571429, |
|
"grad_norm": 0.7380569577217102, |
|
"learning_rate": 4.316298063396534e-07, |
|
"loss": 0.0213, |
|
"step": 1167 |
|
}, |
|
{ |
|
"epoch": 5.214285714285714, |
|
"grad_norm": 0.7990237474441528, |
|
"learning_rate": 4.2684777048875913e-07, |
|
"loss": 0.0301, |
|
"step": 1168 |
|
}, |
|
{ |
|
"epoch": 5.21875, |
|
"grad_norm": 0.8833650946617126, |
|
"learning_rate": 4.2209125338804007e-07, |
|
"loss": 0.0273, |
|
"step": 1169 |
|
}, |
|
{ |
|
"epoch": 5.223214285714286, |
|
"grad_norm": 0.7409700155258179, |
|
"learning_rate": 4.173602815765447e-07, |
|
"loss": 0.0245, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 5.227678571428571, |
|
"grad_norm": 0.7785325050354004, |
|
"learning_rate": 4.126548814507876e-07, |
|
"loss": 0.0209, |
|
"step": 1171 |
|
}, |
|
{ |
|
"epoch": 5.232142857142857, |
|
"grad_norm": 0.7405259609222412, |
|
"learning_rate": 4.079750792646085e-07, |
|
"loss": 0.0229, |
|
"step": 1172 |
|
}, |
|
{ |
|
"epoch": 5.236607142857143, |
|
"grad_norm": 0.865561306476593, |
|
"learning_rate": 4.0332090112902294e-07, |
|
"loss": 0.0283, |
|
"step": 1173 |
|
}, |
|
{ |
|
"epoch": 5.241071428571429, |
|
"grad_norm": 0.7885621786117554, |
|
"learning_rate": 3.98692373012076e-07, |
|
"loss": 0.0255, |
|
"step": 1174 |
|
}, |
|
{ |
|
"epoch": 5.245535714285714, |
|
"grad_norm": 0.6892197728157043, |
|
"learning_rate": 3.940895207387007e-07, |
|
"loss": 0.021, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 5.25, |
|
"grad_norm": 0.819604754447937, |
|
"learning_rate": 3.89512369990565e-07, |
|
"loss": 0.0255, |
|
"step": 1176 |
|
}, |
|
{ |
|
"epoch": 5.254464285714286, |
|
"grad_norm": 0.6747646331787109, |
|
"learning_rate": 3.849609463059437e-07, |
|
"loss": 0.02, |
|
"step": 1177 |
|
}, |
|
{ |
|
"epoch": 5.258928571428571, |
|
"grad_norm": 0.7342539429664612, |
|
"learning_rate": 3.8043527507955926e-07, |
|
"loss": 0.0241, |
|
"step": 1178 |
|
}, |
|
{ |
|
"epoch": 5.263392857142857, |
|
"grad_norm": 0.847237765789032, |
|
"learning_rate": 3.759353815624526e-07, |
|
"loss": 0.0251, |
|
"step": 1179 |
|
}, |
|
{ |
|
"epoch": 5.267857142857143, |
|
"grad_norm": 0.9026116132736206, |
|
"learning_rate": 3.7146129086183547e-07, |
|
"loss": 0.0242, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 5.272321428571429, |
|
"grad_norm": 0.7088863253593445, |
|
"learning_rate": 3.6701302794095416e-07, |
|
"loss": 0.0208, |
|
"step": 1181 |
|
}, |
|
{ |
|
"epoch": 5.276785714285714, |
|
"grad_norm": 0.8555396199226379, |
|
"learning_rate": 3.625906176189484e-07, |
|
"loss": 0.0264, |
|
"step": 1182 |
|
}, |
|
{ |
|
"epoch": 5.28125, |
|
"grad_norm": 0.8181936740875244, |
|
"learning_rate": 3.5819408457070893e-07, |
|
"loss": 0.0233, |
|
"step": 1183 |
|
}, |
|
{ |
|
"epoch": 5.285714285714286, |
|
"grad_norm": 0.7480681538581848, |
|
"learning_rate": 3.5382345332675154e-07, |
|
"loss": 0.0243, |
|
"step": 1184 |
|
}, |
|
{ |
|
"epoch": 5.290178571428571, |
|
"grad_norm": 0.7232365012168884, |
|
"learning_rate": 3.494787482730647e-07, |
|
"loss": 0.0199, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 5.294642857142857, |
|
"grad_norm": 0.7661169171333313, |
|
"learning_rate": 3.4515999365098667e-07, |
|
"loss": 0.0208, |
|
"step": 1186 |
|
}, |
|
{ |
|
"epoch": 5.299107142857143, |
|
"grad_norm": 0.8898442983627319, |
|
"learning_rate": 3.4086721355706303e-07, |
|
"loss": 0.0242, |
|
"step": 1187 |
|
}, |
|
{ |
|
"epoch": 5.303571428571429, |
|
"grad_norm": 0.8835738897323608, |
|
"learning_rate": 3.366004319429139e-07, |
|
"loss": 0.0301, |
|
"step": 1188 |
|
}, |
|
{ |
|
"epoch": 5.308035714285714, |
|
"grad_norm": 0.7845296859741211, |
|
"learning_rate": 3.323596726151021e-07, |
|
"loss": 0.0235, |
|
"step": 1189 |
|
}, |
|
{ |
|
"epoch": 5.3125, |
|
"grad_norm": 0.723148763179779, |
|
"learning_rate": 3.2814495923499496e-07, |
|
"loss": 0.0185, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 5.316964285714286, |
|
"grad_norm": 0.8334882855415344, |
|
"learning_rate": 3.239563153186429e-07, |
|
"loss": 0.0211, |
|
"step": 1191 |
|
}, |
|
{ |
|
"epoch": 5.321428571428571, |
|
"grad_norm": 1.186741828918457, |
|
"learning_rate": 3.197937642366339e-07, |
|
"loss": 0.0291, |
|
"step": 1192 |
|
}, |
|
{ |
|
"epoch": 5.325892857142857, |
|
"grad_norm": 0.7373731136322021, |
|
"learning_rate": 3.1565732921397583e-07, |
|
"loss": 0.0232, |
|
"step": 1193 |
|
}, |
|
{ |
|
"epoch": 5.330357142857143, |
|
"grad_norm": 0.7730535268783569, |
|
"learning_rate": 3.115470333299593e-07, |
|
"loss": 0.0247, |
|
"step": 1194 |
|
}, |
|
{ |
|
"epoch": 5.334821428571429, |
|
"grad_norm": 0.7440225481987, |
|
"learning_rate": 3.0746289951803197e-07, |
|
"loss": 0.0231, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 5.339285714285714, |
|
"grad_norm": 0.8436987996101379, |
|
"learning_rate": 3.03404950565671e-07, |
|
"loss": 0.0259, |
|
"step": 1196 |
|
}, |
|
{ |
|
"epoch": 5.34375, |
|
"grad_norm": 0.8651729822158813, |
|
"learning_rate": 2.9937320911425226e-07, |
|
"loss": 0.024, |
|
"step": 1197 |
|
}, |
|
{ |
|
"epoch": 5.34375, |
|
"eval_loss": 1.1776899099349976, |
|
"eval_runtime": 3.1142, |
|
"eval_samples_per_second": 19.266, |
|
"eval_steps_per_second": 1.284, |
|
"step": 1197 |
|
}, |
|
{ |
|
"epoch": 5.348214285714286, |
|
"grad_norm": 0.7915188074111938, |
|
"learning_rate": 2.953676976589278e-07, |
|
"loss": 0.0217, |
|
"step": 1198 |
|
}, |
|
{ |
|
"epoch": 5.352678571428571, |
|
"grad_norm": 0.7155839204788208, |
|
"learning_rate": 2.9138843854849964e-07, |
|
"loss": 0.0218, |
|
"step": 1199 |
|
}, |
|
{ |
|
"epoch": 5.357142857142857, |
|
"grad_norm": 0.7552268505096436, |
|
"learning_rate": 2.8743545398529436e-07, |
|
"loss": 0.0205, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 5.361607142857143, |
|
"grad_norm": 0.7332569360733032, |
|
"learning_rate": 2.8350876602503893e-07, |
|
"loss": 0.0242, |
|
"step": 1201 |
|
}, |
|
{ |
|
"epoch": 5.366071428571429, |
|
"grad_norm": 0.7247368693351746, |
|
"learning_rate": 2.7960839657673844e-07, |
|
"loss": 0.0247, |
|
"step": 1202 |
|
}, |
|
{ |
|
"epoch": 5.370535714285714, |
|
"grad_norm": 0.7671637535095215, |
|
"learning_rate": 2.7573436740255337e-07, |
|
"loss": 0.0262, |
|
"step": 1203 |
|
}, |
|
{ |
|
"epoch": 5.375, |
|
"grad_norm": 0.8328778147697449, |
|
"learning_rate": 2.7188670011767715e-07, |
|
"loss": 0.0294, |
|
"step": 1204 |
|
}, |
|
{ |
|
"epoch": 5.379464285714286, |
|
"grad_norm": 0.6904054284095764, |
|
"learning_rate": 2.680654161902189e-07, |
|
"loss": 0.0207, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 5.383928571428571, |
|
"grad_norm": 0.9550811648368835, |
|
"learning_rate": 2.6427053694107966e-07, |
|
"loss": 0.0179, |
|
"step": 1206 |
|
}, |
|
{ |
|
"epoch": 5.388392857142857, |
|
"grad_norm": 0.6718007922172546, |
|
"learning_rate": 2.605020835438375e-07, |
|
"loss": 0.0201, |
|
"step": 1207 |
|
}, |
|
{ |
|
"epoch": 5.392857142857143, |
|
"grad_norm": 0.6450316905975342, |
|
"learning_rate": 2.567600770246237e-07, |
|
"loss": 0.0208, |
|
"step": 1208 |
|
}, |
|
{ |
|
"epoch": 5.397321428571429, |
|
"grad_norm": 0.669978678226471, |
|
"learning_rate": 2.5304453826201084e-07, |
|
"loss": 0.0205, |
|
"step": 1209 |
|
}, |
|
{ |
|
"epoch": 5.401785714285714, |
|
"grad_norm": 0.7878643870353699, |
|
"learning_rate": 2.493554879868958e-07, |
|
"loss": 0.0211, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 5.40625, |
|
"grad_norm": 0.7882503271102905, |
|
"learning_rate": 2.4569294678237995e-07, |
|
"loss": 0.0258, |
|
"step": 1211 |
|
}, |
|
{ |
|
"epoch": 5.410714285714286, |
|
"grad_norm": 0.6949124932289124, |
|
"learning_rate": 2.42056935083658e-07, |
|
"loss": 0.02, |
|
"step": 1212 |
|
}, |
|
{ |
|
"epoch": 5.415178571428571, |
|
"grad_norm": 0.7191565632820129, |
|
"learning_rate": 2.38447473177903e-07, |
|
"loss": 0.0187, |
|
"step": 1213 |
|
}, |
|
{ |
|
"epoch": 5.419642857142857, |
|
"grad_norm": 0.8001049757003784, |
|
"learning_rate": 2.3486458120415415e-07, |
|
"loss": 0.0228, |
|
"step": 1214 |
|
}, |
|
{ |
|
"epoch": 5.424107142857143, |
|
"grad_norm": 0.6764441728591919, |
|
"learning_rate": 2.3130827915320015e-07, |
|
"loss": 0.0186, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 5.428571428571429, |
|
"grad_norm": 0.7706072330474854, |
|
"learning_rate": 2.2777858686747495e-07, |
|
"loss": 0.0268, |
|
"step": 1216 |
|
}, |
|
{ |
|
"epoch": 5.433035714285714, |
|
"grad_norm": 0.6611673831939697, |
|
"learning_rate": 2.242755240409399e-07, |
|
"loss": 0.0196, |
|
"step": 1217 |
|
}, |
|
{ |
|
"epoch": 5.4375, |
|
"grad_norm": 0.7625905275344849, |
|
"learning_rate": 2.20799110218979e-07, |
|
"loss": 0.0249, |
|
"step": 1218 |
|
}, |
|
{ |
|
"epoch": 5.441964285714286, |
|
"grad_norm": 0.8830248713493347, |
|
"learning_rate": 2.173493647982873e-07, |
|
"loss": 0.0228, |
|
"step": 1219 |
|
}, |
|
{ |
|
"epoch": 5.446428571428571, |
|
"grad_norm": 0.8279975652694702, |
|
"learning_rate": 2.139263070267605e-07, |
|
"loss": 0.0256, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 5.450892857142857, |
|
"grad_norm": 0.7940500378608704, |
|
"learning_rate": 2.105299560033954e-07, |
|
"loss": 0.0241, |
|
"step": 1221 |
|
}, |
|
{ |
|
"epoch": 5.455357142857143, |
|
"grad_norm": 0.6114321947097778, |
|
"learning_rate": 2.0716033067817308e-07, |
|
"loss": 0.0211, |
|
"step": 1222 |
|
}, |
|
{ |
|
"epoch": 5.459821428571429, |
|
"grad_norm": 0.724319577217102, |
|
"learning_rate": 2.03817449851962e-07, |
|
"loss": 0.0202, |
|
"step": 1223 |
|
}, |
|
{ |
|
"epoch": 5.464285714285714, |
|
"grad_norm": 0.6841122508049011, |
|
"learning_rate": 2.0050133217640672e-07, |
|
"loss": 0.0227, |
|
"step": 1224 |
|
}, |
|
{ |
|
"epoch": 5.46875, |
|
"grad_norm": 0.6772453784942627, |
|
"learning_rate": 1.972119961538277e-07, |
|
"loss": 0.0237, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 5.473214285714286, |
|
"grad_norm": 0.7682874798774719, |
|
"learning_rate": 1.9394946013711787e-07, |
|
"loss": 0.03, |
|
"step": 1226 |
|
}, |
|
{ |
|
"epoch": 5.477678571428571, |
|
"grad_norm": 0.8899271488189697, |
|
"learning_rate": 1.9071374232963564e-07, |
|
"loss": 0.0255, |
|
"step": 1227 |
|
}, |
|
{ |
|
"epoch": 5.482142857142857, |
|
"grad_norm": 0.759722888469696, |
|
"learning_rate": 1.8750486078511206e-07, |
|
"loss": 0.0266, |
|
"step": 1228 |
|
}, |
|
{ |
|
"epoch": 5.486607142857143, |
|
"grad_norm": 0.8016492128372192, |
|
"learning_rate": 1.8432283340754e-07, |
|
"loss": 0.0246, |
|
"step": 1229 |
|
}, |
|
{ |
|
"epoch": 5.491071428571429, |
|
"grad_norm": 0.7532429695129395, |
|
"learning_rate": 1.8116767795108175e-07, |
|
"loss": 0.0245, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 5.495535714285714, |
|
"grad_norm": 0.7854920029640198, |
|
"learning_rate": 1.780394120199672e-07, |
|
"loss": 0.0241, |
|
"step": 1231 |
|
}, |
|
{ |
|
"epoch": 5.5, |
|
"grad_norm": 0.7643800377845764, |
|
"learning_rate": 1.7493805306839532e-07, |
|
"loss": 0.0297, |
|
"step": 1232 |
|
}, |
|
{ |
|
"epoch": 5.504464285714286, |
|
"grad_norm": 0.7853821516036987, |
|
"learning_rate": 1.718636184004378e-07, |
|
"loss": 0.0245, |
|
"step": 1233 |
|
}, |
|
{ |
|
"epoch": 5.508928571428571, |
|
"grad_norm": 0.7025668025016785, |
|
"learning_rate": 1.688161251699405e-07, |
|
"loss": 0.0209, |
|
"step": 1234 |
|
}, |
|
{ |
|
"epoch": 5.513392857142857, |
|
"grad_norm": 0.7605273723602295, |
|
"learning_rate": 1.6579559038043186e-07, |
|
"loss": 0.024, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 5.517857142857143, |
|
"grad_norm": 0.6690748333930969, |
|
"learning_rate": 1.6280203088502275e-07, |
|
"loss": 0.0214, |
|
"step": 1236 |
|
}, |
|
{ |
|
"epoch": 5.522321428571429, |
|
"grad_norm": 0.6744450926780701, |
|
"learning_rate": 1.5983546338631578e-07, |
|
"loss": 0.0194, |
|
"step": 1237 |
|
}, |
|
{ |
|
"epoch": 5.526785714285714, |
|
"grad_norm": 0.6696710586547852, |
|
"learning_rate": 1.5689590443631178e-07, |
|
"loss": 0.0216, |
|
"step": 1238 |
|
}, |
|
{ |
|
"epoch": 5.53125, |
|
"grad_norm": 0.9168437123298645, |
|
"learning_rate": 1.5398337043631723e-07, |
|
"loss": 0.0214, |
|
"step": 1239 |
|
}, |
|
{ |
|
"epoch": 5.535714285714286, |
|
"grad_norm": 0.7635628581047058, |
|
"learning_rate": 1.5109787763685323e-07, |
|
"loss": 0.0251, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 5.540178571428571, |
|
"grad_norm": 0.6954474449157715, |
|
"learning_rate": 1.4823944213756056e-07, |
|
"loss": 0.0218, |
|
"step": 1241 |
|
}, |
|
{ |
|
"epoch": 5.544642857142857, |
|
"grad_norm": 0.7508897185325623, |
|
"learning_rate": 1.4540807988711857e-07, |
|
"loss": 0.0231, |
|
"step": 1242 |
|
}, |
|
{ |
|
"epoch": 5.549107142857143, |
|
"grad_norm": 0.8188397288322449, |
|
"learning_rate": 1.4260380668314764e-07, |
|
"loss": 0.0205, |
|
"step": 1243 |
|
}, |
|
{ |
|
"epoch": 5.553571428571429, |
|
"grad_norm": 0.7487647533416748, |
|
"learning_rate": 1.3982663817212475e-07, |
|
"loss": 0.025, |
|
"step": 1244 |
|
}, |
|
{ |
|
"epoch": 5.558035714285714, |
|
"grad_norm": 0.7735338807106018, |
|
"learning_rate": 1.3707658984929738e-07, |
|
"loss": 0.0228, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 5.5625, |
|
"grad_norm": 0.8641656041145325, |
|
"learning_rate": 1.3435367705859475e-07, |
|
"loss": 0.0266, |
|
"step": 1246 |
|
}, |
|
{ |
|
"epoch": 5.566964285714286, |
|
"grad_norm": 0.7950751781463623, |
|
"learning_rate": 1.3165791499254294e-07, |
|
"loss": 0.0269, |
|
"step": 1247 |
|
}, |
|
{ |
|
"epoch": 5.571428571428571, |
|
"grad_norm": 0.9540092945098877, |
|
"learning_rate": 1.2898931869218046e-07, |
|
"loss": 0.0239, |
|
"step": 1248 |
|
}, |
|
{ |
|
"epoch": 5.575892857142857, |
|
"grad_norm": 0.8364283442497253, |
|
"learning_rate": 1.2634790304697283e-07, |
|
"loss": 0.0247, |
|
"step": 1249 |
|
}, |
|
{ |
|
"epoch": 5.580357142857143, |
|
"grad_norm": 0.7901514768600464, |
|
"learning_rate": 1.2373368279473364e-07, |
|
"loss": 0.0228, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 5.584821428571429, |
|
"grad_norm": 0.8159101605415344, |
|
"learning_rate": 1.2114667252153644e-07, |
|
"loss": 0.0263, |
|
"step": 1251 |
|
}, |
|
{ |
|
"epoch": 5.589285714285714, |
|
"grad_norm": 0.6788504719734192, |
|
"learning_rate": 1.1858688666163752e-07, |
|
"loss": 0.0186, |
|
"step": 1252 |
|
}, |
|
{ |
|
"epoch": 5.59375, |
|
"grad_norm": 0.8900226354598999, |
|
"learning_rate": 1.1605433949739546e-07, |
|
"loss": 0.0286, |
|
"step": 1253 |
|
}, |
|
{ |
|
"epoch": 5.598214285714286, |
|
"grad_norm": 0.7049673795700073, |
|
"learning_rate": 1.1354904515918834e-07, |
|
"loss": 0.0244, |
|
"step": 1254 |
|
}, |
|
{ |
|
"epoch": 5.598214285714286, |
|
"eval_loss": 1.183043122291565, |
|
"eval_runtime": 3.1011, |
|
"eval_samples_per_second": 19.348, |
|
"eval_steps_per_second": 1.29, |
|
"step": 1254 |
|
}, |
|
{ |
|
"epoch": 5.602678571428571, |
|
"grad_norm": 0.7123156785964966, |
|
"learning_rate": 1.1107101762533725e-07, |
|
"loss": 0.0236, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 5.607142857142857, |
|
"grad_norm": 0.6948925256729126, |
|
"learning_rate": 1.0862027072202796e-07, |
|
"loss": 0.02, |
|
"step": 1256 |
|
}, |
|
{ |
|
"epoch": 5.611607142857143, |
|
"grad_norm": 0.8849415183067322, |
|
"learning_rate": 1.0619681812323437e-07, |
|
"loss": 0.0278, |
|
"step": 1257 |
|
}, |
|
{ |
|
"epoch": 5.616071428571429, |
|
"grad_norm": 0.7653726935386658, |
|
"learning_rate": 1.0380067335064019e-07, |
|
"loss": 0.0256, |
|
"step": 1258 |
|
}, |
|
{ |
|
"epoch": 5.620535714285714, |
|
"grad_norm": 0.7977951765060425, |
|
"learning_rate": 1.0143184977356513e-07, |
|
"loss": 0.0227, |
|
"step": 1259 |
|
}, |
|
{ |
|
"epoch": 5.625, |
|
"grad_norm": 0.7583563327789307, |
|
"learning_rate": 9.909036060889063e-08, |
|
"loss": 0.0237, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 5.629464285714286, |
|
"grad_norm": 0.7033082842826843, |
|
"learning_rate": 9.677621892098471e-08, |
|
"loss": 0.0215, |
|
"step": 1261 |
|
}, |
|
{ |
|
"epoch": 5.633928571428571, |
|
"grad_norm": 0.7182819843292236, |
|
"learning_rate": 9.448943762163063e-08, |
|
"loss": 0.0225, |
|
"step": 1262 |
|
}, |
|
{ |
|
"epoch": 5.638392857142857, |
|
"grad_norm": 0.7455835938453674, |
|
"learning_rate": 9.223002946995223e-08, |
|
"loss": 0.0232, |
|
"step": 1263 |
|
}, |
|
{ |
|
"epoch": 5.642857142857143, |
|
"grad_norm": 0.7556766271591187, |
|
"learning_rate": 8.999800707234651e-08, |
|
"loss": 0.0216, |
|
"step": 1264 |
|
}, |
|
{ |
|
"epoch": 5.647321428571429, |
|
"grad_norm": 0.7347747087478638, |
|
"learning_rate": 8.77933828824112e-08, |
|
"loss": 0.0192, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 5.651785714285714, |
|
"grad_norm": 0.6410439610481262, |
|
"learning_rate": 8.561616920087338e-08, |
|
"loss": 0.0171, |
|
"step": 1266 |
|
}, |
|
{ |
|
"epoch": 5.65625, |
|
"grad_norm": 0.8440278768539429, |
|
"learning_rate": 8.346637817552435e-08, |
|
"loss": 0.0298, |
|
"step": 1267 |
|
}, |
|
{ |
|
"epoch": 5.660714285714286, |
|
"grad_norm": 0.8332878351211548, |
|
"learning_rate": 8.134402180115097e-08, |
|
"loss": 0.0267, |
|
"step": 1268 |
|
}, |
|
{ |
|
"epoch": 5.665178571428571, |
|
"grad_norm": 0.7442850470542908, |
|
"learning_rate": 7.924911191946728e-08, |
|
"loss": 0.0291, |
|
"step": 1269 |
|
}, |
|
{ |
|
"epoch": 5.669642857142857, |
|
"grad_norm": 0.7406393885612488, |
|
"learning_rate": 7.718166021904903e-08, |
|
"loss": 0.0234, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 5.674107142857143, |
|
"grad_norm": 0.7578692436218262, |
|
"learning_rate": 7.514167823526817e-08, |
|
"loss": 0.0273, |
|
"step": 1271 |
|
}, |
|
{ |
|
"epoch": 5.678571428571429, |
|
"grad_norm": 0.6509707570075989, |
|
"learning_rate": 7.31291773502313e-08, |
|
"loss": 0.0193, |
|
"step": 1272 |
|
}, |
|
{ |
|
"epoch": 5.683035714285714, |
|
"grad_norm": 0.7130553126335144, |
|
"learning_rate": 7.114416879271188e-08, |
|
"loss": 0.0223, |
|
"step": 1273 |
|
}, |
|
{ |
|
"epoch": 5.6875, |
|
"grad_norm": 0.6997455954551697, |
|
"learning_rate": 6.918666363808976e-08, |
|
"loss": 0.0196, |
|
"step": 1274 |
|
}, |
|
{ |
|
"epoch": 5.691964285714286, |
|
"grad_norm": 0.7897157073020935, |
|
"learning_rate": 6.725667280828959e-08, |
|
"loss": 0.0238, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 5.696428571428571, |
|
"grad_norm": 0.7304858565330505, |
|
"learning_rate": 6.535420707172025e-08, |
|
"loss": 0.0248, |
|
"step": 1276 |
|
}, |
|
{ |
|
"epoch": 5.700892857142857, |
|
"grad_norm": 0.6769036650657654, |
|
"learning_rate": 6.347927704321335e-08, |
|
"loss": 0.0193, |
|
"step": 1277 |
|
}, |
|
{ |
|
"epoch": 5.705357142857143, |
|
"grad_norm": 0.713193416595459, |
|
"learning_rate": 6.163189318396261e-08, |
|
"loss": 0.0222, |
|
"step": 1278 |
|
}, |
|
{ |
|
"epoch": 5.709821428571429, |
|
"grad_norm": 0.798915684223175, |
|
"learning_rate": 5.981206580147232e-08, |
|
"loss": 0.0202, |
|
"step": 1279 |
|
}, |
|
{ |
|
"epoch": 5.714285714285714, |
|
"grad_norm": 0.865748941898346, |
|
"learning_rate": 5.8019805049490143e-08, |
|
"loss": 0.0263, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 5.71875, |
|
"grad_norm": 0.7367980480194092, |
|
"learning_rate": 5.625512092795828e-08, |
|
"loss": 0.0193, |
|
"step": 1281 |
|
}, |
|
{ |
|
"epoch": 5.723214285714286, |
|
"grad_norm": 0.6353578567504883, |
|
"learning_rate": 5.451802328295408e-08, |
|
"loss": 0.0201, |
|
"step": 1282 |
|
}, |
|
{ |
|
"epoch": 5.727678571428571, |
|
"grad_norm": 0.8302894830703735, |
|
"learning_rate": 5.2808521806635646e-08, |
|
"loss": 0.03, |
|
"step": 1283 |
|
}, |
|
{ |
|
"epoch": 5.732142857142857, |
|
"grad_norm": 0.746477484703064, |
|
"learning_rate": 5.1126626037188537e-08, |
|
"loss": 0.0216, |
|
"step": 1284 |
|
}, |
|
{ |
|
"epoch": 5.736607142857143, |
|
"grad_norm": 0.655388593673706, |
|
"learning_rate": 4.9472345358769714e-08, |
|
"loss": 0.0197, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 5.741071428571429, |
|
"grad_norm": 1.365317702293396, |
|
"learning_rate": 4.784568900146095e-08, |
|
"loss": 0.0213, |
|
"step": 1286 |
|
}, |
|
{ |
|
"epoch": 5.745535714285714, |
|
"grad_norm": 0.6960539221763611, |
|
"learning_rate": 4.624666604121047e-08, |
|
"loss": 0.019, |
|
"step": 1287 |
|
}, |
|
{ |
|
"epoch": 5.75, |
|
"grad_norm": 0.6388968229293823, |
|
"learning_rate": 4.4675285399787523e-08, |
|
"loss": 0.0195, |
|
"step": 1288 |
|
}, |
|
{ |
|
"epoch": 5.754464285714286, |
|
"grad_norm": 0.8190224170684814, |
|
"learning_rate": 4.3131555844730135e-08, |
|
"loss": 0.0231, |
|
"step": 1289 |
|
}, |
|
{ |
|
"epoch": 5.758928571428571, |
|
"grad_norm": 0.7648060917854309, |
|
"learning_rate": 4.161548598929689e-08, |
|
"loss": 0.0219, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 5.763392857142857, |
|
"grad_norm": 0.6431726217269897, |
|
"learning_rate": 4.01270842924191e-08, |
|
"loss": 0.0215, |
|
"step": 1291 |
|
}, |
|
{ |
|
"epoch": 5.767857142857143, |
|
"grad_norm": 0.6700044274330139, |
|
"learning_rate": 3.8666359058652064e-08, |
|
"loss": 0.0189, |
|
"step": 1292 |
|
}, |
|
{ |
|
"epoch": 5.772321428571429, |
|
"grad_norm": 0.6487048268318176, |
|
"learning_rate": 3.7233318438130064e-08, |
|
"loss": 0.0211, |
|
"step": 1293 |
|
}, |
|
{ |
|
"epoch": 5.776785714285714, |
|
"grad_norm": 0.7264946103096008, |
|
"learning_rate": 3.582797042652248e-08, |
|
"loss": 0.0243, |
|
"step": 1294 |
|
}, |
|
{ |
|
"epoch": 5.78125, |
|
"grad_norm": 1.036159873008728, |
|
"learning_rate": 3.4450322864986106e-08, |
|
"loss": 0.0301, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 5.785714285714286, |
|
"grad_norm": 0.7695539593696594, |
|
"learning_rate": 3.310038344012184e-08, |
|
"loss": 0.0235, |
|
"step": 1296 |
|
}, |
|
{ |
|
"epoch": 5.790178571428571, |
|
"grad_norm": 0.655296266078949, |
|
"learning_rate": 3.17781596839347e-08, |
|
"loss": 0.0183, |
|
"step": 1297 |
|
}, |
|
{ |
|
"epoch": 5.794642857142857, |
|
"grad_norm": 0.8248305916786194, |
|
"learning_rate": 3.0483658973788894e-08, |
|
"loss": 0.0212, |
|
"step": 1298 |
|
}, |
|
{ |
|
"epoch": 5.799107142857143, |
|
"grad_norm": 0.7949267029762268, |
|
"learning_rate": 2.92168885323662e-08, |
|
"loss": 0.0177, |
|
"step": 1299 |
|
}, |
|
{ |
|
"epoch": 5.803571428571429, |
|
"grad_norm": 0.7954432368278503, |
|
"learning_rate": 2.797785542762927e-08, |
|
"loss": 0.0232, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 5.808035714285714, |
|
"grad_norm": 0.8179627060890198, |
|
"learning_rate": 2.67665665727784e-08, |
|
"loss": 0.026, |
|
"step": 1301 |
|
}, |
|
{ |
|
"epoch": 5.8125, |
|
"grad_norm": 0.7565423846244812, |
|
"learning_rate": 2.5583028726215427e-08, |
|
"loss": 0.0193, |
|
"step": 1302 |
|
}, |
|
{ |
|
"epoch": 5.816964285714286, |
|
"grad_norm": 0.9201167225837708, |
|
"learning_rate": 2.442724849150487e-08, |
|
"loss": 0.0229, |
|
"step": 1303 |
|
}, |
|
{ |
|
"epoch": 5.821428571428571, |
|
"grad_norm": 0.7921238541603088, |
|
"learning_rate": 2.3299232317335643e-08, |
|
"loss": 0.024, |
|
"step": 1304 |
|
}, |
|
{ |
|
"epoch": 5.825892857142857, |
|
"grad_norm": 0.8329811692237854, |
|
"learning_rate": 2.2198986497489963e-08, |
|
"loss": 0.0238, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 5.830357142857143, |
|
"grad_norm": 0.6987408399581909, |
|
"learning_rate": 2.112651717080228e-08, |
|
"loss": 0.0191, |
|
"step": 1306 |
|
}, |
|
{ |
|
"epoch": 5.834821428571429, |
|
"grad_norm": 0.8431302309036255, |
|
"learning_rate": 2.0081830321129298e-08, |
|
"loss": 0.0251, |
|
"step": 1307 |
|
}, |
|
{ |
|
"epoch": 5.839285714285714, |
|
"grad_norm": 0.7029537558555603, |
|
"learning_rate": 1.906493177731391e-08, |
|
"loss": 0.0193, |
|
"step": 1308 |
|
}, |
|
{ |
|
"epoch": 5.84375, |
|
"grad_norm": 0.7598433494567871, |
|
"learning_rate": 1.80758272131541e-08, |
|
"loss": 0.0224, |
|
"step": 1309 |
|
}, |
|
{ |
|
"epoch": 5.848214285714286, |
|
"grad_norm": 3.8866190910339355, |
|
"learning_rate": 1.711452214737187e-08, |
|
"loss": 0.0216, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 5.852678571428571, |
|
"grad_norm": 0.7216530442237854, |
|
"learning_rate": 1.6181021943580477e-08, |
|
"loss": 0.0219, |
|
"step": 1311 |
|
}, |
|
{ |
|
"epoch": 5.852678571428571, |
|
"eval_loss": 1.1876667737960815, |
|
"eval_runtime": 3.1035, |
|
"eval_samples_per_second": 19.333, |
|
"eval_steps_per_second": 1.289, |
|
"step": 1311 |
|
}, |
|
{ |
|
"epoch": 5.857142857142857, |
|
"grad_norm": 0.7401444911956787, |
|
"learning_rate": 1.5275331810256708e-08, |
|
"loss": 0.0217, |
|
"step": 1312 |
|
}, |
|
{ |
|
"epoch": 5.861607142857143, |
|
"grad_norm": 0.7781946659088135, |
|
"learning_rate": 1.439745680070921e-08, |
|
"loss": 0.0263, |
|
"step": 1313 |
|
}, |
|
{ |
|
"epoch": 5.866071428571429, |
|
"grad_norm": 0.6831724047660828, |
|
"learning_rate": 1.3547401813053533e-08, |
|
"loss": 0.0232, |
|
"step": 1314 |
|
}, |
|
{ |
|
"epoch": 5.870535714285714, |
|
"grad_norm": 0.7941478490829468, |
|
"learning_rate": 1.2725171590181043e-08, |
|
"loss": 0.0196, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 5.875, |
|
"grad_norm": 0.7862687706947327, |
|
"learning_rate": 1.1930770719736715e-08, |
|
"loss": 0.0258, |
|
"step": 1316 |
|
}, |
|
{ |
|
"epoch": 5.879464285714286, |
|
"grad_norm": 0.5605817437171936, |
|
"learning_rate": 1.1164203634089718e-08, |
|
"loss": 0.0164, |
|
"step": 1317 |
|
}, |
|
{ |
|
"epoch": 5.883928571428571, |
|
"grad_norm": 0.7842698097229004, |
|
"learning_rate": 1.0425474610310654e-08, |
|
"loss": 0.0265, |
|
"step": 1318 |
|
}, |
|
{ |
|
"epoch": 5.888392857142857, |
|
"grad_norm": 0.6489875912666321, |
|
"learning_rate": 9.714587770147148e-09, |
|
"loss": 0.0196, |
|
"step": 1319 |
|
}, |
|
{ |
|
"epoch": 5.892857142857143, |
|
"grad_norm": 0.6310600638389587, |
|
"learning_rate": 9.031547080002185e-09, |
|
"loss": 0.0188, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 5.897321428571429, |
|
"grad_norm": 0.6858588457107544, |
|
"learning_rate": 8.376356350909694e-09, |
|
"loss": 0.0211, |
|
"step": 1321 |
|
}, |
|
{ |
|
"epoch": 5.901785714285714, |
|
"grad_norm": 0.8310582637786865, |
|
"learning_rate": 7.749019238513461e-09, |
|
"loss": 0.0305, |
|
"step": 1322 |
|
}, |
|
{ |
|
"epoch": 5.90625, |
|
"grad_norm": 0.8480810523033142, |
|
"learning_rate": 7.149539243050468e-09, |
|
"loss": 0.0241, |
|
"step": 1323 |
|
}, |
|
{ |
|
"epoch": 5.910714285714286, |
|
"grad_norm": 0.7383941411972046, |
|
"learning_rate": 6.577919709325367e-09, |
|
"loss": 0.0216, |
|
"step": 1324 |
|
}, |
|
{ |
|
"epoch": 5.915178571428571, |
|
"grad_norm": 0.6655860543251038, |
|
"learning_rate": 6.034163826697711e-09, |
|
"loss": 0.0204, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 5.919642857142857, |
|
"grad_norm": 0.7747806906700134, |
|
"learning_rate": 5.518274629059199e-09, |
|
"loss": 0.0229, |
|
"step": 1326 |
|
}, |
|
{ |
|
"epoch": 5.924107142857143, |
|
"grad_norm": 0.7398366928100586, |
|
"learning_rate": 5.030254994820907e-09, |
|
"loss": 0.0228, |
|
"step": 1327 |
|
}, |
|
{ |
|
"epoch": 5.928571428571429, |
|
"grad_norm": 0.7458826303482056, |
|
"learning_rate": 4.570107646894414e-09, |
|
"loss": 0.023, |
|
"step": 1328 |
|
}, |
|
{ |
|
"epoch": 5.933035714285714, |
|
"grad_norm": 0.7212875485420227, |
|
"learning_rate": 4.137835152677938e-09, |
|
"loss": 0.02, |
|
"step": 1329 |
|
}, |
|
{ |
|
"epoch": 5.9375, |
|
"grad_norm": 0.714472770690918, |
|
"learning_rate": 3.7334399240402185e-09, |
|
"loss": 0.0203, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 5.941964285714286, |
|
"grad_norm": 0.7454721331596375, |
|
"learning_rate": 3.356924217310546e-09, |
|
"loss": 0.0264, |
|
"step": 1331 |
|
}, |
|
{ |
|
"epoch": 5.946428571428571, |
|
"grad_norm": 0.6192952990531921, |
|
"learning_rate": 3.008290133262653e-09, |
|
"loss": 0.0172, |
|
"step": 1332 |
|
}, |
|
{ |
|
"epoch": 5.950892857142857, |
|
"grad_norm": 0.6783339381217957, |
|
"learning_rate": 2.687539617105282e-09, |
|
"loss": 0.0186, |
|
"step": 1333 |
|
}, |
|
{ |
|
"epoch": 5.955357142857143, |
|
"grad_norm": 0.8283798098564148, |
|
"learning_rate": 2.3946744584688626e-09, |
|
"loss": 0.0252, |
|
"step": 1334 |
|
}, |
|
{ |
|
"epoch": 5.959821428571429, |
|
"grad_norm": 0.5973673462867737, |
|
"learning_rate": 2.1296962913994105e-09, |
|
"loss": 0.0181, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 5.964285714285714, |
|
"grad_norm": 0.6673908233642578, |
|
"learning_rate": 1.892606594345199e-09, |
|
"loss": 0.0192, |
|
"step": 1336 |
|
}, |
|
{ |
|
"epoch": 5.96875, |
|
"grad_norm": 0.7677484154701233, |
|
"learning_rate": 1.6834066901512136e-09, |
|
"loss": 0.0248, |
|
"step": 1337 |
|
}, |
|
{ |
|
"epoch": 5.973214285714286, |
|
"grad_norm": 0.6924158334732056, |
|
"learning_rate": 1.5020977460513809e-09, |
|
"loss": 0.023, |
|
"step": 1338 |
|
}, |
|
{ |
|
"epoch": 5.977678571428571, |
|
"grad_norm": 0.6726967096328735, |
|
"learning_rate": 1.3486807736613498e-09, |
|
"loss": 0.0192, |
|
"step": 1339 |
|
}, |
|
{ |
|
"epoch": 5.982142857142857, |
|
"grad_norm": 0.6165555715560913, |
|
"learning_rate": 1.2231566289723888e-09, |
|
"loss": 0.0163, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 5.986607142857143, |
|
"grad_norm": 0.6672031283378601, |
|
"learning_rate": 1.1255260123486095e-09, |
|
"loss": 0.022, |
|
"step": 1341 |
|
}, |
|
{ |
|
"epoch": 5.991071428571429, |
|
"grad_norm": 0.8877815008163452, |
|
"learning_rate": 1.0557894685208617e-09, |
|
"loss": 0.0245, |
|
"step": 1342 |
|
}, |
|
{ |
|
"epoch": 5.995535714285714, |
|
"grad_norm": 0.7125616073608398, |
|
"learning_rate": 1.013947386585067e-09, |
|
"loss": 0.025, |
|
"step": 1343 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 0.6855891942977905, |
|
"learning_rate": 1e-09, |
|
"loss": 0.0186, |
|
"step": 1344 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"step": 1344, |
|
"total_flos": 2.1132791208917074e+18, |
|
"train_loss": 0.19653707026043862, |
|
"train_runtime": 12053.73, |
|
"train_samples_per_second": 1.784, |
|
"train_steps_per_second": 0.112 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 1344, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 6, |
|
"save_steps": 57, |
|
"total_flos": 2.1132791208917074e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|