{ "best_metric": 0.22853875160217285, "best_model_checkpoint": "./w2v-bert-2.0-hausa_579_450h/checkpoint-19000", "epoch": 7.337205747477835, "eval_steps": 1000, "global_step": 24000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003057169061449098, "grad_norm": 20.83271598815918, "learning_rate": 3.821169277799007e-09, "loss": 8.2851, "step": 1 }, { "epoch": 0.0006114338122898196, "grad_norm": 20.908430099487305, "learning_rate": 7.642338555598014e-09, "loss": 8.1164, "step": 2 }, { "epoch": 0.0009171507184347294, "grad_norm": 21.901620864868164, "learning_rate": 1.146350783339702e-08, "loss": 8.3153, "step": 3 }, { "epoch": 0.0012228676245796392, "grad_norm": 21.675174713134766, "learning_rate": 1.5284677111196027e-08, "loss": 8.208, "step": 4 }, { "epoch": 0.001528584530724549, "grad_norm": 21.793302536010742, "learning_rate": 1.9105846388995033e-08, "loss": 8.2758, "step": 5 }, { "epoch": 0.0018343014368694588, "grad_norm": 22.063997268676758, "learning_rate": 2.292701566679404e-08, "loss": 8.2566, "step": 6 }, { "epoch": 0.0021400183430143687, "grad_norm": 21.652164459228516, "learning_rate": 2.6748184944593046e-08, "loss": 8.1169, "step": 7 }, { "epoch": 0.0024457352491592784, "grad_norm": 21.013320922851562, "learning_rate": 3.0569354222392055e-08, "loss": 7.9707, "step": 8 }, { "epoch": 0.0027514521553041885, "grad_norm": 21.321041107177734, "learning_rate": 3.439052350019106e-08, "loss": 8.0038, "step": 9 }, { "epoch": 0.003057169061449098, "grad_norm": 21.157644271850586, "learning_rate": 3.821169277799007e-08, "loss": 7.963, "step": 10 }, { "epoch": 0.003362885967594008, "grad_norm": 20.705812454223633, "learning_rate": 4.203286205578907e-08, "loss": 7.9018, "step": 11 }, { "epoch": 0.0036686028737389176, "grad_norm": 20.542030334472656, "learning_rate": 4.585403133358808e-08, "loss": 7.795, "step": 12 }, { "epoch": 0.003974319779883827, "grad_norm": 20.973297119140625, "learning_rate": 4.967520061138709e-08, "loss": 7.87, "step": 13 }, { "epoch": 0.004280036686028737, "grad_norm": 20.95349884033203, "learning_rate": 5.349636988918609e-08, "loss": 7.8682, "step": 14 }, { "epoch": 0.0045857535921736475, "grad_norm": 20.954387664794922, "learning_rate": 5.73175391669851e-08, "loss": 7.8531, "step": 15 }, { "epoch": 0.004891470498318557, "grad_norm": 20.018869400024414, "learning_rate": 6.113870844478411e-08, "loss": 7.6589, "step": 16 }, { "epoch": 0.005197187404463467, "grad_norm": 20.128570556640625, "learning_rate": 6.49598777225831e-08, "loss": 7.6932, "step": 17 }, { "epoch": 0.005502904310608377, "grad_norm": 19.92058753967285, "learning_rate": 6.878104700038212e-08, "loss": 7.5662, "step": 18 }, { "epoch": 0.005808621216753286, "grad_norm": 19.663522720336914, "learning_rate": 7.260221627818112e-08, "loss": 7.5336, "step": 19 }, { "epoch": 0.006114338122898196, "grad_norm": 18.939983367919922, "learning_rate": 7.642338555598013e-08, "loss": 7.3143, "step": 20 }, { "epoch": 0.0064200550290431065, "grad_norm": 19.282129287719727, "learning_rate": 8.024455483377913e-08, "loss": 7.3474, "step": 21 }, { "epoch": 0.006725771935188016, "grad_norm": 19.209104537963867, "learning_rate": 8.406572411157814e-08, "loss": 7.2938, "step": 22 }, { "epoch": 0.007031488841332926, "grad_norm": 18.60006332397461, "learning_rate": 8.788689338937715e-08, "loss": 7.1286, "step": 23 }, { "epoch": 0.007337205747477835, "grad_norm": 18.60341453552246, "learning_rate": 9.170806266717616e-08, "loss": 7.0891, "step": 24 }, { "epoch": 0.007642922653622745, "grad_norm": 17.593242645263672, "learning_rate": 9.552923194497515e-08, "loss": 6.7028, "step": 25 }, { "epoch": 0.007948639559767655, "grad_norm": 21.129972457885742, "learning_rate": 9.935040122277418e-08, "loss": 8.3106, "step": 26 }, { "epoch": 0.008254356465912565, "grad_norm": 21.232126235961914, "learning_rate": 1.0317157050057317e-07, "loss": 8.1089, "step": 27 }, { "epoch": 0.008560073372057475, "grad_norm": 21.971832275390625, "learning_rate": 1.0699273977837218e-07, "loss": 8.2139, "step": 28 }, { "epoch": 0.008865790278202385, "grad_norm": 21.493450164794922, "learning_rate": 1.1081390905617119e-07, "loss": 8.0371, "step": 29 }, { "epoch": 0.009171507184347295, "grad_norm": 21.966949462890625, "learning_rate": 1.146350783339702e-07, "loss": 8.1334, "step": 30 }, { "epoch": 0.009477224090492203, "grad_norm": 22.135982513427734, "learning_rate": 1.184562476117692e-07, "loss": 8.1462, "step": 31 }, { "epoch": 0.009782940996637114, "grad_norm": 21.35399627685547, "learning_rate": 1.2227741688956822e-07, "loss": 7.9279, "step": 32 }, { "epoch": 0.010088657902782024, "grad_norm": 21.780601501464844, "learning_rate": 1.260985861673672e-07, "loss": 7.9846, "step": 33 }, { "epoch": 0.010394374808926934, "grad_norm": 21.5991268157959, "learning_rate": 1.299197554451662e-07, "loss": 7.9327, "step": 34 }, { "epoch": 0.010700091715071844, "grad_norm": 22.09117889404297, "learning_rate": 1.3374092472296525e-07, "loss": 7.9989, "step": 35 }, { "epoch": 0.011005808621216754, "grad_norm": 21.179874420166016, "learning_rate": 1.3756209400076423e-07, "loss": 7.822, "step": 36 }, { "epoch": 0.011311525527361662, "grad_norm": 21.744579315185547, "learning_rate": 1.4138326327856324e-07, "loss": 7.9289, "step": 37 }, { "epoch": 0.011617242433506572, "grad_norm": 21.675596237182617, "learning_rate": 1.4520443255636225e-07, "loss": 7.8422, "step": 38 }, { "epoch": 0.011922959339651483, "grad_norm": 20.448606491088867, "learning_rate": 1.4902560183416126e-07, "loss": 7.5889, "step": 39 }, { "epoch": 0.012228676245796393, "grad_norm": 21.15997886657715, "learning_rate": 1.5284677111196027e-07, "loss": 7.712, "step": 40 }, { "epoch": 0.012534393151941303, "grad_norm": 20.340505599975586, "learning_rate": 1.5666794038975928e-07, "loss": 7.5513, "step": 41 }, { "epoch": 0.012840110058086213, "grad_norm": 20.458667755126953, "learning_rate": 1.6048910966755826e-07, "loss": 7.5256, "step": 42 }, { "epoch": 0.013145826964231121, "grad_norm": 19.467540740966797, "learning_rate": 1.643102789453573e-07, "loss": 7.2767, "step": 43 }, { "epoch": 0.013451543870376031, "grad_norm": 19.30766487121582, "learning_rate": 1.6813144822315628e-07, "loss": 7.2598, "step": 44 }, { "epoch": 0.013757260776520942, "grad_norm": 20.065006256103516, "learning_rate": 1.719526175009553e-07, "loss": 7.3321, "step": 45 }, { "epoch": 0.014062977682665852, "grad_norm": 19.266765594482422, "learning_rate": 1.757737867787543e-07, "loss": 7.1171, "step": 46 }, { "epoch": 0.014368694588810762, "grad_norm": 18.72146224975586, "learning_rate": 1.795949560565533e-07, "loss": 7.0568, "step": 47 }, { "epoch": 0.01467441149495567, "grad_norm": 18.79753875732422, "learning_rate": 1.8341612533435232e-07, "loss": 7.0006, "step": 48 }, { "epoch": 0.01498012840110058, "grad_norm": 18.57426643371582, "learning_rate": 1.8723729461215133e-07, "loss": 6.8816, "step": 49 }, { "epoch": 0.01528584530724549, "grad_norm": 18.37809181213379, "learning_rate": 1.910584638899503e-07, "loss": 6.7234, "step": 50 }, { "epoch": 0.0155915622133904, "grad_norm": 22.101274490356445, "learning_rate": 1.9487963316774934e-07, "loss": 8.1972, "step": 51 }, { "epoch": 0.01589727911953531, "grad_norm": 22.215782165527344, "learning_rate": 1.9870080244554835e-07, "loss": 8.0322, "step": 52 }, { "epoch": 0.01620299602568022, "grad_norm": 22.176252365112305, "learning_rate": 2.0252197172334734e-07, "loss": 7.9079, "step": 53 }, { "epoch": 0.01650871293182513, "grad_norm": 22.701387405395508, "learning_rate": 2.0634314100114635e-07, "loss": 8.0159, "step": 54 }, { "epoch": 0.01681442983797004, "grad_norm": 22.526180267333984, "learning_rate": 2.1016431027894538e-07, "loss": 7.9018, "step": 55 }, { "epoch": 0.01712014674411495, "grad_norm": Infinity, "learning_rate": 2.1016431027894538e-07, "loss": 7.9717, "step": 56 }, { "epoch": 0.01742586365025986, "grad_norm": 22.894088745117188, "learning_rate": 2.1398547955674436e-07, "loss": 7.9217, "step": 57 }, { "epoch": 0.01773158055640477, "grad_norm": 22.222522735595703, "learning_rate": 2.1780664883454337e-07, "loss": 7.7581, "step": 58 }, { "epoch": 0.01803729746254968, "grad_norm": 22.9604434967041, "learning_rate": 2.2162781811234238e-07, "loss": 7.8329, "step": 59 }, { "epoch": 0.01834301436869459, "grad_norm": 22.268863677978516, "learning_rate": 2.254489873901414e-07, "loss": 7.7317, "step": 60 }, { "epoch": 0.0186487312748395, "grad_norm": 22.327556610107422, "learning_rate": 2.292701566679404e-07, "loss": 7.6866, "step": 61 }, { "epoch": 0.018954448180984407, "grad_norm": 21.665634155273438, "learning_rate": 2.3309132594573938e-07, "loss": 7.5398, "step": 62 }, { "epoch": 0.019260165087129317, "grad_norm": 21.92706871032715, "learning_rate": 2.369124952235384e-07, "loss": 7.552, "step": 63 }, { "epoch": 0.019565881993274227, "grad_norm": 22.52596664428711, "learning_rate": 2.4073366450133743e-07, "loss": 7.6391, "step": 64 }, { "epoch": 0.019871598899419137, "grad_norm": 22.010770797729492, "learning_rate": 2.4455483377913644e-07, "loss": 7.5744, "step": 65 }, { "epoch": 0.020177315805564047, "grad_norm": 21.1176815032959, "learning_rate": 2.4837600305693545e-07, "loss": 7.3114, "step": 66 }, { "epoch": 0.020483032711708957, "grad_norm": 20.638681411743164, "learning_rate": 2.521971723347344e-07, "loss": 7.2313, "step": 67 }, { "epoch": 0.020788749617853867, "grad_norm": 21.00934410095215, "learning_rate": 2.560183416125334e-07, "loss": 7.2519, "step": 68 }, { "epoch": 0.021094466523998778, "grad_norm": 20.51592254638672, "learning_rate": 2.598395108903324e-07, "loss": 7.1, "step": 69 }, { "epoch": 0.021400183430143688, "grad_norm": 20.412561416625977, "learning_rate": 2.636606801681315e-07, "loss": 7.0744, "step": 70 }, { "epoch": 0.021705900336288598, "grad_norm": 19.63575553894043, "learning_rate": 2.674818494459305e-07, "loss": 6.912, "step": 71 }, { "epoch": 0.022011617242433508, "grad_norm": 20.44173240661621, "learning_rate": 2.7130301872372945e-07, "loss": 7.0643, "step": 72 }, { "epoch": 0.022317334148578418, "grad_norm": 19.610523223876953, "learning_rate": 2.7512418800152846e-07, "loss": 6.7973, "step": 73 }, { "epoch": 0.022623051054723325, "grad_norm": 20.24230194091797, "learning_rate": 2.7894535727932747e-07, "loss": 6.8148, "step": 74 }, { "epoch": 0.022928767960868235, "grad_norm": 18.317285537719727, "learning_rate": 2.827665265571265e-07, "loss": 6.3542, "step": 75 }, { "epoch": 0.023234484867013145, "grad_norm": 23.29434585571289, "learning_rate": 2.865876958349255e-07, "loss": 7.8897, "step": 76 }, { "epoch": 0.023540201773158055, "grad_norm": 24.129138946533203, "learning_rate": 2.904088651127245e-07, "loss": 7.7976, "step": 77 }, { "epoch": 0.023845918679302965, "grad_norm": 23.71721076965332, "learning_rate": 2.942300343905235e-07, "loss": 7.6522, "step": 78 }, { "epoch": 0.024151635585447875, "grad_norm": 23.49587059020996, "learning_rate": 2.980512036683225e-07, "loss": 7.5838, "step": 79 }, { "epoch": 0.024457352491592785, "grad_norm": 23.677717208862305, "learning_rate": 3.018723729461215e-07, "loss": 7.5632, "step": 80 }, { "epoch": 0.024763069397737696, "grad_norm": 24.505083084106445, "learning_rate": 3.0569354222392054e-07, "loss": 7.652, "step": 81 }, { "epoch": 0.025068786303882606, "grad_norm": 23.516141891479492, "learning_rate": 3.0951471150171954e-07, "loss": 7.4479, "step": 82 }, { "epoch": 0.025374503210027516, "grad_norm": 23.419748306274414, "learning_rate": 3.1333588077951855e-07, "loss": 7.3801, "step": 83 }, { "epoch": 0.025680220116172426, "grad_norm": 24.187294006347656, "learning_rate": 3.171570500573175e-07, "loss": 7.4818, "step": 84 }, { "epoch": 0.025985937022317333, "grad_norm": 23.868162155151367, "learning_rate": 3.209782193351165e-07, "loss": 7.355, "step": 85 }, { "epoch": 0.026291653928462243, "grad_norm": 22.45135498046875, "learning_rate": 3.247993886129156e-07, "loss": 7.1132, "step": 86 }, { "epoch": 0.026597370834607153, "grad_norm": 23.71923065185547, "learning_rate": 3.286205578907146e-07, "loss": 7.3015, "step": 87 }, { "epoch": 0.026903087740752063, "grad_norm": 23.34016990661621, "learning_rate": 3.324417271685136e-07, "loss": 7.1894, "step": 88 }, { "epoch": 0.027208804646896973, "grad_norm": 22.59413719177246, "learning_rate": 3.3626289644631256e-07, "loss": 7.0706, "step": 89 }, { "epoch": 0.027514521553041883, "grad_norm": 22.837310791015625, "learning_rate": 3.4008406572411157e-07, "loss": 7.0546, "step": 90 }, { "epoch": 0.027820238459186793, "grad_norm": 22.15131187438965, "learning_rate": 3.439052350019106e-07, "loss": 6.9382, "step": 91 }, { "epoch": 0.028125955365331703, "grad_norm": 21.566699981689453, "learning_rate": 3.477264042797096e-07, "loss": 6.7568, "step": 92 }, { "epoch": 0.028431672271476614, "grad_norm": 22.444026947021484, "learning_rate": 3.515475735575086e-07, "loss": 6.9284, "step": 93 }, { "epoch": 0.028737389177621524, "grad_norm": 21.365964889526367, "learning_rate": 3.5536874283530766e-07, "loss": 6.6904, "step": 94 }, { "epoch": 0.029043106083766434, "grad_norm": 22.21048927307129, "learning_rate": 3.591899121131066e-07, "loss": 6.8102, "step": 95 }, { "epoch": 0.02934882298991134, "grad_norm": 21.176986694335938, "learning_rate": 3.630110813909056e-07, "loss": 6.5965, "step": 96 }, { "epoch": 0.02965453989605625, "grad_norm": 21.002107620239258, "learning_rate": 3.6683225066870463e-07, "loss": 6.4811, "step": 97 }, { "epoch": 0.02996025680220116, "grad_norm": 21.357481002807617, "learning_rate": 3.7065341994650364e-07, "loss": 6.579, "step": 98 }, { "epoch": 0.03026597370834607, "grad_norm": 20.7819766998291, "learning_rate": 3.7447458922430265e-07, "loss": 6.356, "step": 99 }, { "epoch": 0.03057169061449098, "grad_norm": 19.37672233581543, "learning_rate": 3.7829575850210166e-07, "loss": 5.9806, "step": 100 }, { "epoch": 0.03087740752063589, "grad_norm": 25.86378288269043, "learning_rate": 3.821169277799006e-07, "loss": 7.4979, "step": 101 }, { "epoch": 0.0311831244267808, "grad_norm": 25.07935905456543, "learning_rate": 3.859380970576997e-07, "loss": 7.1335, "step": 102 }, { "epoch": 0.03148884133292571, "grad_norm": 25.704378128051758, "learning_rate": 3.897592663354987e-07, "loss": 7.1674, "step": 103 }, { "epoch": 0.03179455823907062, "grad_norm": 25.522432327270508, "learning_rate": 3.935804356132977e-07, "loss": 7.0671, "step": 104 }, { "epoch": 0.03210027514521553, "grad_norm": 25.79082679748535, "learning_rate": 3.974016048910967e-07, "loss": 7.0784, "step": 105 }, { "epoch": 0.03240599205136044, "grad_norm": 26.136260986328125, "learning_rate": 4.0122277416889566e-07, "loss": 7.043, "step": 106 }, { "epoch": 0.03271170895750535, "grad_norm": 26.306779861450195, "learning_rate": 4.0504394344669467e-07, "loss": 7.0018, "step": 107 }, { "epoch": 0.03301742586365026, "grad_norm": 25.916648864746094, "learning_rate": 4.088651127244937e-07, "loss": 6.9363, "step": 108 }, { "epoch": 0.03332314276979517, "grad_norm": 25.371524810791016, "learning_rate": 4.126862820022927e-07, "loss": 6.823, "step": 109 }, { "epoch": 0.03362885967594008, "grad_norm": 26.303377151489258, "learning_rate": 4.1650745128009175e-07, "loss": 6.8514, "step": 110 }, { "epoch": 0.03393457658208499, "grad_norm": 25.99602508544922, "learning_rate": 4.2032862055789076e-07, "loss": 6.8182, "step": 111 }, { "epoch": 0.0342402934882299, "grad_norm": 25.688228607177734, "learning_rate": 4.241497898356897e-07, "loss": 6.7174, "step": 112 }, { "epoch": 0.034546010394374806, "grad_norm": 25.091541290283203, "learning_rate": 4.2797095911348873e-07, "loss": 6.5966, "step": 113 }, { "epoch": 0.03485172730051972, "grad_norm": 25.415782928466797, "learning_rate": 4.3179212839128774e-07, "loss": 6.6025, "step": 114 }, { "epoch": 0.035157444206664626, "grad_norm": 24.51012420654297, "learning_rate": 4.3561329766908675e-07, "loss": 6.4593, "step": 115 }, { "epoch": 0.03546316111280954, "grad_norm": 24.933269500732422, "learning_rate": 4.3943446694688576e-07, "loss": 6.4542, "step": 116 }, { "epoch": 0.035768878018954446, "grad_norm": 24.050718307495117, "learning_rate": 4.4325563622468477e-07, "loss": 6.3154, "step": 117 }, { "epoch": 0.03607459492509936, "grad_norm": 24.26476287841797, "learning_rate": 4.470768055024838e-07, "loss": 6.1785, "step": 118 }, { "epoch": 0.036380311831244266, "grad_norm": 22.763628005981445, "learning_rate": 4.508979747802828e-07, "loss": 6.0754, "step": 119 }, { "epoch": 0.03668602873738918, "grad_norm": 22.83565902709961, "learning_rate": 4.547191440580818e-07, "loss": 6.0542, "step": 120 }, { "epoch": 0.03699174564353409, "grad_norm": 22.363571166992188, "learning_rate": 4.585403133358808e-07, "loss": 5.9399, "step": 121 }, { "epoch": 0.037297462549679, "grad_norm": 23.11133575439453, "learning_rate": 4.623614826136798e-07, "loss": 5.8391, "step": 122 }, { "epoch": 0.03760317945582391, "grad_norm": 21.935550689697266, "learning_rate": 4.6618265189147877e-07, "loss": 5.7934, "step": 123 }, { "epoch": 0.037908896361968814, "grad_norm": 21.85487937927246, "learning_rate": 4.7000382116927783e-07, "loss": 5.7123, "step": 124 }, { "epoch": 0.03821461326811373, "grad_norm": 20.72574806213379, "learning_rate": 4.738249904470768e-07, "loss": 5.4847, "step": 125 }, { "epoch": 0.038520330174258634, "grad_norm": 28.811315536499023, "learning_rate": 4.776461597248758e-07, "loss": 6.6482, "step": 126 }, { "epoch": 0.03882604708040355, "grad_norm": 29.478004455566406, "learning_rate": 4.814673290026749e-07, "loss": 6.5336, "step": 127 }, { "epoch": 0.039131763986548454, "grad_norm": 29.13392448425293, "learning_rate": 4.852884982804738e-07, "loss": 6.4138, "step": 128 }, { "epoch": 0.03943748089269337, "grad_norm": 29.322067260742188, "learning_rate": 4.891096675582729e-07, "loss": 6.3758, "step": 129 }, { "epoch": 0.039743197798838274, "grad_norm": 29.335927963256836, "learning_rate": 4.929308368360718e-07, "loss": 6.2729, "step": 130 }, { "epoch": 0.04004891470498319, "grad_norm": 29.03062629699707, "learning_rate": 4.967520061138709e-07, "loss": 6.1525, "step": 131 }, { "epoch": 0.040354631611128094, "grad_norm": 29.667264938354492, "learning_rate": 5.005731753916699e-07, "loss": 6.1818, "step": 132 }, { "epoch": 0.04066034851727301, "grad_norm": 29.049808502197266, "learning_rate": 5.043943446694688e-07, "loss": 6.0194, "step": 133 }, { "epoch": 0.040966065423417915, "grad_norm": 28.47346305847168, "learning_rate": 5.082155139472679e-07, "loss": 5.9337, "step": 134 }, { "epoch": 0.04127178232956282, "grad_norm": 29.584733963012695, "learning_rate": 5.120366832250668e-07, "loss": 5.9656, "step": 135 }, { "epoch": 0.041577499235707735, "grad_norm": 29.759511947631836, "learning_rate": 5.158578525028659e-07, "loss": 5.9457, "step": 136 }, { "epoch": 0.04188321614185264, "grad_norm": 28.96985626220703, "learning_rate": 5.196790217806648e-07, "loss": 5.786, "step": 137 }, { "epoch": 0.042188933047997555, "grad_norm": 28.883241653442383, "learning_rate": 5.235001910584639e-07, "loss": 5.7358, "step": 138 }, { "epoch": 0.04249464995414246, "grad_norm": 28.061107635498047, "learning_rate": 5.27321360336263e-07, "loss": 5.6243, "step": 139 }, { "epoch": 0.042800366860287375, "grad_norm": 28.557910919189453, "learning_rate": 5.311425296140619e-07, "loss": 5.595, "step": 140 }, { "epoch": 0.04310608376643228, "grad_norm": 27.0810489654541, "learning_rate": 5.34963698891861e-07, "loss": 5.4564, "step": 141 }, { "epoch": 0.043411800672577196, "grad_norm": 27.113380432128906, "learning_rate": 5.387848681696599e-07, "loss": 5.4069, "step": 142 }, { "epoch": 0.0437175175787221, "grad_norm": 27.154207229614258, "learning_rate": 5.426060374474589e-07, "loss": 5.3479, "step": 143 }, { "epoch": 0.044023234484867016, "grad_norm": 24.9394474029541, "learning_rate": 5.46427206725258e-07, "loss": 5.1618, "step": 144 }, { "epoch": 0.04432895139101192, "grad_norm": 25.309959411621094, "learning_rate": 5.502483760030569e-07, "loss": 5.1594, "step": 145 }, { "epoch": 0.044634668297156836, "grad_norm": 25.39933204650879, "learning_rate": 5.54069545280856e-07, "loss": 5.0929, "step": 146 }, { "epoch": 0.04494038520330174, "grad_norm": 25.4710693359375, "learning_rate": 5.578907145586549e-07, "loss": 5.027, "step": 147 }, { "epoch": 0.04524610210944665, "grad_norm": 24.42040252685547, "learning_rate": 5.61711883836454e-07, "loss": 4.9338, "step": 148 }, { "epoch": 0.04555181901559156, "grad_norm": 22.97148895263672, "learning_rate": 5.65533053114253e-07, "loss": 4.769, "step": 149 }, { "epoch": 0.04585753592173647, "grad_norm": 20.79481315612793, "learning_rate": 5.693542223920519e-07, "loss": 4.6579, "step": 150 }, { "epoch": 0.04616325282788138, "grad_norm": Infinity, "learning_rate": 5.693542223920519e-07, "loss": 5.3634, "step": 151 }, { "epoch": 0.04646896973402629, "grad_norm": 32.50153350830078, "learning_rate": 5.73175391669851e-07, "loss": 5.2018, "step": 152 }, { "epoch": 0.046774686640171204, "grad_norm": 32.19245910644531, "learning_rate": 5.769965609476499e-07, "loss": 5.1173, "step": 153 }, { "epoch": 0.04708040354631611, "grad_norm": 31.8254337310791, "learning_rate": 5.80817730225449e-07, "loss": 5.0188, "step": 154 }, { "epoch": 0.047386120452461024, "grad_norm": 31.345176696777344, "learning_rate": 5.846388995032481e-07, "loss": 4.9354, "step": 155 }, { "epoch": 0.04769183735860593, "grad_norm": 30.8551082611084, "learning_rate": 5.88460068781047e-07, "loss": 4.8688, "step": 156 }, { "epoch": 0.047997554264750844, "grad_norm": 29.267337799072266, "learning_rate": 5.922812380588461e-07, "loss": 4.7249, "step": 157 }, { "epoch": 0.04830327117089575, "grad_norm": 29.33692169189453, "learning_rate": 5.96102407336645e-07, "loss": 4.686, "step": 158 }, { "epoch": 0.04860898807704066, "grad_norm": 28.15401268005371, "learning_rate": 5.999235766144441e-07, "loss": 4.6101, "step": 159 }, { "epoch": 0.04891470498318557, "grad_norm": 27.548696517944336, "learning_rate": 6.03744745892243e-07, "loss": 4.5363, "step": 160 }, { "epoch": 0.04922042188933048, "grad_norm": 26.75923728942871, "learning_rate": 6.07565915170042e-07, "loss": 4.4534, "step": 161 }, { "epoch": 0.04952613879547539, "grad_norm": 24.019573211669922, "learning_rate": 6.113870844478411e-07, "loss": 4.3505, "step": 162 }, { "epoch": 0.0498318557016203, "grad_norm": 23.76920509338379, "learning_rate": 6.1520825372564e-07, "loss": 4.3051, "step": 163 }, { "epoch": 0.05013757260776521, "grad_norm": 21.32398223876953, "learning_rate": 6.190294230034391e-07, "loss": 4.2441, "step": 164 }, { "epoch": 0.05044328951391012, "grad_norm": 19.767667770385742, "learning_rate": 6.22850592281238e-07, "loss": 4.1633, "step": 165 }, { "epoch": 0.05074900642005503, "grad_norm": 18.072601318359375, "learning_rate": 6.266717615590371e-07, "loss": 4.098, "step": 166 }, { "epoch": 0.05105472332619994, "grad_norm": 16.40079116821289, "learning_rate": 6.304929308368361e-07, "loss": 4.0733, "step": 167 }, { "epoch": 0.05136044023234485, "grad_norm": 14.797042846679688, "learning_rate": 6.34314100114635e-07, "loss": 4.0248, "step": 168 }, { "epoch": 0.05166615713848976, "grad_norm": 13.73922061920166, "learning_rate": 6.381352693924341e-07, "loss": 3.9938, "step": 169 }, { "epoch": 0.051971874044634665, "grad_norm": 12.408406257629395, "learning_rate": 6.41956438670233e-07, "loss": 3.9397, "step": 170 }, { "epoch": 0.05227759095077958, "grad_norm": 11.0806303024292, "learning_rate": 6.457776079480322e-07, "loss": 3.9039, "step": 171 }, { "epoch": 0.052583307856924486, "grad_norm": 10.287764549255371, "learning_rate": 6.495987772258312e-07, "loss": 3.8567, "step": 172 }, { "epoch": 0.0528890247630694, "grad_norm": 10.420894622802734, "learning_rate": 6.534199465036301e-07, "loss": 3.8572, "step": 173 }, { "epoch": 0.053194741669214306, "grad_norm": 11.034759521484375, "learning_rate": 6.572411157814292e-07, "loss": 3.8225, "step": 174 }, { "epoch": 0.05350045857535922, "grad_norm": 12.670732498168945, "learning_rate": 6.610622850592281e-07, "loss": 3.7767, "step": 175 }, { "epoch": 0.053806175481504126, "grad_norm": 9.92987060546875, "learning_rate": 6.648834543370272e-07, "loss": 3.7531, "step": 176 }, { "epoch": 0.05411189238764904, "grad_norm": 9.053284645080566, "learning_rate": 6.687046236148262e-07, "loss": 3.6744, "step": 177 }, { "epoch": 0.054417609293793946, "grad_norm": 8.813132286071777, "learning_rate": 6.725257928926251e-07, "loss": 3.6464, "step": 178 }, { "epoch": 0.05472332619993886, "grad_norm": 8.75460433959961, "learning_rate": 6.763469621704242e-07, "loss": 3.6191, "step": 179 }, { "epoch": 0.055029043106083766, "grad_norm": 8.578996658325195, "learning_rate": 6.801681314482231e-07, "loss": 3.5784, "step": 180 }, { "epoch": 0.05533476001222867, "grad_norm": 8.702179908752441, "learning_rate": 6.839893007260222e-07, "loss": 3.5504, "step": 181 }, { "epoch": 0.05564047691837359, "grad_norm": 7.9532999992370605, "learning_rate": 6.878104700038212e-07, "loss": 3.524, "step": 182 }, { "epoch": 0.05594619382451849, "grad_norm": 7.753958225250244, "learning_rate": 6.916316392816202e-07, "loss": 3.4855, "step": 183 }, { "epoch": 0.05625191073066341, "grad_norm": 7.2960686683654785, "learning_rate": 6.954528085594192e-07, "loss": 3.4897, "step": 184 }, { "epoch": 0.056557627636808314, "grad_norm": 6.889578819274902, "learning_rate": 6.992739778372181e-07, "loss": 3.4378, "step": 185 }, { "epoch": 0.05686334454295323, "grad_norm": 6.90155029296875, "learning_rate": 7.030951471150172e-07, "loss": 3.4377, "step": 186 }, { "epoch": 0.057169061449098134, "grad_norm": 6.828741550445557, "learning_rate": 7.069163163928163e-07, "loss": 3.3882, "step": 187 }, { "epoch": 0.05747477835524305, "grad_norm": 7.035349369049072, "learning_rate": 7.107374856706153e-07, "loss": 3.386, "step": 188 }, { "epoch": 0.057780495261387954, "grad_norm": 6.534512519836426, "learning_rate": 7.145586549484143e-07, "loss": 3.3903, "step": 189 }, { "epoch": 0.05808621216753287, "grad_norm": 7.165710926055908, "learning_rate": 7.183798242262132e-07, "loss": 3.3708, "step": 190 }, { "epoch": 0.058391929073677774, "grad_norm": 6.206090927124023, "learning_rate": 7.222009935040123e-07, "loss": 3.3402, "step": 191 }, { "epoch": 0.05869764597982268, "grad_norm": 5.940566539764404, "learning_rate": 7.260221627818112e-07, "loss": 3.3556, "step": 192 }, { "epoch": 0.059003362885967595, "grad_norm": 6.090426445007324, "learning_rate": 7.298433320596103e-07, "loss": 3.3237, "step": 193 }, { "epoch": 0.0593090797921125, "grad_norm": 5.814377784729004, "learning_rate": 7.336645013374093e-07, "loss": 3.3234, "step": 194 }, { "epoch": 0.059614796698257415, "grad_norm": 6.385340213775635, "learning_rate": 7.374856706152082e-07, "loss": 3.3093, "step": 195 }, { "epoch": 0.05992051360440232, "grad_norm": 5.415619373321533, "learning_rate": 7.413068398930073e-07, "loss": 3.2687, "step": 196 }, { "epoch": 0.060226230510547235, "grad_norm": 5.428919315338135, "learning_rate": 7.451280091708062e-07, "loss": 3.2456, "step": 197 }, { "epoch": 0.06053194741669214, "grad_norm": 5.200325965881348, "learning_rate": 7.489491784486053e-07, "loss": 3.2619, "step": 198 }, { "epoch": 0.060837664322837055, "grad_norm": 5.31981897354126, "learning_rate": 7.527703477264043e-07, "loss": 3.2184, "step": 199 }, { "epoch": 0.06114338122898196, "grad_norm": 5.07005500793457, "learning_rate": 7.565915170042033e-07, "loss": 3.1818, "step": 200 }, { "epoch": 0.061449098135126876, "grad_norm": 17.107723236083984, "learning_rate": 7.604126862820023e-07, "loss": 3.2727, "step": 201 }, { "epoch": 0.06175481504127178, "grad_norm": 15.640349388122559, "learning_rate": 7.642338555598012e-07, "loss": 3.2089, "step": 202 }, { "epoch": 0.06206053194741669, "grad_norm": 14.494150161743164, "learning_rate": 7.680550248376004e-07, "loss": 3.1685, "step": 203 }, { "epoch": 0.0623662488535616, "grad_norm": 12.784191131591797, "learning_rate": 7.718761941153994e-07, "loss": 3.1455, "step": 204 }, { "epoch": 0.06267196575970652, "grad_norm": 8.699557304382324, "learning_rate": 7.756973633931984e-07, "loss": 3.1119, "step": 205 }, { "epoch": 0.06297768266585142, "grad_norm": 6.457502841949463, "learning_rate": 7.795185326709974e-07, "loss": 3.0995, "step": 206 }, { "epoch": 0.06328339957199633, "grad_norm": 4.067775249481201, "learning_rate": 7.833397019487963e-07, "loss": 3.063, "step": 207 }, { "epoch": 0.06358911647814124, "grad_norm": 5.944173336029053, "learning_rate": 7.871608712265954e-07, "loss": 3.0597, "step": 208 }, { "epoch": 0.06389483338428616, "grad_norm": 6.701009750366211, "learning_rate": 7.909820405043944e-07, "loss": 3.0743, "step": 209 }, { "epoch": 0.06420055029043106, "grad_norm": 7.740096092224121, "learning_rate": 7.948032097821934e-07, "loss": 3.0625, "step": 210 }, { "epoch": 0.06450626719657597, "grad_norm": 8.296363830566406, "learning_rate": 7.986243790599924e-07, "loss": 3.0453, "step": 211 }, { "epoch": 0.06481198410272088, "grad_norm": 6.8833770751953125, "learning_rate": 8.024455483377913e-07, "loss": 3.0253, "step": 212 }, { "epoch": 0.0651177010088658, "grad_norm": 4.895036697387695, "learning_rate": 8.062667176155904e-07, "loss": 3.0221, "step": 213 }, { "epoch": 0.0654234179150107, "grad_norm": 3.5238819122314453, "learning_rate": 8.100878868933893e-07, "loss": 3.008, "step": 214 }, { "epoch": 0.06572913482115561, "grad_norm": 3.580324411392212, "learning_rate": 8.139090561711884e-07, "loss": 3.0103, "step": 215 }, { "epoch": 0.06603485172730052, "grad_norm": 4.652540683746338, "learning_rate": 8.177302254489874e-07, "loss": 3.0208, "step": 216 }, { "epoch": 0.06634056863344542, "grad_norm": 4.5278825759887695, "learning_rate": 8.215513947267864e-07, "loss": 3.0356, "step": 217 }, { "epoch": 0.06664628553959034, "grad_norm": 3.829627752304077, "learning_rate": 8.253725640045854e-07, "loss": 3.0282, "step": 218 }, { "epoch": 0.06695200244573525, "grad_norm": 3.477382183074951, "learning_rate": 8.291937332823844e-07, "loss": 3.0114, "step": 219 }, { "epoch": 0.06725771935188016, "grad_norm": 4.350729942321777, "learning_rate": 8.330149025601835e-07, "loss": 3.0317, "step": 220 }, { "epoch": 0.06756343625802506, "grad_norm": 3.895333766937256, "learning_rate": 8.368360718379825e-07, "loss": 3.0416, "step": 221 }, { "epoch": 0.06786915316416998, "grad_norm": 3.6035702228546143, "learning_rate": 8.406572411157815e-07, "loss": 2.9919, "step": 222 }, { "epoch": 0.06817487007031489, "grad_norm": 3.7856855392456055, "learning_rate": 8.444784103935805e-07, "loss": 3.0035, "step": 223 }, { "epoch": 0.0684805869764598, "grad_norm": 3.925621509552002, "learning_rate": 8.482995796713794e-07, "loss": 2.9999, "step": 224 }, { "epoch": 0.0687863038826047, "grad_norm": 7.144941806793213, "learning_rate": 8.521207489491785e-07, "loss": 2.9532, "step": 225 }, { "epoch": 0.06909202078874961, "grad_norm": 12.69289779663086, "learning_rate": 8.559419182269775e-07, "loss": 2.992, "step": 226 }, { "epoch": 0.06939773769489453, "grad_norm": 12.417204856872559, "learning_rate": 8.597630875047765e-07, "loss": 2.9521, "step": 227 }, { "epoch": 0.06970345460103944, "grad_norm": 12.220179557800293, "learning_rate": 8.635842567825755e-07, "loss": 2.9375, "step": 228 }, { "epoch": 0.07000917150718435, "grad_norm": 9.418977737426758, "learning_rate": 8.674054260603744e-07, "loss": 2.9194, "step": 229 }, { "epoch": 0.07031488841332925, "grad_norm": 7.437530040740967, "learning_rate": 8.712265953381735e-07, "loss": 2.9017, "step": 230 }, { "epoch": 0.07062060531947417, "grad_norm": 5.17976188659668, "learning_rate": 8.750477646159725e-07, "loss": 2.8791, "step": 231 }, { "epoch": 0.07092632222561908, "grad_norm": 5.491673946380615, "learning_rate": 8.788689338937715e-07, "loss": 2.8595, "step": 232 }, { "epoch": 0.07123203913176399, "grad_norm": 4.838670253753662, "learning_rate": 8.826901031715705e-07, "loss": 2.8916, "step": 233 }, { "epoch": 0.07153775603790889, "grad_norm": 6.157463550567627, "learning_rate": 8.865112724493695e-07, "loss": 2.8787, "step": 234 }, { "epoch": 0.07184347294405381, "grad_norm": 6.79338264465332, "learning_rate": 8.903324417271686e-07, "loss": 2.886, "step": 235 }, { "epoch": 0.07214918985019872, "grad_norm": 6.777259349822998, "learning_rate": 8.941536110049675e-07, "loss": 2.8888, "step": 236 }, { "epoch": 0.07245490675634363, "grad_norm": 5.309328079223633, "learning_rate": 8.979747802827666e-07, "loss": 2.8604, "step": 237 }, { "epoch": 0.07276062366248853, "grad_norm": 2.91855525970459, "learning_rate": 9.017959495605656e-07, "loss": 2.8492, "step": 238 }, { "epoch": 0.07306634056863344, "grad_norm": 2.6521034240722656, "learning_rate": 9.056171188383646e-07, "loss": 2.8714, "step": 239 }, { "epoch": 0.07337205747477836, "grad_norm": 4.107646942138672, "learning_rate": 9.094382881161636e-07, "loss": 2.878, "step": 240 }, { "epoch": 0.07367777438092327, "grad_norm": 3.249817371368408, "learning_rate": 9.132594573939625e-07, "loss": 2.8596, "step": 241 }, { "epoch": 0.07398349128706817, "grad_norm": 3.9444007873535156, "learning_rate": 9.170806266717616e-07, "loss": 2.8803, "step": 242 }, { "epoch": 0.07428920819321308, "grad_norm": 2.2641005516052246, "learning_rate": 9.209017959495606e-07, "loss": 2.8768, "step": 243 }, { "epoch": 0.074594925099358, "grad_norm": 2.0083391666412354, "learning_rate": 9.247229652273596e-07, "loss": 2.8827, "step": 244 }, { "epoch": 0.07490064200550291, "grad_norm": 2.1648073196411133, "learning_rate": 9.285441345051586e-07, "loss": 2.8879, "step": 245 }, { "epoch": 0.07520635891164781, "grad_norm": 2.881678819656372, "learning_rate": 9.323653037829575e-07, "loss": 2.8619, "step": 246 }, { "epoch": 0.07551207581779272, "grad_norm": 4.199202537536621, "learning_rate": 9.361864730607566e-07, "loss": 2.8829, "step": 247 }, { "epoch": 0.07581779272393763, "grad_norm": 2.215648889541626, "learning_rate": 9.400076423385557e-07, "loss": 2.8851, "step": 248 }, { "epoch": 0.07612350963008255, "grad_norm": 3.7611210346221924, "learning_rate": 9.438288116163546e-07, "loss": 2.894, "step": 249 }, { "epoch": 0.07642922653622745, "grad_norm": 5.182114124298096, "learning_rate": 9.476499808941536e-07, "loss": 2.8982, "step": 250 }, { "epoch": 0.07673494344237236, "grad_norm": 13.185416221618652, "learning_rate": 9.514711501719526e-07, "loss": 2.8846, "step": 251 }, { "epoch": 0.07704066034851727, "grad_norm": 12.94508171081543, "learning_rate": 9.552923194497516e-07, "loss": 2.8548, "step": 252 }, { "epoch": 0.07734637725466219, "grad_norm": 11.59104061126709, "learning_rate": 9.591134887275508e-07, "loss": 2.8339, "step": 253 }, { "epoch": 0.0776520941608071, "grad_norm": 10.375120162963867, "learning_rate": 9.629346580053497e-07, "loss": 2.8097, "step": 254 }, { "epoch": 0.077957811066952, "grad_norm": 7.854126930236816, "learning_rate": 9.667558272831487e-07, "loss": 2.8042, "step": 255 }, { "epoch": 0.07826352797309691, "grad_norm": 4.425750732421875, "learning_rate": 9.705769965609476e-07, "loss": 2.7813, "step": 256 }, { "epoch": 0.07856924487924183, "grad_norm": 1.9504581689834595, "learning_rate": 9.743981658387466e-07, "loss": 2.786, "step": 257 }, { "epoch": 0.07887496178538674, "grad_norm": 4.718151092529297, "learning_rate": 9.782193351165458e-07, "loss": 2.7969, "step": 258 }, { "epoch": 0.07918067869153164, "grad_norm": 6.841475486755371, "learning_rate": 9.820405043943447e-07, "loss": 2.7814, "step": 259 }, { "epoch": 0.07948639559767655, "grad_norm": 7.138665199279785, "learning_rate": 9.858616736721437e-07, "loss": 2.7943, "step": 260 }, { "epoch": 0.07979211250382146, "grad_norm": 7.370173454284668, "learning_rate": 9.896828429499426e-07, "loss": 2.7994, "step": 261 }, { "epoch": 0.08009782940996638, "grad_norm": 4.9088568687438965, "learning_rate": 9.935040122277418e-07, "loss": 2.7919, "step": 262 }, { "epoch": 0.08040354631611128, "grad_norm": 3.364069938659668, "learning_rate": 9.973251815055407e-07, "loss": 2.8051, "step": 263 }, { "epoch": 0.08070926322225619, "grad_norm": 2.158391237258911, "learning_rate": 1.0011463507833397e-06, "loss": 2.7853, "step": 264 }, { "epoch": 0.0810149801284011, "grad_norm": 3.541266441345215, "learning_rate": 1.0049675200611387e-06, "loss": 2.7999, "step": 265 }, { "epoch": 0.08132069703454602, "grad_norm": 4.9565205574035645, "learning_rate": 1.0087886893389376e-06, "loss": 2.7828, "step": 266 }, { "epoch": 0.08162641394069092, "grad_norm": 3.57356858253479, "learning_rate": 1.0126098586167368e-06, "loss": 2.7967, "step": 267 }, { "epoch": 0.08193213084683583, "grad_norm": 2.7359962463378906, "learning_rate": 1.0164310278945357e-06, "loss": 2.8053, "step": 268 }, { "epoch": 0.08223784775298074, "grad_norm": 1.7061967849731445, "learning_rate": 1.0202521971723347e-06, "loss": 2.8027, "step": 269 }, { "epoch": 0.08254356465912564, "grad_norm": 1.9617775678634644, "learning_rate": 1.0240733664501337e-06, "loss": 2.8403, "step": 270 }, { "epoch": 0.08284928156527056, "grad_norm": 2.1935503482818604, "learning_rate": 1.0278945357279326e-06, "loss": 2.8311, "step": 271 }, { "epoch": 0.08315499847141547, "grad_norm": 4.098847389221191, "learning_rate": 1.0317157050057318e-06, "loss": 2.816, "step": 272 }, { "epoch": 0.08346071537756038, "grad_norm": 3.851635217666626, "learning_rate": 1.0355368742835307e-06, "loss": 2.8061, "step": 273 }, { "epoch": 0.08376643228370528, "grad_norm": 4.073040962219238, "learning_rate": 1.0393580435613297e-06, "loss": 2.7834, "step": 274 }, { "epoch": 0.0840721491898502, "grad_norm": 6.517051696777344, "learning_rate": 1.0431792128391289e-06, "loss": 2.8439, "step": 275 }, { "epoch": 0.08437786609599511, "grad_norm": 12.814970970153809, "learning_rate": 1.0470003821169278e-06, "loss": 2.8143, "step": 276 }, { "epoch": 0.08468358300214002, "grad_norm": 11.48625373840332, "learning_rate": 1.050821551394727e-06, "loss": 2.7891, "step": 277 }, { "epoch": 0.08498929990828492, "grad_norm": 13.583642959594727, "learning_rate": 1.054642720672526e-06, "loss": 2.7897, "step": 278 }, { "epoch": 0.08529501681442984, "grad_norm": 10.428862571716309, "learning_rate": 1.058463889950325e-06, "loss": 2.7626, "step": 279 }, { "epoch": 0.08560073372057475, "grad_norm": 8.284067153930664, "learning_rate": 1.0622850592281239e-06, "loss": 2.7625, "step": 280 }, { "epoch": 0.08590645062671966, "grad_norm": 4.828396320343018, "learning_rate": 1.0661062285059228e-06, "loss": 2.7306, "step": 281 }, { "epoch": 0.08621216753286456, "grad_norm": 1.4717888832092285, "learning_rate": 1.069927397783722e-06, "loss": 2.728, "step": 282 }, { "epoch": 0.08651788443900947, "grad_norm": 4.078621864318848, "learning_rate": 1.073748567061521e-06, "loss": 2.7308, "step": 283 }, { "epoch": 0.08682360134515439, "grad_norm": 5.824599266052246, "learning_rate": 1.0775697363393199e-06, "loss": 2.729, "step": 284 }, { "epoch": 0.0871293182512993, "grad_norm": 6.548873424530029, "learning_rate": 1.0813909056171188e-06, "loss": 2.736, "step": 285 }, { "epoch": 0.0874350351574442, "grad_norm": 5.897242069244385, "learning_rate": 1.0852120748949178e-06, "loss": 2.7371, "step": 286 }, { "epoch": 0.08774075206358911, "grad_norm": 3.4619359970092773, "learning_rate": 1.089033244172717e-06, "loss": 2.7297, "step": 287 }, { "epoch": 0.08804646896973403, "grad_norm": 1.3983134031295776, "learning_rate": 1.092854413450516e-06, "loss": 2.7278, "step": 288 }, { "epoch": 0.08835218587587894, "grad_norm": 3.104123115539551, "learning_rate": 1.0966755827283149e-06, "loss": 2.7404, "step": 289 }, { "epoch": 0.08865790278202385, "grad_norm": 4.219205379486084, "learning_rate": 1.1004967520061138e-06, "loss": 2.7331, "step": 290 }, { "epoch": 0.08896361968816875, "grad_norm": 2.462705135345459, "learning_rate": 1.1043179212839128e-06, "loss": 2.742, "step": 291 }, { "epoch": 0.08926933659431367, "grad_norm": 2.9489552974700928, "learning_rate": 1.108139090561712e-06, "loss": 2.7501, "step": 292 }, { "epoch": 0.08957505350045858, "grad_norm": 2.010669469833374, "learning_rate": 1.111960259839511e-06, "loss": 2.7391, "step": 293 }, { "epoch": 0.08988077040660349, "grad_norm": 3.1552932262420654, "learning_rate": 1.1157814291173099e-06, "loss": 2.7645, "step": 294 }, { "epoch": 0.09018648731274839, "grad_norm": 3.743298292160034, "learning_rate": 1.1196025983951088e-06, "loss": 2.77, "step": 295 }, { "epoch": 0.0904922042188933, "grad_norm": 2.441256284713745, "learning_rate": 1.123423767672908e-06, "loss": 2.7567, "step": 296 }, { "epoch": 0.09079792112503822, "grad_norm": 2.0671463012695312, "learning_rate": 1.127244936950707e-06, "loss": 2.7643, "step": 297 }, { "epoch": 0.09110363803118313, "grad_norm": 2.008615493774414, "learning_rate": 1.131066106228506e-06, "loss": 2.7704, "step": 298 }, { "epoch": 0.09140935493732803, "grad_norm": 2.534212827682495, "learning_rate": 1.1348872755063049e-06, "loss": 2.7672, "step": 299 }, { "epoch": 0.09171507184347294, "grad_norm": 3.4729485511779785, "learning_rate": 1.1387084447841038e-06, "loss": 2.7623, "step": 300 }, { "epoch": 0.09202078874961786, "grad_norm": 11.738871574401855, "learning_rate": 1.142529614061903e-06, "loss": 2.7769, "step": 301 }, { "epoch": 0.09232650565576277, "grad_norm": 10.210897445678711, "learning_rate": 1.146350783339702e-06, "loss": 2.7369, "step": 302 }, { "epoch": 0.09263222256190767, "grad_norm": 8.165843963623047, "learning_rate": 1.150171952617501e-06, "loss": 2.7076, "step": 303 }, { "epoch": 0.09293793946805258, "grad_norm": 6.514739513397217, "learning_rate": 1.1539931218952999e-06, "loss": 2.6999, "step": 304 }, { "epoch": 0.09324365637419749, "grad_norm": 2.4918906688690186, "learning_rate": 1.1578142911730988e-06, "loss": 2.685, "step": 305 }, { "epoch": 0.09354937328034241, "grad_norm": 3.355121374130249, "learning_rate": 1.161635460450898e-06, "loss": 2.6819, "step": 306 }, { "epoch": 0.09385509018648731, "grad_norm": 6.0606369972229, "learning_rate": 1.1654566297286972e-06, "loss": 2.6941, "step": 307 }, { "epoch": 0.09416080709263222, "grad_norm": 6.000965595245361, "learning_rate": 1.1692777990064961e-06, "loss": 2.6851, "step": 308 }, { "epoch": 0.09446652399877713, "grad_norm": 5.907788276672363, "learning_rate": 1.173098968284295e-06, "loss": 2.6898, "step": 309 }, { "epoch": 0.09477224090492205, "grad_norm": 3.0512032508850098, "learning_rate": 1.176920137562094e-06, "loss": 2.6837, "step": 310 }, { "epoch": 0.09507795781106695, "grad_norm": 1.813821792602539, "learning_rate": 1.1807413068398932e-06, "loss": 2.68, "step": 311 }, { "epoch": 0.09538367471721186, "grad_norm": 4.216711521148682, "learning_rate": 1.1845624761176922e-06, "loss": 2.6916, "step": 312 }, { "epoch": 0.09568939162335677, "grad_norm": 4.127336502075195, "learning_rate": 1.1883836453954911e-06, "loss": 2.6644, "step": 313 }, { "epoch": 0.09599510852950169, "grad_norm": 3.062204122543335, "learning_rate": 1.19220481467329e-06, "loss": 2.6881, "step": 314 }, { "epoch": 0.0963008254356466, "grad_norm": 2.7297487258911133, "learning_rate": 1.196025983951089e-06, "loss": 2.6726, "step": 315 }, { "epoch": 0.0966065423417915, "grad_norm": 1.652383804321289, "learning_rate": 1.1998471532288882e-06, "loss": 2.6611, "step": 316 }, { "epoch": 0.09691225924793641, "grad_norm": 2.2906124591827393, "learning_rate": 1.2036683225066871e-06, "loss": 2.711, "step": 317 }, { "epoch": 0.09721797615408131, "grad_norm": 4.368490219116211, "learning_rate": 1.207489491784486e-06, "loss": 2.7048, "step": 318 }, { "epoch": 0.09752369306022624, "grad_norm": 3.133607864379883, "learning_rate": 1.211310661062285e-06, "loss": 2.6887, "step": 319 }, { "epoch": 0.09782940996637114, "grad_norm": 1.8967219591140747, "learning_rate": 1.215131830340084e-06, "loss": 2.6745, "step": 320 }, { "epoch": 0.09813512687251605, "grad_norm": 3.3614561557769775, "learning_rate": 1.2189529996178832e-06, "loss": 2.683, "step": 321 }, { "epoch": 0.09844084377866096, "grad_norm": 2.8327786922454834, "learning_rate": 1.2227741688956821e-06, "loss": 2.6836, "step": 322 }, { "epoch": 0.09874656068480588, "grad_norm": 2.965330123901367, "learning_rate": 1.226595338173481e-06, "loss": 2.6925, "step": 323 }, { "epoch": 0.09905227759095078, "grad_norm": 1.953649878501892, "learning_rate": 1.23041650745128e-06, "loss": 2.6866, "step": 324 }, { "epoch": 0.09935799449709569, "grad_norm": 3.590707778930664, "learning_rate": 1.234237676729079e-06, "loss": 2.6345, "step": 325 }, { "epoch": 0.0996637114032406, "grad_norm": 8.426238059997559, "learning_rate": 1.2380588460068782e-06, "loss": 2.6733, "step": 326 }, { "epoch": 0.0999694283093855, "grad_norm": 5.990464210510254, "learning_rate": 1.2418800152846771e-06, "loss": 2.6307, "step": 327 }, { "epoch": 0.10027514521553042, "grad_norm": 4.106024742126465, "learning_rate": 1.245701184562476e-06, "loss": 2.6132, "step": 328 }, { "epoch": 0.10058086212167533, "grad_norm": 2.2279388904571533, "learning_rate": 1.249522353840275e-06, "loss": 2.5835, "step": 329 }, { "epoch": 0.10088657902782024, "grad_norm": 2.269000768661499, "learning_rate": 1.2533435231180742e-06, "loss": 2.5801, "step": 330 }, { "epoch": 0.10119229593396514, "grad_norm": 3.3939566612243652, "learning_rate": 1.2571646923958732e-06, "loss": 2.5733, "step": 331 }, { "epoch": 0.10149801284011006, "grad_norm": 3.9757797718048096, "learning_rate": 1.2609858616736721e-06, "loss": 2.5773, "step": 332 }, { "epoch": 0.10180372974625497, "grad_norm": 3.1792802810668945, "learning_rate": 1.264807030951471e-06, "loss": 2.546, "step": 333 }, { "epoch": 0.10210944665239988, "grad_norm": 2.417346477508545, "learning_rate": 1.26862820022927e-06, "loss": 2.5496, "step": 334 }, { "epoch": 0.10241516355854478, "grad_norm": 2.651249647140503, "learning_rate": 1.2724493695070692e-06, "loss": 2.565, "step": 335 }, { "epoch": 0.1027208804646897, "grad_norm": 3.017545461654663, "learning_rate": 1.2762705387848682e-06, "loss": 2.5409, "step": 336 }, { "epoch": 0.10302659737083461, "grad_norm": 2.1479692459106445, "learning_rate": 1.2800917080626671e-06, "loss": 2.5227, "step": 337 }, { "epoch": 0.10333231427697952, "grad_norm": 2.1394119262695312, "learning_rate": 1.283912877340466e-06, "loss": 2.5196, "step": 338 }, { "epoch": 0.10363803118312442, "grad_norm": 1.9752380847930908, "learning_rate": 1.2877340466182652e-06, "loss": 2.5292, "step": 339 }, { "epoch": 0.10394374808926933, "grad_norm": 2.3703057765960693, "learning_rate": 1.2915552158960644e-06, "loss": 2.5247, "step": 340 }, { "epoch": 0.10424946499541425, "grad_norm": 2.328279972076416, "learning_rate": 1.2953763851738634e-06, "loss": 2.5214, "step": 341 }, { "epoch": 0.10455518190155916, "grad_norm": 2.393890142440796, "learning_rate": 1.2991975544516623e-06, "loss": 2.5172, "step": 342 }, { "epoch": 0.10486089880770406, "grad_norm": 2.300849437713623, "learning_rate": 1.3030187237294613e-06, "loss": 2.5127, "step": 343 }, { "epoch": 0.10516661571384897, "grad_norm": 2.9283103942871094, "learning_rate": 1.3068398930072602e-06, "loss": 2.5007, "step": 344 }, { "epoch": 0.10547233261999389, "grad_norm": 3.046299934387207, "learning_rate": 1.3106610622850594e-06, "loss": 2.5139, "step": 345 }, { "epoch": 0.1057780495261388, "grad_norm": 2.42344331741333, "learning_rate": 1.3144822315628584e-06, "loss": 2.5236, "step": 346 }, { "epoch": 0.1060837664322837, "grad_norm": 2.2297654151916504, "learning_rate": 1.3183034008406573e-06, "loss": 2.4708, "step": 347 }, { "epoch": 0.10638948333842861, "grad_norm": 4.538825511932373, "learning_rate": 1.3221245701184563e-06, "loss": 2.4602, "step": 348 }, { "epoch": 0.10669520024457352, "grad_norm": 4.105545520782471, "learning_rate": 1.3259457393962552e-06, "loss": 2.5098, "step": 349 }, { "epoch": 0.10700091715071844, "grad_norm": 5.038228511810303, "learning_rate": 1.3297669086740544e-06, "loss": 2.4997, "step": 350 }, { "epoch": 0.10730663405686335, "grad_norm": 7.075524806976318, "learning_rate": 1.3335880779518534e-06, "loss": 2.4473, "step": 351 }, { "epoch": 0.10761235096300825, "grad_norm": 4.9096999168396, "learning_rate": 1.3374092472296523e-06, "loss": 2.3623, "step": 352 }, { "epoch": 0.10791806786915316, "grad_norm": 4.376562118530273, "learning_rate": 1.3412304165074513e-06, "loss": 2.3458, "step": 353 }, { "epoch": 0.10822378477529808, "grad_norm": 3.0417819023132324, "learning_rate": 1.3450515857852502e-06, "loss": 2.323, "step": 354 }, { "epoch": 0.10852950168144299, "grad_norm": 2.4273712635040283, "learning_rate": 1.3488727550630494e-06, "loss": 2.2842, "step": 355 }, { "epoch": 0.10883521858758789, "grad_norm": 2.5372025966644287, "learning_rate": 1.3526939243408484e-06, "loss": 2.2924, "step": 356 }, { "epoch": 0.1091409354937328, "grad_norm": 3.219088554382324, "learning_rate": 1.3565150936186473e-06, "loss": 2.2402, "step": 357 }, { "epoch": 0.10944665239987772, "grad_norm": 2.9789183139801025, "learning_rate": 1.3603362628964463e-06, "loss": 2.2583, "step": 358 }, { "epoch": 0.10975236930602263, "grad_norm": 2.6980502605438232, "learning_rate": 1.3641574321742452e-06, "loss": 2.2439, "step": 359 }, { "epoch": 0.11005808621216753, "grad_norm": 4.2234296798706055, "learning_rate": 1.3679786014520444e-06, "loss": 2.2246, "step": 360 }, { "epoch": 0.11036380311831244, "grad_norm": 3.4755866527557373, "learning_rate": 1.3717997707298433e-06, "loss": 2.2323, "step": 361 }, { "epoch": 0.11066952002445735, "grad_norm": 2.8702659606933594, "learning_rate": 1.3756209400076423e-06, "loss": 2.2209, "step": 362 }, { "epoch": 0.11097523693060227, "grad_norm": 2.676515817642212, "learning_rate": 1.3794421092854413e-06, "loss": 2.1912, "step": 363 }, { "epoch": 0.11128095383674717, "grad_norm": 2.8005123138427734, "learning_rate": 1.3832632785632404e-06, "loss": 2.1734, "step": 364 }, { "epoch": 0.11158667074289208, "grad_norm": 3.2698559761047363, "learning_rate": 1.3870844478410394e-06, "loss": 2.1984, "step": 365 }, { "epoch": 0.11189238764903699, "grad_norm": 3.2368435859680176, "learning_rate": 1.3909056171188383e-06, "loss": 2.1974, "step": 366 }, { "epoch": 0.11219810455518191, "grad_norm": 4.320165634155273, "learning_rate": 1.3947267863966373e-06, "loss": 2.1898, "step": 367 }, { "epoch": 0.11250382146132681, "grad_norm": 3.3868277072906494, "learning_rate": 1.3985479556744363e-06, "loss": 2.2032, "step": 368 }, { "epoch": 0.11280953836747172, "grad_norm": 2.524139404296875, "learning_rate": 1.4023691249522354e-06, "loss": 2.1826, "step": 369 }, { "epoch": 0.11311525527361663, "grad_norm": 4.037295818328857, "learning_rate": 1.4061902942300344e-06, "loss": 2.2046, "step": 370 }, { "epoch": 0.11342097217976153, "grad_norm": 2.84753155708313, "learning_rate": 1.4100114635078335e-06, "loss": 2.1787, "step": 371 }, { "epoch": 0.11372668908590645, "grad_norm": 5.823403358459473, "learning_rate": 1.4138326327856325e-06, "loss": 2.1906, "step": 372 }, { "epoch": 0.11403240599205136, "grad_norm": 3.1884052753448486, "learning_rate": 1.4176538020634315e-06, "loss": 2.1904, "step": 373 }, { "epoch": 0.11433812289819627, "grad_norm": 6.810904502868652, "learning_rate": 1.4214749713412306e-06, "loss": 2.2293, "step": 374 }, { "epoch": 0.11464383980434117, "grad_norm": 12.195791244506836, "learning_rate": 1.4252961406190296e-06, "loss": 2.2659, "step": 375 }, { "epoch": 0.1149495567104861, "grad_norm": 5.211302757263184, "learning_rate": 1.4291173098968285e-06, "loss": 2.0571, "step": 376 }, { "epoch": 0.115255273616631, "grad_norm": 4.886347770690918, "learning_rate": 1.4329384791746275e-06, "loss": 2.0003, "step": 377 }, { "epoch": 0.11556099052277591, "grad_norm": 2.849539279937744, "learning_rate": 1.4367596484524265e-06, "loss": 1.9535, "step": 378 }, { "epoch": 0.11586670742892081, "grad_norm": 2.738243579864502, "learning_rate": 1.4405808177302256e-06, "loss": 1.9327, "step": 379 }, { "epoch": 0.11617242433506574, "grad_norm": 2.6285016536712646, "learning_rate": 1.4444019870080246e-06, "loss": 1.8734, "step": 380 }, { "epoch": 0.11647814124121064, "grad_norm": 2.7497611045837402, "learning_rate": 1.4482231562858235e-06, "loss": 1.8533, "step": 381 }, { "epoch": 0.11678385814735555, "grad_norm": 3.66969633102417, "learning_rate": 1.4520443255636225e-06, "loss": 1.8661, "step": 382 }, { "epoch": 0.11708957505350046, "grad_norm": 2.8011255264282227, "learning_rate": 1.4558654948414214e-06, "loss": 1.8619, "step": 383 }, { "epoch": 0.11739529195964536, "grad_norm": 4.164844036102295, "learning_rate": 1.4596866641192206e-06, "loss": 1.8178, "step": 384 }, { "epoch": 0.11770100886579028, "grad_norm": 2.5509207248687744, "learning_rate": 1.4635078333970196e-06, "loss": 1.8197, "step": 385 }, { "epoch": 0.11800672577193519, "grad_norm": 3.240171432495117, "learning_rate": 1.4673290026748185e-06, "loss": 1.8331, "step": 386 }, { "epoch": 0.1183124426780801, "grad_norm": 2.3113067150115967, "learning_rate": 1.4711501719526175e-06, "loss": 1.7633, "step": 387 }, { "epoch": 0.118618159584225, "grad_norm": 3.651369571685791, "learning_rate": 1.4749713412304164e-06, "loss": 1.7714, "step": 388 }, { "epoch": 0.11892387649036992, "grad_norm": 3.2712314128875732, "learning_rate": 1.4787925105082156e-06, "loss": 1.7975, "step": 389 }, { "epoch": 0.11922959339651483, "grad_norm": 3.1503427028656006, "learning_rate": 1.4826136797860146e-06, "loss": 1.8103, "step": 390 }, { "epoch": 0.11953531030265974, "grad_norm": 2.8008873462677, "learning_rate": 1.4864348490638135e-06, "loss": 1.7841, "step": 391 }, { "epoch": 0.11984102720880464, "grad_norm": 5.765246391296387, "learning_rate": 1.4902560183416125e-06, "loss": 1.7447, "step": 392 }, { "epoch": 0.12014674411494956, "grad_norm": 3.355313301086426, "learning_rate": 1.4940771876194114e-06, "loss": 1.8219, "step": 393 }, { "epoch": 0.12045246102109447, "grad_norm": 13.86959457397461, "learning_rate": 1.4978983568972106e-06, "loss": 1.8249, "step": 394 }, { "epoch": 0.12075817792723938, "grad_norm": 3.5347213745117188, "learning_rate": 1.5017195261750096e-06, "loss": 1.8005, "step": 395 }, { "epoch": 0.12106389483338428, "grad_norm": 3.2886812686920166, "learning_rate": 1.5055406954528085e-06, "loss": 1.7947, "step": 396 }, { "epoch": 0.12136961173952919, "grad_norm": 4.42784309387207, "learning_rate": 1.5093618647306075e-06, "loss": 1.8414, "step": 397 }, { "epoch": 0.12167532864567411, "grad_norm": 3.717240810394287, "learning_rate": 1.5131830340084066e-06, "loss": 1.8203, "step": 398 }, { "epoch": 0.12198104555181902, "grad_norm": 4.020057678222656, "learning_rate": 1.5170042032862056e-06, "loss": 1.8637, "step": 399 }, { "epoch": 0.12228676245796392, "grad_norm": 5.128223896026611, "learning_rate": 1.5208253725640046e-06, "loss": 1.9258, "step": 400 }, { "epoch": 0.12259247936410883, "grad_norm": 6.060498237609863, "learning_rate": 1.5246465418418035e-06, "loss": 1.6823, "step": 401 }, { "epoch": 0.12289819627025375, "grad_norm": 8.363202095031738, "learning_rate": 1.5284677111196025e-06, "loss": 1.5348, "step": 402 }, { "epoch": 0.12320391317639866, "grad_norm": 3.834468126296997, "learning_rate": 1.5322888803974018e-06, "loss": 1.5099, "step": 403 }, { "epoch": 0.12350963008254356, "grad_norm": 3.746401786804199, "learning_rate": 1.5361100496752008e-06, "loss": 1.4764, "step": 404 }, { "epoch": 0.12381534698868847, "grad_norm": 2.6676409244537354, "learning_rate": 1.5399312189529998e-06, "loss": 1.4221, "step": 405 }, { "epoch": 0.12412106389483338, "grad_norm": 2.571153402328491, "learning_rate": 1.5437523882307987e-06, "loss": 1.4456, "step": 406 }, { "epoch": 0.1244267808009783, "grad_norm": 4.898181438446045, "learning_rate": 1.5475735575085977e-06, "loss": 1.4019, "step": 407 }, { "epoch": 0.1247324977071232, "grad_norm": 2.530176877975464, "learning_rate": 1.5513947267863968e-06, "loss": 1.4338, "step": 408 }, { "epoch": 0.1250382146132681, "grad_norm": 2.52177095413208, "learning_rate": 1.5552158960641958e-06, "loss": 1.3811, "step": 409 }, { "epoch": 0.12534393151941303, "grad_norm": 2.576986074447632, "learning_rate": 1.5590370653419948e-06, "loss": 1.3315, "step": 410 }, { "epoch": 0.12564964842555792, "grad_norm": 3.0845768451690674, "learning_rate": 1.5628582346197937e-06, "loss": 1.3935, "step": 411 }, { "epoch": 0.12595536533170285, "grad_norm": 2.6538684368133545, "learning_rate": 1.5666794038975927e-06, "loss": 1.3452, "step": 412 }, { "epoch": 0.12626108223784777, "grad_norm": 3.6006276607513428, "learning_rate": 1.5705005731753918e-06, "loss": 1.3864, "step": 413 }, { "epoch": 0.12656679914399266, "grad_norm": 3.4683022499084473, "learning_rate": 1.5743217424531908e-06, "loss": 1.3863, "step": 414 }, { "epoch": 0.12687251605013758, "grad_norm": 2.454369068145752, "learning_rate": 1.5781429117309897e-06, "loss": 1.4164, "step": 415 }, { "epoch": 0.12717823295628247, "grad_norm": 5.0718994140625, "learning_rate": 1.5819640810087887e-06, "loss": 1.3805, "step": 416 }, { "epoch": 0.1274839498624274, "grad_norm": 4.366430282592773, "learning_rate": 1.5857852502865877e-06, "loss": 1.4056, "step": 417 }, { "epoch": 0.1277896667685723, "grad_norm": 4.872661590576172, "learning_rate": 1.5896064195643868e-06, "loss": 1.4218, "step": 418 }, { "epoch": 0.1280953836747172, "grad_norm": 5.092097759246826, "learning_rate": 1.5934275888421858e-06, "loss": 1.4598, "step": 419 }, { "epoch": 0.12840110058086213, "grad_norm": 3.9631106853485107, "learning_rate": 1.5972487581199847e-06, "loss": 1.4403, "step": 420 }, { "epoch": 0.12870681748700702, "grad_norm": 4.17078971862793, "learning_rate": 1.6010699273977837e-06, "loss": 1.4372, "step": 421 }, { "epoch": 0.12901253439315194, "grad_norm": 3.2774176597595215, "learning_rate": 1.6048910966755827e-06, "loss": 1.47, "step": 422 }, { "epoch": 0.12931825129929686, "grad_norm": 4.034589767456055, "learning_rate": 1.6087122659533818e-06, "loss": 1.4868, "step": 423 }, { "epoch": 0.12962396820544175, "grad_norm": 9.104544639587402, "learning_rate": 1.6125334352311808e-06, "loss": 1.5501, "step": 424 }, { "epoch": 0.12992968511158667, "grad_norm": 4.515106678009033, "learning_rate": 1.6163546045089797e-06, "loss": 1.6433, "step": 425 }, { "epoch": 0.1302354020177316, "grad_norm": 4.018584728240967, "learning_rate": 1.6201757737867787e-06, "loss": 1.2616, "step": 426 }, { "epoch": 0.1305411189238765, "grad_norm": 3.031351089477539, "learning_rate": 1.6239969430645776e-06, "loss": 1.1645, "step": 427 }, { "epoch": 0.1308468358300214, "grad_norm": 2.3623054027557373, "learning_rate": 1.6278181123423768e-06, "loss": 1.0995, "step": 428 }, { "epoch": 0.1311525527361663, "grad_norm": 2.514901638031006, "learning_rate": 1.6316392816201758e-06, "loss": 1.0672, "step": 429 }, { "epoch": 0.13145826964231122, "grad_norm": 2.921835422515869, "learning_rate": 1.6354604508979747e-06, "loss": 1.0925, "step": 430 }, { "epoch": 0.13176398654845614, "grad_norm": 4.234851360321045, "learning_rate": 1.6392816201757737e-06, "loss": 1.0489, "step": 431 }, { "epoch": 0.13206970345460103, "grad_norm": 2.4434449672698975, "learning_rate": 1.6431027894535729e-06, "loss": 1.0556, "step": 432 }, { "epoch": 0.13237542036074595, "grad_norm": 3.417098045349121, "learning_rate": 1.6469239587313718e-06, "loss": 1.0521, "step": 433 }, { "epoch": 0.13268113726689085, "grad_norm": 2.177273988723755, "learning_rate": 1.6507451280091708e-06, "loss": 1.0196, "step": 434 }, { "epoch": 0.13298685417303577, "grad_norm": 3.3506534099578857, "learning_rate": 1.65456629728697e-06, "loss": 1.0267, "step": 435 }, { "epoch": 0.1332925710791807, "grad_norm": 2.9204158782958984, "learning_rate": 1.6583874665647689e-06, "loss": 1.0487, "step": 436 }, { "epoch": 0.13359828798532558, "grad_norm": 2.0325400829315186, "learning_rate": 1.662208635842568e-06, "loss": 0.9976, "step": 437 }, { "epoch": 0.1339040048914705, "grad_norm": 2.835183620452881, "learning_rate": 1.666029805120367e-06, "loss": 1.021, "step": 438 }, { "epoch": 0.1342097217976154, "grad_norm": 3.622365951538086, "learning_rate": 1.669850974398166e-06, "loss": 1.0855, "step": 439 }, { "epoch": 0.13451543870376031, "grad_norm": 5.125313758850098, "learning_rate": 1.673672143675965e-06, "loss": 1.0037, "step": 440 }, { "epoch": 0.13482115560990524, "grad_norm": 4.361060619354248, "learning_rate": 1.6774933129537639e-06, "loss": 1.1451, "step": 441 }, { "epoch": 0.13512687251605013, "grad_norm": 3.4043185710906982, "learning_rate": 1.681314482231563e-06, "loss": 1.1106, "step": 442 }, { "epoch": 0.13543258942219505, "grad_norm": 6.439428806304932, "learning_rate": 1.685135651509362e-06, "loss": 1.1322, "step": 443 }, { "epoch": 0.13573830632833997, "grad_norm": 12.970124244689941, "learning_rate": 1.688956820787161e-06, "loss": 1.1814, "step": 444 }, { "epoch": 0.13604402323448486, "grad_norm": 3.0335333347320557, "learning_rate": 1.69277799006496e-06, "loss": 1.1389, "step": 445 }, { "epoch": 0.13634974014062978, "grad_norm": 3.0099613666534424, "learning_rate": 1.6965991593427589e-06, "loss": 1.2176, "step": 446 }, { "epoch": 0.13665545704677468, "grad_norm": 4.334911346435547, "learning_rate": 1.700420328620558e-06, "loss": 1.2042, "step": 447 }, { "epoch": 0.1369611739529196, "grad_norm": 4.309381484985352, "learning_rate": 1.704241497898357e-06, "loss": 1.2655, "step": 448 }, { "epoch": 0.13726689085906452, "grad_norm": 4.769018173217773, "learning_rate": 1.708062667176156e-06, "loss": 1.2392, "step": 449 }, { "epoch": 0.1375726077652094, "grad_norm": 5.442220687866211, "learning_rate": 1.711883836453955e-06, "loss": 1.3634, "step": 450 }, { "epoch": 0.13787832467135433, "grad_norm": 3.48395037651062, "learning_rate": 1.7157050057317539e-06, "loss": 0.9715, "step": 451 }, { "epoch": 0.13818404157749922, "grad_norm": 4.614691734313965, "learning_rate": 1.719526175009553e-06, "loss": 0.8387, "step": 452 }, { "epoch": 0.13848975848364414, "grad_norm": 2.3957905769348145, "learning_rate": 1.723347344287352e-06, "loss": 0.8367, "step": 453 }, { "epoch": 0.13879547538978906, "grad_norm": 1.9913091659545898, "learning_rate": 1.727168513565151e-06, "loss": 0.8262, "step": 454 }, { "epoch": 0.13910119229593396, "grad_norm": 1.7294175624847412, "learning_rate": 1.73098968284295e-06, "loss": 0.7905, "step": 455 }, { "epoch": 0.13940690920207888, "grad_norm": 2.1419146060943604, "learning_rate": 1.7348108521207489e-06, "loss": 0.7588, "step": 456 }, { "epoch": 0.1397126261082238, "grad_norm": 12.753339767456055, "learning_rate": 1.738632021398548e-06, "loss": 0.7792, "step": 457 }, { "epoch": 0.1400183430143687, "grad_norm": 2.461897373199463, "learning_rate": 1.742453190676347e-06, "loss": 0.7945, "step": 458 }, { "epoch": 0.1403240599205136, "grad_norm": 2.7683169841766357, "learning_rate": 1.746274359954146e-06, "loss": 0.7969, "step": 459 }, { "epoch": 0.1406297768266585, "grad_norm": 2.86381459236145, "learning_rate": 1.750095529231945e-06, "loss": 0.7642, "step": 460 }, { "epoch": 0.14093549373280342, "grad_norm": 2.6318113803863525, "learning_rate": 1.7539166985097439e-06, "loss": 0.8136, "step": 461 }, { "epoch": 0.14124121063894834, "grad_norm": 2.953939199447632, "learning_rate": 1.757737867787543e-06, "loss": 0.7834, "step": 462 }, { "epoch": 0.14154692754509324, "grad_norm": 2.338334321975708, "learning_rate": 1.761559037065342e-06, "loss": 0.7944, "step": 463 }, { "epoch": 0.14185264445123816, "grad_norm": 2.588392496109009, "learning_rate": 1.765380206343141e-06, "loss": 0.8573, "step": 464 }, { "epoch": 0.14215836135738305, "grad_norm": 3.032151937484741, "learning_rate": 1.7692013756209399e-06, "loss": 0.8768, "step": 465 }, { "epoch": 0.14246407826352797, "grad_norm": 9.500019073486328, "learning_rate": 1.773022544898739e-06, "loss": 0.8856, "step": 466 }, { "epoch": 0.1427697951696729, "grad_norm": 3.2756781578063965, "learning_rate": 1.7768437141765382e-06, "loss": 0.9228, "step": 467 }, { "epoch": 0.14307551207581778, "grad_norm": 5.580215930938721, "learning_rate": 1.7806648834543372e-06, "loss": 0.9459, "step": 468 }, { "epoch": 0.1433812289819627, "grad_norm": 2.782317876815796, "learning_rate": 1.7844860527321361e-06, "loss": 0.9201, "step": 469 }, { "epoch": 0.14368694588810763, "grad_norm": 13.165220260620117, "learning_rate": 1.788307222009935e-06, "loss": 0.9625, "step": 470 }, { "epoch": 0.14399266279425252, "grad_norm": 5.8131256103515625, "learning_rate": 1.7921283912877343e-06, "loss": 0.9699, "step": 471 }, { "epoch": 0.14429837970039744, "grad_norm": 3.5020856857299805, "learning_rate": 1.7959495605655332e-06, "loss": 0.9813, "step": 472 }, { "epoch": 0.14460409660654233, "grad_norm": 3.2566182613372803, "learning_rate": 1.7997707298433322e-06, "loss": 1.0794, "step": 473 }, { "epoch": 0.14490981351268725, "grad_norm": 4.53338623046875, "learning_rate": 1.8035918991211311e-06, "loss": 1.0848, "step": 474 }, { "epoch": 0.14521553041883217, "grad_norm": 4.8176422119140625, "learning_rate": 1.80741306839893e-06, "loss": 1.1413, "step": 475 }, { "epoch": 0.14552124732497707, "grad_norm": 2.3498904705047607, "learning_rate": 1.8112342376767293e-06, "loss": 0.7812, "step": 476 }, { "epoch": 0.14582696423112199, "grad_norm": 2.3315963745117188, "learning_rate": 1.8150554069545282e-06, "loss": 0.6597, "step": 477 }, { "epoch": 0.14613268113726688, "grad_norm": 1.8880013227462769, "learning_rate": 1.8188765762323272e-06, "loss": 0.6077, "step": 478 }, { "epoch": 0.1464383980434118, "grad_norm": 2.152780055999756, "learning_rate": 1.8226977455101261e-06, "loss": 0.6482, "step": 479 }, { "epoch": 0.14674411494955672, "grad_norm": 2.2284717559814453, "learning_rate": 1.826518914787925e-06, "loss": 0.5988, "step": 480 }, { "epoch": 0.1470498318557016, "grad_norm": 2.35667085647583, "learning_rate": 1.8303400840657243e-06, "loss": 0.608, "step": 481 }, { "epoch": 0.14735554876184653, "grad_norm": 1.737558364868164, "learning_rate": 1.8341612533435232e-06, "loss": 0.6213, "step": 482 }, { "epoch": 0.14766126566799145, "grad_norm": 3.0432419776916504, "learning_rate": 1.8379824226213222e-06, "loss": 0.5796, "step": 483 }, { "epoch": 0.14796698257413635, "grad_norm": 2.378157138824463, "learning_rate": 1.8418035918991211e-06, "loss": 0.6452, "step": 484 }, { "epoch": 0.14827269948028127, "grad_norm": 2.2500953674316406, "learning_rate": 1.84562476117692e-06, "loss": 0.6331, "step": 485 }, { "epoch": 0.14857841638642616, "grad_norm": 1.9005255699157715, "learning_rate": 1.8494459304547192e-06, "loss": 0.6667, "step": 486 }, { "epoch": 0.14888413329257108, "grad_norm": 2.0529346466064453, "learning_rate": 1.8532670997325182e-06, "loss": 0.6164, "step": 487 }, { "epoch": 0.149189850198716, "grad_norm": 2.85892915725708, "learning_rate": 1.8570882690103172e-06, "loss": 0.6351, "step": 488 }, { "epoch": 0.1494955671048609, "grad_norm": 3.760477066040039, "learning_rate": 1.8609094382881161e-06, "loss": 0.7176, "step": 489 }, { "epoch": 0.14980128401100581, "grad_norm": 2.8358240127563477, "learning_rate": 1.864730607565915e-06, "loss": 0.696, "step": 490 }, { "epoch": 0.1501070009171507, "grad_norm": 2.1552517414093018, "learning_rate": 1.8685517768437142e-06, "loss": 0.734, "step": 491 }, { "epoch": 0.15041271782329563, "grad_norm": 3.0512280464172363, "learning_rate": 1.8723729461215132e-06, "loss": 0.7408, "step": 492 }, { "epoch": 0.15071843472944055, "grad_norm": 2.4025492668151855, "learning_rate": 1.8761941153993124e-06, "loss": 0.8121, "step": 493 }, { "epoch": 0.15102415163558544, "grad_norm": 3.2039177417755127, "learning_rate": 1.8800152846771113e-06, "loss": 0.8352, "step": 494 }, { "epoch": 0.15132986854173036, "grad_norm": 3.33308482170105, "learning_rate": 1.8838364539549103e-06, "loss": 0.7791, "step": 495 }, { "epoch": 0.15163558544787525, "grad_norm": 5.387060642242432, "learning_rate": 1.8876576232327092e-06, "loss": 0.8732, "step": 496 }, { "epoch": 0.15194130235402017, "grad_norm": 12.293320655822754, "learning_rate": 1.8914787925105082e-06, "loss": 0.8497, "step": 497 }, { "epoch": 0.1522470192601651, "grad_norm": 6.623059272766113, "learning_rate": 1.8952999617883071e-06, "loss": 0.8985, "step": 498 }, { "epoch": 0.15255273616631, "grad_norm": 3.829218864440918, "learning_rate": 1.8991211310661063e-06, "loss": 0.9528, "step": 499 }, { "epoch": 0.1528584530724549, "grad_norm": 5.141953945159912, "learning_rate": 1.9029423003439053e-06, "loss": 1.1586, "step": 500 }, { "epoch": 0.15316416997859983, "grad_norm": 2.9247562885284424, "learning_rate": 1.9067634696217042e-06, "loss": 0.6662, "step": 501 }, { "epoch": 0.15346988688474472, "grad_norm": 1.8352329730987549, "learning_rate": 1.910584638899503e-06, "loss": 0.541, "step": 502 }, { "epoch": 0.15377560379088964, "grad_norm": 2.0667295455932617, "learning_rate": 1.9144058081773024e-06, "loss": 0.518, "step": 503 }, { "epoch": 0.15408132069703454, "grad_norm": 2.013383388519287, "learning_rate": 1.9182269774551015e-06, "loss": 0.4779, "step": 504 }, { "epoch": 0.15438703760317946, "grad_norm": 1.5878100395202637, "learning_rate": 1.9220481467329003e-06, "loss": 0.486, "step": 505 }, { "epoch": 0.15469275450932438, "grad_norm": 2.509739875793457, "learning_rate": 1.9258693160106994e-06, "loss": 0.5001, "step": 506 }, { "epoch": 0.15499847141546927, "grad_norm": 1.5657154321670532, "learning_rate": 1.929690485288498e-06, "loss": 0.4727, "step": 507 }, { "epoch": 0.1553041883216142, "grad_norm": 2.223322868347168, "learning_rate": 1.9335116545662973e-06, "loss": 0.4528, "step": 508 }, { "epoch": 0.15560990522775908, "grad_norm": 1.8643583059310913, "learning_rate": 1.937332823844096e-06, "loss": 0.5225, "step": 509 }, { "epoch": 0.155915622133904, "grad_norm": 1.4416460990905762, "learning_rate": 1.9411539931218953e-06, "loss": 0.5406, "step": 510 }, { "epoch": 0.15622133904004892, "grad_norm": 1.5860602855682373, "learning_rate": 1.944975162399694e-06, "loss": 0.5087, "step": 511 }, { "epoch": 0.15652705594619382, "grad_norm": 1.9665247201919556, "learning_rate": 1.948796331677493e-06, "loss": 0.4941, "step": 512 }, { "epoch": 0.15683277285233874, "grad_norm": 2.475602865219116, "learning_rate": 1.9526175009552923e-06, "loss": 0.5711, "step": 513 }, { "epoch": 0.15713848975848366, "grad_norm": 2.0139565467834473, "learning_rate": 1.9564386702330915e-06, "loss": 0.5102, "step": 514 }, { "epoch": 0.15744420666462855, "grad_norm": 2.325047731399536, "learning_rate": 1.9602598395108907e-06, "loss": 0.6104, "step": 515 }, { "epoch": 0.15774992357077347, "grad_norm": 7.147446155548096, "learning_rate": 1.9640810087886894e-06, "loss": 0.5935, "step": 516 }, { "epoch": 0.15805564047691836, "grad_norm": 2.743516206741333, "learning_rate": 1.9679021780664886e-06, "loss": 0.6561, "step": 517 }, { "epoch": 0.15836135738306328, "grad_norm": 2.4997544288635254, "learning_rate": 1.9717233473442873e-06, "loss": 0.6801, "step": 518 }, { "epoch": 0.1586670742892082, "grad_norm": 2.784456968307495, "learning_rate": 1.9755445166220865e-06, "loss": 0.7012, "step": 519 }, { "epoch": 0.1589727911953531, "grad_norm": 2.4825854301452637, "learning_rate": 1.9793656858998852e-06, "loss": 0.7193, "step": 520 }, { "epoch": 0.15927850810149802, "grad_norm": 3.431854724884033, "learning_rate": 1.9831868551776844e-06, "loss": 0.7379, "step": 521 }, { "epoch": 0.1595842250076429, "grad_norm": 2.3170371055603027, "learning_rate": 1.9870080244554836e-06, "loss": 0.7685, "step": 522 }, { "epoch": 0.15988994191378783, "grad_norm": 3.6807382106781006, "learning_rate": 1.9908291937332828e-06, "loss": 0.8337, "step": 523 }, { "epoch": 0.16019565881993275, "grad_norm": 5.5456929206848145, "learning_rate": 1.9946503630110815e-06, "loss": 0.8645, "step": 524 }, { "epoch": 0.16050137572607764, "grad_norm": 6.413071632385254, "learning_rate": 1.9984715322888807e-06, "loss": 1.0078, "step": 525 }, { "epoch": 0.16080709263222256, "grad_norm": 2.0091118812561035, "learning_rate": 2.0022927015666794e-06, "loss": 0.5575, "step": 526 }, { "epoch": 0.16111280953836749, "grad_norm": 1.6980489492416382, "learning_rate": 2.0061138708444786e-06, "loss": 0.4564, "step": 527 }, { "epoch": 0.16141852644451238, "grad_norm": 2.7025721073150635, "learning_rate": 2.0099350401222773e-06, "loss": 0.407, "step": 528 }, { "epoch": 0.1617242433506573, "grad_norm": 1.7798800468444824, "learning_rate": 2.0137562094000765e-06, "loss": 0.4167, "step": 529 }, { "epoch": 0.1620299602568022, "grad_norm": 1.4504377841949463, "learning_rate": 2.0175773786778752e-06, "loss": 0.4156, "step": 530 }, { "epoch": 0.1623356771629471, "grad_norm": 1.8627259731292725, "learning_rate": 2.0213985479556744e-06, "loss": 0.4037, "step": 531 }, { "epoch": 0.16264139406909203, "grad_norm": 1.6564728021621704, "learning_rate": 2.0252197172334736e-06, "loss": 0.3831, "step": 532 }, { "epoch": 0.16294711097523693, "grad_norm": 1.4642672538757324, "learning_rate": 2.0290408865112727e-06, "loss": 0.4393, "step": 533 }, { "epoch": 0.16325282788138185, "grad_norm": 1.8814715147018433, "learning_rate": 2.0328620557890715e-06, "loss": 0.415, "step": 534 }, { "epoch": 0.16355854478752674, "grad_norm": 1.587811827659607, "learning_rate": 2.0366832250668707e-06, "loss": 0.411, "step": 535 }, { "epoch": 0.16386426169367166, "grad_norm": 1.9181551933288574, "learning_rate": 2.0405043943446694e-06, "loss": 0.4554, "step": 536 }, { "epoch": 0.16416997859981658, "grad_norm": 1.884787917137146, "learning_rate": 2.0443255636224686e-06, "loss": 0.4431, "step": 537 }, { "epoch": 0.16447569550596147, "grad_norm": 1.845542311668396, "learning_rate": 2.0481467329002673e-06, "loss": 0.443, "step": 538 }, { "epoch": 0.1647814124121064, "grad_norm": 1.990646481513977, "learning_rate": 2.0519679021780665e-06, "loss": 0.5149, "step": 539 }, { "epoch": 0.16508712931825129, "grad_norm": 2.2489404678344727, "learning_rate": 2.0557890714558652e-06, "loss": 0.5302, "step": 540 }, { "epoch": 0.1653928462243962, "grad_norm": 2.2587578296661377, "learning_rate": 2.0596102407336644e-06, "loss": 0.5574, "step": 541 }, { "epoch": 0.16569856313054113, "grad_norm": 2.3649537563323975, "learning_rate": 2.0634314100114636e-06, "loss": 0.5689, "step": 542 }, { "epoch": 0.16600428003668602, "grad_norm": 2.313070058822632, "learning_rate": 2.0672525792892627e-06, "loss": 0.5753, "step": 543 }, { "epoch": 0.16630999694283094, "grad_norm": 3.4409608840942383, "learning_rate": 2.0710737485670615e-06, "loss": 0.61, "step": 544 }, { "epoch": 0.16661571384897586, "grad_norm": 3.74385404586792, "learning_rate": 2.0748949178448606e-06, "loss": 0.6313, "step": 545 }, { "epoch": 0.16692143075512075, "grad_norm": 3.0134334564208984, "learning_rate": 2.0787160871226594e-06, "loss": 0.6367, "step": 546 }, { "epoch": 0.16722714766126567, "grad_norm": 3.9642484188079834, "learning_rate": 2.0825372564004586e-06, "loss": 0.6752, "step": 547 }, { "epoch": 0.16753286456741057, "grad_norm": 3.2685201168060303, "learning_rate": 2.0863584256782577e-06, "loss": 0.7621, "step": 548 }, { "epoch": 0.1678385814735555, "grad_norm": 3.7088401317596436, "learning_rate": 2.0901795949560565e-06, "loss": 0.824, "step": 549 }, { "epoch": 0.1681442983797004, "grad_norm": 5.210728645324707, "learning_rate": 2.0940007642338556e-06, "loss": 1.0039, "step": 550 }, { "epoch": 0.1684500152858453, "grad_norm": 2.1009469032287598, "learning_rate": 2.097821933511655e-06, "loss": 0.5267, "step": 551 }, { "epoch": 0.16875573219199022, "grad_norm": 1.359464168548584, "learning_rate": 2.101643102789454e-06, "loss": 0.4076, "step": 552 }, { "epoch": 0.1690614490981351, "grad_norm": 2.794334650039673, "learning_rate": 2.1054642720672527e-06, "loss": 0.3406, "step": 553 }, { "epoch": 0.16936716600428003, "grad_norm": 1.4243654012680054, "learning_rate": 2.109285441345052e-06, "loss": 0.3674, "step": 554 }, { "epoch": 0.16967288291042495, "grad_norm": 1.7794015407562256, "learning_rate": 2.1131066106228506e-06, "loss": 0.3626, "step": 555 }, { "epoch": 0.16997859981656985, "grad_norm": 1.3493459224700928, "learning_rate": 2.11692777990065e-06, "loss": 0.3507, "step": 556 }, { "epoch": 0.17028431672271477, "grad_norm": 3.3774242401123047, "learning_rate": 2.1207489491784485e-06, "loss": 0.3771, "step": 557 }, { "epoch": 0.1705900336288597, "grad_norm": 1.6613430976867676, "learning_rate": 2.1245701184562477e-06, "loss": 0.3543, "step": 558 }, { "epoch": 0.17089575053500458, "grad_norm": 1.4288980960845947, "learning_rate": 2.1283912877340465e-06, "loss": 0.3558, "step": 559 }, { "epoch": 0.1712014674411495, "grad_norm": 1.64877188205719, "learning_rate": 2.1322124570118456e-06, "loss": 0.3698, "step": 560 }, { "epoch": 0.1715071843472944, "grad_norm": 1.612062692642212, "learning_rate": 2.136033626289645e-06, "loss": 0.4758, "step": 561 }, { "epoch": 0.17181290125343932, "grad_norm": 5.2402119636535645, "learning_rate": 2.139854795567444e-06, "loss": 0.4161, "step": 562 }, { "epoch": 0.17211861815958424, "grad_norm": 1.6275651454925537, "learning_rate": 2.1436759648452427e-06, "loss": 0.4302, "step": 563 }, { "epoch": 0.17242433506572913, "grad_norm": 1.86667001247406, "learning_rate": 2.147497134123042e-06, "loss": 0.4154, "step": 564 }, { "epoch": 0.17273005197187405, "grad_norm": 1.8689332008361816, "learning_rate": 2.1513183034008406e-06, "loss": 0.4791, "step": 565 }, { "epoch": 0.17303576887801894, "grad_norm": 2.0869057178497314, "learning_rate": 2.1551394726786398e-06, "loss": 0.4901, "step": 566 }, { "epoch": 0.17334148578416386, "grad_norm": 2.356966018676758, "learning_rate": 2.1589606419564385e-06, "loss": 0.5142, "step": 567 }, { "epoch": 0.17364720269030878, "grad_norm": 2.961998701095581, "learning_rate": 2.1627818112342377e-06, "loss": 0.5735, "step": 568 }, { "epoch": 0.17395291959645368, "grad_norm": 2.626810073852539, "learning_rate": 2.1666029805120364e-06, "loss": 0.6058, "step": 569 }, { "epoch": 0.1742586365025986, "grad_norm": 2.779517412185669, "learning_rate": 2.1704241497898356e-06, "loss": 0.6305, "step": 570 }, { "epoch": 0.17456435340874352, "grad_norm": 3.350560188293457, "learning_rate": 2.1742453190676348e-06, "loss": 0.6686, "step": 571 }, { "epoch": 0.1748700703148884, "grad_norm": 6.4714131355285645, "learning_rate": 2.178066488345434e-06, "loss": 0.6323, "step": 572 }, { "epoch": 0.17517578722103333, "grad_norm": 4.758286476135254, "learning_rate": 2.1818876576232327e-06, "loss": 0.7519, "step": 573 }, { "epoch": 0.17548150412717822, "grad_norm": 7.642584323883057, "learning_rate": 2.185708826901032e-06, "loss": 0.77, "step": 574 }, { "epoch": 0.17578722103332314, "grad_norm": 3.984527826309204, "learning_rate": 2.1895299961788306e-06, "loss": 0.941, "step": 575 }, { "epoch": 0.17609293793946806, "grad_norm": 2.2247626781463623, "learning_rate": 2.1933511654566298e-06, "loss": 0.487, "step": 576 }, { "epoch": 0.17639865484561296, "grad_norm": 1.7136754989624023, "learning_rate": 2.1971723347344285e-06, "loss": 0.3795, "step": 577 }, { "epoch": 0.17670437175175788, "grad_norm": 1.4240525960922241, "learning_rate": 2.2009935040122277e-06, "loss": 0.3549, "step": 578 }, { "epoch": 0.17701008865790277, "grad_norm": 1.2393327951431274, "learning_rate": 2.204814673290027e-06, "loss": 0.3147, "step": 579 }, { "epoch": 0.1773158055640477, "grad_norm": 1.2773252725601196, "learning_rate": 2.2086358425678256e-06, "loss": 0.3305, "step": 580 }, { "epoch": 0.1776215224701926, "grad_norm": 1.4127362966537476, "learning_rate": 2.212457011845625e-06, "loss": 0.3059, "step": 581 }, { "epoch": 0.1779272393763375, "grad_norm": 1.2819136381149292, "learning_rate": 2.216278181123424e-06, "loss": 0.2971, "step": 582 }, { "epoch": 0.17823295628248242, "grad_norm": 1.5068365335464478, "learning_rate": 2.220099350401223e-06, "loss": 0.3257, "step": 583 }, { "epoch": 0.17853867318862734, "grad_norm": 1.537855625152588, "learning_rate": 2.223920519679022e-06, "loss": 0.35, "step": 584 }, { "epoch": 0.17884439009477224, "grad_norm": 1.1823945045471191, "learning_rate": 2.227741688956821e-06, "loss": 0.315, "step": 585 }, { "epoch": 0.17915010700091716, "grad_norm": 1.1116009950637817, "learning_rate": 2.2315628582346198e-06, "loss": 0.3727, "step": 586 }, { "epoch": 0.17945582390706205, "grad_norm": 1.6083769798278809, "learning_rate": 2.235384027512419e-06, "loss": 0.3956, "step": 587 }, { "epoch": 0.17976154081320697, "grad_norm": 1.6152915954589844, "learning_rate": 2.2392051967902177e-06, "loss": 0.3687, "step": 588 }, { "epoch": 0.1800672577193519, "grad_norm": 1.6674989461898804, "learning_rate": 2.243026366068017e-06, "loss": 0.4004, "step": 589 }, { "epoch": 0.18037297462549678, "grad_norm": 2.49977970123291, "learning_rate": 2.246847535345816e-06, "loss": 0.4449, "step": 590 }, { "epoch": 0.1806786915316417, "grad_norm": 1.623542308807373, "learning_rate": 2.250668704623615e-06, "loss": 0.4651, "step": 591 }, { "epoch": 0.1809844084377866, "grad_norm": 2.207078456878662, "learning_rate": 2.254489873901414e-06, "loss": 0.5553, "step": 592 }, { "epoch": 0.18129012534393152, "grad_norm": 2.674281358718872, "learning_rate": 2.258311043179213e-06, "loss": 0.529, "step": 593 }, { "epoch": 0.18159584225007644, "grad_norm": 2.441256046295166, "learning_rate": 2.262132212457012e-06, "loss": 0.5767, "step": 594 }, { "epoch": 0.18190155915622133, "grad_norm": 2.961146593093872, "learning_rate": 2.265953381734811e-06, "loss": 0.5787, "step": 595 }, { "epoch": 0.18220727606236625, "grad_norm": 3.3737189769744873, "learning_rate": 2.2697745510126097e-06, "loss": 0.6015, "step": 596 }, { "epoch": 0.18251299296851115, "grad_norm": 4.567743301391602, "learning_rate": 2.273595720290409e-06, "loss": 0.6207, "step": 597 }, { "epoch": 0.18281870987465607, "grad_norm": 2.571335554122925, "learning_rate": 2.2774168895682077e-06, "loss": 0.6985, "step": 598 }, { "epoch": 0.183124426780801, "grad_norm": 5.7726826667785645, "learning_rate": 2.281238058846007e-06, "loss": 0.7639, "step": 599 }, { "epoch": 0.18343014368694588, "grad_norm": 10.315164566040039, "learning_rate": 2.285059228123806e-06, "loss": 0.9754, "step": 600 }, { "epoch": 0.1837358605930908, "grad_norm": 2.9848074913024902, "learning_rate": 2.288880397401605e-06, "loss": 0.4545, "step": 601 }, { "epoch": 0.18404157749923572, "grad_norm": 1.6752336025238037, "learning_rate": 2.292701566679404e-06, "loss": 0.3241, "step": 602 }, { "epoch": 0.1843472944053806, "grad_norm": 1.8096387386322021, "learning_rate": 2.296522735957203e-06, "loss": 0.3076, "step": 603 }, { "epoch": 0.18465301131152553, "grad_norm": 2.0160815715789795, "learning_rate": 2.300343905235002e-06, "loss": 0.3031, "step": 604 }, { "epoch": 0.18495872821767043, "grad_norm": 1.157773733139038, "learning_rate": 2.304165074512801e-06, "loss": 0.2886, "step": 605 }, { "epoch": 0.18526444512381535, "grad_norm": 1.167496681213379, "learning_rate": 2.3079862437905997e-06, "loss": 0.2856, "step": 606 }, { "epoch": 0.18557016202996027, "grad_norm": 1.3030118942260742, "learning_rate": 2.311807413068399e-06, "loss": 0.3177, "step": 607 }, { "epoch": 0.18587587893610516, "grad_norm": 1.2056314945220947, "learning_rate": 2.3156285823461976e-06, "loss": 0.3095, "step": 608 }, { "epoch": 0.18618159584225008, "grad_norm": 1.601122260093689, "learning_rate": 2.319449751623997e-06, "loss": 0.3264, "step": 609 }, { "epoch": 0.18648731274839497, "grad_norm": 1.2699966430664062, "learning_rate": 2.323270920901796e-06, "loss": 0.3246, "step": 610 }, { "epoch": 0.1867930296545399, "grad_norm": 1.8906351327896118, "learning_rate": 2.327092090179595e-06, "loss": 0.3116, "step": 611 }, { "epoch": 0.18709874656068481, "grad_norm": 1.6584614515304565, "learning_rate": 2.3309132594573943e-06, "loss": 0.3127, "step": 612 }, { "epoch": 0.1874044634668297, "grad_norm": 2.021218776702881, "learning_rate": 2.334734428735193e-06, "loss": 0.4041, "step": 613 }, { "epoch": 0.18771018037297463, "grad_norm": 1.3904544115066528, "learning_rate": 2.3385555980129922e-06, "loss": 0.4238, "step": 614 }, { "epoch": 0.18801589727911955, "grad_norm": 1.9503837823867798, "learning_rate": 2.342376767290791e-06, "loss": 0.4283, "step": 615 }, { "epoch": 0.18832161418526444, "grad_norm": 7.3907599449157715, "learning_rate": 2.34619793656859e-06, "loss": 0.4476, "step": 616 }, { "epoch": 0.18862733109140936, "grad_norm": 1.5735561847686768, "learning_rate": 2.350019105846389e-06, "loss": 0.4592, "step": 617 }, { "epoch": 0.18893304799755425, "grad_norm": 3.255760431289673, "learning_rate": 2.353840275124188e-06, "loss": 0.5428, "step": 618 }, { "epoch": 0.18923876490369917, "grad_norm": 8.075132369995117, "learning_rate": 2.357661444401987e-06, "loss": 0.5292, "step": 619 }, { "epoch": 0.1895444818098441, "grad_norm": 2.5360372066497803, "learning_rate": 2.3614826136797864e-06, "loss": 0.547, "step": 620 }, { "epoch": 0.189850198715989, "grad_norm": 6.8622918128967285, "learning_rate": 2.365303782957585e-06, "loss": 0.5392, "step": 621 }, { "epoch": 0.1901559156221339, "grad_norm": 3.9884440898895264, "learning_rate": 2.3691249522353843e-06, "loss": 0.5655, "step": 622 }, { "epoch": 0.1904616325282788, "grad_norm": 3.568575143814087, "learning_rate": 2.372946121513183e-06, "loss": 0.6158, "step": 623 }, { "epoch": 0.19076734943442372, "grad_norm": 3.530162811279297, "learning_rate": 2.3767672907909822e-06, "loss": 0.7271, "step": 624 }, { "epoch": 0.19107306634056864, "grad_norm": 6.587818145751953, "learning_rate": 2.380588460068781e-06, "loss": 0.8417, "step": 625 }, { "epoch": 0.19137878324671354, "grad_norm": 1.8634132146835327, "learning_rate": 2.38440962934658e-06, "loss": 0.4252, "step": 626 }, { "epoch": 0.19168450015285846, "grad_norm": 2.477269172668457, "learning_rate": 2.388230798624379e-06, "loss": 0.3399, "step": 627 }, { "epoch": 0.19199021705900338, "grad_norm": 1.0835822820663452, "learning_rate": 2.392051967902178e-06, "loss": 0.2774, "step": 628 }, { "epoch": 0.19229593396514827, "grad_norm": 1.6555373668670654, "learning_rate": 2.3958731371799772e-06, "loss": 0.2753, "step": 629 }, { "epoch": 0.1926016508712932, "grad_norm": 1.3458442687988281, "learning_rate": 2.3996943064577764e-06, "loss": 0.28, "step": 630 }, { "epoch": 0.19290736777743808, "grad_norm": 1.3522217273712158, "learning_rate": 2.403515475735575e-06, "loss": 0.2995, "step": 631 }, { "epoch": 0.193213084683583, "grad_norm": 1.6339012384414673, "learning_rate": 2.4073366450133743e-06, "loss": 0.2598, "step": 632 }, { "epoch": 0.19351880158972792, "grad_norm": 1.7244874238967896, "learning_rate": 2.411157814291173e-06, "loss": 0.2902, "step": 633 }, { "epoch": 0.19382451849587282, "grad_norm": 2.2811813354492188, "learning_rate": 2.414978983568972e-06, "loss": 0.3323, "step": 634 }, { "epoch": 0.19413023540201774, "grad_norm": 1.639678716659546, "learning_rate": 2.418800152846771e-06, "loss": 0.301, "step": 635 }, { "epoch": 0.19443595230816263, "grad_norm": 1.488454818725586, "learning_rate": 2.42262132212457e-06, "loss": 0.3653, "step": 636 }, { "epoch": 0.19474166921430755, "grad_norm": 2.8326029777526855, "learning_rate": 2.426442491402369e-06, "loss": 0.3253, "step": 637 }, { "epoch": 0.19504738612045247, "grad_norm": 1.7051372528076172, "learning_rate": 2.430263660680168e-06, "loss": 0.3185, "step": 638 }, { "epoch": 0.19535310302659736, "grad_norm": 1.6777942180633545, "learning_rate": 2.434084829957967e-06, "loss": 0.4091, "step": 639 }, { "epoch": 0.19565881993274228, "grad_norm": 2.9844322204589844, "learning_rate": 2.4379059992357664e-06, "loss": 0.3603, "step": 640 }, { "epoch": 0.19596453683888718, "grad_norm": 1.8816248178482056, "learning_rate": 2.441727168513565e-06, "loss": 0.4222, "step": 641 }, { "epoch": 0.1962702537450321, "grad_norm": 1.4618343114852905, "learning_rate": 2.4455483377913643e-06, "loss": 0.4648, "step": 642 }, { "epoch": 0.19657597065117702, "grad_norm": 3.0512123107910156, "learning_rate": 2.4493695070691634e-06, "loss": 0.5242, "step": 643 }, { "epoch": 0.1968816875573219, "grad_norm": 2.6113593578338623, "learning_rate": 2.453190676346962e-06, "loss": 0.5195, "step": 644 }, { "epoch": 0.19718740446346683, "grad_norm": 2.579439401626587, "learning_rate": 2.4570118456247614e-06, "loss": 0.5015, "step": 645 }, { "epoch": 0.19749312136961175, "grad_norm": 7.6709208488464355, "learning_rate": 2.46083301490256e-06, "loss": 0.6044, "step": 646 }, { "epoch": 0.19779883827575664, "grad_norm": 2.471949338912964, "learning_rate": 2.4646541841803593e-06, "loss": 0.6066, "step": 647 }, { "epoch": 0.19810455518190156, "grad_norm": 5.208921909332275, "learning_rate": 2.468475353458158e-06, "loss": 0.6613, "step": 648 }, { "epoch": 0.19841027208804646, "grad_norm": 3.0910837650299072, "learning_rate": 2.4722965227359576e-06, "loss": 0.7095, "step": 649 }, { "epoch": 0.19871598899419138, "grad_norm": 4.622714996337891, "learning_rate": 2.4761176920137564e-06, "loss": 0.8402, "step": 650 }, { "epoch": 0.1990217059003363, "grad_norm": 1.7911807298660278, "learning_rate": 2.4799388612915555e-06, "loss": 0.4351, "step": 651 }, { "epoch": 0.1993274228064812, "grad_norm": 1.1847429275512695, "learning_rate": 2.4837600305693543e-06, "loss": 0.2957, "step": 652 }, { "epoch": 0.1996331397126261, "grad_norm": 1.7359099388122559, "learning_rate": 2.4875811998471534e-06, "loss": 0.3238, "step": 653 }, { "epoch": 0.199938856618771, "grad_norm": 1.1558010578155518, "learning_rate": 2.491402369124952e-06, "loss": 0.275, "step": 654 }, { "epoch": 0.20024457352491593, "grad_norm": 1.2988994121551514, "learning_rate": 2.4952235384027514e-06, "loss": 0.2475, "step": 655 }, { "epoch": 0.20055029043106085, "grad_norm": 1.3303642272949219, "learning_rate": 2.49904470768055e-06, "loss": 0.2547, "step": 656 }, { "epoch": 0.20085600733720574, "grad_norm": 0.9915971159934998, "learning_rate": 2.5028658769583493e-06, "loss": 0.2404, "step": 657 }, { "epoch": 0.20116172424335066, "grad_norm": 2.1064414978027344, "learning_rate": 2.5066870462361484e-06, "loss": 0.3418, "step": 658 }, { "epoch": 0.20146744114949558, "grad_norm": 1.326471209526062, "learning_rate": 2.5105082155139476e-06, "loss": 0.2658, "step": 659 }, { "epoch": 0.20177315805564047, "grad_norm": 1.1975065469741821, "learning_rate": 2.5143293847917463e-06, "loss": 0.2489, "step": 660 }, { "epoch": 0.2020788749617854, "grad_norm": 2.133723258972168, "learning_rate": 2.5181505540695455e-06, "loss": 0.3059, "step": 661 }, { "epoch": 0.20238459186793029, "grad_norm": 1.5578323602676392, "learning_rate": 2.5219717233473443e-06, "loss": 0.2941, "step": 662 }, { "epoch": 0.2026903087740752, "grad_norm": 1.9208160638809204, "learning_rate": 2.5257928926251434e-06, "loss": 0.3256, "step": 663 }, { "epoch": 0.20299602568022013, "grad_norm": 1.7661373615264893, "learning_rate": 2.529614061902942e-06, "loss": 0.343, "step": 664 }, { "epoch": 0.20330174258636502, "grad_norm": 1.6304936408996582, "learning_rate": 2.5334352311807413e-06, "loss": 0.3971, "step": 665 }, { "epoch": 0.20360745949250994, "grad_norm": 2.799717664718628, "learning_rate": 2.53725640045854e-06, "loss": 0.4091, "step": 666 }, { "epoch": 0.20391317639865483, "grad_norm": 1.9289597272872925, "learning_rate": 2.5410775697363393e-06, "loss": 0.4538, "step": 667 }, { "epoch": 0.20421889330479975, "grad_norm": 1.7867443561553955, "learning_rate": 2.5448987390141384e-06, "loss": 0.488, "step": 668 }, { "epoch": 0.20452461021094467, "grad_norm": 1.996587872505188, "learning_rate": 2.5487199082919376e-06, "loss": 0.5239, "step": 669 }, { "epoch": 0.20483032711708957, "grad_norm": 2.9546732902526855, "learning_rate": 2.5525410775697363e-06, "loss": 0.5155, "step": 670 }, { "epoch": 0.2051360440232345, "grad_norm": 3.387373208999634, "learning_rate": 2.5563622468475355e-06, "loss": 0.4765, "step": 671 }, { "epoch": 0.2054417609293794, "grad_norm": 2.0132648944854736, "learning_rate": 2.5601834161253342e-06, "loss": 0.5886, "step": 672 }, { "epoch": 0.2057474778355243, "grad_norm": 3.351811170578003, "learning_rate": 2.5640045854031334e-06, "loss": 0.6429, "step": 673 }, { "epoch": 0.20605319474166922, "grad_norm": 5.944129467010498, "learning_rate": 2.567825754680932e-06, "loss": 0.7054, "step": 674 }, { "epoch": 0.20635891164781411, "grad_norm": 4.535555839538574, "learning_rate": 2.5716469239587313e-06, "loss": 0.9446, "step": 675 }, { "epoch": 0.20666462855395903, "grad_norm": 1.882334589958191, "learning_rate": 2.5754680932365305e-06, "loss": 0.4221, "step": 676 }, { "epoch": 0.20697034546010395, "grad_norm": 1.1755021810531616, "learning_rate": 2.5792892625143292e-06, "loss": 0.2823, "step": 677 }, { "epoch": 0.20727606236624885, "grad_norm": 1.0582528114318848, "learning_rate": 2.583110431792129e-06, "loss": 0.2511, "step": 678 }, { "epoch": 0.20758177927239377, "grad_norm": 1.8595941066741943, "learning_rate": 2.5869316010699276e-06, "loss": 0.2129, "step": 679 }, { "epoch": 0.20788749617853866, "grad_norm": 0.8721693158149719, "learning_rate": 2.5907527703477267e-06, "loss": 0.2448, "step": 680 }, { "epoch": 0.20819321308468358, "grad_norm": 2.302337408065796, "learning_rate": 2.5945739396255255e-06, "loss": 0.2918, "step": 681 }, { "epoch": 0.2084989299908285, "grad_norm": 1.240998387336731, "learning_rate": 2.5983951089033247e-06, "loss": 0.2426, "step": 682 }, { "epoch": 0.2088046468969734, "grad_norm": 1.0954853296279907, "learning_rate": 2.6022162781811234e-06, "loss": 0.2786, "step": 683 }, { "epoch": 0.20911036380311832, "grad_norm": 2.5269856452941895, "learning_rate": 2.6060374474589226e-06, "loss": 0.2779, "step": 684 }, { "epoch": 0.20941608070926324, "grad_norm": 1.0602166652679443, "learning_rate": 2.6098586167367213e-06, "loss": 0.2605, "step": 685 }, { "epoch": 0.20972179761540813, "grad_norm": 2.0247888565063477, "learning_rate": 2.6136797860145205e-06, "loss": 0.3267, "step": 686 }, { "epoch": 0.21002751452155305, "grad_norm": 1.1914857625961304, "learning_rate": 2.6175009552923192e-06, "loss": 0.2977, "step": 687 }, { "epoch": 0.21033323142769794, "grad_norm": 1.5091577768325806, "learning_rate": 2.621322124570119e-06, "loss": 0.3015, "step": 688 }, { "epoch": 0.21063894833384286, "grad_norm": 2.038410186767578, "learning_rate": 2.6251432938479176e-06, "loss": 0.3421, "step": 689 }, { "epoch": 0.21094466523998778, "grad_norm": 2.0064024925231934, "learning_rate": 2.6289644631257167e-06, "loss": 0.4153, "step": 690 }, { "epoch": 0.21125038214613268, "grad_norm": 6.313762664794922, "learning_rate": 2.6327856324035155e-06, "loss": 0.4071, "step": 691 }, { "epoch": 0.2115560990522776, "grad_norm": 2.073700189590454, "learning_rate": 2.6366068016813146e-06, "loss": 0.4479, "step": 692 }, { "epoch": 0.2118618159584225, "grad_norm": 1.42826247215271, "learning_rate": 2.6404279709591134e-06, "loss": 0.4684, "step": 693 }, { "epoch": 0.2121675328645674, "grad_norm": 2.2967960834503174, "learning_rate": 2.6442491402369126e-06, "loss": 0.4552, "step": 694 }, { "epoch": 0.21247324977071233, "grad_norm": 2.804847240447998, "learning_rate": 2.6480703095147113e-06, "loss": 0.4889, "step": 695 }, { "epoch": 0.21277896667685722, "grad_norm": 3.5497236251831055, "learning_rate": 2.6518914787925105e-06, "loss": 0.5096, "step": 696 }, { "epoch": 0.21308468358300214, "grad_norm": 2.795166015625, "learning_rate": 2.6557126480703096e-06, "loss": 0.5215, "step": 697 }, { "epoch": 0.21339040048914704, "grad_norm": 4.121243000030518, "learning_rate": 2.659533817348109e-06, "loss": 0.5962, "step": 698 }, { "epoch": 0.21369611739529196, "grad_norm": 3.8289809226989746, "learning_rate": 2.6633549866259075e-06, "loss": 0.647, "step": 699 }, { "epoch": 0.21400183430143688, "grad_norm": Infinity, "learning_rate": 2.6633549866259075e-06, "loss": 0.8404, "step": 700 }, { "epoch": 0.21430755120758177, "grad_norm": 1.7922110557556152, "learning_rate": 2.6671761559037067e-06, "loss": 0.4417, "step": 701 }, { "epoch": 0.2146132681137267, "grad_norm": 0.8783388733863831, "learning_rate": 2.6709973251815055e-06, "loss": 0.2671, "step": 702 }, { "epoch": 0.2149189850198716, "grad_norm": 0.9789925217628479, "learning_rate": 2.6748184944593046e-06, "loss": 0.2656, "step": 703 }, { "epoch": 0.2152247019260165, "grad_norm": 1.0348937511444092, "learning_rate": 2.6786396637371034e-06, "loss": 0.2363, "step": 704 }, { "epoch": 0.21553041883216142, "grad_norm": 1.1928491592407227, "learning_rate": 2.6824608330149025e-06, "loss": 0.2237, "step": 705 }, { "epoch": 0.21583613573830632, "grad_norm": 1.178456425666809, "learning_rate": 2.6862820022927013e-06, "loss": 0.2485, "step": 706 }, { "epoch": 0.21614185264445124, "grad_norm": 1.1639955043792725, "learning_rate": 2.6901031715705005e-06, "loss": 0.227, "step": 707 }, { "epoch": 0.21644756955059616, "grad_norm": 1.12504243850708, "learning_rate": 2.6939243408482996e-06, "loss": 0.2727, "step": 708 }, { "epoch": 0.21675328645674105, "grad_norm": 1.0727709531784058, "learning_rate": 2.697745510126099e-06, "loss": 0.2661, "step": 709 }, { "epoch": 0.21705900336288597, "grad_norm": 1.1062134504318237, "learning_rate": 2.701566679403898e-06, "loss": 0.2643, "step": 710 }, { "epoch": 0.21736472026903086, "grad_norm": 1.1997114419937134, "learning_rate": 2.7053878486816967e-06, "loss": 0.2987, "step": 711 }, { "epoch": 0.21767043717517578, "grad_norm": 2.1866345405578613, "learning_rate": 2.709209017959496e-06, "loss": 0.2862, "step": 712 }, { "epoch": 0.2179761540813207, "grad_norm": 1.0748859643936157, "learning_rate": 2.7130301872372946e-06, "loss": 0.2796, "step": 713 }, { "epoch": 0.2182818709874656, "grad_norm": 1.1325124502182007, "learning_rate": 2.7168513565150938e-06, "loss": 0.3571, "step": 714 }, { "epoch": 0.21858758789361052, "grad_norm": 1.4994715452194214, "learning_rate": 2.7206725257928925e-06, "loss": 0.3683, "step": 715 }, { "epoch": 0.21889330479975544, "grad_norm": 2.099163770675659, "learning_rate": 2.7244936950706917e-06, "loss": 0.3818, "step": 716 }, { "epoch": 0.21919902170590033, "grad_norm": 3.8756957054138184, "learning_rate": 2.7283148643484904e-06, "loss": 0.4218, "step": 717 }, { "epoch": 0.21950473861204525, "grad_norm": 2.2356796264648438, "learning_rate": 2.73213603362629e-06, "loss": 0.4721, "step": 718 }, { "epoch": 0.21981045551819015, "grad_norm": 2.6031112670898438, "learning_rate": 2.7359572029040888e-06, "loss": 0.4625, "step": 719 }, { "epoch": 0.22011617242433507, "grad_norm": 2.5132453441619873, "learning_rate": 2.739778372181888e-06, "loss": 0.4919, "step": 720 }, { "epoch": 0.22042188933048, "grad_norm": 2.1530442237854004, "learning_rate": 2.7435995414596867e-06, "loss": 0.4692, "step": 721 }, { "epoch": 0.22072760623662488, "grad_norm": 4.225505352020264, "learning_rate": 2.747420710737486e-06, "loss": 0.4946, "step": 722 }, { "epoch": 0.2210333231427698, "grad_norm": 2.351405143737793, "learning_rate": 2.7512418800152846e-06, "loss": 0.5925, "step": 723 }, { "epoch": 0.2213390400489147, "grad_norm": 2.9702484607696533, "learning_rate": 2.7550630492930838e-06, "loss": 0.661, "step": 724 }, { "epoch": 0.2216447569550596, "grad_norm": 4.436789035797119, "learning_rate": 2.7588842185708825e-06, "loss": 0.8358, "step": 725 }, { "epoch": 0.22195047386120453, "grad_norm": 1.2003217935562134, "learning_rate": 2.7627053878486817e-06, "loss": 0.338, "step": 726 }, { "epoch": 0.22225619076734943, "grad_norm": 1.6521906852722168, "learning_rate": 2.766526557126481e-06, "loss": 0.2516, "step": 727 }, { "epoch": 0.22256190767349435, "grad_norm": 3.649739980697632, "learning_rate": 2.77034772640428e-06, "loss": 0.2662, "step": 728 }, { "epoch": 0.22286762457963927, "grad_norm": 0.9027721881866455, "learning_rate": 2.7741688956820788e-06, "loss": 0.217, "step": 729 }, { "epoch": 0.22317334148578416, "grad_norm": 0.9324570894241333, "learning_rate": 2.777990064959878e-06, "loss": 0.2572, "step": 730 }, { "epoch": 0.22347905839192908, "grad_norm": 0.8034723997116089, "learning_rate": 2.7818112342376767e-06, "loss": 0.2306, "step": 731 }, { "epoch": 0.22378477529807397, "grad_norm": 0.9497804045677185, "learning_rate": 2.785632403515476e-06, "loss": 0.1994, "step": 732 }, { "epoch": 0.2240904922042189, "grad_norm": 1.146822214126587, "learning_rate": 2.7894535727932746e-06, "loss": 0.2184, "step": 733 }, { "epoch": 0.22439620911036381, "grad_norm": 0.9643145799636841, "learning_rate": 2.7932747420710738e-06, "loss": 0.2525, "step": 734 }, { "epoch": 0.2247019260165087, "grad_norm": 0.8484119176864624, "learning_rate": 2.7970959113488725e-06, "loss": 0.2325, "step": 735 }, { "epoch": 0.22500764292265363, "grad_norm": 0.9125089049339294, "learning_rate": 2.8009170806266717e-06, "loss": 0.2601, "step": 736 }, { "epoch": 0.22531335982879852, "grad_norm": 1.248022437095642, "learning_rate": 2.804738249904471e-06, "loss": 0.2516, "step": 737 }, { "epoch": 0.22561907673494344, "grad_norm": 1.6245940923690796, "learning_rate": 2.80855941918227e-06, "loss": 0.297, "step": 738 }, { "epoch": 0.22592479364108836, "grad_norm": 1.0951716899871826, "learning_rate": 2.8123805884600688e-06, "loss": 0.3085, "step": 739 }, { "epoch": 0.22623051054723325, "grad_norm": 1.2818330526351929, "learning_rate": 2.816201757737868e-06, "loss": 0.3149, "step": 740 }, { "epoch": 0.22653622745337818, "grad_norm": 2.9516377449035645, "learning_rate": 2.820022927015667e-06, "loss": 0.3913, "step": 741 }, { "epoch": 0.22684194435952307, "grad_norm": 1.7961523532867432, "learning_rate": 2.823844096293466e-06, "loss": 0.3933, "step": 742 }, { "epoch": 0.227147661265668, "grad_norm": 9.704188346862793, "learning_rate": 2.827665265571265e-06, "loss": 0.4256, "step": 743 }, { "epoch": 0.2274533781718129, "grad_norm": 1.8876543045043945, "learning_rate": 2.8314864348490637e-06, "loss": 0.4294, "step": 744 }, { "epoch": 0.2277590950779578, "grad_norm": 2.8924856185913086, "learning_rate": 2.835307604126863e-06, "loss": 0.4702, "step": 745 }, { "epoch": 0.22806481198410272, "grad_norm": 3.0494930744171143, "learning_rate": 2.8391287734046617e-06, "loss": 0.4644, "step": 746 }, { "epoch": 0.22837052889024764, "grad_norm": 2.7149107456207275, "learning_rate": 2.8429499426824613e-06, "loss": 0.4766, "step": 747 }, { "epoch": 0.22867624579639254, "grad_norm": 4.382872581481934, "learning_rate": 2.84677111196026e-06, "loss": 0.5436, "step": 748 }, { "epoch": 0.22898196270253746, "grad_norm": 7.625521659851074, "learning_rate": 2.850592281238059e-06, "loss": 0.5952, "step": 749 }, { "epoch": 0.22928767960868235, "grad_norm": 3.692756175994873, "learning_rate": 2.854413450515858e-06, "loss": 0.8423, "step": 750 }, { "epoch": 0.22959339651482727, "grad_norm": 1.4606287479400635, "learning_rate": 2.858234619793657e-06, "loss": 0.3792, "step": 751 }, { "epoch": 0.2298991134209722, "grad_norm": 1.0110578536987305, "learning_rate": 2.862055789071456e-06, "loss": 0.2929, "step": 752 }, { "epoch": 0.23020483032711708, "grad_norm": 1.2443649768829346, "learning_rate": 2.865876958349255e-06, "loss": 0.2514, "step": 753 }, { "epoch": 0.230510547233262, "grad_norm": 1.3578917980194092, "learning_rate": 2.8696981276270537e-06, "loss": 0.2782, "step": 754 }, { "epoch": 0.2308162641394069, "grad_norm": 1.2922049760818481, "learning_rate": 2.873519296904853e-06, "loss": 0.2223, "step": 755 }, { "epoch": 0.23112198104555182, "grad_norm": 0.8305726647377014, "learning_rate": 2.8773404661826516e-06, "loss": 0.2171, "step": 756 }, { "epoch": 0.23142769795169674, "grad_norm": 1.1240423917770386, "learning_rate": 2.8811616354604512e-06, "loss": 0.2003, "step": 757 }, { "epoch": 0.23173341485784163, "grad_norm": 1.0584266185760498, "learning_rate": 2.88498280473825e-06, "loss": 0.2092, "step": 758 }, { "epoch": 0.23203913176398655, "grad_norm": 0.9399870038032532, "learning_rate": 2.888803974016049e-06, "loss": 0.2477, "step": 759 }, { "epoch": 0.23234484867013147, "grad_norm": 1.3306925296783447, "learning_rate": 2.892625143293848e-06, "loss": 0.2405, "step": 760 }, { "epoch": 0.23265056557627636, "grad_norm": 1.019371509552002, "learning_rate": 2.896446312571647e-06, "loss": 0.2655, "step": 761 }, { "epoch": 0.23295628248242128, "grad_norm": 1.0196822881698608, "learning_rate": 2.900267481849446e-06, "loss": 0.2307, "step": 762 }, { "epoch": 0.23326199938856618, "grad_norm": 1.033806562423706, "learning_rate": 2.904088651127245e-06, "loss": 0.2457, "step": 763 }, { "epoch": 0.2335677162947111, "grad_norm": 1.9663867950439453, "learning_rate": 2.9079098204050437e-06, "loss": 0.3396, "step": 764 }, { "epoch": 0.23387343320085602, "grad_norm": 1.5990153551101685, "learning_rate": 2.911730989682843e-06, "loss": 0.3769, "step": 765 }, { "epoch": 0.2341791501070009, "grad_norm": 1.925100564956665, "learning_rate": 2.915552158960642e-06, "loss": 0.4052, "step": 766 }, { "epoch": 0.23448486701314583, "grad_norm": 1.712884545326233, "learning_rate": 2.9193733282384412e-06, "loss": 0.4064, "step": 767 }, { "epoch": 0.23479058391929072, "grad_norm": 1.8831430673599243, "learning_rate": 2.92319449751624e-06, "loss": 0.4364, "step": 768 }, { "epoch": 0.23509630082543564, "grad_norm": 1.581552267074585, "learning_rate": 2.927015666794039e-06, "loss": 0.45, "step": 769 }, { "epoch": 0.23540201773158057, "grad_norm": 2.1549465656280518, "learning_rate": 2.930836836071838e-06, "loss": 0.511, "step": 770 }, { "epoch": 0.23570773463772546, "grad_norm": 2.248976469039917, "learning_rate": 2.934658005349637e-06, "loss": 0.4978, "step": 771 }, { "epoch": 0.23601345154387038, "grad_norm": 2.12103271484375, "learning_rate": 2.9384791746274362e-06, "loss": 0.5254, "step": 772 }, { "epoch": 0.2363191684500153, "grad_norm": 2.358510971069336, "learning_rate": 2.942300343905235e-06, "loss": 0.596, "step": 773 }, { "epoch": 0.2366248853561602, "grad_norm": 2.7500617504119873, "learning_rate": 2.946121513183034e-06, "loss": 0.6137, "step": 774 }, { "epoch": 0.2369306022623051, "grad_norm": 7.139145374298096, "learning_rate": 2.949942682460833e-06, "loss": 0.852, "step": 775 }, { "epoch": 0.23723631916845, "grad_norm": 1.4672679901123047, "learning_rate": 2.9537638517386325e-06, "loss": 0.3558, "step": 776 }, { "epoch": 0.23754203607459493, "grad_norm": 0.8170671463012695, "learning_rate": 2.9575850210164312e-06, "loss": 0.2693, "step": 777 }, { "epoch": 0.23784775298073985, "grad_norm": 0.7714478969573975, "learning_rate": 2.9614061902942304e-06, "loss": 0.2311, "step": 778 }, { "epoch": 0.23815346988688474, "grad_norm": 0.9282224178314209, "learning_rate": 2.965227359572029e-06, "loss": 0.2207, "step": 779 }, { "epoch": 0.23845918679302966, "grad_norm": 1.0686646699905396, "learning_rate": 2.9690485288498283e-06, "loss": 0.2219, "step": 780 }, { "epoch": 0.23876490369917455, "grad_norm": 1.107957363128662, "learning_rate": 2.972869698127627e-06, "loss": 0.2068, "step": 781 }, { "epoch": 0.23907062060531947, "grad_norm": 0.8315656185150146, "learning_rate": 2.976690867405426e-06, "loss": 0.1731, "step": 782 }, { "epoch": 0.2393763375114644, "grad_norm": 1.289969801902771, "learning_rate": 2.980512036683225e-06, "loss": 0.2247, "step": 783 }, { "epoch": 0.23968205441760929, "grad_norm": 1.00977623462677, "learning_rate": 2.984333205961024e-06, "loss": 0.2597, "step": 784 }, { "epoch": 0.2399877713237542, "grad_norm": 2.032116413116455, "learning_rate": 2.988154375238823e-06, "loss": 0.2225, "step": 785 }, { "epoch": 0.24029348822989913, "grad_norm": 1.5789105892181396, "learning_rate": 2.9919755445166225e-06, "loss": 0.2964, "step": 786 }, { "epoch": 0.24059920513604402, "grad_norm": 1.3969404697418213, "learning_rate": 2.995796713794421e-06, "loss": 0.2736, "step": 787 }, { "epoch": 0.24090492204218894, "grad_norm": 0.9456064105033875, "learning_rate": 2.9996178830722204e-06, "loss": 0.2605, "step": 788 }, { "epoch": 0.24121063894833383, "grad_norm": 1.4224978685379028, "learning_rate": 3.003439052350019e-06, "loss": 0.3205, "step": 789 }, { "epoch": 0.24151635585447875, "grad_norm": 1.534359097480774, "learning_rate": 3.0072602216278183e-06, "loss": 0.3674, "step": 790 }, { "epoch": 0.24182207276062367, "grad_norm": 2.2560229301452637, "learning_rate": 3.011081390905617e-06, "loss": 0.348, "step": 791 }, { "epoch": 0.24212778966676857, "grad_norm": 1.7111784219741821, "learning_rate": 3.014902560183416e-06, "loss": 0.4247, "step": 792 }, { "epoch": 0.2424335065729135, "grad_norm": 1.5935587882995605, "learning_rate": 3.018723729461215e-06, "loss": 0.4276, "step": 793 }, { "epoch": 0.24273922347905838, "grad_norm": 2.479442596435547, "learning_rate": 3.022544898739014e-06, "loss": 0.4284, "step": 794 }, { "epoch": 0.2430449403852033, "grad_norm": 2.3076531887054443, "learning_rate": 3.0263660680168133e-06, "loss": 0.478, "step": 795 }, { "epoch": 0.24335065729134822, "grad_norm": 2.6625897884368896, "learning_rate": 3.0301872372946124e-06, "loss": 0.4892, "step": 796 }, { "epoch": 0.24365637419749311, "grad_norm": 4.701833724975586, "learning_rate": 3.034008406572411e-06, "loss": 0.5336, "step": 797 }, { "epoch": 0.24396209110363803, "grad_norm": 2.6905274391174316, "learning_rate": 3.0378295758502104e-06, "loss": 0.604, "step": 798 }, { "epoch": 0.24426780800978293, "grad_norm": 7.340642929077148, "learning_rate": 3.041650745128009e-06, "loss": 0.5587, "step": 799 }, { "epoch": 0.24457352491592785, "grad_norm": 3.5030980110168457, "learning_rate": 3.0454719144058083e-06, "loss": 0.8292, "step": 800 }, { "epoch": 0.24487924182207277, "grad_norm": 1.3179982900619507, "learning_rate": 3.049293083683607e-06, "loss": 0.364, "step": 801 }, { "epoch": 0.24518495872821766, "grad_norm": 0.7543867230415344, "learning_rate": 3.053114252961406e-06, "loss": 0.217, "step": 802 }, { "epoch": 0.24549067563436258, "grad_norm": 1.153267502784729, "learning_rate": 3.056935422239205e-06, "loss": 0.2064, "step": 803 }, { "epoch": 0.2457963925405075, "grad_norm": 0.7060242891311646, "learning_rate": 3.060756591517004e-06, "loss": 0.2234, "step": 804 }, { "epoch": 0.2461021094466524, "grad_norm": 1.5104851722717285, "learning_rate": 3.0645777607948037e-06, "loss": 0.1877, "step": 805 }, { "epoch": 0.24640782635279732, "grad_norm": 1.4368271827697754, "learning_rate": 3.0683989300726024e-06, "loss": 0.2213, "step": 806 }, { "epoch": 0.2467135432589422, "grad_norm": 0.9648993015289307, "learning_rate": 3.0722200993504016e-06, "loss": 0.1944, "step": 807 }, { "epoch": 0.24701926016508713, "grad_norm": 0.8218436241149902, "learning_rate": 3.0760412686282003e-06, "loss": 0.217, "step": 808 }, { "epoch": 0.24732497707123205, "grad_norm": 0.8687562942504883, "learning_rate": 3.0798624379059995e-06, "loss": 0.1939, "step": 809 }, { "epoch": 0.24763069397737694, "grad_norm": 1.039297342300415, "learning_rate": 3.0836836071837983e-06, "loss": 0.2352, "step": 810 }, { "epoch": 0.24793641088352186, "grad_norm": 0.8904309868812561, "learning_rate": 3.0875047764615974e-06, "loss": 0.2073, "step": 811 }, { "epoch": 0.24824212778966676, "grad_norm": 1.0772004127502441, "learning_rate": 3.091325945739396e-06, "loss": 0.2428, "step": 812 }, { "epoch": 0.24854784469581168, "grad_norm": 1.0597343444824219, "learning_rate": 3.0951471150171953e-06, "loss": 0.2647, "step": 813 }, { "epoch": 0.2488535616019566, "grad_norm": 1.2086689472198486, "learning_rate": 3.098968284294994e-06, "loss": 0.311, "step": 814 }, { "epoch": 0.2491592785081015, "grad_norm": 1.0789388418197632, "learning_rate": 3.1027894535727937e-06, "loss": 0.3497, "step": 815 }, { "epoch": 0.2494649954142464, "grad_norm": 2.0154805183410645, "learning_rate": 3.1066106228505924e-06, "loss": 0.3455, "step": 816 }, { "epoch": 0.24977071232039133, "grad_norm": 1.539566159248352, "learning_rate": 3.1104317921283916e-06, "loss": 0.3783, "step": 817 }, { "epoch": 0.2500764292265362, "grad_norm": 1.6037242412567139, "learning_rate": 3.1142529614061903e-06, "loss": 0.3805, "step": 818 }, { "epoch": 0.2503821461326811, "grad_norm": 3.7115602493286133, "learning_rate": 3.1180741306839895e-06, "loss": 0.4398, "step": 819 }, { "epoch": 0.25068786303882606, "grad_norm": 2.4537336826324463, "learning_rate": 3.1218952999617882e-06, "loss": 0.3988, "step": 820 }, { "epoch": 0.25099357994497096, "grad_norm": 1.8688207864761353, "learning_rate": 3.1257164692395874e-06, "loss": 0.4966, "step": 821 }, { "epoch": 0.25129929685111585, "grad_norm": 3.0661752223968506, "learning_rate": 3.129537638517386e-06, "loss": 0.4865, "step": 822 }, { "epoch": 0.2516050137572608, "grad_norm": 2.264585256576538, "learning_rate": 3.1333588077951853e-06, "loss": 0.5205, "step": 823 }, { "epoch": 0.2519107306634057, "grad_norm": 4.2344651222229, "learning_rate": 3.137179977072984e-06, "loss": 0.595, "step": 824 }, { "epoch": 0.2522164475695506, "grad_norm": 8.463536262512207, "learning_rate": 3.1410011463507837e-06, "loss": 0.8322, "step": 825 }, { "epoch": 0.25252216447569553, "grad_norm": 1.8538283109664917, "learning_rate": 3.1448223156285824e-06, "loss": 0.3507, "step": 826 }, { "epoch": 0.2528278813818404, "grad_norm": 1.0638113021850586, "learning_rate": 3.1486434849063816e-06, "loss": 0.2179, "step": 827 }, { "epoch": 0.2531335982879853, "grad_norm": 0.7938324213027954, "learning_rate": 3.1524646541841803e-06, "loss": 0.2242, "step": 828 }, { "epoch": 0.2534393151941302, "grad_norm": 0.7650423645973206, "learning_rate": 3.1562858234619795e-06, "loss": 0.2132, "step": 829 }, { "epoch": 0.25374503210027516, "grad_norm": 1.0319920778274536, "learning_rate": 3.1601069927397782e-06, "loss": 0.1634, "step": 830 }, { "epoch": 0.25405074900642005, "grad_norm": 0.7111336588859558, "learning_rate": 3.1639281620175774e-06, "loss": 0.2156, "step": 831 }, { "epoch": 0.25435646591256494, "grad_norm": 2.026247024536133, "learning_rate": 3.167749331295376e-06, "loss": 0.1762, "step": 832 }, { "epoch": 0.2546621828187099, "grad_norm": 0.9108959436416626, "learning_rate": 3.1715705005731753e-06, "loss": 0.2227, "step": 833 }, { "epoch": 0.2549678997248548, "grad_norm": 0.8213369846343994, "learning_rate": 3.1753916698509745e-06, "loss": 0.1921, "step": 834 }, { "epoch": 0.2552736166309997, "grad_norm": 2.320335626602173, "learning_rate": 3.1792128391287737e-06, "loss": 0.2207, "step": 835 }, { "epoch": 0.2555793335371446, "grad_norm": 1.2573775053024292, "learning_rate": 3.1830340084065724e-06, "loss": 0.2443, "step": 836 }, { "epoch": 0.2558850504432895, "grad_norm": 1.0866658687591553, "learning_rate": 3.1868551776843716e-06, "loss": 0.2318, "step": 837 }, { "epoch": 0.2561907673494344, "grad_norm": 0.9015803933143616, "learning_rate": 3.1906763469621707e-06, "loss": 0.2299, "step": 838 }, { "epoch": 0.25649648425557936, "grad_norm": 1.0368118286132812, "learning_rate": 3.1944975162399695e-06, "loss": 0.2925, "step": 839 }, { "epoch": 0.25680220116172425, "grad_norm": 1.1617902517318726, "learning_rate": 3.1983186855177686e-06, "loss": 0.3041, "step": 840 }, { "epoch": 0.25710791806786915, "grad_norm": 2.747490167617798, "learning_rate": 3.2021398547955674e-06, "loss": 0.3221, "step": 841 }, { "epoch": 0.25741363497401404, "grad_norm": 1.3473057746887207, "learning_rate": 3.2059610240733666e-06, "loss": 0.3348, "step": 842 }, { "epoch": 0.257719351880159, "grad_norm": 1.6611056327819824, "learning_rate": 3.2097821933511653e-06, "loss": 0.4155, "step": 843 }, { "epoch": 0.2580250687863039, "grad_norm": 1.9074817895889282, "learning_rate": 3.213603362628965e-06, "loss": 0.4365, "step": 844 }, { "epoch": 0.25833078569244877, "grad_norm": 2.7206919193267822, "learning_rate": 3.2174245319067636e-06, "loss": 0.4364, "step": 845 }, { "epoch": 0.2586365025985937, "grad_norm": 1.6449638605117798, "learning_rate": 3.221245701184563e-06, "loss": 0.4516, "step": 846 }, { "epoch": 0.2589422195047386, "grad_norm": 2.88694167137146, "learning_rate": 3.2250668704623616e-06, "loss": 0.4904, "step": 847 }, { "epoch": 0.2592479364108835, "grad_norm": 3.1289596557617188, "learning_rate": 3.2288880397401607e-06, "loss": 0.5572, "step": 848 }, { "epoch": 0.25955365331702845, "grad_norm": 6.177978992462158, "learning_rate": 3.2327092090179595e-06, "loss": 0.5783, "step": 849 }, { "epoch": 0.25985937022317335, "grad_norm": 3.8244118690490723, "learning_rate": 3.2365303782957586e-06, "loss": 0.68, "step": 850 }, { "epoch": 0.26016508712931824, "grad_norm": 0.9537702798843384, "learning_rate": 3.2403515475735574e-06, "loss": 0.2944, "step": 851 }, { "epoch": 0.2604708040354632, "grad_norm": 1.4670441150665283, "learning_rate": 3.2441727168513565e-06, "loss": 0.2568, "step": 852 }, { "epoch": 0.2607765209416081, "grad_norm": 0.696590781211853, "learning_rate": 3.2479938861291553e-06, "loss": 0.1983, "step": 853 }, { "epoch": 0.261082237847753, "grad_norm": 0.7362313866615295, "learning_rate": 3.251815055406955e-06, "loss": 0.1664, "step": 854 }, { "epoch": 0.26138795475389787, "grad_norm": 0.8481813669204712, "learning_rate": 3.2556362246847536e-06, "loss": 0.1834, "step": 855 }, { "epoch": 0.2616936716600428, "grad_norm": 1.6507830619812012, "learning_rate": 3.259457393962553e-06, "loss": 0.173, "step": 856 }, { "epoch": 0.2619993885661877, "grad_norm": 0.78886878490448, "learning_rate": 3.2632785632403515e-06, "loss": 0.1679, "step": 857 }, { "epoch": 0.2623051054723326, "grad_norm": 1.0186578035354614, "learning_rate": 3.2670997325181507e-06, "loss": 0.1896, "step": 858 }, { "epoch": 0.26261082237847755, "grad_norm": 0.9392130970954895, "learning_rate": 3.2709209017959495e-06, "loss": 0.2128, "step": 859 }, { "epoch": 0.26291653928462244, "grad_norm": 0.9970370531082153, "learning_rate": 3.2747420710737486e-06, "loss": 0.2193, "step": 860 }, { "epoch": 0.26322225619076733, "grad_norm": 1.0239818096160889, "learning_rate": 3.2785632403515474e-06, "loss": 0.247, "step": 861 }, { "epoch": 0.2635279730969123, "grad_norm": 1.439717411994934, "learning_rate": 3.2823844096293465e-06, "loss": 0.213, "step": 862 }, { "epoch": 0.2638336900030572, "grad_norm": 1.3641846179962158, "learning_rate": 3.2862055789071457e-06, "loss": 0.2579, "step": 863 }, { "epoch": 0.26413940690920207, "grad_norm": 1.1911258697509766, "learning_rate": 3.290026748184945e-06, "loss": 0.2629, "step": 864 }, { "epoch": 0.264445123815347, "grad_norm": 1.9084525108337402, "learning_rate": 3.2938479174627436e-06, "loss": 0.3565, "step": 865 }, { "epoch": 0.2647508407214919, "grad_norm": 1.4821032285690308, "learning_rate": 3.2976690867405428e-06, "loss": 0.3464, "step": 866 }, { "epoch": 0.2650565576276368, "grad_norm": 1.6723273992538452, "learning_rate": 3.3014902560183415e-06, "loss": 0.3148, "step": 867 }, { "epoch": 0.2653622745337817, "grad_norm": 2.078939914703369, "learning_rate": 3.3053114252961407e-06, "loss": 0.4308, "step": 868 }, { "epoch": 0.26566799143992664, "grad_norm": 1.4272770881652832, "learning_rate": 3.30913259457394e-06, "loss": 0.4124, "step": 869 }, { "epoch": 0.26597370834607154, "grad_norm": 1.9923712015151978, "learning_rate": 3.3129537638517386e-06, "loss": 0.434, "step": 870 }, { "epoch": 0.26627942525221643, "grad_norm": 2.1373660564422607, "learning_rate": 3.3167749331295378e-06, "loss": 0.4464, "step": 871 }, { "epoch": 0.2665851421583614, "grad_norm": 3.2901558876037598, "learning_rate": 3.3205961024073365e-06, "loss": 0.4647, "step": 872 }, { "epoch": 0.26689085906450627, "grad_norm": 2.7022202014923096, "learning_rate": 3.324417271685136e-06, "loss": 0.5263, "step": 873 }, { "epoch": 0.26719657597065116, "grad_norm": 2.460059642791748, "learning_rate": 3.328238440962935e-06, "loss": 0.6329, "step": 874 }, { "epoch": 0.2675022928767961, "grad_norm": 3.3876047134399414, "learning_rate": 3.332059610240734e-06, "loss": 0.7567, "step": 875 }, { "epoch": 0.267808009782941, "grad_norm": 2.0188682079315186, "learning_rate": 3.3358807795185328e-06, "loss": 0.325, "step": 876 }, { "epoch": 0.2681137266890859, "grad_norm": 1.1262519359588623, "learning_rate": 3.339701948796332e-06, "loss": 0.2071, "step": 877 }, { "epoch": 0.2684194435952308, "grad_norm": 0.7505500912666321, "learning_rate": 3.3435231180741307e-06, "loss": 0.2045, "step": 878 }, { "epoch": 0.26872516050137574, "grad_norm": 1.2032933235168457, "learning_rate": 3.34734428735193e-06, "loss": 0.2084, "step": 879 }, { "epoch": 0.26903087740752063, "grad_norm": 0.9491597414016724, "learning_rate": 3.3511654566297286e-06, "loss": 0.2033, "step": 880 }, { "epoch": 0.2693365943136655, "grad_norm": 0.8610473871231079, "learning_rate": 3.3549866259075278e-06, "loss": 0.2052, "step": 881 }, { "epoch": 0.26964231121981047, "grad_norm": 0.822860062122345, "learning_rate": 3.3588077951853265e-06, "loss": 0.2134, "step": 882 }, { "epoch": 0.26994802812595536, "grad_norm": 0.7616949081420898, "learning_rate": 3.362628964463126e-06, "loss": 0.1746, "step": 883 }, { "epoch": 0.27025374503210026, "grad_norm": 1.551196575164795, "learning_rate": 3.366450133740925e-06, "loss": 0.2254, "step": 884 }, { "epoch": 0.2705594619382452, "grad_norm": 0.863467812538147, "learning_rate": 3.370271303018724e-06, "loss": 0.1686, "step": 885 }, { "epoch": 0.2708651788443901, "grad_norm": 1.057520866394043, "learning_rate": 3.3740924722965228e-06, "loss": 0.2352, "step": 886 }, { "epoch": 0.271170895750535, "grad_norm": 1.1597425937652588, "learning_rate": 3.377913641574322e-06, "loss": 0.2625, "step": 887 }, { "epoch": 0.27147661265667994, "grad_norm": 1.1816015243530273, "learning_rate": 3.3817348108521207e-06, "loss": 0.2388, "step": 888 }, { "epoch": 0.27178232956282483, "grad_norm": 1.0564309358596802, "learning_rate": 3.38555598012992e-06, "loss": 0.2969, "step": 889 }, { "epoch": 0.2720880464689697, "grad_norm": 1.3525733947753906, "learning_rate": 3.3893771494077186e-06, "loss": 0.3554, "step": 890 }, { "epoch": 0.2723937633751146, "grad_norm": 1.5584194660186768, "learning_rate": 3.3931983186855178e-06, "loss": 0.3603, "step": 891 }, { "epoch": 0.27269948028125957, "grad_norm": 1.3594835996627808, "learning_rate": 3.3970194879633165e-06, "loss": 0.3663, "step": 892 }, { "epoch": 0.27300519718740446, "grad_norm": 1.8705556392669678, "learning_rate": 3.400840657241116e-06, "loss": 0.4149, "step": 893 }, { "epoch": 0.27331091409354935, "grad_norm": 1.5554174184799194, "learning_rate": 3.404661826518915e-06, "loss": 0.4358, "step": 894 }, { "epoch": 0.2736166309996943, "grad_norm": 1.8985402584075928, "learning_rate": 3.408482995796714e-06, "loss": 0.383, "step": 895 }, { "epoch": 0.2739223479058392, "grad_norm": 1.7365281581878662, "learning_rate": 3.4123041650745127e-06, "loss": 0.4782, "step": 896 }, { "epoch": 0.2742280648119841, "grad_norm": 1.9615099430084229, "learning_rate": 3.416125334352312e-06, "loss": 0.4895, "step": 897 }, { "epoch": 0.27453378171812903, "grad_norm": 4.274942398071289, "learning_rate": 3.4199465036301107e-06, "loss": 0.5149, "step": 898 }, { "epoch": 0.2748394986242739, "grad_norm": 3.709667921066284, "learning_rate": 3.42376767290791e-06, "loss": 0.563, "step": 899 }, { "epoch": 0.2751452155304188, "grad_norm": 3.3592917919158936, "learning_rate": 3.427588842185709e-06, "loss": 0.5919, "step": 900 }, { "epoch": 0.27545093243656377, "grad_norm": 1.4470044374465942, "learning_rate": 3.4314100114635077e-06, "loss": 0.3236, "step": 901 }, { "epoch": 0.27575664934270866, "grad_norm": 2.599872589111328, "learning_rate": 3.4352311807413073e-06, "loss": 0.2133, "step": 902 }, { "epoch": 0.27606236624885355, "grad_norm": 0.6367613673210144, "learning_rate": 3.439052350019106e-06, "loss": 0.1789, "step": 903 }, { "epoch": 0.27636808315499845, "grad_norm": 0.6760447025299072, "learning_rate": 3.4428735192969052e-06, "loss": 0.1785, "step": 904 }, { "epoch": 0.2766738000611434, "grad_norm": 1.3422507047653198, "learning_rate": 3.446694688574704e-06, "loss": 0.1787, "step": 905 }, { "epoch": 0.2769795169672883, "grad_norm": 1.0021319389343262, "learning_rate": 3.450515857852503e-06, "loss": 0.1827, "step": 906 }, { "epoch": 0.2772852338734332, "grad_norm": 0.9093711376190186, "learning_rate": 3.454337027130302e-06, "loss": 0.1539, "step": 907 }, { "epoch": 0.2775909507795781, "grad_norm": 0.9904459714889526, "learning_rate": 3.458158196408101e-06, "loss": 0.1866, "step": 908 }, { "epoch": 0.277896667685723, "grad_norm": 1.0212785005569458, "learning_rate": 3.4619793656859e-06, "loss": 0.3031, "step": 909 }, { "epoch": 0.2782023845918679, "grad_norm": 1.0360158681869507, "learning_rate": 3.465800534963699e-06, "loss": 0.2428, "step": 910 }, { "epoch": 0.27850810149801286, "grad_norm": 0.8007262945175171, "learning_rate": 3.4696217042414977e-06, "loss": 0.2287, "step": 911 }, { "epoch": 0.27881381840415775, "grad_norm": 0.8959519863128662, "learning_rate": 3.4734428735192973e-06, "loss": 0.1924, "step": 912 }, { "epoch": 0.27911953531030265, "grad_norm": 1.0466686487197876, "learning_rate": 3.477264042797096e-06, "loss": 0.248, "step": 913 }, { "epoch": 0.2794252522164476, "grad_norm": 1.3770745992660522, "learning_rate": 3.4810852120748952e-06, "loss": 0.3001, "step": 914 }, { "epoch": 0.2797309691225925, "grad_norm": 1.1539194583892822, "learning_rate": 3.484906381352694e-06, "loss": 0.2705, "step": 915 }, { "epoch": 0.2800366860287374, "grad_norm": 2.576812267303467, "learning_rate": 3.488727550630493e-06, "loss": 0.3735, "step": 916 }, { "epoch": 0.2803424029348823, "grad_norm": 1.3034688234329224, "learning_rate": 3.492548719908292e-06, "loss": 0.3831, "step": 917 }, { "epoch": 0.2806481198410272, "grad_norm": 3.0693068504333496, "learning_rate": 3.496369889186091e-06, "loss": 0.3278, "step": 918 }, { "epoch": 0.2809538367471721, "grad_norm": 1.5301369428634644, "learning_rate": 3.50019105846389e-06, "loss": 0.4466, "step": 919 }, { "epoch": 0.281259553653317, "grad_norm": 1.3439732789993286, "learning_rate": 3.504012227741689e-06, "loss": 0.3631, "step": 920 }, { "epoch": 0.28156527055946196, "grad_norm": 1.6414856910705566, "learning_rate": 3.5078333970194877e-06, "loss": 0.372, "step": 921 }, { "epoch": 0.28187098746560685, "grad_norm": 1.836067795753479, "learning_rate": 3.5116545662972873e-06, "loss": 0.4684, "step": 922 }, { "epoch": 0.28217670437175174, "grad_norm": 1.9536476135253906, "learning_rate": 3.515475735575086e-06, "loss": 0.3979, "step": 923 }, { "epoch": 0.2824824212778967, "grad_norm": 2.660726547241211, "learning_rate": 3.5192969048528852e-06, "loss": 0.593, "step": 924 }, { "epoch": 0.2827881381840416, "grad_norm": 3.781505823135376, "learning_rate": 3.523118074130684e-06, "loss": 0.6554, "step": 925 }, { "epoch": 0.2830938550901865, "grad_norm": 1.0426303148269653, "learning_rate": 3.526939243408483e-06, "loss": 0.28, "step": 926 }, { "epoch": 0.2833995719963314, "grad_norm": 1.6242036819458008, "learning_rate": 3.530760412686282e-06, "loss": 0.1903, "step": 927 }, { "epoch": 0.2837052889024763, "grad_norm": 0.7854489088058472, "learning_rate": 3.534581581964081e-06, "loss": 0.2025, "step": 928 }, { "epoch": 0.2840110058086212, "grad_norm": 0.8480618596076965, "learning_rate": 3.5384027512418798e-06, "loss": 0.187, "step": 929 }, { "epoch": 0.2843167227147661, "grad_norm": 0.8296639323234558, "learning_rate": 3.542223920519679e-06, "loss": 0.1789, "step": 930 }, { "epoch": 0.28462243962091105, "grad_norm": 0.8910701274871826, "learning_rate": 3.546045089797478e-06, "loss": 0.1754, "step": 931 }, { "epoch": 0.28492815652705594, "grad_norm": 0.7361922860145569, "learning_rate": 3.5498662590752773e-06, "loss": 0.1465, "step": 932 }, { "epoch": 0.28523387343320084, "grad_norm": 0.8140453100204468, "learning_rate": 3.5536874283530765e-06, "loss": 0.1857, "step": 933 }, { "epoch": 0.2855395903393458, "grad_norm": 0.9322276711463928, "learning_rate": 3.557508597630875e-06, "loss": 0.2407, "step": 934 }, { "epoch": 0.2858453072454907, "grad_norm": 0.858069121837616, "learning_rate": 3.5613297669086744e-06, "loss": 0.1954, "step": 935 }, { "epoch": 0.28615102415163557, "grad_norm": 1.0602911710739136, "learning_rate": 3.565150936186473e-06, "loss": 0.2117, "step": 936 }, { "epoch": 0.2864567410577805, "grad_norm": 0.8842675685882568, "learning_rate": 3.5689721054642723e-06, "loss": 0.2228, "step": 937 }, { "epoch": 0.2867624579639254, "grad_norm": 1.0718669891357422, "learning_rate": 3.572793274742071e-06, "loss": 0.2641, "step": 938 }, { "epoch": 0.2870681748700703, "grad_norm": 0.9640434384346008, "learning_rate": 3.57661444401987e-06, "loss": 0.2372, "step": 939 }, { "epoch": 0.28737389177621525, "grad_norm": 1.245141625404358, "learning_rate": 3.580435613297669e-06, "loss": 0.3228, "step": 940 }, { "epoch": 0.28767960868236014, "grad_norm": 1.4815608263015747, "learning_rate": 3.5842567825754685e-06, "loss": 0.3199, "step": 941 }, { "epoch": 0.28798532558850504, "grad_norm": 1.6869502067565918, "learning_rate": 3.5880779518532673e-06, "loss": 0.3614, "step": 942 }, { "epoch": 0.28829104249464993, "grad_norm": 1.719228744506836, "learning_rate": 3.5918991211310664e-06, "loss": 0.3426, "step": 943 }, { "epoch": 0.2885967594007949, "grad_norm": 2.166821241378784, "learning_rate": 3.595720290408865e-06, "loss": 0.4099, "step": 944 }, { "epoch": 0.28890247630693977, "grad_norm": 3.5243821144104004, "learning_rate": 3.5995414596866644e-06, "loss": 0.4429, "step": 945 }, { "epoch": 0.28920819321308466, "grad_norm": 2.4826912879943848, "learning_rate": 3.603362628964463e-06, "loss": 0.4019, "step": 946 }, { "epoch": 0.2895139101192296, "grad_norm": 4.965357780456543, "learning_rate": 3.6071837982422623e-06, "loss": 0.4329, "step": 947 }, { "epoch": 0.2898196270253745, "grad_norm": 2.470799207687378, "learning_rate": 3.611004967520061e-06, "loss": 0.4903, "step": 948 }, { "epoch": 0.2901253439315194, "grad_norm": 4.475967884063721, "learning_rate": 3.61482613679786e-06, "loss": 0.572, "step": 949 }, { "epoch": 0.29043106083766435, "grad_norm": 3.687953233718872, "learning_rate": 3.618647306075659e-06, "loss": 0.6624, "step": 950 }, { "epoch": 0.29073677774380924, "grad_norm": 1.0951178073883057, "learning_rate": 3.6224684753534585e-06, "loss": 0.3031, "step": 951 }, { "epoch": 0.29104249464995413, "grad_norm": 0.6947671175003052, "learning_rate": 3.6262896446312573e-06, "loss": 0.2053, "step": 952 }, { "epoch": 0.2913482115560991, "grad_norm": 0.7736409306526184, "learning_rate": 3.6301108139090564e-06, "loss": 0.2202, "step": 953 }, { "epoch": 0.29165392846224397, "grad_norm": 0.7130172252655029, "learning_rate": 3.633931983186855e-06, "loss": 0.1581, "step": 954 }, { "epoch": 0.29195964536838886, "grad_norm": 0.7076172232627869, "learning_rate": 3.6377531524646543e-06, "loss": 0.1808, "step": 955 }, { "epoch": 0.29226536227453376, "grad_norm": 0.9558129906654358, "learning_rate": 3.641574321742453e-06, "loss": 0.2322, "step": 956 }, { "epoch": 0.2925710791806787, "grad_norm": 0.9007523059844971, "learning_rate": 3.6453954910202523e-06, "loss": 0.165, "step": 957 }, { "epoch": 0.2928767960868236, "grad_norm": 1.029890537261963, "learning_rate": 3.649216660298051e-06, "loss": 0.1669, "step": 958 }, { "epoch": 0.2931825129929685, "grad_norm": 1.2067070007324219, "learning_rate": 3.65303782957585e-06, "loss": 0.2099, "step": 959 }, { "epoch": 0.29348822989911344, "grad_norm": 0.9087214469909668, "learning_rate": 3.656858998853649e-06, "loss": 0.1891, "step": 960 }, { "epoch": 0.29379394680525833, "grad_norm": 0.8722643852233887, "learning_rate": 3.6606801681314485e-06, "loss": 0.2275, "step": 961 }, { "epoch": 0.2940996637114032, "grad_norm": 0.9560788869857788, "learning_rate": 3.6645013374092473e-06, "loss": 0.1976, "step": 962 }, { "epoch": 0.2944053806175482, "grad_norm": 1.1193222999572754, "learning_rate": 3.6683225066870464e-06, "loss": 0.2653, "step": 963 }, { "epoch": 0.29471109752369307, "grad_norm": 1.0764985084533691, "learning_rate": 3.672143675964845e-06, "loss": 0.2311, "step": 964 }, { "epoch": 0.29501681442983796, "grad_norm": 1.5377695560455322, "learning_rate": 3.6759648452426443e-06, "loss": 0.3012, "step": 965 }, { "epoch": 0.2953225313359829, "grad_norm": 1.3881505727767944, "learning_rate": 3.6797860145204435e-06, "loss": 0.359, "step": 966 }, { "epoch": 0.2956282482421278, "grad_norm": 1.147066593170166, "learning_rate": 3.6836071837982422e-06, "loss": 0.3436, "step": 967 }, { "epoch": 0.2959339651482727, "grad_norm": 1.2819370031356812, "learning_rate": 3.6874283530760414e-06, "loss": 0.3578, "step": 968 }, { "epoch": 0.2962396820544176, "grad_norm": 1.3058921098709106, "learning_rate": 3.69124952235384e-06, "loss": 0.3396, "step": 969 }, { "epoch": 0.29654539896056253, "grad_norm": 1.3253852128982544, "learning_rate": 3.6950706916316398e-06, "loss": 0.4165, "step": 970 }, { "epoch": 0.2968511158667074, "grad_norm": 4.134953022003174, "learning_rate": 3.6988918609094385e-06, "loss": 0.4577, "step": 971 }, { "epoch": 0.2971568327728523, "grad_norm": 1.5740094184875488, "learning_rate": 3.7027130301872377e-06, "loss": 0.4633, "step": 972 }, { "epoch": 0.29746254967899727, "grad_norm": 2.8621842861175537, "learning_rate": 3.7065341994650364e-06, "loss": 0.5124, "step": 973 }, { "epoch": 0.29776826658514216, "grad_norm": 3.1555027961730957, "learning_rate": 3.7103553687428356e-06, "loss": 0.5096, "step": 974 }, { "epoch": 0.29807398349128705, "grad_norm": 2.829953908920288, "learning_rate": 3.7141765380206343e-06, "loss": 0.6141, "step": 975 }, { "epoch": 0.298379700397432, "grad_norm": 1.1862764358520508, "learning_rate": 3.7179977072984335e-06, "loss": 0.3074, "step": 976 }, { "epoch": 0.2986854173035769, "grad_norm": 1.2319847345352173, "learning_rate": 3.7218188765762322e-06, "loss": 0.2151, "step": 977 }, { "epoch": 0.2989911342097218, "grad_norm": 0.78661048412323, "learning_rate": 3.7256400458540314e-06, "loss": 0.1812, "step": 978 }, { "epoch": 0.2992968511158667, "grad_norm": 1.3464089632034302, "learning_rate": 3.72946121513183e-06, "loss": 0.186, "step": 979 }, { "epoch": 0.29960256802201163, "grad_norm": 1.1940717697143555, "learning_rate": 3.7332823844096297e-06, "loss": 0.1971, "step": 980 }, { "epoch": 0.2999082849281565, "grad_norm": 0.952959418296814, "learning_rate": 3.7371035536874285e-06, "loss": 0.1605, "step": 981 }, { "epoch": 0.3002140018343014, "grad_norm": 0.8503403067588806, "learning_rate": 3.7409247229652277e-06, "loss": 0.1904, "step": 982 }, { "epoch": 0.30051971874044636, "grad_norm": 1.0747379064559937, "learning_rate": 3.7447458922430264e-06, "loss": 0.1692, "step": 983 }, { "epoch": 0.30082543564659125, "grad_norm": 1.0567278861999512, "learning_rate": 3.7485670615208256e-06, "loss": 0.2379, "step": 984 }, { "epoch": 0.30113115255273615, "grad_norm": 1.0569156408309937, "learning_rate": 3.7523882307986247e-06, "loss": 0.2409, "step": 985 }, { "epoch": 0.3014368694588811, "grad_norm": 0.9285370707511902, "learning_rate": 3.756209400076424e-06, "loss": 0.2104, "step": 986 }, { "epoch": 0.301742586365026, "grad_norm": 0.9066474437713623, "learning_rate": 3.7600305693542226e-06, "loss": 0.224, "step": 987 }, { "epoch": 0.3020483032711709, "grad_norm": 0.9189996123313904, "learning_rate": 3.763851738632022e-06, "loss": 0.2366, "step": 988 }, { "epoch": 0.30235402017731583, "grad_norm": 1.025918960571289, "learning_rate": 3.7676729079098206e-06, "loss": 0.2362, "step": 989 }, { "epoch": 0.3026597370834607, "grad_norm": 1.6051228046417236, "learning_rate": 3.7714940771876197e-06, "loss": 0.3118, "step": 990 }, { "epoch": 0.3029654539896056, "grad_norm": 1.5180524587631226, "learning_rate": 3.7753152464654185e-06, "loss": 0.3329, "step": 991 }, { "epoch": 0.3032711708957505, "grad_norm": 1.2812504768371582, "learning_rate": 3.7791364157432176e-06, "loss": 0.3804, "step": 992 }, { "epoch": 0.30357688780189546, "grad_norm": 2.386523723602295, "learning_rate": 3.7829575850210164e-06, "loss": 0.4223, "step": 993 }, { "epoch": 0.30388260470804035, "grad_norm": 1.4212864637374878, "learning_rate": 3.7867787542988156e-06, "loss": 0.3947, "step": 994 }, { "epoch": 0.30418832161418524, "grad_norm": 1.4915727376937866, "learning_rate": 3.7905999235766143e-06, "loss": 0.4138, "step": 995 }, { "epoch": 0.3044940385203302, "grad_norm": 2.1796538829803467, "learning_rate": 3.7944210928544135e-06, "loss": 0.3776, "step": 996 }, { "epoch": 0.3047997554264751, "grad_norm": 1.7348482608795166, "learning_rate": 3.7982422621322126e-06, "loss": 0.4081, "step": 997 }, { "epoch": 0.30510547233262, "grad_norm": 3.7839818000793457, "learning_rate": 3.8020634314100114e-06, "loss": 0.5387, "step": 998 }, { "epoch": 0.3054111892387649, "grad_norm": 2.3856942653656006, "learning_rate": 3.8058846006878105e-06, "loss": 0.5312, "step": 999 }, { "epoch": 0.3057169061449098, "grad_norm": 6.250284671783447, "learning_rate": 3.8097057699656093e-06, "loss": 0.6282, "step": 1000 }, { "epoch": 0.3057169061449098, "eval_cer": 0.21323360937448857, "eval_loss": 0.44768765568733215, "eval_runtime": 19.7618, "eval_samples_per_second": 229.634, "eval_steps_per_second": 0.759, "eval_wer": 0.42121728443339573, "step": 1000 }, { "epoch": 0.3060226230510547, "grad_norm": 0.8415254950523376, "learning_rate": 3.8135269392434085e-06, "loss": 0.2703, "step": 1001 }, { "epoch": 0.30632833995719966, "grad_norm": 1.0004171133041382, "learning_rate": 3.817348108521208e-06, "loss": 0.2348, "step": 1002 }, { "epoch": 0.30663405686334455, "grad_norm": 1.009069800376892, "learning_rate": 3.821169277799006e-06, "loss": 0.1856, "step": 1003 }, { "epoch": 0.30693977376948944, "grad_norm": 0.6723507642745972, "learning_rate": 3.824990447076806e-06, "loss": 0.1765, "step": 1004 }, { "epoch": 0.30724549067563434, "grad_norm": 0.8880953192710876, "learning_rate": 3.828811616354605e-06, "loss": 0.1811, "step": 1005 }, { "epoch": 0.3075512075817793, "grad_norm": 1.200845718383789, "learning_rate": 3.832632785632404e-06, "loss": 0.1552, "step": 1006 }, { "epoch": 0.3078569244879242, "grad_norm": 0.8896739482879639, "learning_rate": 3.836453954910203e-06, "loss": 0.1675, "step": 1007 }, { "epoch": 0.30816264139406907, "grad_norm": 0.7621726989746094, "learning_rate": 3.840275124188002e-06, "loss": 0.1995, "step": 1008 }, { "epoch": 0.308468358300214, "grad_norm": 0.83097243309021, "learning_rate": 3.8440962934658005e-06, "loss": 0.202, "step": 1009 }, { "epoch": 0.3087740752063589, "grad_norm": 0.8738337755203247, "learning_rate": 3.8479174627436e-06, "loss": 0.2153, "step": 1010 }, { "epoch": 0.3090797921125038, "grad_norm": 0.9159551858901978, "learning_rate": 3.851738632021399e-06, "loss": 0.1743, "step": 1011 }, { "epoch": 0.30938550901864875, "grad_norm": 1.198378324508667, "learning_rate": 3.855559801299198e-06, "loss": 0.2208, "step": 1012 }, { "epoch": 0.30969122592479364, "grad_norm": 1.0526041984558105, "learning_rate": 3.859380970576996e-06, "loss": 0.2187, "step": 1013 }, { "epoch": 0.30999694283093854, "grad_norm": 0.9403125643730164, "learning_rate": 3.863202139854796e-06, "loss": 0.2258, "step": 1014 }, { "epoch": 0.3103026597370835, "grad_norm": 0.9867441654205322, "learning_rate": 3.867023309132595e-06, "loss": 0.2674, "step": 1015 }, { "epoch": 0.3106083766432284, "grad_norm": 1.7308610677719116, "learning_rate": 3.8708444784103934e-06, "loss": 0.3138, "step": 1016 }, { "epoch": 0.31091409354937327, "grad_norm": 1.2558552026748657, "learning_rate": 3.874665647688192e-06, "loss": 0.3281, "step": 1017 }, { "epoch": 0.31121981045551816, "grad_norm": 1.2523506879806519, "learning_rate": 3.878486816965992e-06, "loss": 0.3603, "step": 1018 }, { "epoch": 0.3115255273616631, "grad_norm": 2.837400436401367, "learning_rate": 3.8823079862437905e-06, "loss": 0.3768, "step": 1019 }, { "epoch": 0.311831244267808, "grad_norm": 2.156273126602173, "learning_rate": 3.886129155521589e-06, "loss": 0.3512, "step": 1020 }, { "epoch": 0.3121369611739529, "grad_norm": 1.4955203533172607, "learning_rate": 3.889950324799388e-06, "loss": 0.4058, "step": 1021 }, { "epoch": 0.31244267808009785, "grad_norm": 1.6584020853042603, "learning_rate": 3.893771494077188e-06, "loss": 0.4265, "step": 1022 }, { "epoch": 0.31274839498624274, "grad_norm": 2.272995710372925, "learning_rate": 3.897592663354986e-06, "loss": 0.4069, "step": 1023 }, { "epoch": 0.31305411189238763, "grad_norm": 3.419945478439331, "learning_rate": 3.901413832632786e-06, "loss": 0.5492, "step": 1024 }, { "epoch": 0.3133598287985326, "grad_norm": 3.8543241024017334, "learning_rate": 3.905235001910585e-06, "loss": 0.5632, "step": 1025 }, { "epoch": 0.3136655457046775, "grad_norm": 1.2046235799789429, "learning_rate": 3.909056171188384e-06, "loss": 0.2975, "step": 1026 }, { "epoch": 0.31397126261082237, "grad_norm": 0.6379966139793396, "learning_rate": 3.912877340466183e-06, "loss": 0.1702, "step": 1027 }, { "epoch": 0.3142769795169673, "grad_norm": 0.6776479482650757, "learning_rate": 3.916698509743982e-06, "loss": 0.1699, "step": 1028 }, { "epoch": 0.3145826964231122, "grad_norm": 0.6807871460914612, "learning_rate": 3.920519679021781e-06, "loss": 0.1678, "step": 1029 }, { "epoch": 0.3148884133292571, "grad_norm": 0.6433473229408264, "learning_rate": 3.92434084829958e-06, "loss": 0.1539, "step": 1030 }, { "epoch": 0.315194130235402, "grad_norm": 0.7070204615592957, "learning_rate": 3.928162017577379e-06, "loss": 0.1331, "step": 1031 }, { "epoch": 0.31549984714154694, "grad_norm": 0.6519095301628113, "learning_rate": 3.931983186855178e-06, "loss": 0.1649, "step": 1032 }, { "epoch": 0.31580556404769183, "grad_norm": 0.69696444272995, "learning_rate": 3.935804356132977e-06, "loss": 0.1766, "step": 1033 }, { "epoch": 0.3161112809538367, "grad_norm": 1.4875261783599854, "learning_rate": 3.939625525410776e-06, "loss": 0.2003, "step": 1034 }, { "epoch": 0.3164169978599817, "grad_norm": 0.8749998211860657, "learning_rate": 3.943446694688575e-06, "loss": 0.19, "step": 1035 }, { "epoch": 0.31672271476612657, "grad_norm": 1.026098608970642, "learning_rate": 3.947267863966373e-06, "loss": 0.2043, "step": 1036 }, { "epoch": 0.31702843167227146, "grad_norm": 0.7759823203086853, "learning_rate": 3.951089033244173e-06, "loss": 0.1939, "step": 1037 }, { "epoch": 0.3173341485784164, "grad_norm": 1.0083603858947754, "learning_rate": 3.954910202521972e-06, "loss": 0.251, "step": 1038 }, { "epoch": 0.3176398654845613, "grad_norm": 1.5741424560546875, "learning_rate": 3.9587313717997705e-06, "loss": 0.2558, "step": 1039 }, { "epoch": 0.3179455823907062, "grad_norm": 1.0051624774932861, "learning_rate": 3.962552541077569e-06, "loss": 0.2559, "step": 1040 }, { "epoch": 0.31825129929685114, "grad_norm": 1.1902285814285278, "learning_rate": 3.966373710355369e-06, "loss": 0.2575, "step": 1041 }, { "epoch": 0.31855701620299604, "grad_norm": 1.4542866945266724, "learning_rate": 3.970194879633168e-06, "loss": 0.3412, "step": 1042 }, { "epoch": 0.31886273310914093, "grad_norm": 1.434388518333435, "learning_rate": 3.974016048910967e-06, "loss": 0.3705, "step": 1043 }, { "epoch": 0.3191684500152858, "grad_norm": 2.0379223823547363, "learning_rate": 3.977837218188766e-06, "loss": 0.3634, "step": 1044 }, { "epoch": 0.31947416692143077, "grad_norm": 2.016648054122925, "learning_rate": 3.9816583874665655e-06, "loss": 0.3759, "step": 1045 }, { "epoch": 0.31977988382757566, "grad_norm": 1.7251392602920532, "learning_rate": 3.985479556744364e-06, "loss": 0.4286, "step": 1046 }, { "epoch": 0.32008560073372055, "grad_norm": 4.037511348724365, "learning_rate": 3.989300726022163e-06, "loss": 0.4297, "step": 1047 }, { "epoch": 0.3203913176398655, "grad_norm": 1.8641585111618042, "learning_rate": 3.993121895299962e-06, "loss": 0.5292, "step": 1048 }, { "epoch": 0.3206970345460104, "grad_norm": 1.6908762454986572, "learning_rate": 3.996943064577761e-06, "loss": 0.4624, "step": 1049 }, { "epoch": 0.3210027514521553, "grad_norm": 3.1654205322265625, "learning_rate": 4.00076423385556e-06, "loss": 0.6478, "step": 1050 }, { "epoch": 0.32130846835830024, "grad_norm": 0.9773539304733276, "learning_rate": 4.004585403133359e-06, "loss": 0.3266, "step": 1051 }, { "epoch": 0.32161418526444513, "grad_norm": 0.6849073171615601, "learning_rate": 4.0084065724111576e-06, "loss": 0.1681, "step": 1052 }, { "epoch": 0.32191990217059, "grad_norm": 0.5629790425300598, "learning_rate": 4.012227741688957e-06, "loss": 0.1955, "step": 1053 }, { "epoch": 0.32222561907673497, "grad_norm": 0.726783812046051, "learning_rate": 4.016048910966756e-06, "loss": 0.2212, "step": 1054 }, { "epoch": 0.32253133598287986, "grad_norm": 0.7148677706718445, "learning_rate": 4.019870080244555e-06, "loss": 0.1518, "step": 1055 }, { "epoch": 0.32283705288902476, "grad_norm": 0.7914243936538696, "learning_rate": 4.023691249522353e-06, "loss": 0.1819, "step": 1056 }, { "epoch": 0.32314276979516965, "grad_norm": 0.8621677756309509, "learning_rate": 4.027512418800153e-06, "loss": 0.1755, "step": 1057 }, { "epoch": 0.3234484867013146, "grad_norm": 0.6665903329849243, "learning_rate": 4.031333588077952e-06, "loss": 0.1517, "step": 1058 }, { "epoch": 0.3237542036074595, "grad_norm": 0.8795692920684814, "learning_rate": 4.0351547573557505e-06, "loss": 0.1619, "step": 1059 }, { "epoch": 0.3240599205136044, "grad_norm": 0.7709496021270752, "learning_rate": 4.03897592663355e-06, "loss": 0.1688, "step": 1060 }, { "epoch": 0.32436563741974933, "grad_norm": 1.1102931499481201, "learning_rate": 4.042797095911349e-06, "loss": 0.2338, "step": 1061 }, { "epoch": 0.3246713543258942, "grad_norm": 0.8988953828811646, "learning_rate": 4.0466182651891476e-06, "loss": 0.224, "step": 1062 }, { "epoch": 0.3249770712320391, "grad_norm": 2.1740994453430176, "learning_rate": 4.050439434466947e-06, "loss": 0.233, "step": 1063 }, { "epoch": 0.32528278813818406, "grad_norm": 1.3498635292053223, "learning_rate": 4.054260603744747e-06, "loss": 0.2368, "step": 1064 }, { "epoch": 0.32558850504432896, "grad_norm": 2.5414867401123047, "learning_rate": 4.0580817730225455e-06, "loss": 0.2432, "step": 1065 }, { "epoch": 0.32589422195047385, "grad_norm": 1.7503288984298706, "learning_rate": 4.061902942300344e-06, "loss": 0.2999, "step": 1066 }, { "epoch": 0.3261999388566188, "grad_norm": 1.8839746713638306, "learning_rate": 4.065724111578143e-06, "loss": 0.3283, "step": 1067 }, { "epoch": 0.3265056557627637, "grad_norm": 1.4148657321929932, "learning_rate": 4.0695452808559426e-06, "loss": 0.3461, "step": 1068 }, { "epoch": 0.3268113726689086, "grad_norm": 1.5503294467926025, "learning_rate": 4.073366450133741e-06, "loss": 0.341, "step": 1069 }, { "epoch": 0.3271170895750535, "grad_norm": 1.3923650979995728, "learning_rate": 4.07718761941154e-06, "loss": 0.3582, "step": 1070 }, { "epoch": 0.3274228064811984, "grad_norm": 1.3900295495986938, "learning_rate": 4.081008788689339e-06, "loss": 0.3496, "step": 1071 }, { "epoch": 0.3277285233873433, "grad_norm": 2.051630735397339, "learning_rate": 4.084829957967138e-06, "loss": 0.4887, "step": 1072 }, { "epoch": 0.3280342402934882, "grad_norm": 1.7488216161727905, "learning_rate": 4.088651127244937e-06, "loss": 0.4455, "step": 1073 }, { "epoch": 0.32833995719963316, "grad_norm": 2.6405110359191895, "learning_rate": 4.092472296522736e-06, "loss": 0.4877, "step": 1074 }, { "epoch": 0.32864567410577805, "grad_norm": 3.600313425064087, "learning_rate": 4.096293465800535e-06, "loss": 0.591, "step": 1075 }, { "epoch": 0.32895139101192294, "grad_norm": 1.3394747972488403, "learning_rate": 4.100114635078334e-06, "loss": 0.3278, "step": 1076 }, { "epoch": 0.3292571079180679, "grad_norm": 1.4324204921722412, "learning_rate": 4.103935804356133e-06, "loss": 0.1751, "step": 1077 }, { "epoch": 0.3295628248242128, "grad_norm": 0.7233107089996338, "learning_rate": 4.107756973633932e-06, "loss": 0.1534, "step": 1078 }, { "epoch": 0.3298685417303577, "grad_norm": 0.9019874930381775, "learning_rate": 4.1115781429117304e-06, "loss": 0.1545, "step": 1079 }, { "epoch": 0.33017425863650257, "grad_norm": 0.5924026370048523, "learning_rate": 4.11539931218953e-06, "loss": 0.1597, "step": 1080 }, { "epoch": 0.3304799755426475, "grad_norm": 0.672450602054596, "learning_rate": 4.119220481467329e-06, "loss": 0.13, "step": 1081 }, { "epoch": 0.3307856924487924, "grad_norm": 2.211538553237915, "learning_rate": 4.123041650745128e-06, "loss": 0.183, "step": 1082 }, { "epoch": 0.3310914093549373, "grad_norm": 2.2014520168304443, "learning_rate": 4.126862820022927e-06, "loss": 0.1842, "step": 1083 }, { "epoch": 0.33139712626108225, "grad_norm": 0.7176119685173035, "learning_rate": 4.130683989300727e-06, "loss": 0.1588, "step": 1084 }, { "epoch": 0.33170284316722715, "grad_norm": 0.9973981976509094, "learning_rate": 4.1345051585785255e-06, "loss": 0.2206, "step": 1085 }, { "epoch": 0.33200856007337204, "grad_norm": 0.81184321641922, "learning_rate": 4.138326327856324e-06, "loss": 0.1835, "step": 1086 }, { "epoch": 0.332314276979517, "grad_norm": 1.1753523349761963, "learning_rate": 4.142147497134123e-06, "loss": 0.2241, "step": 1087 }, { "epoch": 0.3326199938856619, "grad_norm": 1.019192099571228, "learning_rate": 4.1459686664119225e-06, "loss": 0.2062, "step": 1088 }, { "epoch": 0.3329257107918068, "grad_norm": 0.9956881999969482, "learning_rate": 4.149789835689721e-06, "loss": 0.2578, "step": 1089 }, { "epoch": 0.3332314276979517, "grad_norm": 0.9867995977401733, "learning_rate": 4.15361100496752e-06, "loss": 0.2768, "step": 1090 }, { "epoch": 0.3335371446040966, "grad_norm": 2.218087673187256, "learning_rate": 4.157432174245319e-06, "loss": 0.2893, "step": 1091 }, { "epoch": 0.3338428615102415, "grad_norm": 1.3115246295928955, "learning_rate": 4.161253343523118e-06, "loss": 0.3277, "step": 1092 }, { "epoch": 0.3341485784163864, "grad_norm": 2.1156673431396484, "learning_rate": 4.165074512800917e-06, "loss": 0.3407, "step": 1093 }, { "epoch": 0.33445429532253135, "grad_norm": 1.7131067514419556, "learning_rate": 4.168895682078716e-06, "loss": 0.3004, "step": 1094 }, { "epoch": 0.33476001222867624, "grad_norm": 1.9218730926513672, "learning_rate": 4.1727168513565154e-06, "loss": 0.3867, "step": 1095 }, { "epoch": 0.33506572913482113, "grad_norm": 2.220580577850342, "learning_rate": 4.176538020634314e-06, "loss": 0.4201, "step": 1096 }, { "epoch": 0.3353714460409661, "grad_norm": 2.000875473022461, "learning_rate": 4.180359189912113e-06, "loss": 0.3985, "step": 1097 }, { "epoch": 0.335677162947111, "grad_norm": 1.9349753856658936, "learning_rate": 4.184180359189912e-06, "loss": 0.4526, "step": 1098 }, { "epoch": 0.33598287985325587, "grad_norm": 2.833981990814209, "learning_rate": 4.188001528467711e-06, "loss": 0.4895, "step": 1099 }, { "epoch": 0.3362885967594008, "grad_norm": 2.7497096061706543, "learning_rate": 4.19182269774551e-06, "loss": 0.5825, "step": 1100 }, { "epoch": 0.3365943136655457, "grad_norm": 1.1093322038650513, "learning_rate": 4.19564386702331e-06, "loss": 0.2779, "step": 1101 }, { "epoch": 0.3369000305716906, "grad_norm": 0.7151468396186829, "learning_rate": 4.199465036301108e-06, "loss": 0.2334, "step": 1102 }, { "epoch": 0.33720574747783555, "grad_norm": 0.9146250486373901, "learning_rate": 4.203286205578908e-06, "loss": 0.1542, "step": 1103 }, { "epoch": 0.33751146438398044, "grad_norm": 0.8436581492424011, "learning_rate": 4.207107374856707e-06, "loss": 0.1261, "step": 1104 }, { "epoch": 0.33781718129012533, "grad_norm": 0.695248007774353, "learning_rate": 4.2109285441345054e-06, "loss": 0.1424, "step": 1105 }, { "epoch": 0.3381228981962702, "grad_norm": 0.8848037123680115, "learning_rate": 4.214749713412304e-06, "loss": 0.164, "step": 1106 }, { "epoch": 0.3384286151024152, "grad_norm": 0.6781333684921265, "learning_rate": 4.218570882690104e-06, "loss": 0.1293, "step": 1107 }, { "epoch": 0.33873433200856007, "grad_norm": 0.8748390078544617, "learning_rate": 4.2223920519679025e-06, "loss": 0.1714, "step": 1108 }, { "epoch": 0.33904004891470496, "grad_norm": 0.8061756491661072, "learning_rate": 4.226213221245701e-06, "loss": 0.1569, "step": 1109 }, { "epoch": 0.3393457658208499, "grad_norm": 0.9454506039619446, "learning_rate": 4.2300343905235e-06, "loss": 0.2055, "step": 1110 }, { "epoch": 0.3396514827269948, "grad_norm": 0.847995400428772, "learning_rate": 4.2338555598013e-06, "loss": 0.1854, "step": 1111 }, { "epoch": 0.3399571996331397, "grad_norm": 0.6679621934890747, "learning_rate": 4.237676729079098e-06, "loss": 0.1578, "step": 1112 }, { "epoch": 0.34026291653928464, "grad_norm": 1.1085615158081055, "learning_rate": 4.241497898356897e-06, "loss": 0.2692, "step": 1113 }, { "epoch": 0.34056863344542954, "grad_norm": 2.481106758117676, "learning_rate": 4.245319067634696e-06, "loss": 0.2288, "step": 1114 }, { "epoch": 0.34087435035157443, "grad_norm": 1.167162299156189, "learning_rate": 4.249140236912495e-06, "loss": 0.2294, "step": 1115 }, { "epoch": 0.3411800672577194, "grad_norm": 1.6012628078460693, "learning_rate": 4.252961406190294e-06, "loss": 0.2898, "step": 1116 }, { "epoch": 0.34148578416386427, "grad_norm": 1.1955419778823853, "learning_rate": 4.256782575468093e-06, "loss": 0.2829, "step": 1117 }, { "epoch": 0.34179150107000916, "grad_norm": 1.400905728340149, "learning_rate": 4.260603744745892e-06, "loss": 0.3057, "step": 1118 }, { "epoch": 0.34209721797615406, "grad_norm": 1.4307470321655273, "learning_rate": 4.264424914023691e-06, "loss": 0.3267, "step": 1119 }, { "epoch": 0.342402934882299, "grad_norm": 2.791585683822632, "learning_rate": 4.26824608330149e-06, "loss": 0.3661, "step": 1120 }, { "epoch": 0.3427086517884439, "grad_norm": 9.107033729553223, "learning_rate": 4.27206725257929e-06, "loss": 0.4289, "step": 1121 }, { "epoch": 0.3430143686945888, "grad_norm": 4.044131278991699, "learning_rate": 4.275888421857088e-06, "loss": 0.4214, "step": 1122 }, { "epoch": 0.34332008560073374, "grad_norm": 1.9075394868850708, "learning_rate": 4.279709591134888e-06, "loss": 0.4047, "step": 1123 }, { "epoch": 0.34362580250687863, "grad_norm": 2.836144208908081, "learning_rate": 4.283530760412687e-06, "loss": 0.5156, "step": 1124 }, { "epoch": 0.3439315194130235, "grad_norm": 2.5184106826782227, "learning_rate": 4.287351929690485e-06, "loss": 0.5951, "step": 1125 }, { "epoch": 0.34423723631916847, "grad_norm": 0.9171217679977417, "learning_rate": 4.291173098968285e-06, "loss": 0.2705, "step": 1126 }, { "epoch": 0.34454295322531336, "grad_norm": 0.5947016477584839, "learning_rate": 4.294994268246084e-06, "loss": 0.1875, "step": 1127 }, { "epoch": 0.34484867013145826, "grad_norm": 0.7303478121757507, "learning_rate": 4.2988154375238825e-06, "loss": 0.1621, "step": 1128 }, { "epoch": 0.3451543870376032, "grad_norm": 0.6008920669555664, "learning_rate": 4.302636606801681e-06, "loss": 0.149, "step": 1129 }, { "epoch": 0.3454601039437481, "grad_norm": 0.8052994608879089, "learning_rate": 4.306457776079481e-06, "loss": 0.1959, "step": 1130 }, { "epoch": 0.345765820849893, "grad_norm": 0.6567323803901672, "learning_rate": 4.3102789453572796e-06, "loss": 0.1683, "step": 1131 }, { "epoch": 0.3460715377560379, "grad_norm": 1.3648227453231812, "learning_rate": 4.314100114635078e-06, "loss": 0.2163, "step": 1132 }, { "epoch": 0.34637725466218283, "grad_norm": 0.8216473460197449, "learning_rate": 4.317921283912877e-06, "loss": 0.164, "step": 1133 }, { "epoch": 0.3466829715683277, "grad_norm": 0.7333937883377075, "learning_rate": 4.321742453190677e-06, "loss": 0.1698, "step": 1134 }, { "epoch": 0.3469886884744726, "grad_norm": 0.7512550354003906, "learning_rate": 4.325563622468475e-06, "loss": 0.1711, "step": 1135 }, { "epoch": 0.34729440538061757, "grad_norm": 0.6834243535995483, "learning_rate": 4.329384791746274e-06, "loss": 0.162, "step": 1136 }, { "epoch": 0.34760012228676246, "grad_norm": 0.8810251951217651, "learning_rate": 4.333205961024073e-06, "loss": 0.21, "step": 1137 }, { "epoch": 0.34790583919290735, "grad_norm": 1.0049279928207397, "learning_rate": 4.3370271303018725e-06, "loss": 0.2293, "step": 1138 }, { "epoch": 0.3482115560990523, "grad_norm": 1.8311522006988525, "learning_rate": 4.340848299579671e-06, "loss": 0.2556, "step": 1139 }, { "epoch": 0.3485172730051972, "grad_norm": 1.1780295372009277, "learning_rate": 4.344669468857471e-06, "loss": 0.2703, "step": 1140 }, { "epoch": 0.3488229899113421, "grad_norm": 2.1878504753112793, "learning_rate": 4.3484906381352696e-06, "loss": 0.2708, "step": 1141 }, { "epoch": 0.34912870681748703, "grad_norm": 1.6044468879699707, "learning_rate": 4.352311807413069e-06, "loss": 0.3028, "step": 1142 }, { "epoch": 0.3494344237236319, "grad_norm": 1.3686617612838745, "learning_rate": 4.356132976690868e-06, "loss": 0.3491, "step": 1143 }, { "epoch": 0.3497401406297768, "grad_norm": 1.4625345468521118, "learning_rate": 4.359954145968667e-06, "loss": 0.3583, "step": 1144 }, { "epoch": 0.3500458575359217, "grad_norm": 1.625711441040039, "learning_rate": 4.363775315246465e-06, "loss": 0.3969, "step": 1145 }, { "epoch": 0.35035157444206666, "grad_norm": 1.6420135498046875, "learning_rate": 4.367596484524265e-06, "loss": 0.3894, "step": 1146 }, { "epoch": 0.35065729134821155, "grad_norm": 1.8871665000915527, "learning_rate": 4.371417653802064e-06, "loss": 0.4036, "step": 1147 }, { "epoch": 0.35096300825435645, "grad_norm": 3.2323544025421143, "learning_rate": 4.3752388230798625e-06, "loss": 0.4323, "step": 1148 }, { "epoch": 0.3512687251605014, "grad_norm": 2.3272202014923096, "learning_rate": 4.379059992357661e-06, "loss": 0.5179, "step": 1149 }, { "epoch": 0.3515744420666463, "grad_norm": 4.4944963455200195, "learning_rate": 4.382881161635461e-06, "loss": 0.6009, "step": 1150 }, { "epoch": 0.3518801589727912, "grad_norm": 1.019527554512024, "learning_rate": 4.3867023309132595e-06, "loss": 0.2907, "step": 1151 }, { "epoch": 0.3521858758789361, "grad_norm": 0.7130956649780273, "learning_rate": 4.390523500191058e-06, "loss": 0.1949, "step": 1152 }, { "epoch": 0.352491592785081, "grad_norm": 0.6597585082054138, "learning_rate": 4.394344669468857e-06, "loss": 0.1656, "step": 1153 }, { "epoch": 0.3527973096912259, "grad_norm": 0.5587711334228516, "learning_rate": 4.398165838746657e-06, "loss": 0.1528, "step": 1154 }, { "epoch": 0.35310302659737086, "grad_norm": 0.6974244713783264, "learning_rate": 4.401987008024455e-06, "loss": 0.1723, "step": 1155 }, { "epoch": 0.35340874350351575, "grad_norm": 2.3464181423187256, "learning_rate": 4.405808177302254e-06, "loss": 0.1411, "step": 1156 }, { "epoch": 0.35371446040966065, "grad_norm": 0.7518149018287659, "learning_rate": 4.409629346580054e-06, "loss": 0.1236, "step": 1157 }, { "epoch": 0.35402017731580554, "grad_norm": 0.9952051043510437, "learning_rate": 4.4134505158578524e-06, "loss": 0.2237, "step": 1158 }, { "epoch": 0.3543258942219505, "grad_norm": 0.9834770560264587, "learning_rate": 4.417271685135651e-06, "loss": 0.1668, "step": 1159 }, { "epoch": 0.3546316111280954, "grad_norm": 1.851924180984497, "learning_rate": 4.421092854413451e-06, "loss": 0.187, "step": 1160 }, { "epoch": 0.3549373280342403, "grad_norm": 0.8051882386207581, "learning_rate": 4.42491402369125e-06, "loss": 0.2206, "step": 1161 }, { "epoch": 0.3552430449403852, "grad_norm": 2.004366636276245, "learning_rate": 4.428735192969049e-06, "loss": 0.2111, "step": 1162 }, { "epoch": 0.3555487618465301, "grad_norm": 0.8372135162353516, "learning_rate": 4.432556362246848e-06, "loss": 0.2049, "step": 1163 }, { "epoch": 0.355854478752675, "grad_norm": 0.9269620180130005, "learning_rate": 4.436377531524647e-06, "loss": 0.2359, "step": 1164 }, { "epoch": 0.35616019565881996, "grad_norm": 0.8397481441497803, "learning_rate": 4.440198700802446e-06, "loss": 0.2334, "step": 1165 }, { "epoch": 0.35646591256496485, "grad_norm": 0.9950135350227356, "learning_rate": 4.444019870080245e-06, "loss": 0.2583, "step": 1166 }, { "epoch": 0.35677162947110974, "grad_norm": 1.252647876739502, "learning_rate": 4.447841039358044e-06, "loss": 0.3058, "step": 1167 }, { "epoch": 0.3570773463772547, "grad_norm": 1.8272062540054321, "learning_rate": 4.4516622086358424e-06, "loss": 0.313, "step": 1168 }, { "epoch": 0.3573830632833996, "grad_norm": 1.408332109451294, "learning_rate": 4.455483377913642e-06, "loss": 0.3409, "step": 1169 }, { "epoch": 0.3576887801895445, "grad_norm": 1.6306763887405396, "learning_rate": 4.459304547191441e-06, "loss": 0.3515, "step": 1170 }, { "epoch": 0.35799449709568937, "grad_norm": 2.2903521060943604, "learning_rate": 4.4631257164692395e-06, "loss": 0.4033, "step": 1171 }, { "epoch": 0.3583002140018343, "grad_norm": 1.9211477041244507, "learning_rate": 4.466946885747038e-06, "loss": 0.3847, "step": 1172 }, { "epoch": 0.3586059309079792, "grad_norm": 2.318091630935669, "learning_rate": 4.470768055024838e-06, "loss": 0.4541, "step": 1173 }, { "epoch": 0.3589116478141241, "grad_norm": 2.068183183670044, "learning_rate": 4.474589224302637e-06, "loss": 0.555, "step": 1174 }, { "epoch": 0.35921736472026905, "grad_norm": 3.0729846954345703, "learning_rate": 4.478410393580435e-06, "loss": 0.5953, "step": 1175 }, { "epoch": 0.35952308162641394, "grad_norm": 0.7593841552734375, "learning_rate": 4.482231562858234e-06, "loss": 0.2732, "step": 1176 }, { "epoch": 0.35982879853255884, "grad_norm": 0.5460600256919861, "learning_rate": 4.486052732136034e-06, "loss": 0.1496, "step": 1177 }, { "epoch": 0.3601345154387038, "grad_norm": 0.776069700717926, "learning_rate": 4.4898739014138324e-06, "loss": 0.1748, "step": 1178 }, { "epoch": 0.3604402323448487, "grad_norm": 0.6357946991920471, "learning_rate": 4.493695070691632e-06, "loss": 0.1481, "step": 1179 }, { "epoch": 0.36074594925099357, "grad_norm": 0.6793935894966125, "learning_rate": 4.497516239969431e-06, "loss": 0.1284, "step": 1180 }, { "epoch": 0.36105166615713846, "grad_norm": 0.7535185217857361, "learning_rate": 4.50133740924723e-06, "loss": 0.1293, "step": 1181 }, { "epoch": 0.3613573830632834, "grad_norm": 0.637328565120697, "learning_rate": 4.505158578525029e-06, "loss": 0.1343, "step": 1182 }, { "epoch": 0.3616630999694283, "grad_norm": 0.9011083841323853, "learning_rate": 4.508979747802828e-06, "loss": 0.1902, "step": 1183 }, { "epoch": 0.3619688168755732, "grad_norm": 0.7913046479225159, "learning_rate": 4.512800917080627e-06, "loss": 0.1781, "step": 1184 }, { "epoch": 0.36227453378171814, "grad_norm": 0.8933420777320862, "learning_rate": 4.516622086358426e-06, "loss": 0.1995, "step": 1185 }, { "epoch": 0.36258025068786304, "grad_norm": 0.7659948468208313, "learning_rate": 4.520443255636225e-06, "loss": 0.158, "step": 1186 }, { "epoch": 0.36288596759400793, "grad_norm": 1.0395658016204834, "learning_rate": 4.524264424914024e-06, "loss": 0.2086, "step": 1187 }, { "epoch": 0.3631916845001529, "grad_norm": 1.2591477632522583, "learning_rate": 4.528085594191822e-06, "loss": 0.2109, "step": 1188 }, { "epoch": 0.36349740140629777, "grad_norm": 1.366073489189148, "learning_rate": 4.531906763469622e-06, "loss": 0.2713, "step": 1189 }, { "epoch": 0.36380311831244266, "grad_norm": 1.0495946407318115, "learning_rate": 4.535727932747421e-06, "loss": 0.2742, "step": 1190 }, { "epoch": 0.3641088352185876, "grad_norm": 2.097355842590332, "learning_rate": 4.5395491020252195e-06, "loss": 0.2989, "step": 1191 }, { "epoch": 0.3644145521247325, "grad_norm": 4.305901527404785, "learning_rate": 4.543370271303019e-06, "loss": 0.3166, "step": 1192 }, { "epoch": 0.3647202690308774, "grad_norm": 1.4982064962387085, "learning_rate": 4.547191440580818e-06, "loss": 0.3717, "step": 1193 }, { "epoch": 0.3650259859370223, "grad_norm": 1.4732321500778198, "learning_rate": 4.5510126098586166e-06, "loss": 0.351, "step": 1194 }, { "epoch": 0.36533170284316724, "grad_norm": 1.4862436056137085, "learning_rate": 4.554833779136415e-06, "loss": 0.3883, "step": 1195 }, { "epoch": 0.36563741974931213, "grad_norm": 1.484092116355896, "learning_rate": 4.558654948414215e-06, "loss": 0.4332, "step": 1196 }, { "epoch": 0.365943136655457, "grad_norm": 1.7387186288833618, "learning_rate": 4.562476117692014e-06, "loss": 0.3552, "step": 1197 }, { "epoch": 0.366248853561602, "grad_norm": 2.1320910453796387, "learning_rate": 4.566297286969812e-06, "loss": 0.4179, "step": 1198 }, { "epoch": 0.36655457046774687, "grad_norm": 2.523535966873169, "learning_rate": 4.570118456247612e-06, "loss": 0.4558, "step": 1199 }, { "epoch": 0.36686028737389176, "grad_norm": 2.65391206741333, "learning_rate": 4.573939625525412e-06, "loss": 0.5766, "step": 1200 }, { "epoch": 0.3671660042800367, "grad_norm": 1.5112229585647583, "learning_rate": 4.57776079480321e-06, "loss": 0.2372, "step": 1201 }, { "epoch": 0.3674717211861816, "grad_norm": 0.6302282810211182, "learning_rate": 4.581581964081009e-06, "loss": 0.1654, "step": 1202 }, { "epoch": 0.3677774380923265, "grad_norm": 1.6004841327667236, "learning_rate": 4.585403133358808e-06, "loss": 0.1819, "step": 1203 }, { "epoch": 0.36808315499847144, "grad_norm": 1.3499172925949097, "learning_rate": 4.589224302636607e-06, "loss": 0.1512, "step": 1204 }, { "epoch": 0.36838887190461633, "grad_norm": 0.5445764064788818, "learning_rate": 4.593045471914406e-06, "loss": 0.1307, "step": 1205 }, { "epoch": 0.3686945888107612, "grad_norm": 0.7139901518821716, "learning_rate": 4.596866641192205e-06, "loss": 0.1537, "step": 1206 }, { "epoch": 0.3690003057169061, "grad_norm": 0.652698814868927, "learning_rate": 4.600687810470004e-06, "loss": 0.1317, "step": 1207 }, { "epoch": 0.36930602262305107, "grad_norm": 0.6621175408363342, "learning_rate": 4.604508979747803e-06, "loss": 0.1396, "step": 1208 }, { "epoch": 0.36961173952919596, "grad_norm": 2.3930506706237793, "learning_rate": 4.608330149025602e-06, "loss": 0.2034, "step": 1209 }, { "epoch": 0.36991745643534085, "grad_norm": 0.5879676342010498, "learning_rate": 4.612151318303401e-06, "loss": 0.179, "step": 1210 }, { "epoch": 0.3702231733414858, "grad_norm": 0.8301270008087158, "learning_rate": 4.6159724875811995e-06, "loss": 0.216, "step": 1211 }, { "epoch": 0.3705288902476307, "grad_norm": 0.9897774457931519, "learning_rate": 4.619793656858999e-06, "loss": 0.1584, "step": 1212 }, { "epoch": 0.3708346071537756, "grad_norm": 0.8671888113021851, "learning_rate": 4.623614826136798e-06, "loss": 0.2009, "step": 1213 }, { "epoch": 0.37114032405992053, "grad_norm": 1.2332406044006348, "learning_rate": 4.6274359954145965e-06, "loss": 0.2442, "step": 1214 }, { "epoch": 0.3714460409660654, "grad_norm": 1.0080281496047974, "learning_rate": 4.631257164692395e-06, "loss": 0.2362, "step": 1215 }, { "epoch": 0.3717517578722103, "grad_norm": 1.6309752464294434, "learning_rate": 4.635078333970195e-06, "loss": 0.2614, "step": 1216 }, { "epoch": 0.37205747477835527, "grad_norm": 2.336900472640991, "learning_rate": 4.638899503247994e-06, "loss": 0.299, "step": 1217 }, { "epoch": 0.37236319168450016, "grad_norm": 1.335769772529602, "learning_rate": 4.642720672525793e-06, "loss": 0.3456, "step": 1218 }, { "epoch": 0.37266890859064505, "grad_norm": 1.6250522136688232, "learning_rate": 4.646541841803592e-06, "loss": 0.316, "step": 1219 }, { "epoch": 0.37297462549678995, "grad_norm": 1.1951230764389038, "learning_rate": 4.6503630110813916e-06, "loss": 0.3646, "step": 1220 }, { "epoch": 0.3732803424029349, "grad_norm": 1.4477193355560303, "learning_rate": 4.65418418035919e-06, "loss": 0.3933, "step": 1221 }, { "epoch": 0.3735860593090798, "grad_norm": 2.3576576709747314, "learning_rate": 4.658005349636989e-06, "loss": 0.4151, "step": 1222 }, { "epoch": 0.3738917762152247, "grad_norm": 2.3455891609191895, "learning_rate": 4.661826518914789e-06, "loss": 0.4628, "step": 1223 }, { "epoch": 0.37419749312136963, "grad_norm": 3.121619939804077, "learning_rate": 4.665647688192587e-06, "loss": 0.451, "step": 1224 }, { "epoch": 0.3745032100275145, "grad_norm": 5.18320894241333, "learning_rate": 4.669468857470386e-06, "loss": 0.6167, "step": 1225 }, { "epoch": 0.3748089269336594, "grad_norm": 0.9331083297729492, "learning_rate": 4.673290026748185e-06, "loss": 0.2915, "step": 1226 }, { "epoch": 0.37511464383980436, "grad_norm": 0.6090571880340576, "learning_rate": 4.6771111960259845e-06, "loss": 0.1623, "step": 1227 }, { "epoch": 0.37542036074594926, "grad_norm": 0.5480424165725708, "learning_rate": 4.680932365303783e-06, "loss": 0.1677, "step": 1228 }, { "epoch": 0.37572607765209415, "grad_norm": 0.6573004722595215, "learning_rate": 4.684753534581582e-06, "loss": 0.1364, "step": 1229 }, { "epoch": 0.3760317945582391, "grad_norm": 0.573586642742157, "learning_rate": 4.688574703859381e-06, "loss": 0.1303, "step": 1230 }, { "epoch": 0.376337511464384, "grad_norm": 0.7404967546463013, "learning_rate": 4.69239587313718e-06, "loss": 0.1454, "step": 1231 }, { "epoch": 0.3766432283705289, "grad_norm": 0.6259741187095642, "learning_rate": 4.696217042414979e-06, "loss": 0.1344, "step": 1232 }, { "epoch": 0.3769489452766738, "grad_norm": 0.6448951363563538, "learning_rate": 4.700038211692778e-06, "loss": 0.1754, "step": 1233 }, { "epoch": 0.3772546621828187, "grad_norm": 0.6660234928131104, "learning_rate": 4.7038593809705765e-06, "loss": 0.1477, "step": 1234 }, { "epoch": 0.3775603790889636, "grad_norm": 0.7363557815551758, "learning_rate": 4.707680550248376e-06, "loss": 0.1984, "step": 1235 }, { "epoch": 0.3778660959951085, "grad_norm": 1.0360037088394165, "learning_rate": 4.711501719526175e-06, "loss": 0.1789, "step": 1236 }, { "epoch": 0.37817181290125346, "grad_norm": 0.7956397533416748, "learning_rate": 4.715322888803974e-06, "loss": 0.2025, "step": 1237 }, { "epoch": 0.37847752980739835, "grad_norm": 0.9116426110267639, "learning_rate": 4.719144058081773e-06, "loss": 0.2078, "step": 1238 }, { "epoch": 0.37878324671354324, "grad_norm": 1.4188889265060425, "learning_rate": 4.722965227359573e-06, "loss": 0.2724, "step": 1239 }, { "epoch": 0.3790889636196882, "grad_norm": 0.9818515181541443, "learning_rate": 4.7267863966373715e-06, "loss": 0.2228, "step": 1240 }, { "epoch": 0.3793946805258331, "grad_norm": 0.9690314531326294, "learning_rate": 4.73060756591517e-06, "loss": 0.2944, "step": 1241 }, { "epoch": 0.379700397431978, "grad_norm": 1.0667005777359009, "learning_rate": 4.734428735192969e-06, "loss": 0.2757, "step": 1242 }, { "epoch": 0.3800061143381229, "grad_norm": 1.805499792098999, "learning_rate": 4.738249904470769e-06, "loss": 0.3291, "step": 1243 }, { "epoch": 0.3803118312442678, "grad_norm": 2.141906976699829, "learning_rate": 4.742071073748567e-06, "loss": 0.3119, "step": 1244 }, { "epoch": 0.3806175481504127, "grad_norm": 1.2072532176971436, "learning_rate": 4.745892243026366e-06, "loss": 0.3759, "step": 1245 }, { "epoch": 0.3809232650565576, "grad_norm": 1.837446689605713, "learning_rate": 4.749713412304165e-06, "loss": 0.4293, "step": 1246 }, { "epoch": 0.38122898196270255, "grad_norm": 10.997689247131348, "learning_rate": 4.7535345815819644e-06, "loss": 0.4032, "step": 1247 }, { "epoch": 0.38153469886884744, "grad_norm": 2.047083854675293, "learning_rate": 4.757355750859763e-06, "loss": 0.4073, "step": 1248 }, { "epoch": 0.38184041577499234, "grad_norm": 1.693834662437439, "learning_rate": 4.761176920137562e-06, "loss": 0.4536, "step": 1249 }, { "epoch": 0.3821461326811373, "grad_norm": 3.0123660564422607, "learning_rate": 4.764998089415361e-06, "loss": 0.5747, "step": 1250 }, { "epoch": 0.3824518495872822, "grad_norm": 0.966464102268219, "learning_rate": 4.76881925869316e-06, "loss": 0.2484, "step": 1251 }, { "epoch": 0.38275756649342707, "grad_norm": 0.6172966957092285, "learning_rate": 4.772640427970959e-06, "loss": 0.1966, "step": 1252 }, { "epoch": 0.383063283399572, "grad_norm": 0.8758047223091125, "learning_rate": 4.776461597248758e-06, "loss": 0.1782, "step": 1253 }, { "epoch": 0.3833690003057169, "grad_norm": 0.6286633610725403, "learning_rate": 4.780282766526557e-06, "loss": 0.1519, "step": 1254 }, { "epoch": 0.3836747172118618, "grad_norm": 0.7264947891235352, "learning_rate": 4.784103935804356e-06, "loss": 0.165, "step": 1255 }, { "epoch": 0.38398043411800675, "grad_norm": 0.7599061727523804, "learning_rate": 4.787925105082155e-06, "loss": 0.1416, "step": 1256 }, { "epoch": 0.38428615102415165, "grad_norm": 0.4763408303260803, "learning_rate": 4.7917462743599544e-06, "loss": 0.1296, "step": 1257 }, { "epoch": 0.38459186793029654, "grad_norm": 0.9735826849937439, "learning_rate": 4.795567443637754e-06, "loss": 0.1672, "step": 1258 }, { "epoch": 0.38489758483644143, "grad_norm": 1.1784754991531372, "learning_rate": 4.799388612915553e-06, "loss": 0.2141, "step": 1259 }, { "epoch": 0.3852033017425864, "grad_norm": 0.5954272150993347, "learning_rate": 4.8032097821933515e-06, "loss": 0.1299, "step": 1260 }, { "epoch": 0.38550901864873127, "grad_norm": 0.9857587814331055, "learning_rate": 4.80703095147115e-06, "loss": 0.1962, "step": 1261 }, { "epoch": 0.38581473555487616, "grad_norm": 0.8069046139717102, "learning_rate": 4.81085212074895e-06, "loss": 0.1764, "step": 1262 }, { "epoch": 0.3861204524610211, "grad_norm": 0.7979682087898254, "learning_rate": 4.814673290026749e-06, "loss": 0.2051, "step": 1263 }, { "epoch": 0.386426169367166, "grad_norm": 0.8909856081008911, "learning_rate": 4.818494459304547e-06, "loss": 0.2215, "step": 1264 }, { "epoch": 0.3867318862733109, "grad_norm": 0.7755007147789001, "learning_rate": 4.822315628582346e-06, "loss": 0.2247, "step": 1265 }, { "epoch": 0.38703760317945585, "grad_norm": 1.0999784469604492, "learning_rate": 4.826136797860146e-06, "loss": 0.2763, "step": 1266 }, { "epoch": 0.38734332008560074, "grad_norm": 1.3482717275619507, "learning_rate": 4.829957967137944e-06, "loss": 0.3282, "step": 1267 }, { "epoch": 0.38764903699174563, "grad_norm": 1.530321717262268, "learning_rate": 4.833779136415743e-06, "loss": 0.3597, "step": 1268 }, { "epoch": 0.3879547538978906, "grad_norm": 1.3318259716033936, "learning_rate": 4.837600305693542e-06, "loss": 0.3259, "step": 1269 }, { "epoch": 0.3882604708040355, "grad_norm": 1.4329966306686401, "learning_rate": 4.8414214749713415e-06, "loss": 0.3545, "step": 1270 }, { "epoch": 0.38856618771018037, "grad_norm": 1.8052146434783936, "learning_rate": 4.84524264424914e-06, "loss": 0.3543, "step": 1271 }, { "epoch": 0.38887190461632526, "grad_norm": 1.7306065559387207, "learning_rate": 4.849063813526939e-06, "loss": 0.4046, "step": 1272 }, { "epoch": 0.3891776215224702, "grad_norm": 2.1445608139038086, "learning_rate": 4.852884982804738e-06, "loss": 0.4291, "step": 1273 }, { "epoch": 0.3894833384286151, "grad_norm": 3.239980459213257, "learning_rate": 4.856706152082537e-06, "loss": 0.4113, "step": 1274 }, { "epoch": 0.38978905533476, "grad_norm": 3.9969916343688965, "learning_rate": 4.860527321360336e-06, "loss": 0.5835, "step": 1275 }, { "epoch": 0.39009477224090494, "grad_norm": 0.7067741751670837, "learning_rate": 4.864348490638136e-06, "loss": 0.2788, "step": 1276 }, { "epoch": 0.39040048914704983, "grad_norm": 0.5166637301445007, "learning_rate": 4.868169659915934e-06, "loss": 0.1937, "step": 1277 }, { "epoch": 0.3907062060531947, "grad_norm": 0.7154192924499512, "learning_rate": 4.871990829193734e-06, "loss": 0.1739, "step": 1278 }, { "epoch": 0.3910119229593397, "grad_norm": 0.508371114730835, "learning_rate": 4.875811998471533e-06, "loss": 0.1428, "step": 1279 }, { "epoch": 0.39131763986548457, "grad_norm": 0.5431569218635559, "learning_rate": 4.8796331677493315e-06, "loss": 0.1288, "step": 1280 }, { "epoch": 0.39162335677162946, "grad_norm": 0.7764075398445129, "learning_rate": 4.88345433702713e-06, "loss": 0.1134, "step": 1281 }, { "epoch": 0.39192907367777435, "grad_norm": 0.5803810358047485, "learning_rate": 4.88727550630493e-06, "loss": 0.1523, "step": 1282 }, { "epoch": 0.3922347905839193, "grad_norm": 0.6589874029159546, "learning_rate": 4.8910966755827286e-06, "loss": 0.1612, "step": 1283 }, { "epoch": 0.3925405074900642, "grad_norm": 0.6391811370849609, "learning_rate": 4.894917844860527e-06, "loss": 0.1563, "step": 1284 }, { "epoch": 0.3928462243962091, "grad_norm": 0.6180522441864014, "learning_rate": 4.898739014138327e-06, "loss": 0.1472, "step": 1285 }, { "epoch": 0.39315194130235404, "grad_norm": 0.9768776297569275, "learning_rate": 4.902560183416126e-06, "loss": 0.2134, "step": 1286 }, { "epoch": 0.39345765820849893, "grad_norm": 0.8891527652740479, "learning_rate": 4.906381352693924e-06, "loss": 0.1895, "step": 1287 }, { "epoch": 0.3937633751146438, "grad_norm": 0.9281437993049622, "learning_rate": 4.910202521971723e-06, "loss": 0.2201, "step": 1288 }, { "epoch": 0.39406909202078877, "grad_norm": 1.1035382747650146, "learning_rate": 4.914023691249523e-06, "loss": 0.193, "step": 1289 }, { "epoch": 0.39437480892693366, "grad_norm": 0.9090976715087891, "learning_rate": 4.9178448605273215e-06, "loss": 0.2009, "step": 1290 }, { "epoch": 0.39468052583307855, "grad_norm": 0.8891276717185974, "learning_rate": 4.92166602980512e-06, "loss": 0.2479, "step": 1291 }, { "epoch": 0.3949862427392235, "grad_norm": 1.1348284482955933, "learning_rate": 4.925487199082919e-06, "loss": 0.2774, "step": 1292 }, { "epoch": 0.3952919596453684, "grad_norm": 1.1534793376922607, "learning_rate": 4.9293083683607186e-06, "loss": 0.3512, "step": 1293 }, { "epoch": 0.3955976765515133, "grad_norm": 1.1900779008865356, "learning_rate": 4.933129537638517e-06, "loss": 0.3463, "step": 1294 }, { "epoch": 0.3959033934576582, "grad_norm": 1.3482097387313843, "learning_rate": 4.936950706916316e-06, "loss": 0.3638, "step": 1295 }, { "epoch": 0.39620911036380313, "grad_norm": 1.7599655389785767, "learning_rate": 4.940771876194116e-06, "loss": 0.3569, "step": 1296 }, { "epoch": 0.396514827269948, "grad_norm": 2.334439754486084, "learning_rate": 4.944593045471915e-06, "loss": 0.4064, "step": 1297 }, { "epoch": 0.3968205441760929, "grad_norm": 1.5174139738082886, "learning_rate": 4.948414214749714e-06, "loss": 0.3995, "step": 1298 }, { "epoch": 0.39712626108223786, "grad_norm": 2.1678736209869385, "learning_rate": 4.952235384027513e-06, "loss": 0.4199, "step": 1299 }, { "epoch": 0.39743197798838276, "grad_norm": 4.567468643188477, "learning_rate": 4.9560565533053115e-06, "loss": 0.5467, "step": 1300 }, { "epoch": 0.39773769489452765, "grad_norm": 0.9034004807472229, "learning_rate": 4.959877722583111e-06, "loss": 0.2589, "step": 1301 }, { "epoch": 0.3980434118006726, "grad_norm": 0.7938328981399536, "learning_rate": 4.96369889186091e-06, "loss": 0.1992, "step": 1302 }, { "epoch": 0.3983491287068175, "grad_norm": 0.803612232208252, "learning_rate": 4.9675200611387085e-06, "loss": 0.141, "step": 1303 }, { "epoch": 0.3986548456129624, "grad_norm": 0.6698991656303406, "learning_rate": 4.971341230416507e-06, "loss": 0.1903, "step": 1304 }, { "epoch": 0.39896056251910733, "grad_norm": 0.7669826149940491, "learning_rate": 4.975162399694307e-06, "loss": 0.1443, "step": 1305 }, { "epoch": 0.3992662794252522, "grad_norm": 0.8387812376022339, "learning_rate": 4.978983568972106e-06, "loss": 0.1606, "step": 1306 }, { "epoch": 0.3995719963313971, "grad_norm": 0.5812118053436279, "learning_rate": 4.982804738249904e-06, "loss": 0.1355, "step": 1307 }, { "epoch": 0.399877713237542, "grad_norm": 1.2772188186645508, "learning_rate": 4.986625907527703e-06, "loss": 0.1772, "step": 1308 }, { "epoch": 0.40018343014368696, "grad_norm": 0.6703361868858337, "learning_rate": 4.990447076805503e-06, "loss": 0.1501, "step": 1309 }, { "epoch": 0.40048914704983185, "grad_norm": 0.6659746766090393, "learning_rate": 4.9942682460833014e-06, "loss": 0.1778, "step": 1310 }, { "epoch": 0.40079486395597674, "grad_norm": 0.8355326652526855, "learning_rate": 4.9980894153611e-06, "loss": 0.1686, "step": 1311 }, { "epoch": 0.4011005808621217, "grad_norm": 0.6912976503372192, "learning_rate": 5.001910584638899e-06, "loss": 0.1576, "step": 1312 }, { "epoch": 0.4014062977682666, "grad_norm": 0.9919653534889221, "learning_rate": 5.0057317539166985e-06, "loss": 0.2627, "step": 1313 }, { "epoch": 0.4017120146744115, "grad_norm": 0.7396920919418335, "learning_rate": 5.009552923194497e-06, "loss": 0.2185, "step": 1314 }, { "epoch": 0.4020177315805564, "grad_norm": 1.0119283199310303, "learning_rate": 5.013374092472297e-06, "loss": 0.2353, "step": 1315 }, { "epoch": 0.4023234484867013, "grad_norm": 4.674990653991699, "learning_rate": 5.017195261750096e-06, "loss": 0.277, "step": 1316 }, { "epoch": 0.4026291653928462, "grad_norm": 1.253829836845398, "learning_rate": 5.021016431027895e-06, "loss": 0.2979, "step": 1317 }, { "epoch": 0.40293488229899116, "grad_norm": 1.1452797651290894, "learning_rate": 5.024837600305694e-06, "loss": 0.2951, "step": 1318 }, { "epoch": 0.40324059920513605, "grad_norm": 1.277434229850769, "learning_rate": 5.028658769583493e-06, "loss": 0.3283, "step": 1319 }, { "epoch": 0.40354631611128094, "grad_norm": 2.8071792125701904, "learning_rate": 5.032479938861292e-06, "loss": 0.3366, "step": 1320 }, { "epoch": 0.40385203301742584, "grad_norm": 1.5146147012710571, "learning_rate": 5.036301108139091e-06, "loss": 0.3886, "step": 1321 }, { "epoch": 0.4041577499235708, "grad_norm": 2.0810978412628174, "learning_rate": 5.04012227741689e-06, "loss": 0.3978, "step": 1322 }, { "epoch": 0.4044634668297157, "grad_norm": 1.4489542245864868, "learning_rate": 5.0439434466946885e-06, "loss": 0.3682, "step": 1323 }, { "epoch": 0.40476918373586057, "grad_norm": 1.9398267269134521, "learning_rate": 5.047764615972488e-06, "loss": 0.3916, "step": 1324 }, { "epoch": 0.4050749006420055, "grad_norm": 2.455259323120117, "learning_rate": 5.051585785250287e-06, "loss": 0.5732, "step": 1325 }, { "epoch": 0.4053806175481504, "grad_norm": 0.9040418267250061, "learning_rate": 5.055406954528086e-06, "loss": 0.2848, "step": 1326 }, { "epoch": 0.4056863344542953, "grad_norm": 0.6162763237953186, "learning_rate": 5.059228123805884e-06, "loss": 0.2016, "step": 1327 }, { "epoch": 0.40599205136044025, "grad_norm": 0.6002427339553833, "learning_rate": 5.063049293083684e-06, "loss": 0.1413, "step": 1328 }, { "epoch": 0.40629776826658515, "grad_norm": 0.715265154838562, "learning_rate": 5.066870462361483e-06, "loss": 0.156, "step": 1329 }, { "epoch": 0.40660348517273004, "grad_norm": 0.5038378834724426, "learning_rate": 5.070691631639281e-06, "loss": 0.1101, "step": 1330 }, { "epoch": 0.406909202078875, "grad_norm": 1.0096873044967651, "learning_rate": 5.07451280091708e-06, "loss": 0.1412, "step": 1331 }, { "epoch": 0.4072149189850199, "grad_norm": 0.635530412197113, "learning_rate": 5.07833397019488e-06, "loss": 0.1477, "step": 1332 }, { "epoch": 0.4075206358911648, "grad_norm": 0.5414690375328064, "learning_rate": 5.0821551394726785e-06, "loss": 0.1436, "step": 1333 }, { "epoch": 0.40782635279730967, "grad_norm": 0.5466145873069763, "learning_rate": 5.085976308750477e-06, "loss": 0.1376, "step": 1334 }, { "epoch": 0.4081320697034546, "grad_norm": 0.5597972273826599, "learning_rate": 5.089797478028277e-06, "loss": 0.1436, "step": 1335 }, { "epoch": 0.4084377866095995, "grad_norm": 0.7341774106025696, "learning_rate": 5.0936186473060764e-06, "loss": 0.1836, "step": 1336 }, { "epoch": 0.4087435035157444, "grad_norm": 0.8261978030204773, "learning_rate": 5.097439816583875e-06, "loss": 0.1916, "step": 1337 }, { "epoch": 0.40904922042188935, "grad_norm": 0.7484690546989441, "learning_rate": 5.101260985861674e-06, "loss": 0.1742, "step": 1338 }, { "epoch": 0.40935493732803424, "grad_norm": 0.8227639198303223, "learning_rate": 5.105082155139473e-06, "loss": 0.2566, "step": 1339 }, { "epoch": 0.40966065423417913, "grad_norm": 0.7340384125709534, "learning_rate": 5.108903324417272e-06, "loss": 0.222, "step": 1340 }, { "epoch": 0.4099663711403241, "grad_norm": 1.3571431636810303, "learning_rate": 5.112724493695071e-06, "loss": 0.2607, "step": 1341 }, { "epoch": 0.410272088046469, "grad_norm": 1.7297300100326538, "learning_rate": 5.11654566297287e-06, "loss": 0.2953, "step": 1342 }, { "epoch": 0.41057780495261387, "grad_norm": 1.267896294593811, "learning_rate": 5.1203668322506685e-06, "loss": 0.3289, "step": 1343 }, { "epoch": 0.4108835218587588, "grad_norm": 1.213083028793335, "learning_rate": 5.124188001528468e-06, "loss": 0.3474, "step": 1344 }, { "epoch": 0.4111892387649037, "grad_norm": 1.2214168310165405, "learning_rate": 5.128009170806267e-06, "loss": 0.3517, "step": 1345 }, { "epoch": 0.4114949556710486, "grad_norm": 1.5511515140533447, "learning_rate": 5.1318303400840656e-06, "loss": 0.3737, "step": 1346 }, { "epoch": 0.4118006725771935, "grad_norm": 2.18957257270813, "learning_rate": 5.135651509361864e-06, "loss": 0.3323, "step": 1347 }, { "epoch": 0.41210638948333844, "grad_norm": 1.9654964208602905, "learning_rate": 5.139472678639664e-06, "loss": 0.3993, "step": 1348 }, { "epoch": 0.41241210638948334, "grad_norm": 1.6144508123397827, "learning_rate": 5.143293847917463e-06, "loss": 0.4265, "step": 1349 }, { "epoch": 0.41271782329562823, "grad_norm": 3.655642032623291, "learning_rate": 5.147115017195261e-06, "loss": 0.5924, "step": 1350 }, { "epoch": 0.4130235402017732, "grad_norm": 1.316231608390808, "learning_rate": 5.150936186473061e-06, "loss": 0.2883, "step": 1351 }, { "epoch": 0.41332925710791807, "grad_norm": 1.2389651536941528, "learning_rate": 5.15475735575086e-06, "loss": 0.1348, "step": 1352 }, { "epoch": 0.41363497401406296, "grad_norm": 0.5293911099433899, "learning_rate": 5.1585785250286585e-06, "loss": 0.1705, "step": 1353 }, { "epoch": 0.4139406909202079, "grad_norm": 0.6752898693084717, "learning_rate": 5.162399694306458e-06, "loss": 0.1618, "step": 1354 }, { "epoch": 0.4142464078263528, "grad_norm": 0.6094158887863159, "learning_rate": 5.166220863584258e-06, "loss": 0.1234, "step": 1355 }, { "epoch": 0.4145521247324977, "grad_norm": 0.602851152420044, "learning_rate": 5.170042032862056e-06, "loss": 0.1404, "step": 1356 }, { "epoch": 0.41485784163864264, "grad_norm": 0.5731081366539001, "learning_rate": 5.173863202139855e-06, "loss": 0.1038, "step": 1357 }, { "epoch": 0.41516355854478754, "grad_norm": 0.5587401390075684, "learning_rate": 5.177684371417654e-06, "loss": 0.1508, "step": 1358 }, { "epoch": 0.41546927545093243, "grad_norm": 0.7085039019584656, "learning_rate": 5.1815055406954535e-06, "loss": 0.1277, "step": 1359 }, { "epoch": 0.4157749923570773, "grad_norm": 0.6725518703460693, "learning_rate": 5.185326709973252e-06, "loss": 0.159, "step": 1360 }, { "epoch": 0.41608070926322227, "grad_norm": 0.6211150288581848, "learning_rate": 5.189147879251051e-06, "loss": 0.1325, "step": 1361 }, { "epoch": 0.41638642616936716, "grad_norm": 0.8007253408432007, "learning_rate": 5.19296904852885e-06, "loss": 0.1861, "step": 1362 }, { "epoch": 0.41669214307551206, "grad_norm": 0.8060648441314697, "learning_rate": 5.196790217806649e-06, "loss": 0.2073, "step": 1363 }, { "epoch": 0.416997859981657, "grad_norm": 1.1795018911361694, "learning_rate": 5.200611387084448e-06, "loss": 0.2059, "step": 1364 }, { "epoch": 0.4173035768878019, "grad_norm": 1.853468894958496, "learning_rate": 5.204432556362247e-06, "loss": 0.2874, "step": 1365 }, { "epoch": 0.4176092937939468, "grad_norm": 1.273930549621582, "learning_rate": 5.2082537256400455e-06, "loss": 0.2554, "step": 1366 }, { "epoch": 0.41791501070009174, "grad_norm": 2.221656322479248, "learning_rate": 5.212074894917845e-06, "loss": 0.3478, "step": 1367 }, { "epoch": 0.41822072760623663, "grad_norm": 1.0372730493545532, "learning_rate": 5.215896064195644e-06, "loss": 0.321, "step": 1368 }, { "epoch": 0.4185264445123815, "grad_norm": 2.164851665496826, "learning_rate": 5.219717233473443e-06, "loss": 0.3348, "step": 1369 }, { "epoch": 0.41883216141852647, "grad_norm": 1.6584869623184204, "learning_rate": 5.223538402751241e-06, "loss": 0.3234, "step": 1370 }, { "epoch": 0.41913787832467136, "grad_norm": 1.3357857465744019, "learning_rate": 5.227359572029041e-06, "loss": 0.3578, "step": 1371 }, { "epoch": 0.41944359523081626, "grad_norm": 3.0854299068450928, "learning_rate": 5.23118074130684e-06, "loss": 0.3588, "step": 1372 }, { "epoch": 0.41974931213696115, "grad_norm": 2.023376703262329, "learning_rate": 5.2350019105846385e-06, "loss": 0.3934, "step": 1373 }, { "epoch": 0.4200550290431061, "grad_norm": 5.353822708129883, "learning_rate": 5.238823079862438e-06, "loss": 0.4199, "step": 1374 }, { "epoch": 0.420360745949251, "grad_norm": 8.218305587768555, "learning_rate": 5.242644249140238e-06, "loss": 0.5676, "step": 1375 }, { "epoch": 0.4206664628553959, "grad_norm": 0.6208615899085999, "learning_rate": 5.246465418418036e-06, "loss": 0.2468, "step": 1376 }, { "epoch": 0.42097217976154083, "grad_norm": 0.5228369832038879, "learning_rate": 5.250286587695835e-06, "loss": 0.1748, "step": 1377 }, { "epoch": 0.4212778966676857, "grad_norm": 0.7981668710708618, "learning_rate": 5.254107756973634e-06, "loss": 0.172, "step": 1378 }, { "epoch": 0.4215836135738306, "grad_norm": 0.5856297612190247, "learning_rate": 5.2579289262514335e-06, "loss": 0.1359, "step": 1379 }, { "epoch": 0.42188933047997557, "grad_norm": 0.5179661512374878, "learning_rate": 5.261750095529232e-06, "loss": 0.1277, "step": 1380 }, { "epoch": 0.42219504738612046, "grad_norm": 0.5800349116325378, "learning_rate": 5.265571264807031e-06, "loss": 0.1218, "step": 1381 }, { "epoch": 0.42250076429226535, "grad_norm": 0.5725017786026001, "learning_rate": 5.2693924340848305e-06, "loss": 0.1345, "step": 1382 }, { "epoch": 0.42280648119841024, "grad_norm": 0.9929306507110596, "learning_rate": 5.273213603362629e-06, "loss": 0.161, "step": 1383 }, { "epoch": 0.4231121981045552, "grad_norm": 0.5800286531448364, "learning_rate": 5.277034772640428e-06, "loss": 0.1369, "step": 1384 }, { "epoch": 0.4234179150107001, "grad_norm": 0.7855810523033142, "learning_rate": 5.280855941918227e-06, "loss": 0.1677, "step": 1385 }, { "epoch": 0.423723631916845, "grad_norm": 1.3666610717773438, "learning_rate": 5.284677111196026e-06, "loss": 0.1791, "step": 1386 }, { "epoch": 0.4240293488229899, "grad_norm": 0.8579196333885193, "learning_rate": 5.288498280473825e-06, "loss": 0.1871, "step": 1387 }, { "epoch": 0.4243350657291348, "grad_norm": 0.7060865759849548, "learning_rate": 5.292319449751624e-06, "loss": 0.1772, "step": 1388 }, { "epoch": 0.4246407826352797, "grad_norm": 1.1772781610488892, "learning_rate": 5.296140619029423e-06, "loss": 0.2279, "step": 1389 }, { "epoch": 0.42494649954142466, "grad_norm": 1.0134859085083008, "learning_rate": 5.299961788307222e-06, "loss": 0.2232, "step": 1390 }, { "epoch": 0.42525221644756955, "grad_norm": 1.5585485696792603, "learning_rate": 5.303782957585021e-06, "loss": 0.2284, "step": 1391 }, { "epoch": 0.42555793335371445, "grad_norm": 1.302858591079712, "learning_rate": 5.30760412686282e-06, "loss": 0.3609, "step": 1392 }, { "epoch": 0.4258636502598594, "grad_norm": 0.9647319316864014, "learning_rate": 5.311425296140619e-06, "loss": 0.3036, "step": 1393 }, { "epoch": 0.4261693671660043, "grad_norm": 0.9970281720161438, "learning_rate": 5.315246465418419e-06, "loss": 0.289, "step": 1394 }, { "epoch": 0.4264750840721492, "grad_norm": 1.377230167388916, "learning_rate": 5.319067634696218e-06, "loss": 0.3206, "step": 1395 }, { "epoch": 0.4267808009782941, "grad_norm": 1.393159031867981, "learning_rate": 5.322888803974016e-06, "loss": 0.3324, "step": 1396 }, { "epoch": 0.427086517884439, "grad_norm": 1.5722496509552002, "learning_rate": 5.326709973251815e-06, "loss": 0.3662, "step": 1397 }, { "epoch": 0.4273922347905839, "grad_norm": 3.3081650733947754, "learning_rate": 5.330531142529615e-06, "loss": 0.3452, "step": 1398 }, { "epoch": 0.4276979516967288, "grad_norm": 2.036569118499756, "learning_rate": 5.3343523118074134e-06, "loss": 0.4308, "step": 1399 }, { "epoch": 0.42800366860287375, "grad_norm": 2.355095624923706, "learning_rate": 5.338173481085212e-06, "loss": 0.5594, "step": 1400 }, { "epoch": 0.42830938550901865, "grad_norm": 0.7370728850364685, "learning_rate": 5.341994650363011e-06, "loss": 0.2556, "step": 1401 }, { "epoch": 0.42861510241516354, "grad_norm": 0.5488453507423401, "learning_rate": 5.3458158196408105e-06, "loss": 0.1656, "step": 1402 }, { "epoch": 0.4289208193213085, "grad_norm": 0.7265148162841797, "learning_rate": 5.349636988918609e-06, "loss": 0.1442, "step": 1403 }, { "epoch": 0.4292265362274534, "grad_norm": 0.5736374258995056, "learning_rate": 5.353458158196408e-06, "loss": 0.1462, "step": 1404 }, { "epoch": 0.4295322531335983, "grad_norm": 0.5143766403198242, "learning_rate": 5.357279327474207e-06, "loss": 0.1515, "step": 1405 }, { "epoch": 0.4298379700397432, "grad_norm": 0.5869737863540649, "learning_rate": 5.361100496752006e-06, "loss": 0.1309, "step": 1406 }, { "epoch": 0.4301436869458881, "grad_norm": 0.5825177431106567, "learning_rate": 5.364921666029805e-06, "loss": 0.1246, "step": 1407 }, { "epoch": 0.430449403852033, "grad_norm": 0.9723330140113831, "learning_rate": 5.368742835307604e-06, "loss": 0.1985, "step": 1408 }, { "epoch": 0.4307551207581779, "grad_norm": 0.838407039642334, "learning_rate": 5.3725640045854026e-06, "loss": 0.1668, "step": 1409 }, { "epoch": 0.43106083766432285, "grad_norm": 0.7364253997802734, "learning_rate": 5.376385173863202e-06, "loss": 0.1655, "step": 1410 }, { "epoch": 0.43136655457046774, "grad_norm": 0.8068876266479492, "learning_rate": 5.380206343141001e-06, "loss": 0.1984, "step": 1411 }, { "epoch": 0.43167227147661263, "grad_norm": 0.8391739726066589, "learning_rate": 5.3840275124188005e-06, "loss": 0.159, "step": 1412 }, { "epoch": 0.4319779883827576, "grad_norm": 0.6882902383804321, "learning_rate": 5.387848681696599e-06, "loss": 0.1747, "step": 1413 }, { "epoch": 0.4322837052889025, "grad_norm": 0.8079756498336792, "learning_rate": 5.391669850974399e-06, "loss": 0.1911, "step": 1414 }, { "epoch": 0.43258942219504737, "grad_norm": 0.8331185579299927, "learning_rate": 5.395491020252198e-06, "loss": 0.2716, "step": 1415 }, { "epoch": 0.4328951391011923, "grad_norm": 1.1497169733047485, "learning_rate": 5.399312189529996e-06, "loss": 0.2349, "step": 1416 }, { "epoch": 0.4332008560073372, "grad_norm": 1.065093994140625, "learning_rate": 5.403133358807796e-06, "loss": 0.2846, "step": 1417 }, { "epoch": 0.4335065729134821, "grad_norm": 1.44892156124115, "learning_rate": 5.406954528085595e-06, "loss": 0.3136, "step": 1418 }, { "epoch": 0.43381228981962705, "grad_norm": 1.2225708961486816, "learning_rate": 5.410775697363393e-06, "loss": 0.2822, "step": 1419 }, { "epoch": 0.43411800672577194, "grad_norm": 1.2240264415740967, "learning_rate": 5.414596866641192e-06, "loss": 0.3719, "step": 1420 }, { "epoch": 0.43442372363191684, "grad_norm": 1.328163981437683, "learning_rate": 5.418418035918992e-06, "loss": 0.3533, "step": 1421 }, { "epoch": 0.43472944053806173, "grad_norm": 1.6499191522598267, "learning_rate": 5.4222392051967905e-06, "loss": 0.419, "step": 1422 }, { "epoch": 0.4350351574442067, "grad_norm": 1.7151870727539062, "learning_rate": 5.426060374474589e-06, "loss": 0.3761, "step": 1423 }, { "epoch": 0.43534087435035157, "grad_norm": 2.173807382583618, "learning_rate": 5.429881543752388e-06, "loss": 0.426, "step": 1424 }, { "epoch": 0.43564659125649646, "grad_norm": 2.887911558151245, "learning_rate": 5.4337027130301876e-06, "loss": 0.5084, "step": 1425 }, { "epoch": 0.4359523081626414, "grad_norm": 0.6654773354530334, "learning_rate": 5.437523882307986e-06, "loss": 0.2304, "step": 1426 }, { "epoch": 0.4362580250687863, "grad_norm": 0.7351674437522888, "learning_rate": 5.441345051585785e-06, "loss": 0.1422, "step": 1427 }, { "epoch": 0.4365637419749312, "grad_norm": 0.6390872597694397, "learning_rate": 5.445166220863584e-06, "loss": 0.142, "step": 1428 }, { "epoch": 0.43686945888107614, "grad_norm": 0.7088742852210999, "learning_rate": 5.448987390141383e-06, "loss": 0.1648, "step": 1429 }, { "epoch": 0.43717517578722104, "grad_norm": 0.48904478549957275, "learning_rate": 5.452808559419182e-06, "loss": 0.11, "step": 1430 }, { "epoch": 0.43748089269336593, "grad_norm": 0.6018951535224915, "learning_rate": 5.456629728696981e-06, "loss": 0.13, "step": 1431 }, { "epoch": 0.4377866095995109, "grad_norm": 0.5275538563728333, "learning_rate": 5.4604508979747805e-06, "loss": 0.1068, "step": 1432 }, { "epoch": 0.43809232650565577, "grad_norm": 0.9476163983345032, "learning_rate": 5.46427206725258e-06, "loss": 0.1436, "step": 1433 }, { "epoch": 0.43839804341180066, "grad_norm": 0.770512044429779, "learning_rate": 5.468093236530379e-06, "loss": 0.14, "step": 1434 }, { "epoch": 0.43870376031794556, "grad_norm": 0.5432831048965454, "learning_rate": 5.4719144058081776e-06, "loss": 0.1264, "step": 1435 }, { "epoch": 0.4390094772240905, "grad_norm": 0.6544167995452881, "learning_rate": 5.475735575085976e-06, "loss": 0.1696, "step": 1436 }, { "epoch": 0.4393151941302354, "grad_norm": 0.7097304463386536, "learning_rate": 5.479556744363776e-06, "loss": 0.1356, "step": 1437 }, { "epoch": 0.4396209110363803, "grad_norm": 0.7673826217651367, "learning_rate": 5.483377913641575e-06, "loss": 0.1921, "step": 1438 }, { "epoch": 0.43992662794252524, "grad_norm": 1.9408562183380127, "learning_rate": 5.487199082919373e-06, "loss": 0.2195, "step": 1439 }, { "epoch": 0.44023234484867013, "grad_norm": 0.6766736507415771, "learning_rate": 5.491020252197172e-06, "loss": 0.2187, "step": 1440 }, { "epoch": 0.440538061754815, "grad_norm": 0.9510131478309631, "learning_rate": 5.494841421474972e-06, "loss": 0.2626, "step": 1441 }, { "epoch": 0.44084377866096, "grad_norm": 0.9889276623725891, "learning_rate": 5.4986625907527705e-06, "loss": 0.2616, "step": 1442 }, { "epoch": 0.44114949556710487, "grad_norm": 1.3165086507797241, "learning_rate": 5.502483760030569e-06, "loss": 0.3341, "step": 1443 }, { "epoch": 0.44145521247324976, "grad_norm": 1.6684355735778809, "learning_rate": 5.506304929308368e-06, "loss": 0.3683, "step": 1444 }, { "epoch": 0.4417609293793947, "grad_norm": 1.9388741254806519, "learning_rate": 5.5101260985861675e-06, "loss": 0.3065, "step": 1445 }, { "epoch": 0.4420666462855396, "grad_norm": 3.941204309463501, "learning_rate": 5.513947267863966e-06, "loss": 0.3132, "step": 1446 }, { "epoch": 0.4423723631916845, "grad_norm": 2.1633684635162354, "learning_rate": 5.517768437141765e-06, "loss": 0.375, "step": 1447 }, { "epoch": 0.4426780800978294, "grad_norm": 1.605307936668396, "learning_rate": 5.521589606419565e-06, "loss": 0.4015, "step": 1448 }, { "epoch": 0.44298379700397433, "grad_norm": 1.779604196548462, "learning_rate": 5.525410775697363e-06, "loss": 0.4365, "step": 1449 }, { "epoch": 0.4432895139101192, "grad_norm": 3.9954564571380615, "learning_rate": 5.529231944975162e-06, "loss": 0.531, "step": 1450 }, { "epoch": 0.4435952308162641, "grad_norm": 0.6006679534912109, "learning_rate": 5.533053114252962e-06, "loss": 0.2426, "step": 1451 }, { "epoch": 0.44390094772240907, "grad_norm": 0.5728842616081238, "learning_rate": 5.536874283530761e-06, "loss": 0.1714, "step": 1452 }, { "epoch": 0.44420666462855396, "grad_norm": 0.6073495745658875, "learning_rate": 5.54069545280856e-06, "loss": 0.1377, "step": 1453 }, { "epoch": 0.44451238153469885, "grad_norm": 0.7928131222724915, "learning_rate": 5.544516622086359e-06, "loss": 0.1455, "step": 1454 }, { "epoch": 0.4448180984408438, "grad_norm": 0.6218541860580444, "learning_rate": 5.5483377913641575e-06, "loss": 0.1387, "step": 1455 }, { "epoch": 0.4451238153469887, "grad_norm": 0.5825249552726746, "learning_rate": 5.552158960641957e-06, "loss": 0.1558, "step": 1456 }, { "epoch": 0.4454295322531336, "grad_norm": 0.8664868474006653, "learning_rate": 5.555980129919756e-06, "loss": 0.1813, "step": 1457 }, { "epoch": 0.44573524915927853, "grad_norm": 0.5208842754364014, "learning_rate": 5.559801299197555e-06, "loss": 0.1448, "step": 1458 }, { "epoch": 0.4460409660654234, "grad_norm": 0.6711595058441162, "learning_rate": 5.563622468475353e-06, "loss": 0.169, "step": 1459 }, { "epoch": 0.4463466829715683, "grad_norm": 0.7706631422042847, "learning_rate": 5.567443637753153e-06, "loss": 0.1346, "step": 1460 }, { "epoch": 0.4466523998777132, "grad_norm": 0.6589921712875366, "learning_rate": 5.571264807030952e-06, "loss": 0.217, "step": 1461 }, { "epoch": 0.44695811678385816, "grad_norm": 0.6047631502151489, "learning_rate": 5.5750859763087504e-06, "loss": 0.1543, "step": 1462 }, { "epoch": 0.44726383369000305, "grad_norm": 0.901934802532196, "learning_rate": 5.578907145586549e-06, "loss": 0.2012, "step": 1463 }, { "epoch": 0.44756955059614795, "grad_norm": 0.8441657423973083, "learning_rate": 5.582728314864349e-06, "loss": 0.2004, "step": 1464 }, { "epoch": 0.4478752675022929, "grad_norm": 0.9855977296829224, "learning_rate": 5.5865494841421475e-06, "loss": 0.2299, "step": 1465 }, { "epoch": 0.4481809844084378, "grad_norm": 0.9420602321624756, "learning_rate": 5.590370653419946e-06, "loss": 0.2572, "step": 1466 }, { "epoch": 0.4484867013145827, "grad_norm": 1.2400072813034058, "learning_rate": 5.594191822697745e-06, "loss": 0.2665, "step": 1467 }, { "epoch": 0.44879241822072763, "grad_norm": 0.8167262077331543, "learning_rate": 5.598012991975545e-06, "loss": 0.3153, "step": 1468 }, { "epoch": 0.4490981351268725, "grad_norm": 1.2354620695114136, "learning_rate": 5.601834161253343e-06, "loss": 0.3112, "step": 1469 }, { "epoch": 0.4494038520330174, "grad_norm": 1.4385031461715698, "learning_rate": 5.605655330531142e-06, "loss": 0.3297, "step": 1470 }, { "epoch": 0.44970956893916236, "grad_norm": 1.140471339225769, "learning_rate": 5.609476499808942e-06, "loss": 0.3426, "step": 1471 }, { "epoch": 0.45001528584530726, "grad_norm": 1.5762871503829956, "learning_rate": 5.613297669086741e-06, "loss": 0.3459, "step": 1472 }, { "epoch": 0.45032100275145215, "grad_norm": 2.192809581756592, "learning_rate": 5.61711883836454e-06, "loss": 0.4316, "step": 1473 }, { "epoch": 0.45062671965759704, "grad_norm": 1.8700090646743774, "learning_rate": 5.620940007642339e-06, "loss": 0.4396, "step": 1474 }, { "epoch": 0.450932436563742, "grad_norm": 2.949990749359131, "learning_rate": 5.6247611769201375e-06, "loss": 0.5464, "step": 1475 }, { "epoch": 0.4512381534698869, "grad_norm": 0.5750002264976501, "learning_rate": 5.628582346197937e-06, "loss": 0.2602, "step": 1476 }, { "epoch": 0.4515438703760318, "grad_norm": 0.6884226202964783, "learning_rate": 5.632403515475736e-06, "loss": 0.1773, "step": 1477 }, { "epoch": 0.4518495872821767, "grad_norm": 0.7850894331932068, "learning_rate": 5.636224684753535e-06, "loss": 0.1433, "step": 1478 }, { "epoch": 0.4521553041883216, "grad_norm": 0.5394912362098694, "learning_rate": 5.640045854031334e-06, "loss": 0.1353, "step": 1479 }, { "epoch": 0.4524610210944665, "grad_norm": 0.43276676535606384, "learning_rate": 5.643867023309133e-06, "loss": 0.124, "step": 1480 }, { "epoch": 0.45276673800061146, "grad_norm": 0.5779910683631897, "learning_rate": 5.647688192586932e-06, "loss": 0.1468, "step": 1481 }, { "epoch": 0.45307245490675635, "grad_norm": 0.6509206891059875, "learning_rate": 5.65150936186473e-06, "loss": 0.1229, "step": 1482 }, { "epoch": 0.45337817181290124, "grad_norm": 0.6439804434776306, "learning_rate": 5.65533053114253e-06, "loss": 0.1453, "step": 1483 }, { "epoch": 0.45368388871904614, "grad_norm": 0.5339728593826294, "learning_rate": 5.659151700420329e-06, "loss": 0.1248, "step": 1484 }, { "epoch": 0.4539896056251911, "grad_norm": 0.5214660167694092, "learning_rate": 5.6629728696981275e-06, "loss": 0.1615, "step": 1485 }, { "epoch": 0.454295322531336, "grad_norm": 0.7597001194953918, "learning_rate": 5.666794038975926e-06, "loss": 0.1557, "step": 1486 }, { "epoch": 0.45460103943748087, "grad_norm": 0.7975046038627625, "learning_rate": 5.670615208253726e-06, "loss": 0.1712, "step": 1487 }, { "epoch": 0.4549067563436258, "grad_norm": 0.9877157211303711, "learning_rate": 5.674436377531525e-06, "loss": 0.2003, "step": 1488 }, { "epoch": 0.4552124732497707, "grad_norm": 0.7565435171127319, "learning_rate": 5.678257546809323e-06, "loss": 0.1946, "step": 1489 }, { "epoch": 0.4555181901559156, "grad_norm": 0.917678713798523, "learning_rate": 5.682078716087123e-06, "loss": 0.2548, "step": 1490 }, { "epoch": 0.45582390706206055, "grad_norm": 1.4010577201843262, "learning_rate": 5.6858998853649225e-06, "loss": 0.2248, "step": 1491 }, { "epoch": 0.45612962396820544, "grad_norm": 1.313340425491333, "learning_rate": 5.689721054642721e-06, "loss": 0.3098, "step": 1492 }, { "epoch": 0.45643534087435034, "grad_norm": 1.1965148448944092, "learning_rate": 5.69354222392052e-06, "loss": 0.2985, "step": 1493 }, { "epoch": 0.4567410577804953, "grad_norm": 0.9224949479103088, "learning_rate": 5.697363393198319e-06, "loss": 0.2997, "step": 1494 }, { "epoch": 0.4570467746866402, "grad_norm": 1.384360909461975, "learning_rate": 5.701184562476118e-06, "loss": 0.3162, "step": 1495 }, { "epoch": 0.45735249159278507, "grad_norm": 1.2723926305770874, "learning_rate": 5.705005731753917e-06, "loss": 0.3502, "step": 1496 }, { "epoch": 0.45765820849892996, "grad_norm": 1.572136640548706, "learning_rate": 5.708826901031716e-06, "loss": 0.3442, "step": 1497 }, { "epoch": 0.4579639254050749, "grad_norm": 1.644147276878357, "learning_rate": 5.7126480703095146e-06, "loss": 0.3754, "step": 1498 }, { "epoch": 0.4582696423112198, "grad_norm": 2.1421966552734375, "learning_rate": 5.716469239587314e-06, "loss": 0.3957, "step": 1499 }, { "epoch": 0.4585753592173647, "grad_norm": 3.6919493675231934, "learning_rate": 5.720290408865113e-06, "loss": 0.4966, "step": 1500 }, { "epoch": 0.45888107612350965, "grad_norm": 0.8912977576255798, "learning_rate": 5.724111578142912e-06, "loss": 0.2853, "step": 1501 }, { "epoch": 0.45918679302965454, "grad_norm": 0.5687696933746338, "learning_rate": 5.72793274742071e-06, "loss": 0.159, "step": 1502 }, { "epoch": 0.45949250993579943, "grad_norm": 0.634182870388031, "learning_rate": 5.73175391669851e-06, "loss": 0.1756, "step": 1503 }, { "epoch": 0.4597982268419444, "grad_norm": 0.5093748569488525, "learning_rate": 5.735575085976309e-06, "loss": 0.1239, "step": 1504 }, { "epoch": 0.4601039437480893, "grad_norm": 0.5008350610733032, "learning_rate": 5.7393962552541075e-06, "loss": 0.104, "step": 1505 }, { "epoch": 0.46040966065423417, "grad_norm": 0.6922484040260315, "learning_rate": 5.743217424531906e-06, "loss": 0.1296, "step": 1506 }, { "epoch": 0.4607153775603791, "grad_norm": 0.5465528964996338, "learning_rate": 5.747038593809706e-06, "loss": 0.1274, "step": 1507 }, { "epoch": 0.461021094466524, "grad_norm": 0.5320935249328613, "learning_rate": 5.7508597630875046e-06, "loss": 0.1062, "step": 1508 }, { "epoch": 0.4613268113726689, "grad_norm": 0.6155261993408203, "learning_rate": 5.754680932365303e-06, "loss": 0.174, "step": 1509 }, { "epoch": 0.4616325282788138, "grad_norm": 1.0189393758773804, "learning_rate": 5.758502101643103e-06, "loss": 0.1433, "step": 1510 }, { "epoch": 0.46193824518495874, "grad_norm": 0.6382350325584412, "learning_rate": 5.7623232709209025e-06, "loss": 0.129, "step": 1511 }, { "epoch": 0.46224396209110363, "grad_norm": 1.652158498764038, "learning_rate": 5.766144440198701e-06, "loss": 0.1861, "step": 1512 }, { "epoch": 0.4625496789972485, "grad_norm": 0.9698798060417175, "learning_rate": 5.7699656094765e-06, "loss": 0.208, "step": 1513 }, { "epoch": 0.4628553959033935, "grad_norm": 0.8726076483726501, "learning_rate": 5.7737867787542996e-06, "loss": 0.2001, "step": 1514 }, { "epoch": 0.46316111280953837, "grad_norm": 0.753102719783783, "learning_rate": 5.777607948032098e-06, "loss": 0.2294, "step": 1515 }, { "epoch": 0.46346682971568326, "grad_norm": 0.9722453951835632, "learning_rate": 5.781429117309897e-06, "loss": 0.219, "step": 1516 }, { "epoch": 0.4637725466218282, "grad_norm": 1.0511925220489502, "learning_rate": 5.785250286587696e-06, "loss": 0.2646, "step": 1517 }, { "epoch": 0.4640782635279731, "grad_norm": 1.0059694051742554, "learning_rate": 5.789071455865495e-06, "loss": 0.2864, "step": 1518 }, { "epoch": 0.464383980434118, "grad_norm": 1.1343311071395874, "learning_rate": 5.792892625143294e-06, "loss": 0.3016, "step": 1519 }, { "epoch": 0.46468969734026294, "grad_norm": 4.117332458496094, "learning_rate": 5.796713794421093e-06, "loss": 0.3558, "step": 1520 }, { "epoch": 0.46499541424640783, "grad_norm": 1.7416610717773438, "learning_rate": 5.800534963698892e-06, "loss": 0.359, "step": 1521 }, { "epoch": 0.4653011311525527, "grad_norm": 4.492753505706787, "learning_rate": 5.804356132976691e-06, "loss": 0.3616, "step": 1522 }, { "epoch": 0.4656068480586976, "grad_norm": 3.5465192794799805, "learning_rate": 5.80817730225449e-06, "loss": 0.3859, "step": 1523 }, { "epoch": 0.46591256496484257, "grad_norm": 1.7752809524536133, "learning_rate": 5.811998471532289e-06, "loss": 0.3971, "step": 1524 }, { "epoch": 0.46621828187098746, "grad_norm": 3.450338363647461, "learning_rate": 5.8158196408100874e-06, "loss": 0.5869, "step": 1525 }, { "epoch": 0.46652399877713235, "grad_norm": 0.6518266201019287, "learning_rate": 5.819640810087887e-06, "loss": 0.2576, "step": 1526 }, { "epoch": 0.4668297156832773, "grad_norm": 0.4855239987373352, "learning_rate": 5.823461979365686e-06, "loss": 0.1598, "step": 1527 }, { "epoch": 0.4671354325894222, "grad_norm": 0.4652314782142639, "learning_rate": 5.8272831486434845e-06, "loss": 0.1582, "step": 1528 }, { "epoch": 0.4674411494955671, "grad_norm": 0.46380504965782166, "learning_rate": 5.831104317921284e-06, "loss": 0.1152, "step": 1529 }, { "epoch": 0.46774686640171204, "grad_norm": 0.4838676154613495, "learning_rate": 5.834925487199084e-06, "loss": 0.1138, "step": 1530 }, { "epoch": 0.46805258330785693, "grad_norm": 0.5403484106063843, "learning_rate": 5.8387466564768825e-06, "loss": 0.135, "step": 1531 }, { "epoch": 0.4683583002140018, "grad_norm": 0.4559294581413269, "learning_rate": 5.842567825754681e-06, "loss": 0.1169, "step": 1532 }, { "epoch": 0.46866401712014677, "grad_norm": 0.8320120573043823, "learning_rate": 5.84638899503248e-06, "loss": 0.1629, "step": 1533 }, { "epoch": 0.46896973402629166, "grad_norm": 0.5228766798973083, "learning_rate": 5.8502101643102795e-06, "loss": 0.1142, "step": 1534 }, { "epoch": 0.46927545093243656, "grad_norm": 0.8565366268157959, "learning_rate": 5.854031333588078e-06, "loss": 0.1487, "step": 1535 }, { "epoch": 0.46958116783858145, "grad_norm": 0.835739016532898, "learning_rate": 5.857852502865877e-06, "loss": 0.1852, "step": 1536 }, { "epoch": 0.4698868847447264, "grad_norm": 0.6322270631790161, "learning_rate": 5.861673672143676e-06, "loss": 0.1502, "step": 1537 }, { "epoch": 0.4701926016508713, "grad_norm": 0.6449726223945618, "learning_rate": 5.865494841421475e-06, "loss": 0.2125, "step": 1538 }, { "epoch": 0.4704983185570162, "grad_norm": 0.9615899324417114, "learning_rate": 5.869316010699274e-06, "loss": 0.1806, "step": 1539 }, { "epoch": 0.47080403546316113, "grad_norm": 1.5217723846435547, "learning_rate": 5.873137179977073e-06, "loss": 0.3039, "step": 1540 }, { "epoch": 0.471109752369306, "grad_norm": 1.047280192375183, "learning_rate": 5.8769583492548724e-06, "loss": 0.2308, "step": 1541 }, { "epoch": 0.4714154692754509, "grad_norm": 1.2710621356964111, "learning_rate": 5.880779518532671e-06, "loss": 0.2975, "step": 1542 }, { "epoch": 0.47172118618159586, "grad_norm": 1.2034118175506592, "learning_rate": 5.88460068781047e-06, "loss": 0.3252, "step": 1543 }, { "epoch": 0.47202690308774076, "grad_norm": 1.399906873703003, "learning_rate": 5.888421857088269e-06, "loss": 0.3477, "step": 1544 }, { "epoch": 0.47233261999388565, "grad_norm": 1.4099671840667725, "learning_rate": 5.892243026366068e-06, "loss": 0.3406, "step": 1545 }, { "epoch": 0.4726383369000306, "grad_norm": 2.3429577350616455, "learning_rate": 5.896064195643867e-06, "loss": 0.3249, "step": 1546 }, { "epoch": 0.4729440538061755, "grad_norm": 11.052801132202148, "learning_rate": 5.899885364921666e-06, "loss": 0.3721, "step": 1547 }, { "epoch": 0.4732497707123204, "grad_norm": 1.7301377058029175, "learning_rate": 5.903706534199465e-06, "loss": 0.3924, "step": 1548 }, { "epoch": 0.4735554876184653, "grad_norm": 1.925672173500061, "learning_rate": 5.907527703477265e-06, "loss": 0.4019, "step": 1549 }, { "epoch": 0.4738612045246102, "grad_norm": 2.3690290451049805, "learning_rate": 5.911348872755064e-06, "loss": 0.5798, "step": 1550 }, { "epoch": 0.4741669214307551, "grad_norm": 0.6058140993118286, "learning_rate": 5.9151700420328624e-06, "loss": 0.2334, "step": 1551 }, { "epoch": 0.4744726383369, "grad_norm": 0.6556515693664551, "learning_rate": 5.918991211310661e-06, "loss": 0.1489, "step": 1552 }, { "epoch": 0.47477835524304496, "grad_norm": 0.6158584356307983, "learning_rate": 5.922812380588461e-06, "loss": 0.1519, "step": 1553 }, { "epoch": 0.47508407214918985, "grad_norm": 0.5261646509170532, "learning_rate": 5.9266335498662595e-06, "loss": 0.1172, "step": 1554 }, { "epoch": 0.47538978905533474, "grad_norm": 0.4238406717777252, "learning_rate": 5.930454719144058e-06, "loss": 0.1138, "step": 1555 }, { "epoch": 0.4756955059614797, "grad_norm": 0.5132592916488647, "learning_rate": 5.934275888421857e-06, "loss": 0.1489, "step": 1556 }, { "epoch": 0.4760012228676246, "grad_norm": 0.489541232585907, "learning_rate": 5.938097057699657e-06, "loss": 0.1029, "step": 1557 }, { "epoch": 0.4763069397737695, "grad_norm": 0.7315694689750671, "learning_rate": 5.941918226977455e-06, "loss": 0.1105, "step": 1558 }, { "epoch": 0.4766126566799144, "grad_norm": 0.6320673227310181, "learning_rate": 5.945739396255254e-06, "loss": 0.1747, "step": 1559 }, { "epoch": 0.4769183735860593, "grad_norm": 0.6652668714523315, "learning_rate": 5.949560565533053e-06, "loss": 0.1051, "step": 1560 }, { "epoch": 0.4772240904922042, "grad_norm": 0.7119699120521545, "learning_rate": 5.953381734810852e-06, "loss": 0.2037, "step": 1561 }, { "epoch": 0.4775298073983491, "grad_norm": 0.9247368574142456, "learning_rate": 5.957202904088651e-06, "loss": 0.1742, "step": 1562 }, { "epoch": 0.47783552430449405, "grad_norm": 0.7939284443855286, "learning_rate": 5.96102407336645e-06, "loss": 0.2186, "step": 1563 }, { "epoch": 0.47814124121063895, "grad_norm": 1.5275377035140991, "learning_rate": 5.964845242644249e-06, "loss": 0.1897, "step": 1564 }, { "epoch": 0.47844695811678384, "grad_norm": 0.8849748373031616, "learning_rate": 5.968666411922048e-06, "loss": 0.226, "step": 1565 }, { "epoch": 0.4787526750229288, "grad_norm": 1.4264332056045532, "learning_rate": 5.972487581199847e-06, "loss": 0.2697, "step": 1566 }, { "epoch": 0.4790583919290737, "grad_norm": 1.2362239360809326, "learning_rate": 5.976308750477646e-06, "loss": 0.2519, "step": 1567 }, { "epoch": 0.47936410883521857, "grad_norm": 1.2370015382766724, "learning_rate": 5.980129919755445e-06, "loss": 0.2949, "step": 1568 }, { "epoch": 0.4796698257413635, "grad_norm": 1.231164574623108, "learning_rate": 5.983951089033245e-06, "loss": 0.3011, "step": 1569 }, { "epoch": 0.4799755426475084, "grad_norm": 1.7411141395568848, "learning_rate": 5.987772258311044e-06, "loss": 0.3082, "step": 1570 }, { "epoch": 0.4802812595536533, "grad_norm": 1.3877772092819214, "learning_rate": 5.991593427588842e-06, "loss": 0.3462, "step": 1571 }, { "epoch": 0.48058697645979825, "grad_norm": 1.653743028640747, "learning_rate": 5.995414596866641e-06, "loss": 0.3205, "step": 1572 }, { "epoch": 0.48089269336594315, "grad_norm": 2.664297580718994, "learning_rate": 5.999235766144441e-06, "loss": 0.3775, "step": 1573 }, { "epoch": 0.48119841027208804, "grad_norm": 2.1151676177978516, "learning_rate": 6.0030569354222395e-06, "loss": 0.441, "step": 1574 }, { "epoch": 0.48150412717823293, "grad_norm": 3.1114697456359863, "learning_rate": 6.006878104700038e-06, "loss": 0.5031, "step": 1575 }, { "epoch": 0.4818098440843779, "grad_norm": 1.0440870523452759, "learning_rate": 6.010699273977838e-06, "loss": 0.2442, "step": 1576 }, { "epoch": 0.4821155609905228, "grad_norm": 0.4584539532661438, "learning_rate": 6.0145204432556366e-06, "loss": 0.1554, "step": 1577 }, { "epoch": 0.48242127789666767, "grad_norm": 0.6052743196487427, "learning_rate": 6.018341612533435e-06, "loss": 0.1355, "step": 1578 }, { "epoch": 0.4827269948028126, "grad_norm": 0.5751933455467224, "learning_rate": 6.022162781811234e-06, "loss": 0.1239, "step": 1579 }, { "epoch": 0.4830327117089575, "grad_norm": 0.5758290886878967, "learning_rate": 6.025983951089034e-06, "loss": 0.1303, "step": 1580 }, { "epoch": 0.4833384286151024, "grad_norm": 0.8872753977775574, "learning_rate": 6.029805120366832e-06, "loss": 0.1557, "step": 1581 }, { "epoch": 0.48364414552124735, "grad_norm": 0.4593057334423065, "learning_rate": 6.033626289644631e-06, "loss": 0.1012, "step": 1582 }, { "epoch": 0.48394986242739224, "grad_norm": 2.508209705352783, "learning_rate": 6.03744745892243e-06, "loss": 0.1886, "step": 1583 }, { "epoch": 0.48425557933353713, "grad_norm": 0.5534228086471558, "learning_rate": 6.0412686282002295e-06, "loss": 0.1446, "step": 1584 }, { "epoch": 0.484561296239682, "grad_norm": 0.6376481652259827, "learning_rate": 6.045089797478028e-06, "loss": 0.1213, "step": 1585 }, { "epoch": 0.484867013145827, "grad_norm": 0.831822395324707, "learning_rate": 6.048910966755827e-06, "loss": 0.1827, "step": 1586 }, { "epoch": 0.48517273005197187, "grad_norm": 0.5340111255645752, "learning_rate": 6.0527321360336266e-06, "loss": 0.149, "step": 1587 }, { "epoch": 0.48547844695811676, "grad_norm": 0.899723470211029, "learning_rate": 6.056553305311426e-06, "loss": 0.183, "step": 1588 }, { "epoch": 0.4857841638642617, "grad_norm": 0.7544511556625366, "learning_rate": 6.060374474589225e-06, "loss": 0.1985, "step": 1589 }, { "epoch": 0.4860898807704066, "grad_norm": 0.837821900844574, "learning_rate": 6.064195643867024e-06, "loss": 0.2104, "step": 1590 }, { "epoch": 0.4863955976765515, "grad_norm": 1.1283105611801147, "learning_rate": 6.068016813144822e-06, "loss": 0.2526, "step": 1591 }, { "epoch": 0.48670131458269644, "grad_norm": 1.5213171243667603, "learning_rate": 6.071837982422622e-06, "loss": 0.2586, "step": 1592 }, { "epoch": 0.48700703148884134, "grad_norm": 1.7326734066009521, "learning_rate": 6.075659151700421e-06, "loss": 0.2846, "step": 1593 }, { "epoch": 0.48731274839498623, "grad_norm": 1.5707367658615112, "learning_rate": 6.0794803209782195e-06, "loss": 0.3226, "step": 1594 }, { "epoch": 0.4876184653011312, "grad_norm": 1.4573074579238892, "learning_rate": 6.083301490256018e-06, "loss": 0.3242, "step": 1595 }, { "epoch": 0.48792418220727607, "grad_norm": 1.2020325660705566, "learning_rate": 6.087122659533818e-06, "loss": 0.3074, "step": 1596 }, { "epoch": 0.48822989911342096, "grad_norm": 1.8295068740844727, "learning_rate": 6.0909438288116165e-06, "loss": 0.361, "step": 1597 }, { "epoch": 0.48853561601956585, "grad_norm": 1.915524959564209, "learning_rate": 6.094764998089415e-06, "loss": 0.3962, "step": 1598 }, { "epoch": 0.4888413329257108, "grad_norm": 2.43646240234375, "learning_rate": 6.098586167367214e-06, "loss": 0.3812, "step": 1599 }, { "epoch": 0.4891470498318557, "grad_norm": 3.28507924079895, "learning_rate": 6.102407336645014e-06, "loss": 0.5736, "step": 1600 }, { "epoch": 0.4894527667380006, "grad_norm": 0.5855198502540588, "learning_rate": 6.106228505922812e-06, "loss": 0.2374, "step": 1601 }, { "epoch": 0.48975848364414554, "grad_norm": 0.6425602436065674, "learning_rate": 6.110049675200611e-06, "loss": 0.1849, "step": 1602 }, { "epoch": 0.49006420055029043, "grad_norm": 0.5482865571975708, "learning_rate": 6.11387084447841e-06, "loss": 0.1223, "step": 1603 }, { "epoch": 0.4903699174564353, "grad_norm": 0.49301162362098694, "learning_rate": 6.1176920137562094e-06, "loss": 0.1182, "step": 1604 }, { "epoch": 0.49067563436258027, "grad_norm": 0.464394748210907, "learning_rate": 6.121513183034008e-06, "loss": 0.1091, "step": 1605 }, { "epoch": 0.49098135126872516, "grad_norm": 0.5490759015083313, "learning_rate": 6.125334352311807e-06, "loss": 0.1237, "step": 1606 }, { "epoch": 0.49128706817487006, "grad_norm": 0.5359674692153931, "learning_rate": 6.129155521589607e-06, "loss": 0.1106, "step": 1607 }, { "epoch": 0.491592785081015, "grad_norm": 0.6040338277816772, "learning_rate": 6.132976690867406e-06, "loss": 0.1769, "step": 1608 }, { "epoch": 0.4918985019871599, "grad_norm": 0.5274519920349121, "learning_rate": 6.136797860145205e-06, "loss": 0.1422, "step": 1609 }, { "epoch": 0.4922042188933048, "grad_norm": 0.6024336218833923, "learning_rate": 6.140619029423004e-06, "loss": 0.1534, "step": 1610 }, { "epoch": 0.4925099357994497, "grad_norm": 0.8656171560287476, "learning_rate": 6.144440198700803e-06, "loss": 0.1631, "step": 1611 }, { "epoch": 0.49281565270559463, "grad_norm": 0.5407431721687317, "learning_rate": 6.148261367978602e-06, "loss": 0.17, "step": 1612 }, { "epoch": 0.4931213696117395, "grad_norm": 0.8366139531135559, "learning_rate": 6.152082537256401e-06, "loss": 0.2147, "step": 1613 }, { "epoch": 0.4934270865178844, "grad_norm": 0.7693113088607788, "learning_rate": 6.1559037065341994e-06, "loss": 0.1943, "step": 1614 }, { "epoch": 0.49373280342402937, "grad_norm": 1.0070055723190308, "learning_rate": 6.159724875811999e-06, "loss": 0.2573, "step": 1615 }, { "epoch": 0.49403852033017426, "grad_norm": 1.0975981950759888, "learning_rate": 6.163546045089798e-06, "loss": 0.2419, "step": 1616 }, { "epoch": 0.49434423723631915, "grad_norm": 0.9759030342102051, "learning_rate": 6.1673672143675965e-06, "loss": 0.2551, "step": 1617 }, { "epoch": 0.4946499541424641, "grad_norm": 1.008446216583252, "learning_rate": 6.171188383645395e-06, "loss": 0.2874, "step": 1618 }, { "epoch": 0.494955671048609, "grad_norm": 1.062767505645752, "learning_rate": 6.175009552923195e-06, "loss": 0.3149, "step": 1619 }, { "epoch": 0.4952613879547539, "grad_norm": 1.5594909191131592, "learning_rate": 6.178830722200994e-06, "loss": 0.3361, "step": 1620 }, { "epoch": 0.49556710486089883, "grad_norm": 1.493957281112671, "learning_rate": 6.182651891478792e-06, "loss": 0.3783, "step": 1621 }, { "epoch": 0.4958728217670437, "grad_norm": 1.4434784650802612, "learning_rate": 6.186473060756591e-06, "loss": 0.3477, "step": 1622 }, { "epoch": 0.4961785386731886, "grad_norm": 1.7336812019348145, "learning_rate": 6.190294230034391e-06, "loss": 0.3558, "step": 1623 }, { "epoch": 0.4964842555793335, "grad_norm": 1.7658923864364624, "learning_rate": 6.1941153993121894e-06, "loss": 0.3954, "step": 1624 }, { "epoch": 0.49678997248547846, "grad_norm": 2.757596731185913, "learning_rate": 6.197936568589988e-06, "loss": 0.4981, "step": 1625 }, { "epoch": 0.49709568939162335, "grad_norm": 0.611805260181427, "learning_rate": 6.201757737867788e-06, "loss": 0.2782, "step": 1626 }, { "epoch": 0.49740140629776824, "grad_norm": 0.533721923828125, "learning_rate": 6.205578907145587e-06, "loss": 0.1676, "step": 1627 }, { "epoch": 0.4977071232039132, "grad_norm": 0.4837518632411957, "learning_rate": 6.209400076423386e-06, "loss": 0.1387, "step": 1628 }, { "epoch": 0.4980128401100581, "grad_norm": 0.45139122009277344, "learning_rate": 6.213221245701185e-06, "loss": 0.0965, "step": 1629 }, { "epoch": 0.498318557016203, "grad_norm": 0.5238125324249268, "learning_rate": 6.217042414978984e-06, "loss": 0.1077, "step": 1630 }, { "epoch": 0.4986242739223479, "grad_norm": 0.48245754837989807, "learning_rate": 6.220863584256783e-06, "loss": 0.1037, "step": 1631 }, { "epoch": 0.4989299908284928, "grad_norm": 0.5867476463317871, "learning_rate": 6.224684753534582e-06, "loss": 0.1477, "step": 1632 }, { "epoch": 0.4992357077346377, "grad_norm": 0.5914271473884583, "learning_rate": 6.228505922812381e-06, "loss": 0.128, "step": 1633 }, { "epoch": 0.49954142464078266, "grad_norm": 0.6185508370399475, "learning_rate": 6.232327092090179e-06, "loss": 0.1346, "step": 1634 }, { "epoch": 0.49984714154692755, "grad_norm": 0.5250635147094727, "learning_rate": 6.236148261367979e-06, "loss": 0.1191, "step": 1635 }, { "epoch": 0.5001528584530724, "grad_norm": 0.8693307042121887, "learning_rate": 6.239969430645778e-06, "loss": 0.1697, "step": 1636 }, { "epoch": 0.5004585753592173, "grad_norm": 0.8064185380935669, "learning_rate": 6.2437905999235765e-06, "loss": 0.1317, "step": 1637 }, { "epoch": 0.5007642922653622, "grad_norm": 0.618786633014679, "learning_rate": 6.247611769201376e-06, "loss": 0.162, "step": 1638 }, { "epoch": 0.5010700091715072, "grad_norm": 1.03717839717865, "learning_rate": 6.251432938479175e-06, "loss": 0.2306, "step": 1639 }, { "epoch": 0.5013757260776521, "grad_norm": 0.763977587223053, "learning_rate": 6.2552541077569736e-06, "loss": 0.1903, "step": 1640 }, { "epoch": 0.501681442983797, "grad_norm": 0.9757720828056335, "learning_rate": 6.259075277034772e-06, "loss": 0.2766, "step": 1641 }, { "epoch": 0.5019871598899419, "grad_norm": 0.8174242377281189, "learning_rate": 6.262896446312572e-06, "loss": 0.2642, "step": 1642 }, { "epoch": 0.5022928767960868, "grad_norm": 1.1471222639083862, "learning_rate": 6.266717615590371e-06, "loss": 0.2949, "step": 1643 }, { "epoch": 0.5025985937022317, "grad_norm": 1.6578450202941895, "learning_rate": 6.270538784868169e-06, "loss": 0.3133, "step": 1644 }, { "epoch": 0.5029043106083766, "grad_norm": 1.3881834745407104, "learning_rate": 6.274359954145968e-06, "loss": 0.3063, "step": 1645 }, { "epoch": 0.5032100275145216, "grad_norm": 1.5048508644104004, "learning_rate": 6.278181123423769e-06, "loss": 0.3251, "step": 1646 }, { "epoch": 0.5035157444206665, "grad_norm": 4.338088035583496, "learning_rate": 6.282002292701567e-06, "loss": 0.2975, "step": 1647 }, { "epoch": 0.5038214613268114, "grad_norm": 1.5654417276382446, "learning_rate": 6.285823461979366e-06, "loss": 0.3339, "step": 1648 }, { "epoch": 0.5041271782329563, "grad_norm": 2.7351977825164795, "learning_rate": 6.289644631257165e-06, "loss": 0.4566, "step": 1649 }, { "epoch": 0.5044328951391012, "grad_norm": 2.563049077987671, "learning_rate": 6.293465800534964e-06, "loss": 0.4367, "step": 1650 }, { "epoch": 0.5047386120452461, "grad_norm": 0.6670291423797607, "learning_rate": 6.297286969812763e-06, "loss": 0.2453, "step": 1651 }, { "epoch": 0.5050443289513911, "grad_norm": 0.5144705176353455, "learning_rate": 6.301108139090562e-06, "loss": 0.146, "step": 1652 }, { "epoch": 0.505350045857536, "grad_norm": 0.5700180530548096, "learning_rate": 6.304929308368361e-06, "loss": 0.1407, "step": 1653 }, { "epoch": 0.5056557627636808, "grad_norm": 0.4960145056247711, "learning_rate": 6.30875047764616e-06, "loss": 0.1274, "step": 1654 }, { "epoch": 0.5059614796698257, "grad_norm": 0.6106263399124146, "learning_rate": 6.312571646923959e-06, "loss": 0.1063, "step": 1655 }, { "epoch": 0.5062671965759706, "grad_norm": 0.6492846012115479, "learning_rate": 6.316392816201758e-06, "loss": 0.116, "step": 1656 }, { "epoch": 0.5065729134821155, "grad_norm": 0.47340261936187744, "learning_rate": 6.3202139854795565e-06, "loss": 0.1139, "step": 1657 }, { "epoch": 0.5068786303882604, "grad_norm": 0.6347806453704834, "learning_rate": 6.324035154757356e-06, "loss": 0.1088, "step": 1658 }, { "epoch": 0.5071843472944054, "grad_norm": 0.5559546947479248, "learning_rate": 6.327856324035155e-06, "loss": 0.1336, "step": 1659 }, { "epoch": 0.5074900642005503, "grad_norm": 0.8151801228523254, "learning_rate": 6.3316774933129535e-06, "loss": 0.1319, "step": 1660 }, { "epoch": 0.5077957811066952, "grad_norm": 0.729644775390625, "learning_rate": 6.335498662590752e-06, "loss": 0.1878, "step": 1661 }, { "epoch": 0.5081014980128401, "grad_norm": 1.1487776041030884, "learning_rate": 6.339319831868552e-06, "loss": 0.1573, "step": 1662 }, { "epoch": 0.508407214918985, "grad_norm": 0.7020917534828186, "learning_rate": 6.343141001146351e-06, "loss": 0.1656, "step": 1663 }, { "epoch": 0.5087129318251299, "grad_norm": 1.0798276662826538, "learning_rate": 6.346962170424149e-06, "loss": 0.2263, "step": 1664 }, { "epoch": 0.5090186487312749, "grad_norm": 0.8994076251983643, "learning_rate": 6.350783339701949e-06, "loss": 0.2019, "step": 1665 }, { "epoch": 0.5093243656374198, "grad_norm": 1.0153549909591675, "learning_rate": 6.3546045089797486e-06, "loss": 0.2553, "step": 1666 }, { "epoch": 0.5096300825435647, "grad_norm": 0.9655422568321228, "learning_rate": 6.358425678257547e-06, "loss": 0.2602, "step": 1667 }, { "epoch": 0.5099357994497096, "grad_norm": 1.2048882246017456, "learning_rate": 6.362246847535346e-06, "loss": 0.2632, "step": 1668 }, { "epoch": 0.5102415163558545, "grad_norm": 1.4865163564682007, "learning_rate": 6.366068016813145e-06, "loss": 0.3065, "step": 1669 }, { "epoch": 0.5105472332619994, "grad_norm": 1.376583456993103, "learning_rate": 6.369889186090944e-06, "loss": 0.3029, "step": 1670 }, { "epoch": 0.5108529501681442, "grad_norm": 1.279587984085083, "learning_rate": 6.373710355368743e-06, "loss": 0.3445, "step": 1671 }, { "epoch": 0.5111586670742893, "grad_norm": 1.405760645866394, "learning_rate": 6.377531524646542e-06, "loss": 0.3272, "step": 1672 }, { "epoch": 0.5114643839804341, "grad_norm": 1.6558563709259033, "learning_rate": 6.3813526939243415e-06, "loss": 0.3537, "step": 1673 }, { "epoch": 0.511770100886579, "grad_norm": 2.5302646160125732, "learning_rate": 6.38517386320214e-06, "loss": 0.3902, "step": 1674 }, { "epoch": 0.5120758177927239, "grad_norm": 2.25349497795105, "learning_rate": 6.388995032479939e-06, "loss": 0.4142, "step": 1675 }, { "epoch": 0.5123815346988688, "grad_norm": 0.7292546033859253, "learning_rate": 6.392816201757738e-06, "loss": 0.2218, "step": 1676 }, { "epoch": 0.5126872516050137, "grad_norm": 0.5065193176269531, "learning_rate": 6.396637371035537e-06, "loss": 0.1358, "step": 1677 }, { "epoch": 0.5129929685111587, "grad_norm": 0.4380475580692291, "learning_rate": 6.400458540313336e-06, "loss": 0.1439, "step": 1678 }, { "epoch": 0.5132986854173036, "grad_norm": 0.5089801549911499, "learning_rate": 6.404279709591135e-06, "loss": 0.1098, "step": 1679 }, { "epoch": 0.5136044023234485, "grad_norm": 0.38961079716682434, "learning_rate": 6.4081008788689335e-06, "loss": 0.0935, "step": 1680 }, { "epoch": 0.5139101192295934, "grad_norm": 0.4964177906513214, "learning_rate": 6.411922048146733e-06, "loss": 0.1197, "step": 1681 }, { "epoch": 0.5142158361357383, "grad_norm": 0.5046684145927429, "learning_rate": 6.415743217424532e-06, "loss": 0.0856, "step": 1682 }, { "epoch": 0.5145215530418832, "grad_norm": 0.6274060010910034, "learning_rate": 6.419564386702331e-06, "loss": 0.1414, "step": 1683 }, { "epoch": 0.5148272699480281, "grad_norm": 0.4708370864391327, "learning_rate": 6.42338555598013e-06, "loss": 0.1196, "step": 1684 }, { "epoch": 0.5151329868541731, "grad_norm": 0.6558343172073364, "learning_rate": 6.42720672525793e-06, "loss": 0.1299, "step": 1685 }, { "epoch": 0.515438703760318, "grad_norm": 0.6934751868247986, "learning_rate": 6.4310278945357285e-06, "loss": 0.2115, "step": 1686 }, { "epoch": 0.5157444206664629, "grad_norm": 0.6181259751319885, "learning_rate": 6.434849063813527e-06, "loss": 0.1112, "step": 1687 }, { "epoch": 0.5160501375726078, "grad_norm": 0.8272998929023743, "learning_rate": 6.438670233091326e-06, "loss": 0.1689, "step": 1688 }, { "epoch": 0.5163558544787527, "grad_norm": 0.7596091032028198, "learning_rate": 6.442491402369126e-06, "loss": 0.1981, "step": 1689 }, { "epoch": 0.5166615713848975, "grad_norm": 0.9862402677536011, "learning_rate": 6.446312571646924e-06, "loss": 0.2312, "step": 1690 }, { "epoch": 0.5169672882910425, "grad_norm": 0.9424225091934204, "learning_rate": 6.450133740924723e-06, "loss": 0.2756, "step": 1691 }, { "epoch": 0.5172730051971874, "grad_norm": 1.1410669088363647, "learning_rate": 6.453954910202522e-06, "loss": 0.2741, "step": 1692 }, { "epoch": 0.5175787221033323, "grad_norm": 0.9108328819274902, "learning_rate": 6.4577760794803214e-06, "loss": 0.2855, "step": 1693 }, { "epoch": 0.5178844390094772, "grad_norm": 1.3621546030044556, "learning_rate": 6.46159724875812e-06, "loss": 0.3046, "step": 1694 }, { "epoch": 0.5181901559156221, "grad_norm": 1.0968899726867676, "learning_rate": 6.465418418035919e-06, "loss": 0.2958, "step": 1695 }, { "epoch": 0.518495872821767, "grad_norm": 1.5882469415664673, "learning_rate": 6.469239587313718e-06, "loss": 0.2745, "step": 1696 }, { "epoch": 0.5188015897279119, "grad_norm": 2.4131453037261963, "learning_rate": 6.473060756591517e-06, "loss": 0.3655, "step": 1697 }, { "epoch": 0.5191073066340569, "grad_norm": 4.667337417602539, "learning_rate": 6.476881925869316e-06, "loss": 0.3067, "step": 1698 }, { "epoch": 0.5194130235402018, "grad_norm": 5.6264166831970215, "learning_rate": 6.480703095147115e-06, "loss": 0.4418, "step": 1699 }, { "epoch": 0.5197187404463467, "grad_norm": 4.336860656738281, "learning_rate": 6.4845242644249135e-06, "loss": 0.5145, "step": 1700 }, { "epoch": 0.5200244573524916, "grad_norm": 0.7986057996749878, "learning_rate": 6.488345433702713e-06, "loss": 0.2602, "step": 1701 }, { "epoch": 0.5203301742586365, "grad_norm": 0.5907853245735168, "learning_rate": 6.492166602980512e-06, "loss": 0.1502, "step": 1702 }, { "epoch": 0.5206358911647814, "grad_norm": 0.694563627243042, "learning_rate": 6.495987772258311e-06, "loss": 0.1106, "step": 1703 }, { "epoch": 0.5209416080709264, "grad_norm": 0.6006177067756653, "learning_rate": 6.499808941536111e-06, "loss": 0.1457, "step": 1704 }, { "epoch": 0.5212473249770713, "grad_norm": 0.5028936266899109, "learning_rate": 6.50363011081391e-06, "loss": 0.1322, "step": 1705 }, { "epoch": 0.5215530418832162, "grad_norm": 0.5607298016548157, "learning_rate": 6.5074512800917085e-06, "loss": 0.0956, "step": 1706 }, { "epoch": 0.521858758789361, "grad_norm": 0.5568869709968567, "learning_rate": 6.511272449369507e-06, "loss": 0.109, "step": 1707 }, { "epoch": 0.522164475695506, "grad_norm": 0.7582440376281738, "learning_rate": 6.515093618647307e-06, "loss": 0.1555, "step": 1708 }, { "epoch": 0.5224701926016508, "grad_norm": 0.5096423029899597, "learning_rate": 6.518914787925106e-06, "loss": 0.115, "step": 1709 }, { "epoch": 0.5227759095077957, "grad_norm": 0.4977381229400635, "learning_rate": 6.522735957202904e-06, "loss": 0.1243, "step": 1710 }, { "epoch": 0.5230816264139407, "grad_norm": 0.9183540940284729, "learning_rate": 6.526557126480703e-06, "loss": 0.1583, "step": 1711 }, { "epoch": 0.5233873433200856, "grad_norm": 0.6528307199478149, "learning_rate": 6.530378295758503e-06, "loss": 0.1546, "step": 1712 }, { "epoch": 0.5236930602262305, "grad_norm": 0.6433296799659729, "learning_rate": 6.534199465036301e-06, "loss": 0.1568, "step": 1713 }, { "epoch": 0.5239987771323754, "grad_norm": 0.765804648399353, "learning_rate": 6.5380206343141e-06, "loss": 0.1795, "step": 1714 }, { "epoch": 0.5243044940385203, "grad_norm": 1.1224417686462402, "learning_rate": 6.541841803591899e-06, "loss": 0.2195, "step": 1715 }, { "epoch": 0.5246102109446652, "grad_norm": 0.9797439575195312, "learning_rate": 6.5456629728696985e-06, "loss": 0.2198, "step": 1716 }, { "epoch": 0.5249159278508102, "grad_norm": 1.0683081150054932, "learning_rate": 6.549484142147497e-06, "loss": 0.254, "step": 1717 }, { "epoch": 0.5252216447569551, "grad_norm": 2.301647901535034, "learning_rate": 6.553305311425296e-06, "loss": 0.3259, "step": 1718 }, { "epoch": 0.5255273616631, "grad_norm": 1.9776228666305542, "learning_rate": 6.557126480703095e-06, "loss": 0.317, "step": 1719 }, { "epoch": 0.5258330785692449, "grad_norm": 1.1763262748718262, "learning_rate": 6.560947649980894e-06, "loss": 0.3543, "step": 1720 }, { "epoch": 0.5261387954753898, "grad_norm": 1.8406590223312378, "learning_rate": 6.564768819258693e-06, "loss": 0.3035, "step": 1721 }, { "epoch": 0.5264445123815347, "grad_norm": 3.476926565170288, "learning_rate": 6.568589988536492e-06, "loss": 0.3478, "step": 1722 }, { "epoch": 0.5267502292876796, "grad_norm": 1.8574650287628174, "learning_rate": 6.572411157814291e-06, "loss": 0.3549, "step": 1723 }, { "epoch": 0.5270559461938246, "grad_norm": 2.153702735900879, "learning_rate": 6.576232327092091e-06, "loss": 0.4455, "step": 1724 }, { "epoch": 0.5273616630999695, "grad_norm": 3.9471218585968018, "learning_rate": 6.58005349636989e-06, "loss": 0.4863, "step": 1725 }, { "epoch": 0.5276673800061144, "grad_norm": 0.9496064186096191, "learning_rate": 6.5838746656476885e-06, "loss": 0.2702, "step": 1726 }, { "epoch": 0.5279730969122592, "grad_norm": 0.8866237998008728, "learning_rate": 6.587695834925487e-06, "loss": 0.1316, "step": 1727 }, { "epoch": 0.5282788138184041, "grad_norm": 0.6133860349655151, "learning_rate": 6.591517004203287e-06, "loss": 0.1249, "step": 1728 }, { "epoch": 0.528584530724549, "grad_norm": 0.6006677746772766, "learning_rate": 6.5953381734810856e-06, "loss": 0.1499, "step": 1729 }, { "epoch": 0.528890247630694, "grad_norm": 0.47958436608314514, "learning_rate": 6.599159342758884e-06, "loss": 0.1053, "step": 1730 }, { "epoch": 0.5291959645368389, "grad_norm": 0.5783552527427673, "learning_rate": 6.602980512036683e-06, "loss": 0.1159, "step": 1731 }, { "epoch": 0.5295016814429838, "grad_norm": 0.5806292295455933, "learning_rate": 6.606801681314483e-06, "loss": 0.1372, "step": 1732 }, { "epoch": 0.5298073983491287, "grad_norm": 0.9246692061424255, "learning_rate": 6.610622850592281e-06, "loss": 0.1115, "step": 1733 }, { "epoch": 0.5301131152552736, "grad_norm": 0.7009364366531372, "learning_rate": 6.61444401987008e-06, "loss": 0.1308, "step": 1734 }, { "epoch": 0.5304188321614185, "grad_norm": 0.646483302116394, "learning_rate": 6.61826518914788e-06, "loss": 0.126, "step": 1735 }, { "epoch": 0.5307245490675634, "grad_norm": 0.7322234511375427, "learning_rate": 6.6220863584256785e-06, "loss": 0.1885, "step": 1736 }, { "epoch": 0.5310302659737084, "grad_norm": 0.7209143042564392, "learning_rate": 6.625907527703477e-06, "loss": 0.1593, "step": 1737 }, { "epoch": 0.5313359828798533, "grad_norm": 0.6876514554023743, "learning_rate": 6.629728696981276e-06, "loss": 0.1866, "step": 1738 }, { "epoch": 0.5316416997859982, "grad_norm": 0.721433162689209, "learning_rate": 6.6335498662590756e-06, "loss": 0.22, "step": 1739 }, { "epoch": 0.5319474166921431, "grad_norm": 1.4250761270523071, "learning_rate": 6.637371035536874e-06, "loss": 0.1925, "step": 1740 }, { "epoch": 0.532253133598288, "grad_norm": 0.9893543124198914, "learning_rate": 6.641192204814673e-06, "loss": 0.2499, "step": 1741 }, { "epoch": 0.5325588505044329, "grad_norm": 1.0274531841278076, "learning_rate": 6.645013374092472e-06, "loss": 0.2387, "step": 1742 }, { "epoch": 0.5328645674105777, "grad_norm": 1.2412492036819458, "learning_rate": 6.648834543370272e-06, "loss": 0.2847, "step": 1743 }, { "epoch": 0.5331702843167228, "grad_norm": 1.5094294548034668, "learning_rate": 6.652655712648071e-06, "loss": 0.3024, "step": 1744 }, { "epoch": 0.5334760012228676, "grad_norm": 1.6246464252471924, "learning_rate": 6.65647688192587e-06, "loss": 0.3876, "step": 1745 }, { "epoch": 0.5337817181290125, "grad_norm": 2.33821439743042, "learning_rate": 6.6602980512036685e-06, "loss": 0.3135, "step": 1746 }, { "epoch": 0.5340874350351574, "grad_norm": 2.55466890335083, "learning_rate": 6.664119220481468e-06, "loss": 0.3671, "step": 1747 }, { "epoch": 0.5343931519413023, "grad_norm": 1.6293448209762573, "learning_rate": 6.667940389759267e-06, "loss": 0.3639, "step": 1748 }, { "epoch": 0.5346988688474472, "grad_norm": 1.3241603374481201, "learning_rate": 6.6717615590370655e-06, "loss": 0.3982, "step": 1749 }, { "epoch": 0.5350045857535922, "grad_norm": 1.7417325973510742, "learning_rate": 6.675582728314864e-06, "loss": 0.4296, "step": 1750 }, { "epoch": 0.5353103026597371, "grad_norm": 0.780276358127594, "learning_rate": 6.679403897592664e-06, "loss": 0.2665, "step": 1751 }, { "epoch": 0.535616019565882, "grad_norm": 0.5509480237960815, "learning_rate": 6.683225066870463e-06, "loss": 0.1691, "step": 1752 }, { "epoch": 0.5359217364720269, "grad_norm": 0.42979732155799866, "learning_rate": 6.687046236148261e-06, "loss": 0.1185, "step": 1753 }, { "epoch": 0.5362274533781718, "grad_norm": 0.6056907176971436, "learning_rate": 6.69086740542606e-06, "loss": 0.1316, "step": 1754 }, { "epoch": 0.5365331702843167, "grad_norm": 0.560955822467804, "learning_rate": 6.69468857470386e-06, "loss": 0.1338, "step": 1755 }, { "epoch": 0.5368388871904616, "grad_norm": 0.6280403137207031, "learning_rate": 6.6985097439816584e-06, "loss": 0.1065, "step": 1756 }, { "epoch": 0.5371446040966066, "grad_norm": 0.5801345705986023, "learning_rate": 6.702330913259457e-06, "loss": 0.1126, "step": 1757 }, { "epoch": 0.5374503210027515, "grad_norm": 1.837843656539917, "learning_rate": 6.706152082537256e-06, "loss": 0.1205, "step": 1758 }, { "epoch": 0.5377560379088964, "grad_norm": 0.5427421927452087, "learning_rate": 6.7099732518150555e-06, "loss": 0.1318, "step": 1759 }, { "epoch": 0.5380617548150413, "grad_norm": 0.8376352190971375, "learning_rate": 6.713794421092854e-06, "loss": 0.1422, "step": 1760 }, { "epoch": 0.5383674717211862, "grad_norm": 0.6468901038169861, "learning_rate": 6.717615590370653e-06, "loss": 0.1909, "step": 1761 }, { "epoch": 0.538673188627331, "grad_norm": 0.637401819229126, "learning_rate": 6.721436759648453e-06, "loss": 0.1544, "step": 1762 }, { "epoch": 0.538978905533476, "grad_norm": 0.9416242837905884, "learning_rate": 6.725257928926252e-06, "loss": 0.1752, "step": 1763 }, { "epoch": 0.5392846224396209, "grad_norm": 0.8398690819740295, "learning_rate": 6.729079098204051e-06, "loss": 0.2028, "step": 1764 }, { "epoch": 0.5395903393457658, "grad_norm": 0.7533447742462158, "learning_rate": 6.73290026748185e-06, "loss": 0.1822, "step": 1765 }, { "epoch": 0.5398960562519107, "grad_norm": 1.7478350400924683, "learning_rate": 6.7367214367596484e-06, "loss": 0.2139, "step": 1766 }, { "epoch": 0.5402017731580556, "grad_norm": 0.8349722027778625, "learning_rate": 6.740542606037448e-06, "loss": 0.2829, "step": 1767 }, { "epoch": 0.5405074900642005, "grad_norm": 0.8959169983863831, "learning_rate": 6.744363775315247e-06, "loss": 0.2922, "step": 1768 }, { "epoch": 0.5408132069703454, "grad_norm": 1.2058583498001099, "learning_rate": 6.7481849445930455e-06, "loss": 0.2908, "step": 1769 }, { "epoch": 0.5411189238764904, "grad_norm": 1.068112850189209, "learning_rate": 6.752006113870845e-06, "loss": 0.2972, "step": 1770 }, { "epoch": 0.5414246407826353, "grad_norm": 1.0865105390548706, "learning_rate": 6.755827283148644e-06, "loss": 0.2919, "step": 1771 }, { "epoch": 0.5417303576887802, "grad_norm": 1.4941085577011108, "learning_rate": 6.759648452426443e-06, "loss": 0.3471, "step": 1772 }, { "epoch": 0.5420360745949251, "grad_norm": 1.738445520401001, "learning_rate": 6.763469621704241e-06, "loss": 0.3238, "step": 1773 }, { "epoch": 0.54234179150107, "grad_norm": 1.6376501321792603, "learning_rate": 6.767290790982041e-06, "loss": 0.3471, "step": 1774 }, { "epoch": 0.5426475084072149, "grad_norm": 2.122401475906372, "learning_rate": 6.77111196025984e-06, "loss": 0.4833, "step": 1775 }, { "epoch": 0.5429532253133599, "grad_norm": 0.7851406335830688, "learning_rate": 6.774933129537638e-06, "loss": 0.2308, "step": 1776 }, { "epoch": 0.5432589422195048, "grad_norm": 0.7153325080871582, "learning_rate": 6.778754298815437e-06, "loss": 0.1598, "step": 1777 }, { "epoch": 0.5435646591256497, "grad_norm": 0.9585533738136292, "learning_rate": 6.782575468093237e-06, "loss": 0.1256, "step": 1778 }, { "epoch": 0.5438703760317946, "grad_norm": 0.4236360490322113, "learning_rate": 6.7863966373710355e-06, "loss": 0.1114, "step": 1779 }, { "epoch": 0.5441760929379394, "grad_norm": 0.47418978810310364, "learning_rate": 6.790217806648834e-06, "loss": 0.0905, "step": 1780 }, { "epoch": 0.5444818098440843, "grad_norm": 0.3970838189125061, "learning_rate": 6.794038975926633e-06, "loss": 0.0894, "step": 1781 }, { "epoch": 0.5447875267502292, "grad_norm": 0.5803502798080444, "learning_rate": 6.7978601452044334e-06, "loss": 0.1147, "step": 1782 }, { "epoch": 0.5450932436563742, "grad_norm": 0.5404546856880188, "learning_rate": 6.801681314482232e-06, "loss": 0.1368, "step": 1783 }, { "epoch": 0.5453989605625191, "grad_norm": 0.5083880424499512, "learning_rate": 6.805502483760031e-06, "loss": 0.1239, "step": 1784 }, { "epoch": 0.545704677468664, "grad_norm": 0.5567473769187927, "learning_rate": 6.80932365303783e-06, "loss": 0.1137, "step": 1785 }, { "epoch": 0.5460103943748089, "grad_norm": 0.7537161707878113, "learning_rate": 6.813144822315629e-06, "loss": 0.1753, "step": 1786 }, { "epoch": 0.5463161112809538, "grad_norm": 1.0778430700302124, "learning_rate": 6.816965991593428e-06, "loss": 0.1618, "step": 1787 }, { "epoch": 0.5466218281870987, "grad_norm": 0.7368510961532593, "learning_rate": 6.820787160871227e-06, "loss": 0.1916, "step": 1788 }, { "epoch": 0.5469275450932437, "grad_norm": 0.7914478182792664, "learning_rate": 6.8246083301490255e-06, "loss": 0.1868, "step": 1789 }, { "epoch": 0.5472332619993886, "grad_norm": 0.8107283115386963, "learning_rate": 6.828429499426825e-06, "loss": 0.1835, "step": 1790 }, { "epoch": 0.5475389789055335, "grad_norm": 0.9262338280677795, "learning_rate": 6.832250668704624e-06, "loss": 0.2363, "step": 1791 }, { "epoch": 0.5478446958116784, "grad_norm": 0.7571263313293457, "learning_rate": 6.8360718379824226e-06, "loss": 0.2751, "step": 1792 }, { "epoch": 0.5481504127178233, "grad_norm": 1.1687792539596558, "learning_rate": 6.839893007260221e-06, "loss": 0.2702, "step": 1793 }, { "epoch": 0.5484561296239682, "grad_norm": 1.3911783695220947, "learning_rate": 6.843714176538021e-06, "loss": 0.3352, "step": 1794 }, { "epoch": 0.5487618465301131, "grad_norm": 4.138689041137695, "learning_rate": 6.84753534581582e-06, "loss": 0.3185, "step": 1795 }, { "epoch": 0.5490675634362581, "grad_norm": 1.644181489944458, "learning_rate": 6.851356515093618e-06, "loss": 0.3027, "step": 1796 }, { "epoch": 0.549373280342403, "grad_norm": 1.4144753217697144, "learning_rate": 6.855177684371418e-06, "loss": 0.3259, "step": 1797 }, { "epoch": 0.5496789972485479, "grad_norm": 2.061938524246216, "learning_rate": 6.858998853649217e-06, "loss": 0.3478, "step": 1798 }, { "epoch": 0.5499847141546927, "grad_norm": 3.06655216217041, "learning_rate": 6.8628200229270155e-06, "loss": 0.4054, "step": 1799 }, { "epoch": 0.5502904310608376, "grad_norm": 3.4562156200408936, "learning_rate": 6.866641192204814e-06, "loss": 0.5118, "step": 1800 }, { "epoch": 0.5505961479669825, "grad_norm": 0.7622950673103333, "learning_rate": 6.870462361482615e-06, "loss": 0.238, "step": 1801 }, { "epoch": 0.5509018648731275, "grad_norm": 0.6364317536354065, "learning_rate": 6.874283530760413e-06, "loss": 0.1448, "step": 1802 }, { "epoch": 0.5512075817792724, "grad_norm": 0.48387864232063293, "learning_rate": 6.878104700038212e-06, "loss": 0.1069, "step": 1803 }, { "epoch": 0.5515132986854173, "grad_norm": 0.5266673564910889, "learning_rate": 6.881925869316011e-06, "loss": 0.1277, "step": 1804 }, { "epoch": 0.5518190155915622, "grad_norm": 0.4874516427516937, "learning_rate": 6.8857470385938105e-06, "loss": 0.1209, "step": 1805 }, { "epoch": 0.5521247324977071, "grad_norm": 0.7448146343231201, "learning_rate": 6.889568207871609e-06, "loss": 0.099, "step": 1806 }, { "epoch": 0.552430449403852, "grad_norm": 0.47271373867988586, "learning_rate": 6.893389377149408e-06, "loss": 0.0935, "step": 1807 }, { "epoch": 0.5527361663099969, "grad_norm": 0.5431455969810486, "learning_rate": 6.897210546427207e-06, "loss": 0.1603, "step": 1808 }, { "epoch": 0.5530418832161419, "grad_norm": 0.5700578093528748, "learning_rate": 6.901031715705006e-06, "loss": 0.1013, "step": 1809 }, { "epoch": 0.5533476001222868, "grad_norm": 0.4578174352645874, "learning_rate": 6.904852884982805e-06, "loss": 0.1202, "step": 1810 }, { "epoch": 0.5536533170284317, "grad_norm": 0.7084099650382996, "learning_rate": 6.908674054260604e-06, "loss": 0.1227, "step": 1811 }, { "epoch": 0.5539590339345766, "grad_norm": 0.5544901490211487, "learning_rate": 6.9124952235384025e-06, "loss": 0.1457, "step": 1812 }, { "epoch": 0.5542647508407215, "grad_norm": 1.721486210823059, "learning_rate": 6.916316392816202e-06, "loss": 0.2126, "step": 1813 }, { "epoch": 0.5545704677468664, "grad_norm": 0.8468775749206543, "learning_rate": 6.920137562094001e-06, "loss": 0.1867, "step": 1814 }, { "epoch": 0.5548761846530114, "grad_norm": 0.8471442461013794, "learning_rate": 6.9239587313718e-06, "loss": 0.2173, "step": 1815 }, { "epoch": 0.5551819015591563, "grad_norm": 0.848345160484314, "learning_rate": 6.927779900649598e-06, "loss": 0.2569, "step": 1816 }, { "epoch": 0.5554876184653011, "grad_norm": 0.9333407282829285, "learning_rate": 6.931601069927398e-06, "loss": 0.2259, "step": 1817 }, { "epoch": 0.555793335371446, "grad_norm": 0.9360960125923157, "learning_rate": 6.935422239205197e-06, "loss": 0.2619, "step": 1818 }, { "epoch": 0.5560990522775909, "grad_norm": 0.8180817365646362, "learning_rate": 6.9392434084829955e-06, "loss": 0.2503, "step": 1819 }, { "epoch": 0.5564047691837358, "grad_norm": 1.3674391508102417, "learning_rate": 6.943064577760794e-06, "loss": 0.268, "step": 1820 }, { "epoch": 0.5567104860898807, "grad_norm": 1.4019544124603271, "learning_rate": 6.946885747038595e-06, "loss": 0.333, "step": 1821 }, { "epoch": 0.5570162029960257, "grad_norm": 2.144352436065674, "learning_rate": 6.950706916316393e-06, "loss": 0.3103, "step": 1822 }, { "epoch": 0.5573219199021706, "grad_norm": 1.6411892175674438, "learning_rate": 6.954528085594192e-06, "loss": 0.3443, "step": 1823 }, { "epoch": 0.5576276368083155, "grad_norm": 1.988133192062378, "learning_rate": 6.958349254871991e-06, "loss": 0.3871, "step": 1824 }, { "epoch": 0.5579333537144604, "grad_norm": 2.3542673587799072, "learning_rate": 6.9621704241497905e-06, "loss": 0.4486, "step": 1825 }, { "epoch": 0.5582390706206053, "grad_norm": 0.6053528785705566, "learning_rate": 6.965991593427589e-06, "loss": 0.2243, "step": 1826 }, { "epoch": 0.5585447875267502, "grad_norm": 0.5742367506027222, "learning_rate": 6.969812762705388e-06, "loss": 0.1573, "step": 1827 }, { "epoch": 0.5588505044328952, "grad_norm": 0.4263661205768585, "learning_rate": 6.973633931983187e-06, "loss": 0.1049, "step": 1828 }, { "epoch": 0.5591562213390401, "grad_norm": 0.5401995778083801, "learning_rate": 6.977455101260986e-06, "loss": 0.1469, "step": 1829 }, { "epoch": 0.559461938245185, "grad_norm": 0.6042286157608032, "learning_rate": 6.981276270538785e-06, "loss": 0.1294, "step": 1830 }, { "epoch": 0.5597676551513299, "grad_norm": 0.6417702436447144, "learning_rate": 6.985097439816584e-06, "loss": 0.1027, "step": 1831 }, { "epoch": 0.5600733720574748, "grad_norm": 0.5610223412513733, "learning_rate": 6.988918609094383e-06, "loss": 0.1123, "step": 1832 }, { "epoch": 0.5603790889636197, "grad_norm": 0.5107663869857788, "learning_rate": 6.992739778372182e-06, "loss": 0.1059, "step": 1833 }, { "epoch": 0.5606848058697645, "grad_norm": 0.5423054695129395, "learning_rate": 6.996560947649981e-06, "loss": 0.1275, "step": 1834 }, { "epoch": 0.5609905227759096, "grad_norm": 0.5394623875617981, "learning_rate": 7.00038211692778e-06, "loss": 0.1467, "step": 1835 }, { "epoch": 0.5612962396820544, "grad_norm": 0.706482470035553, "learning_rate": 7.004203286205579e-06, "loss": 0.1576, "step": 1836 }, { "epoch": 0.5616019565881993, "grad_norm": 0.873502254486084, "learning_rate": 7.008024455483378e-06, "loss": 0.1959, "step": 1837 }, { "epoch": 0.5619076734943442, "grad_norm": 0.6734521985054016, "learning_rate": 7.011845624761177e-06, "loss": 0.1791, "step": 1838 }, { "epoch": 0.5622133904004891, "grad_norm": 0.6665511727333069, "learning_rate": 7.0156667940389754e-06, "loss": 0.1849, "step": 1839 }, { "epoch": 0.562519107306634, "grad_norm": 0.8156330585479736, "learning_rate": 7.019487963316776e-06, "loss": 0.209, "step": 1840 }, { "epoch": 0.562824824212779, "grad_norm": 0.8651542067527771, "learning_rate": 7.023309132594575e-06, "loss": 0.2279, "step": 1841 }, { "epoch": 0.5631305411189239, "grad_norm": 1.0602695941925049, "learning_rate": 7.027130301872373e-06, "loss": 0.2359, "step": 1842 }, { "epoch": 0.5634362580250688, "grad_norm": 1.0098832845687866, "learning_rate": 7.030951471150172e-06, "loss": 0.2854, "step": 1843 }, { "epoch": 0.5637419749312137, "grad_norm": 1.1456831693649292, "learning_rate": 7.034772640427972e-06, "loss": 0.3244, "step": 1844 }, { "epoch": 0.5640476918373586, "grad_norm": 1.1722238063812256, "learning_rate": 7.0385938097057704e-06, "loss": 0.2631, "step": 1845 }, { "epoch": 0.5643534087435035, "grad_norm": 1.3385485410690308, "learning_rate": 7.042414978983569e-06, "loss": 0.3364, "step": 1846 }, { "epoch": 0.5646591256496484, "grad_norm": 1.5197807550430298, "learning_rate": 7.046236148261368e-06, "loss": 0.3114, "step": 1847 }, { "epoch": 0.5649648425557934, "grad_norm": 2.1980814933776855, "learning_rate": 7.0500573175391675e-06, "loss": 0.3204, "step": 1848 }, { "epoch": 0.5652705594619383, "grad_norm": 1.6695780754089355, "learning_rate": 7.053878486816966e-06, "loss": 0.3492, "step": 1849 }, { "epoch": 0.5655762763680832, "grad_norm": 5.04495096206665, "learning_rate": 7.057699656094765e-06, "loss": 0.4781, "step": 1850 }, { "epoch": 0.5658819932742281, "grad_norm": 0.6675856709480286, "learning_rate": 7.061520825372564e-06, "loss": 0.2337, "step": 1851 }, { "epoch": 0.566187710180373, "grad_norm": 0.8079965710639954, "learning_rate": 7.065341994650363e-06, "loss": 0.1631, "step": 1852 }, { "epoch": 0.5664934270865178, "grad_norm": 0.6248931884765625, "learning_rate": 7.069163163928162e-06, "loss": 0.1332, "step": 1853 }, { "epoch": 0.5667991439926628, "grad_norm": 0.5726986527442932, "learning_rate": 7.072984333205961e-06, "loss": 0.0992, "step": 1854 }, { "epoch": 0.5671048608988077, "grad_norm": 0.4778134226799011, "learning_rate": 7.0768055024837596e-06, "loss": 0.1044, "step": 1855 }, { "epoch": 0.5674105778049526, "grad_norm": 0.4854373037815094, "learning_rate": 7.080626671761559e-06, "loss": 0.1234, "step": 1856 }, { "epoch": 0.5677162947110975, "grad_norm": 0.48632878065109253, "learning_rate": 7.084447841039358e-06, "loss": 0.1118, "step": 1857 }, { "epoch": 0.5680220116172424, "grad_norm": 0.4848400950431824, "learning_rate": 7.088269010317157e-06, "loss": 0.1077, "step": 1858 }, { "epoch": 0.5683277285233873, "grad_norm": 0.623792827129364, "learning_rate": 7.092090179594956e-06, "loss": 0.1622, "step": 1859 }, { "epoch": 0.5686334454295322, "grad_norm": 0.5443034172058105, "learning_rate": 7.095911348872756e-06, "loss": 0.121, "step": 1860 }, { "epoch": 0.5689391623356772, "grad_norm": 0.5323377847671509, "learning_rate": 7.099732518150555e-06, "loss": 0.1692, "step": 1861 }, { "epoch": 0.5692448792418221, "grad_norm": 1.3336899280548096, "learning_rate": 7.103553687428353e-06, "loss": 0.1434, "step": 1862 }, { "epoch": 0.569550596147967, "grad_norm": 0.796190083026886, "learning_rate": 7.107374856706153e-06, "loss": 0.1692, "step": 1863 }, { "epoch": 0.5698563130541119, "grad_norm": 0.7670611143112183, "learning_rate": 7.111196025983952e-06, "loss": 0.1863, "step": 1864 }, { "epoch": 0.5701620299602568, "grad_norm": 0.754612922668457, "learning_rate": 7.11501719526175e-06, "loss": 0.222, "step": 1865 }, { "epoch": 0.5704677468664017, "grad_norm": 0.7055993676185608, "learning_rate": 7.118838364539549e-06, "loss": 0.2101, "step": 1866 }, { "epoch": 0.5707734637725467, "grad_norm": 1.0119719505310059, "learning_rate": 7.122659533817349e-06, "loss": 0.2757, "step": 1867 }, { "epoch": 0.5710791806786916, "grad_norm": 1.5251290798187256, "learning_rate": 7.1264807030951475e-06, "loss": 0.2741, "step": 1868 }, { "epoch": 0.5713848975848365, "grad_norm": 1.0728724002838135, "learning_rate": 7.130301872372946e-06, "loss": 0.2888, "step": 1869 }, { "epoch": 0.5716906144909814, "grad_norm": 1.1715260744094849, "learning_rate": 7.134123041650745e-06, "loss": 0.2991, "step": 1870 }, { "epoch": 0.5719963313971262, "grad_norm": 1.0131726264953613, "learning_rate": 7.1379442109285446e-06, "loss": 0.3092, "step": 1871 }, { "epoch": 0.5723020483032711, "grad_norm": 1.6379377841949463, "learning_rate": 7.141765380206343e-06, "loss": 0.3002, "step": 1872 }, { "epoch": 0.572607765209416, "grad_norm": 2.2192718982696533, "learning_rate": 7.145586549484142e-06, "loss": 0.3404, "step": 1873 }, { "epoch": 0.572913482115561, "grad_norm": 1.8767591714859009, "learning_rate": 7.149407718761941e-06, "loss": 0.396, "step": 1874 }, { "epoch": 0.5732191990217059, "grad_norm": 4.572295188903809, "learning_rate": 7.15322888803974e-06, "loss": 0.3777, "step": 1875 }, { "epoch": 0.5735249159278508, "grad_norm": 0.9931073188781738, "learning_rate": 7.157050057317539e-06, "loss": 0.2412, "step": 1876 }, { "epoch": 0.5738306328339957, "grad_norm": 0.7713192105293274, "learning_rate": 7.160871226595338e-06, "loss": 0.1342, "step": 1877 }, { "epoch": 0.5741363497401406, "grad_norm": 0.5861683487892151, "learning_rate": 7.164692395873137e-06, "loss": 0.1342, "step": 1878 }, { "epoch": 0.5744420666462855, "grad_norm": 0.5489619374275208, "learning_rate": 7.168513565150937e-06, "loss": 0.121, "step": 1879 }, { "epoch": 0.5747477835524305, "grad_norm": 0.6320542693138123, "learning_rate": 7.172334734428736e-06, "loss": 0.1096, "step": 1880 }, { "epoch": 0.5750535004585754, "grad_norm": 0.49416646361351013, "learning_rate": 7.1761559037065346e-06, "loss": 0.1148, "step": 1881 }, { "epoch": 0.5753592173647203, "grad_norm": 0.5392212271690369, "learning_rate": 7.179977072984333e-06, "loss": 0.1254, "step": 1882 }, { "epoch": 0.5756649342708652, "grad_norm": 0.5214961767196655, "learning_rate": 7.183798242262133e-06, "loss": 0.1178, "step": 1883 }, { "epoch": 0.5759706511770101, "grad_norm": 0.9156227707862854, "learning_rate": 7.187619411539932e-06, "loss": 0.139, "step": 1884 }, { "epoch": 0.576276368083155, "grad_norm": 0.4529374837875366, "learning_rate": 7.19144058081773e-06, "loss": 0.1276, "step": 1885 }, { "epoch": 0.5765820849892999, "grad_norm": 0.8996360301971436, "learning_rate": 7.195261750095529e-06, "loss": 0.2534, "step": 1886 }, { "epoch": 0.5768878018954449, "grad_norm": 0.6929118633270264, "learning_rate": 7.199082919373329e-06, "loss": 0.1587, "step": 1887 }, { "epoch": 0.5771935188015898, "grad_norm": 0.6763818860054016, "learning_rate": 7.2029040886511275e-06, "loss": 0.1793, "step": 1888 }, { "epoch": 0.5774992357077346, "grad_norm": 1.7264829874038696, "learning_rate": 7.206725257928926e-06, "loss": 0.2175, "step": 1889 }, { "epoch": 0.5778049526138795, "grad_norm": 1.155777096748352, "learning_rate": 7.210546427206725e-06, "loss": 0.2092, "step": 1890 }, { "epoch": 0.5781106695200244, "grad_norm": 0.8028881549835205, "learning_rate": 7.2143675964845245e-06, "loss": 0.2554, "step": 1891 }, { "epoch": 0.5784163864261693, "grad_norm": 1.0740469694137573, "learning_rate": 7.218188765762323e-06, "loss": 0.2911, "step": 1892 }, { "epoch": 0.5787221033323143, "grad_norm": 1.1702165603637695, "learning_rate": 7.222009935040122e-06, "loss": 0.2815, "step": 1893 }, { "epoch": 0.5790278202384592, "grad_norm": 1.360775113105774, "learning_rate": 7.225831104317922e-06, "loss": 0.3072, "step": 1894 }, { "epoch": 0.5793335371446041, "grad_norm": 1.7146896123886108, "learning_rate": 7.22965227359572e-06, "loss": 0.2708, "step": 1895 }, { "epoch": 0.579639254050749, "grad_norm": 1.5225187540054321, "learning_rate": 7.233473442873519e-06, "loss": 0.3322, "step": 1896 }, { "epoch": 0.5799449709568939, "grad_norm": 1.4397433996200562, "learning_rate": 7.237294612151318e-06, "loss": 0.3362, "step": 1897 }, { "epoch": 0.5802506878630388, "grad_norm": 1.6193811893463135, "learning_rate": 7.241115781429118e-06, "loss": 0.3135, "step": 1898 }, { "epoch": 0.5805564047691837, "grad_norm": 1.8463780879974365, "learning_rate": 7.244936950706917e-06, "loss": 0.3633, "step": 1899 }, { "epoch": 0.5808621216753287, "grad_norm": 3.014223098754883, "learning_rate": 7.248758119984716e-06, "loss": 0.4879, "step": 1900 }, { "epoch": 0.5811678385814736, "grad_norm": 0.606360673904419, "learning_rate": 7.2525792892625145e-06, "loss": 0.2055, "step": 1901 }, { "epoch": 0.5814735554876185, "grad_norm": 0.4634894132614136, "learning_rate": 7.256400458540314e-06, "loss": 0.142, "step": 1902 }, { "epoch": 0.5817792723937634, "grad_norm": 0.777320921421051, "learning_rate": 7.260221627818113e-06, "loss": 0.1469, "step": 1903 }, { "epoch": 0.5820849892999083, "grad_norm": 0.6068689227104187, "learning_rate": 7.264042797095912e-06, "loss": 0.1077, "step": 1904 }, { "epoch": 0.5823907062060532, "grad_norm": 0.578449010848999, "learning_rate": 7.26786396637371e-06, "loss": 0.1259, "step": 1905 }, { "epoch": 0.5826964231121982, "grad_norm": 0.44615569710731506, "learning_rate": 7.27168513565151e-06, "loss": 0.1274, "step": 1906 }, { "epoch": 0.583002140018343, "grad_norm": 0.7546061277389526, "learning_rate": 7.275506304929309e-06, "loss": 0.1319, "step": 1907 }, { "epoch": 0.5833078569244879, "grad_norm": 0.5305916666984558, "learning_rate": 7.2793274742071074e-06, "loss": 0.111, "step": 1908 }, { "epoch": 0.5836135738306328, "grad_norm": 0.6672648191452026, "learning_rate": 7.283148643484906e-06, "loss": 0.1498, "step": 1909 }, { "epoch": 0.5839192907367777, "grad_norm": 0.645152747631073, "learning_rate": 7.286969812762706e-06, "loss": 0.1032, "step": 1910 }, { "epoch": 0.5842250076429226, "grad_norm": 0.6592658758163452, "learning_rate": 7.2907909820405045e-06, "loss": 0.1464, "step": 1911 }, { "epoch": 0.5845307245490675, "grad_norm": 0.5453980565071106, "learning_rate": 7.294612151318303e-06, "loss": 0.1453, "step": 1912 }, { "epoch": 0.5848364414552125, "grad_norm": 2.6207962036132812, "learning_rate": 7.298433320596102e-06, "loss": 0.1414, "step": 1913 }, { "epoch": 0.5851421583613574, "grad_norm": 0.9048108458518982, "learning_rate": 7.302254489873902e-06, "loss": 0.2271, "step": 1914 }, { "epoch": 0.5854478752675023, "grad_norm": 0.9043570756912231, "learning_rate": 7.3060756591517e-06, "loss": 0.2123, "step": 1915 }, { "epoch": 0.5857535921736472, "grad_norm": 0.7846548557281494, "learning_rate": 7.309896828429499e-06, "loss": 0.2544, "step": 1916 }, { "epoch": 0.5860593090797921, "grad_norm": 0.8672215342521667, "learning_rate": 7.313717997707298e-06, "loss": 0.2716, "step": 1917 }, { "epoch": 0.586365025985937, "grad_norm": 1.4152439832687378, "learning_rate": 7.317539166985098e-06, "loss": 0.2872, "step": 1918 }, { "epoch": 0.586670742892082, "grad_norm": 1.0666393041610718, "learning_rate": 7.321360336262897e-06, "loss": 0.2653, "step": 1919 }, { "epoch": 0.5869764597982269, "grad_norm": 0.8719692826271057, "learning_rate": 7.325181505540696e-06, "loss": 0.3009, "step": 1920 }, { "epoch": 0.5872821767043718, "grad_norm": 1.164770245552063, "learning_rate": 7.3290026748184945e-06, "loss": 0.3038, "step": 1921 }, { "epoch": 0.5875878936105167, "grad_norm": 1.359133243560791, "learning_rate": 7.332823844096294e-06, "loss": 0.3053, "step": 1922 }, { "epoch": 0.5878936105166616, "grad_norm": 1.3975558280944824, "learning_rate": 7.336645013374093e-06, "loss": 0.3819, "step": 1923 }, { "epoch": 0.5881993274228065, "grad_norm": 2.2978293895721436, "learning_rate": 7.340466182651892e-06, "loss": 0.3591, "step": 1924 }, { "epoch": 0.5885050443289513, "grad_norm": 2.55303955078125, "learning_rate": 7.34428735192969e-06, "loss": 0.4354, "step": 1925 }, { "epoch": 0.5888107612350963, "grad_norm": 0.6490985751152039, "learning_rate": 7.34810852120749e-06, "loss": 0.2003, "step": 1926 }, { "epoch": 0.5891164781412412, "grad_norm": 0.6329155564308167, "learning_rate": 7.351929690485289e-06, "loss": 0.1321, "step": 1927 }, { "epoch": 0.5894221950473861, "grad_norm": 0.5672707557678223, "learning_rate": 7.355750859763087e-06, "loss": 0.137, "step": 1928 }, { "epoch": 0.589727911953531, "grad_norm": 0.4347907602787018, "learning_rate": 7.359572029040887e-06, "loss": 0.1096, "step": 1929 }, { "epoch": 0.5900336288596759, "grad_norm": 0.4673721194267273, "learning_rate": 7.363393198318686e-06, "loss": 0.087, "step": 1930 }, { "epoch": 0.5903393457658208, "grad_norm": 0.5681637525558472, "learning_rate": 7.3672143675964845e-06, "loss": 0.1282, "step": 1931 }, { "epoch": 0.5906450626719658, "grad_norm": 0.4858405292034149, "learning_rate": 7.371035536874283e-06, "loss": 0.1021, "step": 1932 }, { "epoch": 0.5909507795781107, "grad_norm": 0.7588855624198914, "learning_rate": 7.374856706152083e-06, "loss": 0.1471, "step": 1933 }, { "epoch": 0.5912564964842556, "grad_norm": 0.5144547820091248, "learning_rate": 7.378677875429882e-06, "loss": 0.1286, "step": 1934 }, { "epoch": 0.5915622133904005, "grad_norm": 0.4558835029602051, "learning_rate": 7.38249904470768e-06, "loss": 0.1126, "step": 1935 }, { "epoch": 0.5918679302965454, "grad_norm": 1.0903987884521484, "learning_rate": 7.386320213985479e-06, "loss": 0.1689, "step": 1936 }, { "epoch": 0.5921736472026903, "grad_norm": 0.5751377940177917, "learning_rate": 7.3901413832632795e-06, "loss": 0.1315, "step": 1937 }, { "epoch": 0.5924793641088352, "grad_norm": 1.0316221714019775, "learning_rate": 7.393962552541078e-06, "loss": 0.1657, "step": 1938 }, { "epoch": 0.5927850810149802, "grad_norm": 1.7573440074920654, "learning_rate": 7.397783721818877e-06, "loss": 0.1639, "step": 1939 }, { "epoch": 0.5930907979211251, "grad_norm": 1.1260974407196045, "learning_rate": 7.401604891096676e-06, "loss": 0.2168, "step": 1940 }, { "epoch": 0.59339651482727, "grad_norm": 0.8659827709197998, "learning_rate": 7.405426060374475e-06, "loss": 0.2438, "step": 1941 }, { "epoch": 0.5937022317334149, "grad_norm": 1.1062947511672974, "learning_rate": 7.409247229652274e-06, "loss": 0.236, "step": 1942 }, { "epoch": 0.5940079486395597, "grad_norm": 1.2706793546676636, "learning_rate": 7.413068398930073e-06, "loss": 0.2982, "step": 1943 }, { "epoch": 0.5943136655457046, "grad_norm": 1.1166890859603882, "learning_rate": 7.4168895682078716e-06, "loss": 0.2564, "step": 1944 }, { "epoch": 0.5946193824518495, "grad_norm": 1.195608377456665, "learning_rate": 7.420710737485671e-06, "loss": 0.2565, "step": 1945 }, { "epoch": 0.5949250993579945, "grad_norm": 1.3918237686157227, "learning_rate": 7.42453190676347e-06, "loss": 0.3364, "step": 1946 }, { "epoch": 0.5952308162641394, "grad_norm": 1.5003726482391357, "learning_rate": 7.428353076041269e-06, "loss": 0.2856, "step": 1947 }, { "epoch": 0.5955365331702843, "grad_norm": 1.7223433256149292, "learning_rate": 7.432174245319067e-06, "loss": 0.3776, "step": 1948 }, { "epoch": 0.5958422500764292, "grad_norm": 1.8875541687011719, "learning_rate": 7.435995414596867e-06, "loss": 0.3658, "step": 1949 }, { "epoch": 0.5961479669825741, "grad_norm": 3.2051913738250732, "learning_rate": 7.439816583874666e-06, "loss": 0.5088, "step": 1950 }, { "epoch": 0.596453683888719, "grad_norm": 0.7080013155937195, "learning_rate": 7.4436377531524645e-06, "loss": 0.2045, "step": 1951 }, { "epoch": 0.596759400794864, "grad_norm": 0.4593445956707001, "learning_rate": 7.447458922430263e-06, "loss": 0.1319, "step": 1952 }, { "epoch": 0.5970651177010089, "grad_norm": 0.6685171127319336, "learning_rate": 7.451280091708063e-06, "loss": 0.121, "step": 1953 }, { "epoch": 0.5973708346071538, "grad_norm": 0.6758092641830444, "learning_rate": 7.4551012609858616e-06, "loss": 0.114, "step": 1954 }, { "epoch": 0.5976765515132987, "grad_norm": 0.47277897596359253, "learning_rate": 7.45892243026366e-06, "loss": 0.0935, "step": 1955 }, { "epoch": 0.5979822684194436, "grad_norm": 0.7233219146728516, "learning_rate": 7.462743599541459e-06, "loss": 0.1018, "step": 1956 }, { "epoch": 0.5982879853255885, "grad_norm": 0.5520886182785034, "learning_rate": 7.4665647688192595e-06, "loss": 0.1128, "step": 1957 }, { "epoch": 0.5985937022317334, "grad_norm": 0.9269227981567383, "learning_rate": 7.470385938097058e-06, "loss": 0.1194, "step": 1958 }, { "epoch": 0.5988994191378784, "grad_norm": 0.6073543429374695, "learning_rate": 7.474207107374857e-06, "loss": 0.1448, "step": 1959 }, { "epoch": 0.5992051360440233, "grad_norm": 0.6807029247283936, "learning_rate": 7.4780282766526566e-06, "loss": 0.1282, "step": 1960 }, { "epoch": 0.5995108529501681, "grad_norm": 0.5982428789138794, "learning_rate": 7.481849445930455e-06, "loss": 0.1179, "step": 1961 }, { "epoch": 0.599816569856313, "grad_norm": 0.6457120776176453, "learning_rate": 7.485670615208254e-06, "loss": 0.133, "step": 1962 }, { "epoch": 0.6001222867624579, "grad_norm": 0.7989453077316284, "learning_rate": 7.489491784486053e-06, "loss": 0.1788, "step": 1963 }, { "epoch": 0.6004280036686028, "grad_norm": 0.7645642757415771, "learning_rate": 7.493312953763852e-06, "loss": 0.2336, "step": 1964 }, { "epoch": 0.6007337205747478, "grad_norm": 0.807452380657196, "learning_rate": 7.497134123041651e-06, "loss": 0.2023, "step": 1965 }, { "epoch": 0.6010394374808927, "grad_norm": 0.7843896150588989, "learning_rate": 7.500955292319451e-06, "loss": 0.2204, "step": 1966 }, { "epoch": 0.6013451543870376, "grad_norm": 1.6914281845092773, "learning_rate": 7.5047764615972495e-06, "loss": 0.2777, "step": 1967 }, { "epoch": 0.6016508712931825, "grad_norm": 2.399582862854004, "learning_rate": 7.508597630875049e-06, "loss": 0.2443, "step": 1968 }, { "epoch": 0.6019565881993274, "grad_norm": 1.7999681234359741, "learning_rate": 7.512418800152848e-06, "loss": 0.2426, "step": 1969 }, { "epoch": 0.6022623051054723, "grad_norm": 1.30343496799469, "learning_rate": 7.5162399694306465e-06, "loss": 0.306, "step": 1970 }, { "epoch": 0.6025680220116172, "grad_norm": 1.2521511316299438, "learning_rate": 7.520061138708445e-06, "loss": 0.3187, "step": 1971 }, { "epoch": 0.6028737389177622, "grad_norm": 1.4214974641799927, "learning_rate": 7.523882307986245e-06, "loss": 0.3291, "step": 1972 }, { "epoch": 0.6031794558239071, "grad_norm": 1.6459029912948608, "learning_rate": 7.527703477264044e-06, "loss": 0.3514, "step": 1973 }, { "epoch": 0.603485172730052, "grad_norm": 1.73680579662323, "learning_rate": 7.531524646541842e-06, "loss": 0.3365, "step": 1974 }, { "epoch": 0.6037908896361969, "grad_norm": 2.2698380947113037, "learning_rate": 7.535345815819641e-06, "loss": 0.4793, "step": 1975 }, { "epoch": 0.6040966065423418, "grad_norm": 1.160672903060913, "learning_rate": 7.539166985097441e-06, "loss": 0.203, "step": 1976 }, { "epoch": 0.6044023234484867, "grad_norm": 0.45741963386535645, "learning_rate": 7.5429881543752395e-06, "loss": 0.1336, "step": 1977 }, { "epoch": 0.6047080403546317, "grad_norm": 0.544162392616272, "learning_rate": 7.546809323653038e-06, "loss": 0.1171, "step": 1978 }, { "epoch": 0.6050137572607766, "grad_norm": 0.681744396686554, "learning_rate": 7.550630492930837e-06, "loss": 0.1024, "step": 1979 }, { "epoch": 0.6053194741669214, "grad_norm": 0.4992324411869049, "learning_rate": 7.5544516622086365e-06, "loss": 0.1021, "step": 1980 }, { "epoch": 0.6056251910730663, "grad_norm": 0.4904412031173706, "learning_rate": 7.558272831486435e-06, "loss": 0.1071, "step": 1981 }, { "epoch": 0.6059309079792112, "grad_norm": 0.581933856010437, "learning_rate": 7.562094000764234e-06, "loss": 0.1193, "step": 1982 }, { "epoch": 0.6062366248853561, "grad_norm": 0.43909770250320435, "learning_rate": 7.565915170042033e-06, "loss": 0.1275, "step": 1983 }, { "epoch": 0.606542341791501, "grad_norm": 0.5250309109687805, "learning_rate": 7.569736339319832e-06, "loss": 0.1341, "step": 1984 }, { "epoch": 0.606848058697646, "grad_norm": 0.48730164766311646, "learning_rate": 7.573557508597631e-06, "loss": 0.109, "step": 1985 }, { "epoch": 0.6071537756037909, "grad_norm": 0.646440327167511, "learning_rate": 7.57737867787543e-06, "loss": 0.1364, "step": 1986 }, { "epoch": 0.6074594925099358, "grad_norm": 0.6552844643592834, "learning_rate": 7.581199847153229e-06, "loss": 0.187, "step": 1987 }, { "epoch": 0.6077652094160807, "grad_norm": 0.8944993615150452, "learning_rate": 7.585021016431028e-06, "loss": 0.1884, "step": 1988 }, { "epoch": 0.6080709263222256, "grad_norm": 1.3205132484436035, "learning_rate": 7.588842185708827e-06, "loss": 0.1867, "step": 1989 }, { "epoch": 0.6083766432283705, "grad_norm": 0.8520851135253906, "learning_rate": 7.592663354986626e-06, "loss": 0.2219, "step": 1990 }, { "epoch": 0.6086823601345155, "grad_norm": 0.7794179916381836, "learning_rate": 7.596484524264425e-06, "loss": 0.2362, "step": 1991 }, { "epoch": 0.6089880770406604, "grad_norm": 1.0107321739196777, "learning_rate": 7.600305693542224e-06, "loss": 0.2607, "step": 1992 }, { "epoch": 0.6092937939468053, "grad_norm": 0.7697958946228027, "learning_rate": 7.604126862820023e-06, "loss": 0.2569, "step": 1993 }, { "epoch": 0.6095995108529502, "grad_norm": 0.9239106178283691, "learning_rate": 7.6079480320978215e-06, "loss": 0.2955, "step": 1994 }, { "epoch": 0.6099052277590951, "grad_norm": 2.1650359630584717, "learning_rate": 7.611769201375621e-06, "loss": 0.2667, "step": 1995 }, { "epoch": 0.61021094466524, "grad_norm": 2.0550553798675537, "learning_rate": 7.61559037065342e-06, "loss": 0.3121, "step": 1996 }, { "epoch": 0.6105166615713848, "grad_norm": 1.3943654298782349, "learning_rate": 7.619411539931219e-06, "loss": 0.3325, "step": 1997 }, { "epoch": 0.6108223784775298, "grad_norm": 1.320082664489746, "learning_rate": 7.623232709209017e-06, "loss": 0.3516, "step": 1998 }, { "epoch": 0.6111280953836747, "grad_norm": 3.998629570007324, "learning_rate": 7.627053878486817e-06, "loss": 0.35, "step": 1999 }, { "epoch": 0.6114338122898196, "grad_norm": 2.585301399230957, "learning_rate": 7.630875047764615e-06, "loss": 0.5207, "step": 2000 }, { "epoch": 0.6114338122898196, "eval_cer": 0.20164097192672362, "eval_loss": 0.3285868167877197, "eval_runtime": 19.2728, "eval_samples_per_second": 235.461, "eval_steps_per_second": 0.778, "eval_wer": 0.37571576212954944, "step": 2000 }, { "epoch": 0.6117395291959645, "grad_norm": 0.5187902450561523, "learning_rate": 7.634696217042415e-06, "loss": 0.194, "step": 2001 }, { "epoch": 0.6120452461021094, "grad_norm": 0.6038624048233032, "learning_rate": 7.638517386320214e-06, "loss": 0.1398, "step": 2002 }, { "epoch": 0.6123509630082543, "grad_norm": 0.430724173784256, "learning_rate": 7.642338555598013e-06, "loss": 0.126, "step": 2003 }, { "epoch": 0.6126566799143993, "grad_norm": 0.45426061749458313, "learning_rate": 7.646159724875811e-06, "loss": 0.1154, "step": 2004 }, { "epoch": 0.6129623968205442, "grad_norm": 0.4350703954696655, "learning_rate": 7.649980894153612e-06, "loss": 0.1011, "step": 2005 }, { "epoch": 0.6132681137266891, "grad_norm": 0.37412363290786743, "learning_rate": 7.65380206343141e-06, "loss": 0.0869, "step": 2006 }, { "epoch": 0.613573830632834, "grad_norm": 0.44846847653388977, "learning_rate": 7.65762323270921e-06, "loss": 0.1012, "step": 2007 }, { "epoch": 0.6138795475389789, "grad_norm": 0.6391977071762085, "learning_rate": 7.661444401987008e-06, "loss": 0.1188, "step": 2008 }, { "epoch": 0.6141852644451238, "grad_norm": 0.47337374091148376, "learning_rate": 7.665265571264809e-06, "loss": 0.1136, "step": 2009 }, { "epoch": 0.6144909813512687, "grad_norm": 0.5771970748901367, "learning_rate": 7.669086740542607e-06, "loss": 0.1171, "step": 2010 }, { "epoch": 0.6147966982574137, "grad_norm": 0.540390133857727, "learning_rate": 7.672907909820406e-06, "loss": 0.1473, "step": 2011 }, { "epoch": 0.6151024151635586, "grad_norm": 0.7346463799476624, "learning_rate": 7.676729079098205e-06, "loss": 0.15, "step": 2012 }, { "epoch": 0.6154081320697035, "grad_norm": 1.2267534732818604, "learning_rate": 7.680550248376004e-06, "loss": 0.1422, "step": 2013 }, { "epoch": 0.6157138489758484, "grad_norm": 0.7928211092948914, "learning_rate": 7.684371417653802e-06, "loss": 0.1755, "step": 2014 }, { "epoch": 0.6160195658819932, "grad_norm": 0.8881862163543701, "learning_rate": 7.688192586931601e-06, "loss": 0.244, "step": 2015 }, { "epoch": 0.6163252827881381, "grad_norm": 1.2150815725326538, "learning_rate": 7.6920137562094e-06, "loss": 0.2139, "step": 2016 }, { "epoch": 0.6166309996942831, "grad_norm": 0.8944963216781616, "learning_rate": 7.6958349254872e-06, "loss": 0.2747, "step": 2017 }, { "epoch": 0.616936716600428, "grad_norm": 1.2540019750595093, "learning_rate": 7.699656094764999e-06, "loss": 0.2959, "step": 2018 }, { "epoch": 0.6172424335065729, "grad_norm": 1.0746394395828247, "learning_rate": 7.703477264042798e-06, "loss": 0.2871, "step": 2019 }, { "epoch": 0.6175481504127178, "grad_norm": 1.1277234554290771, "learning_rate": 7.707298433320596e-06, "loss": 0.318, "step": 2020 }, { "epoch": 0.6178538673188627, "grad_norm": 1.2350902557373047, "learning_rate": 7.711119602598395e-06, "loss": 0.2858, "step": 2021 }, { "epoch": 0.6181595842250076, "grad_norm": 1.9677119255065918, "learning_rate": 7.714940771876194e-06, "loss": 0.2861, "step": 2022 }, { "epoch": 0.6184653011311525, "grad_norm": 2.8836381435394287, "learning_rate": 7.718761941153993e-06, "loss": 0.3575, "step": 2023 }, { "epoch": 0.6187710180372975, "grad_norm": 1.7863613367080688, "learning_rate": 7.722583110431793e-06, "loss": 0.4289, "step": 2024 }, { "epoch": 0.6190767349434424, "grad_norm": 3.1880104541778564, "learning_rate": 7.726404279709592e-06, "loss": 0.5282, "step": 2025 }, { "epoch": 0.6193824518495873, "grad_norm": 0.6489416360855103, "learning_rate": 7.73022544898739e-06, "loss": 0.2168, "step": 2026 }, { "epoch": 0.6196881687557322, "grad_norm": 0.4780120253562927, "learning_rate": 7.73404661826519e-06, "loss": 0.1617, "step": 2027 }, { "epoch": 0.6199938856618771, "grad_norm": 0.7685484886169434, "learning_rate": 7.737867787542988e-06, "loss": 0.1085, "step": 2028 }, { "epoch": 0.620299602568022, "grad_norm": 0.6536751985549927, "learning_rate": 7.741688956820787e-06, "loss": 0.1166, "step": 2029 }, { "epoch": 0.620605319474167, "grad_norm": 0.5400654673576355, "learning_rate": 7.745510126098586e-06, "loss": 0.104, "step": 2030 }, { "epoch": 0.6209110363803119, "grad_norm": 0.7401245832443237, "learning_rate": 7.749331295376384e-06, "loss": 0.1395, "step": 2031 }, { "epoch": 0.6212167532864568, "grad_norm": 0.5311408638954163, "learning_rate": 7.753152464654185e-06, "loss": 0.1103, "step": 2032 }, { "epoch": 0.6215224701926017, "grad_norm": 0.4083317220211029, "learning_rate": 7.756973633931984e-06, "loss": 0.0839, "step": 2033 }, { "epoch": 0.6218281870987465, "grad_norm": 0.5604150891304016, "learning_rate": 7.760794803209782e-06, "loss": 0.1357, "step": 2034 }, { "epoch": 0.6221339040048914, "grad_norm": 1.2679128646850586, "learning_rate": 7.764615972487581e-06, "loss": 0.1137, "step": 2035 }, { "epoch": 0.6224396209110363, "grad_norm": 0.4440188705921173, "learning_rate": 7.76843714176538e-06, "loss": 0.1328, "step": 2036 }, { "epoch": 0.6227453378171813, "grad_norm": 0.7043231129646301, "learning_rate": 7.772258311043179e-06, "loss": 0.1691, "step": 2037 }, { "epoch": 0.6230510547233262, "grad_norm": 0.9576815366744995, "learning_rate": 7.776079480320977e-06, "loss": 0.1715, "step": 2038 }, { "epoch": 0.6233567716294711, "grad_norm": 0.5889386534690857, "learning_rate": 7.779900649598776e-06, "loss": 0.2005, "step": 2039 }, { "epoch": 0.623662488535616, "grad_norm": 1.869053840637207, "learning_rate": 7.783721818876576e-06, "loss": 0.2192, "step": 2040 }, { "epoch": 0.6239682054417609, "grad_norm": 0.845515787601471, "learning_rate": 7.787542988154375e-06, "loss": 0.2086, "step": 2041 }, { "epoch": 0.6242739223479058, "grad_norm": 5.7928338050842285, "learning_rate": 7.791364157432174e-06, "loss": 0.2535, "step": 2042 }, { "epoch": 0.6245796392540508, "grad_norm": 0.8845643997192383, "learning_rate": 7.795185326709973e-06, "loss": 0.2302, "step": 2043 }, { "epoch": 0.6248853561601957, "grad_norm": 1.0290377140045166, "learning_rate": 7.799006495987773e-06, "loss": 0.2769, "step": 2044 }, { "epoch": 0.6251910730663406, "grad_norm": 1.3524507284164429, "learning_rate": 7.802827665265572e-06, "loss": 0.3156, "step": 2045 }, { "epoch": 0.6254967899724855, "grad_norm": 1.0882244110107422, "learning_rate": 7.80664883454337e-06, "loss": 0.2782, "step": 2046 }, { "epoch": 0.6258025068786304, "grad_norm": 1.3886784315109253, "learning_rate": 7.81047000382117e-06, "loss": 0.3211, "step": 2047 }, { "epoch": 0.6261082237847753, "grad_norm": 1.6012904644012451, "learning_rate": 7.81429117309897e-06, "loss": 0.3185, "step": 2048 }, { "epoch": 0.6264139406909202, "grad_norm": 1.5593615770339966, "learning_rate": 7.818112342376769e-06, "loss": 0.3439, "step": 2049 }, { "epoch": 0.6267196575970652, "grad_norm": 2.3732826709747314, "learning_rate": 7.821933511654567e-06, "loss": 0.4586, "step": 2050 }, { "epoch": 0.62702537450321, "grad_norm": 0.8152393698692322, "learning_rate": 7.825754680932366e-06, "loss": 0.1938, "step": 2051 }, { "epoch": 0.627331091409355, "grad_norm": 0.4624381363391876, "learning_rate": 7.829575850210165e-06, "loss": 0.125, "step": 2052 }, { "epoch": 0.6276368083154998, "grad_norm": 0.5955327153205872, "learning_rate": 7.833397019487964e-06, "loss": 0.1397, "step": 2053 }, { "epoch": 0.6279425252216447, "grad_norm": 0.47326764464378357, "learning_rate": 7.837218188765762e-06, "loss": 0.1081, "step": 2054 }, { "epoch": 0.6282482421277896, "grad_norm": 0.6850059032440186, "learning_rate": 7.841039358043563e-06, "loss": 0.1067, "step": 2055 }, { "epoch": 0.6285539590339346, "grad_norm": 0.4454417824745178, "learning_rate": 7.844860527321361e-06, "loss": 0.0882, "step": 2056 }, { "epoch": 0.6288596759400795, "grad_norm": 0.7535659074783325, "learning_rate": 7.84868169659916e-06, "loss": 0.1154, "step": 2057 }, { "epoch": 0.6291653928462244, "grad_norm": 0.6505241394042969, "learning_rate": 7.852502865876959e-06, "loss": 0.164, "step": 2058 }, { "epoch": 0.6294711097523693, "grad_norm": 0.5871323943138123, "learning_rate": 7.856324035154758e-06, "loss": 0.1379, "step": 2059 }, { "epoch": 0.6297768266585142, "grad_norm": 0.575226366519928, "learning_rate": 7.860145204432556e-06, "loss": 0.1155, "step": 2060 }, { "epoch": 0.6300825435646591, "grad_norm": 0.9004877805709839, "learning_rate": 7.863966373710355e-06, "loss": 0.1084, "step": 2061 }, { "epoch": 0.630388260470804, "grad_norm": 0.9147818684577942, "learning_rate": 7.867787542988154e-06, "loss": 0.1443, "step": 2062 }, { "epoch": 0.630693977376949, "grad_norm": 1.1063188314437866, "learning_rate": 7.871608712265954e-06, "loss": 0.2254, "step": 2063 }, { "epoch": 0.6309996942830939, "grad_norm": 0.6520466208457947, "learning_rate": 7.875429881543753e-06, "loss": 0.1614, "step": 2064 }, { "epoch": 0.6313054111892388, "grad_norm": 0.7221137881278992, "learning_rate": 7.879251050821552e-06, "loss": 0.2396, "step": 2065 }, { "epoch": 0.6316111280953837, "grad_norm": 0.8394717574119568, "learning_rate": 7.88307222009935e-06, "loss": 0.2054, "step": 2066 }, { "epoch": 0.6319168450015286, "grad_norm": 1.6745811700820923, "learning_rate": 7.88689338937715e-06, "loss": 0.2989, "step": 2067 }, { "epoch": 0.6322225619076735, "grad_norm": 1.0665745735168457, "learning_rate": 7.890714558654948e-06, "loss": 0.2787, "step": 2068 }, { "epoch": 0.6325282788138185, "grad_norm": 1.9412670135498047, "learning_rate": 7.894535727932747e-06, "loss": 0.2962, "step": 2069 }, { "epoch": 0.6328339957199633, "grad_norm": 1.23017156124115, "learning_rate": 7.898356897210546e-06, "loss": 0.3329, "step": 2070 }, { "epoch": 0.6331397126261082, "grad_norm": 1.7694157361984253, "learning_rate": 7.902178066488346e-06, "loss": 0.3038, "step": 2071 }, { "epoch": 0.6334454295322531, "grad_norm": 1.2622723579406738, "learning_rate": 7.905999235766145e-06, "loss": 0.2973, "step": 2072 }, { "epoch": 0.633751146438398, "grad_norm": 2.9322917461395264, "learning_rate": 7.909820405043944e-06, "loss": 0.4583, "step": 2073 }, { "epoch": 0.6340568633445429, "grad_norm": 3.0593667030334473, "learning_rate": 7.913641574321742e-06, "loss": 0.384, "step": 2074 }, { "epoch": 0.6343625802506878, "grad_norm": 4.861026763916016, "learning_rate": 7.917462743599541e-06, "loss": 0.4604, "step": 2075 }, { "epoch": 0.6346682971568328, "grad_norm": 1.4252387285232544, "learning_rate": 7.92128391287734e-06, "loss": 0.2289, "step": 2076 }, { "epoch": 0.6349740140629777, "grad_norm": 0.7187891006469727, "learning_rate": 7.925105082155138e-06, "loss": 0.1647, "step": 2077 }, { "epoch": 0.6352797309691226, "grad_norm": 0.5627343654632568, "learning_rate": 7.928926251432937e-06, "loss": 0.1375, "step": 2078 }, { "epoch": 0.6355854478752675, "grad_norm": 0.5516590476036072, "learning_rate": 7.932747420710738e-06, "loss": 0.1481, "step": 2079 }, { "epoch": 0.6358911647814124, "grad_norm": 0.5831177830696106, "learning_rate": 7.936568589988536e-06, "loss": 0.0965, "step": 2080 }, { "epoch": 0.6361968816875573, "grad_norm": 0.3841116428375244, "learning_rate": 7.940389759266335e-06, "loss": 0.0742, "step": 2081 }, { "epoch": 0.6365025985937023, "grad_norm": 0.5494676232337952, "learning_rate": 7.944210928544134e-06, "loss": 0.1009, "step": 2082 }, { "epoch": 0.6368083154998472, "grad_norm": 0.5728955864906311, "learning_rate": 7.948032097821934e-06, "loss": 0.1137, "step": 2083 }, { "epoch": 0.6371140324059921, "grad_norm": 0.4761159420013428, "learning_rate": 7.951853267099733e-06, "loss": 0.089, "step": 2084 }, { "epoch": 0.637419749312137, "grad_norm": 0.5311678051948547, "learning_rate": 7.955674436377532e-06, "loss": 0.1188, "step": 2085 }, { "epoch": 0.6377254662182819, "grad_norm": 0.6980441808700562, "learning_rate": 7.95949560565533e-06, "loss": 0.1562, "step": 2086 }, { "epoch": 0.6380311831244267, "grad_norm": 0.7395910620689392, "learning_rate": 7.963316774933131e-06, "loss": 0.1821, "step": 2087 }, { "epoch": 0.6383369000305716, "grad_norm": 0.9286815524101257, "learning_rate": 7.96713794421093e-06, "loss": 0.1755, "step": 2088 }, { "epoch": 0.6386426169367166, "grad_norm": 0.7462678551673889, "learning_rate": 7.970959113488729e-06, "loss": 0.2032, "step": 2089 }, { "epoch": 0.6389483338428615, "grad_norm": 1.3060532808303833, "learning_rate": 7.974780282766527e-06, "loss": 0.2097, "step": 2090 }, { "epoch": 0.6392540507490064, "grad_norm": 1.0439081192016602, "learning_rate": 7.978601452044326e-06, "loss": 0.195, "step": 2091 }, { "epoch": 0.6395597676551513, "grad_norm": 4.609252452850342, "learning_rate": 7.982422621322125e-06, "loss": 0.2445, "step": 2092 }, { "epoch": 0.6398654845612962, "grad_norm": 1.3947752714157104, "learning_rate": 7.986243790599923e-06, "loss": 0.2731, "step": 2093 }, { "epoch": 0.6401712014674411, "grad_norm": 1.494159460067749, "learning_rate": 7.990064959877724e-06, "loss": 0.3384, "step": 2094 }, { "epoch": 0.6404769183735861, "grad_norm": 1.5055195093154907, "learning_rate": 7.993886129155523e-06, "loss": 0.2745, "step": 2095 }, { "epoch": 0.640782635279731, "grad_norm": 1.0835999250411987, "learning_rate": 7.997707298433321e-06, "loss": 0.3143, "step": 2096 }, { "epoch": 0.6410883521858759, "grad_norm": 1.353200078010559, "learning_rate": 8.00152846771112e-06, "loss": 0.3008, "step": 2097 }, { "epoch": 0.6413940690920208, "grad_norm": 1.6830744743347168, "learning_rate": 8.005349636988919e-06, "loss": 0.2926, "step": 2098 }, { "epoch": 0.6416997859981657, "grad_norm": 1.6070001125335693, "learning_rate": 8.009170806266718e-06, "loss": 0.3375, "step": 2099 }, { "epoch": 0.6420055029043106, "grad_norm": 2.1234073638916016, "learning_rate": 8.012991975544516e-06, "loss": 0.46, "step": 2100 }, { "epoch": 0.6423112198104555, "grad_norm": 1.0086764097213745, "learning_rate": 8.016813144822315e-06, "loss": 0.2285, "step": 2101 }, { "epoch": 0.6426169367166005, "grad_norm": 0.4094775915145874, "learning_rate": 8.020634314100116e-06, "loss": 0.1218, "step": 2102 }, { "epoch": 0.6429226536227454, "grad_norm": 0.6444330811500549, "learning_rate": 8.024455483377914e-06, "loss": 0.1181, "step": 2103 }, { "epoch": 0.6432283705288903, "grad_norm": 0.6410729885101318, "learning_rate": 8.028276652655713e-06, "loss": 0.1169, "step": 2104 }, { "epoch": 0.6435340874350352, "grad_norm": 0.4469577968120575, "learning_rate": 8.032097821933512e-06, "loss": 0.0961, "step": 2105 }, { "epoch": 0.64383980434118, "grad_norm": 0.41036680340766907, "learning_rate": 8.03591899121131e-06, "loss": 0.0868, "step": 2106 }, { "epoch": 0.6441455212473249, "grad_norm": 0.5413553714752197, "learning_rate": 8.03974016048911e-06, "loss": 0.1085, "step": 2107 }, { "epoch": 0.6444512381534699, "grad_norm": 0.6858155131340027, "learning_rate": 8.043561329766908e-06, "loss": 0.1044, "step": 2108 }, { "epoch": 0.6447569550596148, "grad_norm": 0.6070111393928528, "learning_rate": 8.047382499044707e-06, "loss": 0.1231, "step": 2109 }, { "epoch": 0.6450626719657597, "grad_norm": 0.5327774286270142, "learning_rate": 8.051203668322507e-06, "loss": 0.1265, "step": 2110 }, { "epoch": 0.6453683888719046, "grad_norm": 0.5920222997665405, "learning_rate": 8.055024837600306e-06, "loss": 0.1608, "step": 2111 }, { "epoch": 0.6456741057780495, "grad_norm": 0.7388197779655457, "learning_rate": 8.058846006878105e-06, "loss": 0.1624, "step": 2112 }, { "epoch": 0.6459798226841944, "grad_norm": 0.6805545687675476, "learning_rate": 8.062667176155903e-06, "loss": 0.1985, "step": 2113 }, { "epoch": 0.6462855395903393, "grad_norm": 0.6527850031852722, "learning_rate": 8.066488345433702e-06, "loss": 0.1837, "step": 2114 }, { "epoch": 0.6465912564964843, "grad_norm": 0.7970917224884033, "learning_rate": 8.070309514711501e-06, "loss": 0.1629, "step": 2115 }, { "epoch": 0.6468969734026292, "grad_norm": 0.7753785848617554, "learning_rate": 8.0741306839893e-06, "loss": 0.2045, "step": 2116 }, { "epoch": 0.6472026903087741, "grad_norm": 1.6173797845840454, "learning_rate": 8.0779518532671e-06, "loss": 0.2314, "step": 2117 }, { "epoch": 0.647508407214919, "grad_norm": 0.8957337737083435, "learning_rate": 8.081773022544899e-06, "loss": 0.2976, "step": 2118 }, { "epoch": 0.6478141241210639, "grad_norm": 1.0854274034500122, "learning_rate": 8.085594191822698e-06, "loss": 0.2499, "step": 2119 }, { "epoch": 0.6481198410272088, "grad_norm": 0.9155086874961853, "learning_rate": 8.089415361100496e-06, "loss": 0.2714, "step": 2120 }, { "epoch": 0.6484255579333538, "grad_norm": 1.3727308511734009, "learning_rate": 8.093236530378295e-06, "loss": 0.3285, "step": 2121 }, { "epoch": 0.6487312748394987, "grad_norm": 1.3563233613967896, "learning_rate": 8.097057699656096e-06, "loss": 0.339, "step": 2122 }, { "epoch": 0.6490369917456436, "grad_norm": 2.36318302154541, "learning_rate": 8.100878868933894e-06, "loss": 0.2994, "step": 2123 }, { "epoch": 0.6493427086517884, "grad_norm": 1.8329064846038818, "learning_rate": 8.104700038211693e-06, "loss": 0.3775, "step": 2124 }, { "epoch": 0.6496484255579333, "grad_norm": 2.573214292526245, "learning_rate": 8.108521207489493e-06, "loss": 0.5023, "step": 2125 }, { "epoch": 0.6499541424640782, "grad_norm": 0.49748942255973816, "learning_rate": 8.112342376767292e-06, "loss": 0.2348, "step": 2126 }, { "epoch": 0.6502598593702231, "grad_norm": 0.6136089563369751, "learning_rate": 8.116163546045091e-06, "loss": 0.1099, "step": 2127 }, { "epoch": 0.6505655762763681, "grad_norm": 0.341797411441803, "learning_rate": 8.11998471532289e-06, "loss": 0.1102, "step": 2128 }, { "epoch": 0.650871293182513, "grad_norm": 0.40247640013694763, "learning_rate": 8.123805884600688e-06, "loss": 0.1115, "step": 2129 }, { "epoch": 0.6511770100886579, "grad_norm": 0.5074516534805298, "learning_rate": 8.127627053878487e-06, "loss": 0.1063, "step": 2130 }, { "epoch": 0.6514827269948028, "grad_norm": 0.43491944670677185, "learning_rate": 8.131448223156286e-06, "loss": 0.1109, "step": 2131 }, { "epoch": 0.6517884439009477, "grad_norm": 0.504197359085083, "learning_rate": 8.135269392434085e-06, "loss": 0.0986, "step": 2132 }, { "epoch": 0.6520941608070926, "grad_norm": 0.8430514931678772, "learning_rate": 8.139090561711885e-06, "loss": 0.1082, "step": 2133 }, { "epoch": 0.6523998777132376, "grad_norm": 0.5364850759506226, "learning_rate": 8.142911730989684e-06, "loss": 0.1262, "step": 2134 }, { "epoch": 0.6527055946193825, "grad_norm": 0.5194720029830933, "learning_rate": 8.146732900267483e-06, "loss": 0.1221, "step": 2135 }, { "epoch": 0.6530113115255274, "grad_norm": 0.6051045060157776, "learning_rate": 8.150554069545281e-06, "loss": 0.1495, "step": 2136 }, { "epoch": 0.6533170284316723, "grad_norm": 0.5159919261932373, "learning_rate": 8.15437523882308e-06, "loss": 0.1635, "step": 2137 }, { "epoch": 0.6536227453378172, "grad_norm": 1.025882363319397, "learning_rate": 8.158196408100879e-06, "loss": 0.1678, "step": 2138 }, { "epoch": 0.6539284622439621, "grad_norm": 0.7689366340637207, "learning_rate": 8.162017577378678e-06, "loss": 0.14, "step": 2139 }, { "epoch": 0.654234179150107, "grad_norm": 1.2561746835708618, "learning_rate": 8.165838746656476e-06, "loss": 0.2569, "step": 2140 }, { "epoch": 0.654539896056252, "grad_norm": 0.9622258543968201, "learning_rate": 8.169659915934277e-06, "loss": 0.2676, "step": 2141 }, { "epoch": 0.6548456129623969, "grad_norm": 2.4272615909576416, "learning_rate": 8.173481085212076e-06, "loss": 0.2549, "step": 2142 }, { "epoch": 0.6551513298685417, "grad_norm": 0.980562686920166, "learning_rate": 8.177302254489874e-06, "loss": 0.2535, "step": 2143 }, { "epoch": 0.6554570467746866, "grad_norm": 0.8645119667053223, "learning_rate": 8.181123423767673e-06, "loss": 0.259, "step": 2144 }, { "epoch": 0.6557627636808315, "grad_norm": 1.1742675304412842, "learning_rate": 8.184944593045472e-06, "loss": 0.2937, "step": 2145 }, { "epoch": 0.6560684805869764, "grad_norm": 1.2084094285964966, "learning_rate": 8.18876576232327e-06, "loss": 0.2869, "step": 2146 }, { "epoch": 0.6563741974931213, "grad_norm": 1.8248767852783203, "learning_rate": 8.19258693160107e-06, "loss": 0.3151, "step": 2147 }, { "epoch": 0.6566799143992663, "grad_norm": 1.4562664031982422, "learning_rate": 8.196408100878868e-06, "loss": 0.3271, "step": 2148 }, { "epoch": 0.6569856313054112, "grad_norm": 1.9381396770477295, "learning_rate": 8.200229270156668e-06, "loss": 0.3811, "step": 2149 }, { "epoch": 0.6572913482115561, "grad_norm": 1.9084031581878662, "learning_rate": 8.204050439434467e-06, "loss": 0.4023, "step": 2150 }, { "epoch": 0.657597065117701, "grad_norm": 0.6166840195655823, "learning_rate": 8.207871608712266e-06, "loss": 0.2131, "step": 2151 }, { "epoch": 0.6579027820238459, "grad_norm": 0.8479235172271729, "learning_rate": 8.211692777990065e-06, "loss": 0.1362, "step": 2152 }, { "epoch": 0.6582084989299908, "grad_norm": 0.8066351413726807, "learning_rate": 8.215513947267863e-06, "loss": 0.1512, "step": 2153 }, { "epoch": 0.6585142158361358, "grad_norm": 0.5315497517585754, "learning_rate": 8.219335116545662e-06, "loss": 0.1104, "step": 2154 }, { "epoch": 0.6588199327422807, "grad_norm": 0.48157474398612976, "learning_rate": 8.223156285823461e-06, "loss": 0.0971, "step": 2155 }, { "epoch": 0.6591256496484256, "grad_norm": 0.5097467303276062, "learning_rate": 8.226977455101261e-06, "loss": 0.1146, "step": 2156 }, { "epoch": 0.6594313665545705, "grad_norm": 1.3486497402191162, "learning_rate": 8.23079862437906e-06, "loss": 0.0991, "step": 2157 }, { "epoch": 0.6597370834607154, "grad_norm": 0.5089870095252991, "learning_rate": 8.234619793656859e-06, "loss": 0.1133, "step": 2158 }, { "epoch": 0.6600428003668602, "grad_norm": 0.5073222517967224, "learning_rate": 8.238440962934658e-06, "loss": 0.1548, "step": 2159 }, { "epoch": 0.6603485172730051, "grad_norm": 0.5371313691139221, "learning_rate": 8.242262132212456e-06, "loss": 0.118, "step": 2160 }, { "epoch": 0.6606542341791501, "grad_norm": 1.0455652475357056, "learning_rate": 8.246083301490257e-06, "loss": 0.1151, "step": 2161 }, { "epoch": 0.660959951085295, "grad_norm": 0.5314789414405823, "learning_rate": 8.249904470768055e-06, "loss": 0.1478, "step": 2162 }, { "epoch": 0.6612656679914399, "grad_norm": 0.5724758505821228, "learning_rate": 8.253725640045854e-06, "loss": 0.1588, "step": 2163 }, { "epoch": 0.6615713848975848, "grad_norm": 0.713639497756958, "learning_rate": 8.257546809323655e-06, "loss": 0.185, "step": 2164 }, { "epoch": 0.6618771018037297, "grad_norm": 1.4886302947998047, "learning_rate": 8.261367978601453e-06, "loss": 0.2273, "step": 2165 }, { "epoch": 0.6621828187098746, "grad_norm": 0.9653937816619873, "learning_rate": 8.265189147879252e-06, "loss": 0.2633, "step": 2166 }, { "epoch": 0.6624885356160196, "grad_norm": 0.7831928133964539, "learning_rate": 8.269010317157051e-06, "loss": 0.2132, "step": 2167 }, { "epoch": 0.6627942525221645, "grad_norm": 0.9198587536811829, "learning_rate": 8.27283148643485e-06, "loss": 0.2766, "step": 2168 }, { "epoch": 0.6630999694283094, "grad_norm": 1.2338401079177856, "learning_rate": 8.276652655712648e-06, "loss": 0.3147, "step": 2169 }, { "epoch": 0.6634056863344543, "grad_norm": 1.3956259489059448, "learning_rate": 8.280473824990447e-06, "loss": 0.3205, "step": 2170 }, { "epoch": 0.6637114032405992, "grad_norm": 1.048843502998352, "learning_rate": 8.284294994268246e-06, "loss": 0.318, "step": 2171 }, { "epoch": 0.6640171201467441, "grad_norm": 1.5389643907546997, "learning_rate": 8.288116163546046e-06, "loss": 0.3368, "step": 2172 }, { "epoch": 0.664322837052889, "grad_norm": 1.5854166746139526, "learning_rate": 8.291937332823845e-06, "loss": 0.3692, "step": 2173 }, { "epoch": 0.664628553959034, "grad_norm": 1.5241695642471313, "learning_rate": 8.295758502101644e-06, "loss": 0.3815, "step": 2174 }, { "epoch": 0.6649342708651789, "grad_norm": 2.4132726192474365, "learning_rate": 8.299579671379443e-06, "loss": 0.4478, "step": 2175 }, { "epoch": 0.6652399877713238, "grad_norm": 0.6744420528411865, "learning_rate": 8.303400840657241e-06, "loss": 0.1884, "step": 2176 }, { "epoch": 0.6655457046774687, "grad_norm": 2.551630973815918, "learning_rate": 8.30722200993504e-06, "loss": 0.1166, "step": 2177 }, { "epoch": 0.6658514215836135, "grad_norm": 0.6490741968154907, "learning_rate": 8.311043179212839e-06, "loss": 0.1177, "step": 2178 }, { "epoch": 0.6661571384897584, "grad_norm": 0.42154601216316223, "learning_rate": 8.314864348490638e-06, "loss": 0.1078, "step": 2179 }, { "epoch": 0.6664628553959034, "grad_norm": 0.48295557498931885, "learning_rate": 8.318685517768438e-06, "loss": 0.1101, "step": 2180 }, { "epoch": 0.6667685723020483, "grad_norm": 0.856315553188324, "learning_rate": 8.322506687046237e-06, "loss": 0.1224, "step": 2181 }, { "epoch": 0.6670742892081932, "grad_norm": 0.642306387424469, "learning_rate": 8.326327856324035e-06, "loss": 0.1329, "step": 2182 }, { "epoch": 0.6673800061143381, "grad_norm": 0.8629612326622009, "learning_rate": 8.330149025601834e-06, "loss": 0.1021, "step": 2183 }, { "epoch": 0.667685723020483, "grad_norm": 0.5254561901092529, "learning_rate": 8.333970194879633e-06, "loss": 0.1174, "step": 2184 }, { "epoch": 0.6679914399266279, "grad_norm": 0.6319891214370728, "learning_rate": 8.337791364157432e-06, "loss": 0.1377, "step": 2185 }, { "epoch": 0.6682971568327728, "grad_norm": 0.6036531925201416, "learning_rate": 8.34161253343523e-06, "loss": 0.161, "step": 2186 }, { "epoch": 0.6686028737389178, "grad_norm": 0.8825640082359314, "learning_rate": 8.345433702713031e-06, "loss": 0.1162, "step": 2187 }, { "epoch": 0.6689085906450627, "grad_norm": 0.5315311551094055, "learning_rate": 8.34925487199083e-06, "loss": 0.1878, "step": 2188 }, { "epoch": 0.6692143075512076, "grad_norm": 0.8576556444168091, "learning_rate": 8.353076041268628e-06, "loss": 0.1929, "step": 2189 }, { "epoch": 0.6695200244573525, "grad_norm": 0.634548008441925, "learning_rate": 8.356897210546427e-06, "loss": 0.1607, "step": 2190 }, { "epoch": 0.6698257413634974, "grad_norm": 0.795169472694397, "learning_rate": 8.360718379824226e-06, "loss": 0.2213, "step": 2191 }, { "epoch": 0.6701314582696423, "grad_norm": 0.6457976698875427, "learning_rate": 8.364539549102025e-06, "loss": 0.2721, "step": 2192 }, { "epoch": 0.6704371751757873, "grad_norm": 1.2385672330856323, "learning_rate": 8.368360718379823e-06, "loss": 0.2676, "step": 2193 }, { "epoch": 0.6707428920819322, "grad_norm": 1.0434664487838745, "learning_rate": 8.372181887657622e-06, "loss": 0.3192, "step": 2194 }, { "epoch": 0.671048608988077, "grad_norm": 0.9268093705177307, "learning_rate": 8.376003056935423e-06, "loss": 0.2644, "step": 2195 }, { "epoch": 0.671354325894222, "grad_norm": 1.313844084739685, "learning_rate": 8.379824226213221e-06, "loss": 0.2761, "step": 2196 }, { "epoch": 0.6716600428003668, "grad_norm": 1.2908694744110107, "learning_rate": 8.38364539549102e-06, "loss": 0.3271, "step": 2197 }, { "epoch": 0.6719657597065117, "grad_norm": 2.1844394207000732, "learning_rate": 8.387466564768819e-06, "loss": 0.2913, "step": 2198 }, { "epoch": 0.6722714766126566, "grad_norm": 2.1975326538085938, "learning_rate": 8.39128773404662e-06, "loss": 0.3913, "step": 2199 }, { "epoch": 0.6725771935188016, "grad_norm": 1.7605775594711304, "learning_rate": 8.395108903324418e-06, "loss": 0.3891, "step": 2200 }, { "epoch": 0.6728829104249465, "grad_norm": 0.5336949825286865, "learning_rate": 8.398930072602217e-06, "loss": 0.1984, "step": 2201 }, { "epoch": 0.6731886273310914, "grad_norm": 0.4739847779273987, "learning_rate": 8.402751241880015e-06, "loss": 0.1403, "step": 2202 }, { "epoch": 0.6734943442372363, "grad_norm": 0.7635122537612915, "learning_rate": 8.406572411157816e-06, "loss": 0.1337, "step": 2203 }, { "epoch": 0.6738000611433812, "grad_norm": 0.5298926830291748, "learning_rate": 8.410393580435615e-06, "loss": 0.1077, "step": 2204 }, { "epoch": 0.6741057780495261, "grad_norm": 0.4208395481109619, "learning_rate": 8.414214749713413e-06, "loss": 0.1258, "step": 2205 }, { "epoch": 0.6744114949556711, "grad_norm": 0.34379327297210693, "learning_rate": 8.418035918991212e-06, "loss": 0.1015, "step": 2206 }, { "epoch": 0.674717211861816, "grad_norm": 0.6589096784591675, "learning_rate": 8.421857088269011e-06, "loss": 0.1044, "step": 2207 }, { "epoch": 0.6750229287679609, "grad_norm": 0.43147751688957214, "learning_rate": 8.42567825754681e-06, "loss": 0.1078, "step": 2208 }, { "epoch": 0.6753286456741058, "grad_norm": 0.5299631357192993, "learning_rate": 8.429499426824608e-06, "loss": 0.1208, "step": 2209 }, { "epoch": 0.6756343625802507, "grad_norm": 0.5101324915885925, "learning_rate": 8.433320596102407e-06, "loss": 0.1331, "step": 2210 }, { "epoch": 0.6759400794863956, "grad_norm": 0.7721606492996216, "learning_rate": 8.437141765380208e-06, "loss": 0.1798, "step": 2211 }, { "epoch": 0.6762457963925405, "grad_norm": 0.7137681841850281, "learning_rate": 8.440962934658006e-06, "loss": 0.1107, "step": 2212 }, { "epoch": 0.6765515132986855, "grad_norm": 0.598042368888855, "learning_rate": 8.444784103935805e-06, "loss": 0.1658, "step": 2213 }, { "epoch": 0.6768572302048304, "grad_norm": 0.7714128494262695, "learning_rate": 8.448605273213604e-06, "loss": 0.1872, "step": 2214 }, { "epoch": 0.6771629471109752, "grad_norm": 0.7921777367591858, "learning_rate": 8.452426442491403e-06, "loss": 0.2124, "step": 2215 }, { "epoch": 0.6774686640171201, "grad_norm": 0.8288670182228088, "learning_rate": 8.456247611769201e-06, "loss": 0.2072, "step": 2216 }, { "epoch": 0.677774380923265, "grad_norm": 1.1291987895965576, "learning_rate": 8.460068781047e-06, "loss": 0.2786, "step": 2217 }, { "epoch": 0.6780800978294099, "grad_norm": 0.9784873723983765, "learning_rate": 8.4638899503248e-06, "loss": 0.2452, "step": 2218 }, { "epoch": 0.6783858147355549, "grad_norm": 1.6217217445373535, "learning_rate": 8.4677111196026e-06, "loss": 0.2942, "step": 2219 }, { "epoch": 0.6786915316416998, "grad_norm": 1.0026724338531494, "learning_rate": 8.471532288880398e-06, "loss": 0.2451, "step": 2220 }, { "epoch": 0.6789972485478447, "grad_norm": 1.293858289718628, "learning_rate": 8.475353458158197e-06, "loss": 0.3171, "step": 2221 }, { "epoch": 0.6793029654539896, "grad_norm": 1.7060985565185547, "learning_rate": 8.479174627435995e-06, "loss": 0.2816, "step": 2222 }, { "epoch": 0.6796086823601345, "grad_norm": 2.3749477863311768, "learning_rate": 8.482995796713794e-06, "loss": 0.3212, "step": 2223 }, { "epoch": 0.6799143992662794, "grad_norm": 2.695993423461914, "learning_rate": 8.486816965991593e-06, "loss": 0.3755, "step": 2224 }, { "epoch": 0.6802201161724243, "grad_norm": 2.4149162769317627, "learning_rate": 8.490638135269392e-06, "loss": 0.4446, "step": 2225 }, { "epoch": 0.6805258330785693, "grad_norm": 0.533627450466156, "learning_rate": 8.494459304547192e-06, "loss": 0.2236, "step": 2226 }, { "epoch": 0.6808315499847142, "grad_norm": 0.6192163825035095, "learning_rate": 8.49828047382499e-06, "loss": 0.1399, "step": 2227 }, { "epoch": 0.6811372668908591, "grad_norm": 0.4438275694847107, "learning_rate": 8.50210164310279e-06, "loss": 0.1365, "step": 2228 }, { "epoch": 0.681442983797004, "grad_norm": 0.5843666195869446, "learning_rate": 8.505922812380588e-06, "loss": 0.1207, "step": 2229 }, { "epoch": 0.6817487007031489, "grad_norm": 0.46538078784942627, "learning_rate": 8.509743981658387e-06, "loss": 0.1054, "step": 2230 }, { "epoch": 0.6820544176092938, "grad_norm": 0.40470853447914124, "learning_rate": 8.513565150936186e-06, "loss": 0.109, "step": 2231 }, { "epoch": 0.6823601345154388, "grad_norm": 0.41524580121040344, "learning_rate": 8.517386320213985e-06, "loss": 0.1134, "step": 2232 }, { "epoch": 0.6826658514215836, "grad_norm": 0.5145371556282043, "learning_rate": 8.521207489491783e-06, "loss": 0.0955, "step": 2233 }, { "epoch": 0.6829715683277285, "grad_norm": 0.554099977016449, "learning_rate": 8.525028658769584e-06, "loss": 0.1294, "step": 2234 }, { "epoch": 0.6832772852338734, "grad_norm": 0.5350748300552368, "learning_rate": 8.528849828047382e-06, "loss": 0.1289, "step": 2235 }, { "epoch": 0.6835830021400183, "grad_norm": 0.4908585250377655, "learning_rate": 8.532670997325181e-06, "loss": 0.1353, "step": 2236 }, { "epoch": 0.6838887190461632, "grad_norm": 0.5616809725761414, "learning_rate": 8.53649216660298e-06, "loss": 0.156, "step": 2237 }, { "epoch": 0.6841944359523081, "grad_norm": 0.6418547034263611, "learning_rate": 8.54031333588078e-06, "loss": 0.1776, "step": 2238 }, { "epoch": 0.6845001528584531, "grad_norm": 0.7592685222625732, "learning_rate": 8.54413450515858e-06, "loss": 0.221, "step": 2239 }, { "epoch": 0.684805869764598, "grad_norm": 1.158677339553833, "learning_rate": 8.547955674436378e-06, "loss": 0.2091, "step": 2240 }, { "epoch": 0.6851115866707429, "grad_norm": 0.8325470685958862, "learning_rate": 8.551776843714177e-06, "loss": 0.2106, "step": 2241 }, { "epoch": 0.6854173035768878, "grad_norm": 0.8375606536865234, "learning_rate": 8.555598012991977e-06, "loss": 0.2018, "step": 2242 }, { "epoch": 0.6857230204830327, "grad_norm": 0.9773740172386169, "learning_rate": 8.559419182269776e-06, "loss": 0.3094, "step": 2243 }, { "epoch": 0.6860287373891776, "grad_norm": 0.941293478012085, "learning_rate": 8.563240351547575e-06, "loss": 0.2414, "step": 2244 }, { "epoch": 0.6863344542953226, "grad_norm": 1.116223692893982, "learning_rate": 8.567061520825373e-06, "loss": 0.254, "step": 2245 }, { "epoch": 0.6866401712014675, "grad_norm": 3.6116907596588135, "learning_rate": 8.570882690103172e-06, "loss": 0.3001, "step": 2246 }, { "epoch": 0.6869458881076124, "grad_norm": 1.3963475227355957, "learning_rate": 8.57470385938097e-06, "loss": 0.3178, "step": 2247 }, { "epoch": 0.6872516050137573, "grad_norm": 2.333280563354492, "learning_rate": 8.57852502865877e-06, "loss": 0.3639, "step": 2248 }, { "epoch": 0.6875573219199022, "grad_norm": 2.0362138748168945, "learning_rate": 8.58234619793657e-06, "loss": 0.3951, "step": 2249 }, { "epoch": 0.687863038826047, "grad_norm": 2.4964733123779297, "learning_rate": 8.586167367214369e-06, "loss": 0.4414, "step": 2250 }, { "epoch": 0.6881687557321919, "grad_norm": 0.5067337155342102, "learning_rate": 8.589988536492167e-06, "loss": 0.2181, "step": 2251 }, { "epoch": 0.6884744726383369, "grad_norm": 0.45199164748191833, "learning_rate": 8.593809705769966e-06, "loss": 0.1451, "step": 2252 }, { "epoch": 0.6887801895444818, "grad_norm": 0.5657305121421814, "learning_rate": 8.597630875047765e-06, "loss": 0.1239, "step": 2253 }, { "epoch": 0.6890859064506267, "grad_norm": 0.4357755482196808, "learning_rate": 8.601452044325564e-06, "loss": 0.106, "step": 2254 }, { "epoch": 0.6893916233567716, "grad_norm": 0.38092732429504395, "learning_rate": 8.605273213603362e-06, "loss": 0.1027, "step": 2255 }, { "epoch": 0.6896973402629165, "grad_norm": 0.530462920665741, "learning_rate": 8.609094382881161e-06, "loss": 0.109, "step": 2256 }, { "epoch": 0.6900030571690614, "grad_norm": 0.504198431968689, "learning_rate": 8.612915552158962e-06, "loss": 0.1001, "step": 2257 }, { "epoch": 0.6903087740752064, "grad_norm": 0.7219339609146118, "learning_rate": 8.61673672143676e-06, "loss": 0.0765, "step": 2258 }, { "epoch": 0.6906144909813513, "grad_norm": 0.8413931727409363, "learning_rate": 8.620557890714559e-06, "loss": 0.1621, "step": 2259 }, { "epoch": 0.6909202078874962, "grad_norm": 0.8333097100257874, "learning_rate": 8.624379059992358e-06, "loss": 0.1178, "step": 2260 }, { "epoch": 0.6912259247936411, "grad_norm": 0.4821109175682068, "learning_rate": 8.628200229270157e-06, "loss": 0.1166, "step": 2261 }, { "epoch": 0.691531641699786, "grad_norm": 0.6278350949287415, "learning_rate": 8.632021398547955e-06, "loss": 0.1151, "step": 2262 }, { "epoch": 0.6918373586059309, "grad_norm": 0.5334621071815491, "learning_rate": 8.635842567825754e-06, "loss": 0.1578, "step": 2263 }, { "epoch": 0.6921430755120758, "grad_norm": 0.5672036409378052, "learning_rate": 8.639663737103553e-06, "loss": 0.1511, "step": 2264 }, { "epoch": 0.6924487924182208, "grad_norm": 0.6677817702293396, "learning_rate": 8.643484906381353e-06, "loss": 0.1836, "step": 2265 }, { "epoch": 0.6927545093243657, "grad_norm": 1.122306227684021, "learning_rate": 8.647306075659152e-06, "loss": 0.2437, "step": 2266 }, { "epoch": 0.6930602262305106, "grad_norm": 0.9394242763519287, "learning_rate": 8.65112724493695e-06, "loss": 0.2126, "step": 2267 }, { "epoch": 0.6933659431366554, "grad_norm": 1.2128372192382812, "learning_rate": 8.65494841421475e-06, "loss": 0.2573, "step": 2268 }, { "epoch": 0.6936716600428003, "grad_norm": 1.23612380027771, "learning_rate": 8.658769583492548e-06, "loss": 0.2794, "step": 2269 }, { "epoch": 0.6939773769489452, "grad_norm": 0.931220531463623, "learning_rate": 8.662590752770347e-06, "loss": 0.2802, "step": 2270 }, { "epoch": 0.6942830938550902, "grad_norm": 1.1379950046539307, "learning_rate": 8.666411922048146e-06, "loss": 0.2653, "step": 2271 }, { "epoch": 0.6945888107612351, "grad_norm": 2.062136173248291, "learning_rate": 8.670233091325945e-06, "loss": 0.2762, "step": 2272 }, { "epoch": 0.69489452766738, "grad_norm": 1.8608852624893188, "learning_rate": 8.674054260603745e-06, "loss": 0.331, "step": 2273 }, { "epoch": 0.6952002445735249, "grad_norm": 1.8502509593963623, "learning_rate": 8.677875429881544e-06, "loss": 0.3427, "step": 2274 }, { "epoch": 0.6955059614796698, "grad_norm": 1.6292979717254639, "learning_rate": 8.681696599159342e-06, "loss": 0.4411, "step": 2275 }, { "epoch": 0.6958116783858147, "grad_norm": 0.7640476226806641, "learning_rate": 8.685517768437141e-06, "loss": 0.239, "step": 2276 }, { "epoch": 0.6961173952919596, "grad_norm": 0.43284645676612854, "learning_rate": 8.689338937714942e-06, "loss": 0.1359, "step": 2277 }, { "epoch": 0.6964231121981046, "grad_norm": 0.44169285893440247, "learning_rate": 8.69316010699274e-06, "loss": 0.1246, "step": 2278 }, { "epoch": 0.6967288291042495, "grad_norm": 0.4845556318759918, "learning_rate": 8.696981276270539e-06, "loss": 0.1097, "step": 2279 }, { "epoch": 0.6970345460103944, "grad_norm": 0.5113290548324585, "learning_rate": 8.70080244554834e-06, "loss": 0.1021, "step": 2280 }, { "epoch": 0.6973402629165393, "grad_norm": 0.4943855106830597, "learning_rate": 8.704623614826138e-06, "loss": 0.109, "step": 2281 }, { "epoch": 0.6976459798226842, "grad_norm": 0.49679869413375854, "learning_rate": 8.708444784103937e-06, "loss": 0.1065, "step": 2282 }, { "epoch": 0.6979516967288291, "grad_norm": 0.8132513165473938, "learning_rate": 8.712265953381736e-06, "loss": 0.1478, "step": 2283 }, { "epoch": 0.6982574136349741, "grad_norm": 0.4840562045574188, "learning_rate": 8.716087122659535e-06, "loss": 0.1175, "step": 2284 }, { "epoch": 0.698563130541119, "grad_norm": 0.5090274810791016, "learning_rate": 8.719908291937333e-06, "loss": 0.1259, "step": 2285 }, { "epoch": 0.6988688474472639, "grad_norm": 0.9197195172309875, "learning_rate": 8.723729461215132e-06, "loss": 0.1984, "step": 2286 }, { "epoch": 0.6991745643534087, "grad_norm": 0.5403217673301697, "learning_rate": 8.72755063049293e-06, "loss": 0.1078, "step": 2287 }, { "epoch": 0.6994802812595536, "grad_norm": 0.6682656407356262, "learning_rate": 8.731371799770731e-06, "loss": 0.1516, "step": 2288 }, { "epoch": 0.6997859981656985, "grad_norm": 0.5218607783317566, "learning_rate": 8.73519296904853e-06, "loss": 0.1672, "step": 2289 }, { "epoch": 0.7000917150718434, "grad_norm": 0.5985652804374695, "learning_rate": 8.739014138326329e-06, "loss": 0.1751, "step": 2290 }, { "epoch": 0.7003974319779884, "grad_norm": 0.847692608833313, "learning_rate": 8.742835307604127e-06, "loss": 0.2104, "step": 2291 }, { "epoch": 0.7007031488841333, "grad_norm": 0.8565949201583862, "learning_rate": 8.746656476881926e-06, "loss": 0.2297, "step": 2292 }, { "epoch": 0.7010088657902782, "grad_norm": 1.0674355030059814, "learning_rate": 8.750477646159725e-06, "loss": 0.2547, "step": 2293 }, { "epoch": 0.7013145826964231, "grad_norm": 0.9931284785270691, "learning_rate": 8.754298815437524e-06, "loss": 0.2686, "step": 2294 }, { "epoch": 0.701620299602568, "grad_norm": 1.1029061079025269, "learning_rate": 8.758119984715322e-06, "loss": 0.2767, "step": 2295 }, { "epoch": 0.7019260165087129, "grad_norm": 1.1337616443634033, "learning_rate": 8.761941153993123e-06, "loss": 0.3167, "step": 2296 }, { "epoch": 0.7022317334148579, "grad_norm": 2.6895012855529785, "learning_rate": 8.765762323270922e-06, "loss": 0.2728, "step": 2297 }, { "epoch": 0.7025374503210028, "grad_norm": 1.9623466730117798, "learning_rate": 8.76958349254872e-06, "loss": 0.2959, "step": 2298 }, { "epoch": 0.7028431672271477, "grad_norm": 1.460923433303833, "learning_rate": 8.773404661826519e-06, "loss": 0.3601, "step": 2299 }, { "epoch": 0.7031488841332926, "grad_norm": 2.394845724105835, "learning_rate": 8.777225831104318e-06, "loss": 0.3978, "step": 2300 }, { "epoch": 0.7034546010394375, "grad_norm": 0.5000455975532532, "learning_rate": 8.781047000382117e-06, "loss": 0.1898, "step": 2301 }, { "epoch": 0.7037603179455824, "grad_norm": 0.6097943782806396, "learning_rate": 8.784868169659915e-06, "loss": 0.1404, "step": 2302 }, { "epoch": 0.7040660348517273, "grad_norm": 0.47218140959739685, "learning_rate": 8.788689338937714e-06, "loss": 0.1436, "step": 2303 }, { "epoch": 0.7043717517578723, "grad_norm": 0.9118728041648865, "learning_rate": 8.792510508215515e-06, "loss": 0.1068, "step": 2304 }, { "epoch": 0.7046774686640171, "grad_norm": 0.4284038245677948, "learning_rate": 8.796331677493313e-06, "loss": 0.0879, "step": 2305 }, { "epoch": 0.704983185570162, "grad_norm": 0.43854203820228577, "learning_rate": 8.800152846771112e-06, "loss": 0.1132, "step": 2306 }, { "epoch": 0.7052889024763069, "grad_norm": 0.37134239077568054, "learning_rate": 8.80397401604891e-06, "loss": 0.0849, "step": 2307 }, { "epoch": 0.7055946193824518, "grad_norm": 0.3931886851787567, "learning_rate": 8.80779518532671e-06, "loss": 0.0924, "step": 2308 }, { "epoch": 0.7059003362885967, "grad_norm": 0.503882884979248, "learning_rate": 8.811616354604508e-06, "loss": 0.1319, "step": 2309 }, { "epoch": 0.7062060531947417, "grad_norm": 0.5988165140151978, "learning_rate": 8.815437523882307e-06, "loss": 0.1005, "step": 2310 }, { "epoch": 0.7065117701008866, "grad_norm": 0.613615095615387, "learning_rate": 8.819258693160107e-06, "loss": 0.169, "step": 2311 }, { "epoch": 0.7068174870070315, "grad_norm": 1.2925477027893066, "learning_rate": 8.823079862437906e-06, "loss": 0.149, "step": 2312 }, { "epoch": 0.7071232039131764, "grad_norm": 0.6035140156745911, "learning_rate": 8.826901031715705e-06, "loss": 0.1373, "step": 2313 }, { "epoch": 0.7074289208193213, "grad_norm": 3.823395013809204, "learning_rate": 8.830722200993504e-06, "loss": 0.1638, "step": 2314 }, { "epoch": 0.7077346377254662, "grad_norm": 0.7739012837409973, "learning_rate": 8.834543370271302e-06, "loss": 0.1764, "step": 2315 }, { "epoch": 0.7080403546316111, "grad_norm": 0.7637941837310791, "learning_rate": 8.838364539549103e-06, "loss": 0.2035, "step": 2316 }, { "epoch": 0.7083460715377561, "grad_norm": 1.144938588142395, "learning_rate": 8.842185708826902e-06, "loss": 0.2168, "step": 2317 }, { "epoch": 0.708651788443901, "grad_norm": 1.0288162231445312, "learning_rate": 8.8460068781047e-06, "loss": 0.251, "step": 2318 }, { "epoch": 0.7089575053500459, "grad_norm": 1.2882295846939087, "learning_rate": 8.8498280473825e-06, "loss": 0.2655, "step": 2319 }, { "epoch": 0.7092632222561908, "grad_norm": 1.0414283275604248, "learning_rate": 8.8536492166603e-06, "loss": 0.257, "step": 2320 }, { "epoch": 0.7095689391623357, "grad_norm": 1.8675198554992676, "learning_rate": 8.857470385938098e-06, "loss": 0.2826, "step": 2321 }, { "epoch": 0.7098746560684805, "grad_norm": 1.8178998231887817, "learning_rate": 8.861291555215897e-06, "loss": 0.3105, "step": 2322 }, { "epoch": 0.7101803729746256, "grad_norm": 1.3213679790496826, "learning_rate": 8.865112724493696e-06, "loss": 0.3021, "step": 2323 }, { "epoch": 0.7104860898807704, "grad_norm": 1.5043472051620483, "learning_rate": 8.868933893771494e-06, "loss": 0.377, "step": 2324 }, { "epoch": 0.7107918067869153, "grad_norm": 2.510061025619507, "learning_rate": 8.872755063049293e-06, "loss": 0.4413, "step": 2325 }, { "epoch": 0.7110975236930602, "grad_norm": 0.5274940729141235, "learning_rate": 8.876576232327092e-06, "loss": 0.2295, "step": 2326 }, { "epoch": 0.7114032405992051, "grad_norm": 0.4964894652366638, "learning_rate": 8.880397401604892e-06, "loss": 0.1554, "step": 2327 }, { "epoch": 0.71170895750535, "grad_norm": 0.6294508576393127, "learning_rate": 8.884218570882691e-06, "loss": 0.1318, "step": 2328 }, { "epoch": 0.7120146744114949, "grad_norm": 0.4254167973995209, "learning_rate": 8.88803974016049e-06, "loss": 0.1078, "step": 2329 }, { "epoch": 0.7123203913176399, "grad_norm": 0.5516884326934814, "learning_rate": 8.891860909438289e-06, "loss": 0.1082, "step": 2330 }, { "epoch": 0.7126261082237848, "grad_norm": 0.4276992678642273, "learning_rate": 8.895682078716087e-06, "loss": 0.1055, "step": 2331 }, { "epoch": 0.7129318251299297, "grad_norm": 0.6422765851020813, "learning_rate": 8.899503247993886e-06, "loss": 0.0809, "step": 2332 }, { "epoch": 0.7132375420360746, "grad_norm": 0.4808218777179718, "learning_rate": 8.903324417271685e-06, "loss": 0.1007, "step": 2333 }, { "epoch": 0.7135432589422195, "grad_norm": 0.45484215021133423, "learning_rate": 8.907145586549484e-06, "loss": 0.0992, "step": 2334 }, { "epoch": 0.7138489758483644, "grad_norm": 0.6082325577735901, "learning_rate": 8.910966755827284e-06, "loss": 0.1137, "step": 2335 }, { "epoch": 0.7141546927545094, "grad_norm": 0.49855417013168335, "learning_rate": 8.914787925105083e-06, "loss": 0.1191, "step": 2336 }, { "epoch": 0.7144604096606543, "grad_norm": 0.655446469783783, "learning_rate": 8.918609094382882e-06, "loss": 0.1514, "step": 2337 }, { "epoch": 0.7147661265667992, "grad_norm": 0.6854950785636902, "learning_rate": 8.92243026366068e-06, "loss": 0.1746, "step": 2338 }, { "epoch": 0.7150718434729441, "grad_norm": 0.9457300901412964, "learning_rate": 8.926251432938479e-06, "loss": 0.1909, "step": 2339 }, { "epoch": 0.715377560379089, "grad_norm": 0.8044342994689941, "learning_rate": 8.930072602216278e-06, "loss": 0.2032, "step": 2340 }, { "epoch": 0.7156832772852338, "grad_norm": 0.7724645733833313, "learning_rate": 8.933893771494077e-06, "loss": 0.2201, "step": 2341 }, { "epoch": 0.7159889941913787, "grad_norm": 1.1193134784698486, "learning_rate": 8.937714940771877e-06, "loss": 0.2304, "step": 2342 }, { "epoch": 0.7162947110975237, "grad_norm": 0.8979930281639099, "learning_rate": 8.941536110049676e-06, "loss": 0.2337, "step": 2343 }, { "epoch": 0.7166004280036686, "grad_norm": 1.0020456314086914, "learning_rate": 8.945357279327474e-06, "loss": 0.269, "step": 2344 }, { "epoch": 0.7169061449098135, "grad_norm": 1.0708725452423096, "learning_rate": 8.949178448605273e-06, "loss": 0.2512, "step": 2345 }, { "epoch": 0.7172118618159584, "grad_norm": 1.7721545696258545, "learning_rate": 8.952999617883072e-06, "loss": 0.2635, "step": 2346 }, { "epoch": 0.7175175787221033, "grad_norm": 1.427876353263855, "learning_rate": 8.95682078716087e-06, "loss": 0.3334, "step": 2347 }, { "epoch": 0.7178232956282482, "grad_norm": 1.6724989414215088, "learning_rate": 8.96064195643867e-06, "loss": 0.3112, "step": 2348 }, { "epoch": 0.7181290125343932, "grad_norm": 2.3301873207092285, "learning_rate": 8.964463125716468e-06, "loss": 0.3799, "step": 2349 }, { "epoch": 0.7184347294405381, "grad_norm": 2.663996696472168, "learning_rate": 8.968284294994269e-06, "loss": 0.399, "step": 2350 }, { "epoch": 0.718740446346683, "grad_norm": 0.6381139755249023, "learning_rate": 8.972105464272067e-06, "loss": 0.2128, "step": 2351 }, { "epoch": 0.7190461632528279, "grad_norm": 0.44501152634620667, "learning_rate": 8.975926633549866e-06, "loss": 0.1363, "step": 2352 }, { "epoch": 0.7193518801589728, "grad_norm": 0.9872630834579468, "learning_rate": 8.979747802827665e-06, "loss": 0.1252, "step": 2353 }, { "epoch": 0.7196575970651177, "grad_norm": 0.34286537766456604, "learning_rate": 8.983568972105464e-06, "loss": 0.0781, "step": 2354 }, { "epoch": 0.7199633139712626, "grad_norm": 0.5091730356216431, "learning_rate": 8.987390141383264e-06, "loss": 0.1097, "step": 2355 }, { "epoch": 0.7202690308774076, "grad_norm": 0.5159571170806885, "learning_rate": 8.991211310661063e-06, "loss": 0.0845, "step": 2356 }, { "epoch": 0.7205747477835525, "grad_norm": 0.4927806258201599, "learning_rate": 8.995032479938862e-06, "loss": 0.1182, "step": 2357 }, { "epoch": 0.7208804646896974, "grad_norm": 0.44063276052474976, "learning_rate": 8.998853649216662e-06, "loss": 0.0766, "step": 2358 }, { "epoch": 0.7211861815958422, "grad_norm": 0.7033730745315552, "learning_rate": 9.00267481849446e-06, "loss": 0.107, "step": 2359 }, { "epoch": 0.7214918985019871, "grad_norm": 0.5316545963287354, "learning_rate": 9.00649598777226e-06, "loss": 0.1071, "step": 2360 }, { "epoch": 0.721797615408132, "grad_norm": 1.0832189321517944, "learning_rate": 9.010317157050058e-06, "loss": 0.1254, "step": 2361 }, { "epoch": 0.7221033323142769, "grad_norm": 0.5308778882026672, "learning_rate": 9.014138326327857e-06, "loss": 0.1297, "step": 2362 }, { "epoch": 0.7224090492204219, "grad_norm": 0.5385238528251648, "learning_rate": 9.017959495605656e-06, "loss": 0.1224, "step": 2363 }, { "epoch": 0.7227147661265668, "grad_norm": 0.9681349992752075, "learning_rate": 9.021780664883454e-06, "loss": 0.2014, "step": 2364 }, { "epoch": 0.7230204830327117, "grad_norm": 0.8247058987617493, "learning_rate": 9.025601834161253e-06, "loss": 0.2089, "step": 2365 }, { "epoch": 0.7233261999388566, "grad_norm": 1.1083128452301025, "learning_rate": 9.029423003439054e-06, "loss": 0.2221, "step": 2366 }, { "epoch": 0.7236319168450015, "grad_norm": 1.333994746208191, "learning_rate": 9.033244172716852e-06, "loss": 0.246, "step": 2367 }, { "epoch": 0.7239376337511464, "grad_norm": 1.5191866159439087, "learning_rate": 9.037065341994651e-06, "loss": 0.2461, "step": 2368 }, { "epoch": 0.7242433506572914, "grad_norm": 1.3066326379776, "learning_rate": 9.04088651127245e-06, "loss": 0.2311, "step": 2369 }, { "epoch": 0.7245490675634363, "grad_norm": 1.1492563486099243, "learning_rate": 9.044707680550249e-06, "loss": 0.2574, "step": 2370 }, { "epoch": 0.7248547844695812, "grad_norm": 1.1902903318405151, "learning_rate": 9.048528849828047e-06, "loss": 0.2913, "step": 2371 }, { "epoch": 0.7251605013757261, "grad_norm": 1.4794169664382935, "learning_rate": 9.052350019105846e-06, "loss": 0.2678, "step": 2372 }, { "epoch": 0.725466218281871, "grad_norm": 1.6252349615097046, "learning_rate": 9.056171188383645e-06, "loss": 0.3417, "step": 2373 }, { "epoch": 0.7257719351880159, "grad_norm": 2.3861358165740967, "learning_rate": 9.059992357661445e-06, "loss": 0.3616, "step": 2374 }, { "epoch": 0.7260776520941608, "grad_norm": 2.6507456302642822, "learning_rate": 9.063813526939244e-06, "loss": 0.4314, "step": 2375 }, { "epoch": 0.7263833690003058, "grad_norm": 0.6492313146591187, "learning_rate": 9.067634696217043e-06, "loss": 0.2012, "step": 2376 }, { "epoch": 0.7266890859064506, "grad_norm": 0.6160467863082886, "learning_rate": 9.071455865494841e-06, "loss": 0.1186, "step": 2377 }, { "epoch": 0.7269948028125955, "grad_norm": 0.44237270951271057, "learning_rate": 9.07527703477264e-06, "loss": 0.1174, "step": 2378 }, { "epoch": 0.7273005197187404, "grad_norm": 0.4370216727256775, "learning_rate": 9.079098204050439e-06, "loss": 0.0837, "step": 2379 }, { "epoch": 0.7276062366248853, "grad_norm": 0.42271265387535095, "learning_rate": 9.082919373328238e-06, "loss": 0.0834, "step": 2380 }, { "epoch": 0.7279119535310302, "grad_norm": 0.34754472970962524, "learning_rate": 9.086740542606038e-06, "loss": 0.085, "step": 2381 }, { "epoch": 0.7282176704371752, "grad_norm": 0.4635731279850006, "learning_rate": 9.090561711883837e-06, "loss": 0.1117, "step": 2382 }, { "epoch": 0.7285233873433201, "grad_norm": 0.6364822387695312, "learning_rate": 9.094382881161636e-06, "loss": 0.093, "step": 2383 }, { "epoch": 0.728829104249465, "grad_norm": 0.9228109121322632, "learning_rate": 9.098204050439434e-06, "loss": 0.0965, "step": 2384 }, { "epoch": 0.7291348211556099, "grad_norm": 0.5148409605026245, "learning_rate": 9.102025219717233e-06, "loss": 0.1198, "step": 2385 }, { "epoch": 0.7294405380617548, "grad_norm": 0.774448812007904, "learning_rate": 9.105846388995032e-06, "loss": 0.159, "step": 2386 }, { "epoch": 0.7297462549678997, "grad_norm": 0.5254757404327393, "learning_rate": 9.10966755827283e-06, "loss": 0.1237, "step": 2387 }, { "epoch": 0.7300519718740446, "grad_norm": 1.3450536727905273, "learning_rate": 9.11348872755063e-06, "loss": 0.1393, "step": 2388 }, { "epoch": 0.7303576887801896, "grad_norm": 0.653856635093689, "learning_rate": 9.11730989682843e-06, "loss": 0.1633, "step": 2389 }, { "epoch": 0.7306634056863345, "grad_norm": 0.7850512266159058, "learning_rate": 9.121131066106229e-06, "loss": 0.2007, "step": 2390 }, { "epoch": 0.7309691225924794, "grad_norm": 0.8620153665542603, "learning_rate": 9.124952235384027e-06, "loss": 0.2007, "step": 2391 }, { "epoch": 0.7312748394986243, "grad_norm": 0.9758855104446411, "learning_rate": 9.128773404661826e-06, "loss": 0.2244, "step": 2392 }, { "epoch": 0.7315805564047692, "grad_norm": 0.8017004728317261, "learning_rate": 9.132594573939625e-06, "loss": 0.2249, "step": 2393 }, { "epoch": 0.731886273310914, "grad_norm": 0.9895491600036621, "learning_rate": 9.136415743217425e-06, "loss": 0.2542, "step": 2394 }, { "epoch": 0.732191990217059, "grad_norm": 0.9211504459381104, "learning_rate": 9.140236912495224e-06, "loss": 0.2523, "step": 2395 }, { "epoch": 0.732497707123204, "grad_norm": 1.3325068950653076, "learning_rate": 9.144058081773023e-06, "loss": 0.2714, "step": 2396 }, { "epoch": 0.7328034240293488, "grad_norm": 1.9395289421081543, "learning_rate": 9.147879251050823e-06, "loss": 0.3217, "step": 2397 }, { "epoch": 0.7331091409354937, "grad_norm": 1.7682430744171143, "learning_rate": 9.151700420328622e-06, "loss": 0.3048, "step": 2398 }, { "epoch": 0.7334148578416386, "grad_norm": 1.7665178775787354, "learning_rate": 9.15552158960642e-06, "loss": 0.3172, "step": 2399 }, { "epoch": 0.7337205747477835, "grad_norm": 2.108240842819214, "learning_rate": 9.15934275888422e-06, "loss": 0.4002, "step": 2400 }, { "epoch": 0.7340262916539284, "grad_norm": 1.1000782251358032, "learning_rate": 9.163163928162018e-06, "loss": 0.2459, "step": 2401 }, { "epoch": 0.7343320085600734, "grad_norm": 0.4195878505706787, "learning_rate": 9.166985097439817e-06, "loss": 0.1489, "step": 2402 }, { "epoch": 0.7346377254662183, "grad_norm": 0.5791486501693726, "learning_rate": 9.170806266717616e-06, "loss": 0.1114, "step": 2403 }, { "epoch": 0.7349434423723632, "grad_norm": 0.4480782151222229, "learning_rate": 9.174627435995414e-06, "loss": 0.0992, "step": 2404 }, { "epoch": 0.7352491592785081, "grad_norm": 0.4661538302898407, "learning_rate": 9.178448605273215e-06, "loss": 0.0909, "step": 2405 }, { "epoch": 0.735554876184653, "grad_norm": 0.4505203664302826, "learning_rate": 9.182269774551014e-06, "loss": 0.0865, "step": 2406 }, { "epoch": 0.7358605930907979, "grad_norm": 0.5235399007797241, "learning_rate": 9.186090943828812e-06, "loss": 0.0987, "step": 2407 }, { "epoch": 0.7361663099969429, "grad_norm": 0.49063459038734436, "learning_rate": 9.189912113106611e-06, "loss": 0.1223, "step": 2408 }, { "epoch": 0.7364720269030878, "grad_norm": 0.388651967048645, "learning_rate": 9.19373328238441e-06, "loss": 0.1008, "step": 2409 }, { "epoch": 0.7367777438092327, "grad_norm": 0.7696850299835205, "learning_rate": 9.197554451662209e-06, "loss": 0.1051, "step": 2410 }, { "epoch": 0.7370834607153776, "grad_norm": 0.5919461250305176, "learning_rate": 9.201375620940007e-06, "loss": 0.116, "step": 2411 }, { "epoch": 0.7373891776215225, "grad_norm": 0.8811898231506348, "learning_rate": 9.205196790217808e-06, "loss": 0.1168, "step": 2412 }, { "epoch": 0.7376948945276673, "grad_norm": 0.6328366994857788, "learning_rate": 9.209017959495606e-06, "loss": 0.1179, "step": 2413 }, { "epoch": 0.7380006114338122, "grad_norm": 0.6111212372779846, "learning_rate": 9.212839128773405e-06, "loss": 0.175, "step": 2414 }, { "epoch": 0.7383063283399572, "grad_norm": 0.896183431148529, "learning_rate": 9.216660298051204e-06, "loss": 0.2041, "step": 2415 }, { "epoch": 0.7386120452461021, "grad_norm": 0.7913030982017517, "learning_rate": 9.220481467329003e-06, "loss": 0.2025, "step": 2416 }, { "epoch": 0.738917762152247, "grad_norm": 0.7184579372406006, "learning_rate": 9.224302636606801e-06, "loss": 0.2167, "step": 2417 }, { "epoch": 0.7392234790583919, "grad_norm": 1.1730397939682007, "learning_rate": 9.2281238058846e-06, "loss": 0.2689, "step": 2418 }, { "epoch": 0.7395291959645368, "grad_norm": 2.5078048706054688, "learning_rate": 9.231944975162399e-06, "loss": 0.2636, "step": 2419 }, { "epoch": 0.7398349128706817, "grad_norm": 1.3194046020507812, "learning_rate": 9.2357661444402e-06, "loss": 0.2936, "step": 2420 }, { "epoch": 0.7401406297768267, "grad_norm": 1.2152255773544312, "learning_rate": 9.239587313717998e-06, "loss": 0.3037, "step": 2421 }, { "epoch": 0.7404463466829716, "grad_norm": 1.351432204246521, "learning_rate": 9.243408482995797e-06, "loss": 0.2717, "step": 2422 }, { "epoch": 0.7407520635891165, "grad_norm": 1.7026994228363037, "learning_rate": 9.247229652273596e-06, "loss": 0.3483, "step": 2423 }, { "epoch": 0.7410577804952614, "grad_norm": 1.573121428489685, "learning_rate": 9.251050821551394e-06, "loss": 0.3087, "step": 2424 }, { "epoch": 0.7413634974014063, "grad_norm": 2.3626508712768555, "learning_rate": 9.254871990829193e-06, "loss": 0.4705, "step": 2425 }, { "epoch": 0.7416692143075512, "grad_norm": 0.5444228053092957, "learning_rate": 9.258693160106992e-06, "loss": 0.2227, "step": 2426 }, { "epoch": 0.7419749312136961, "grad_norm": 0.6124668717384338, "learning_rate": 9.26251432938479e-06, "loss": 0.1231, "step": 2427 }, { "epoch": 0.7422806481198411, "grad_norm": 0.514008641242981, "learning_rate": 9.266335498662591e-06, "loss": 0.157, "step": 2428 }, { "epoch": 0.742586365025986, "grad_norm": 0.37836381793022156, "learning_rate": 9.27015666794039e-06, "loss": 0.0967, "step": 2429 }, { "epoch": 0.7428920819321309, "grad_norm": 0.38575801253318787, "learning_rate": 9.273977837218189e-06, "loss": 0.1055, "step": 2430 }, { "epoch": 0.7431977988382757, "grad_norm": 0.4063425660133362, "learning_rate": 9.277799006495987e-06, "loss": 0.0661, "step": 2431 }, { "epoch": 0.7435035157444206, "grad_norm": 0.4100622236728668, "learning_rate": 9.281620175773786e-06, "loss": 0.1004, "step": 2432 }, { "epoch": 0.7438092326505655, "grad_norm": 0.7449556589126587, "learning_rate": 9.285441345051586e-06, "loss": 0.1095, "step": 2433 }, { "epoch": 0.7441149495567105, "grad_norm": 1.0441219806671143, "learning_rate": 9.289262514329385e-06, "loss": 0.1226, "step": 2434 }, { "epoch": 0.7444206664628554, "grad_norm": 0.486929327249527, "learning_rate": 9.293083683607184e-06, "loss": 0.133, "step": 2435 }, { "epoch": 0.7447263833690003, "grad_norm": 0.7131247520446777, "learning_rate": 9.296904852884984e-06, "loss": 0.113, "step": 2436 }, { "epoch": 0.7450321002751452, "grad_norm": 0.6539348363876343, "learning_rate": 9.300726022162783e-06, "loss": 0.1384, "step": 2437 }, { "epoch": 0.7453378171812901, "grad_norm": 0.8282005786895752, "learning_rate": 9.304547191440582e-06, "loss": 0.1896, "step": 2438 }, { "epoch": 0.745643534087435, "grad_norm": 0.6568138003349304, "learning_rate": 9.30836836071838e-06, "loss": 0.2035, "step": 2439 }, { "epoch": 0.7459492509935799, "grad_norm": 0.6615884900093079, "learning_rate": 9.31218952999618e-06, "loss": 0.199, "step": 2440 }, { "epoch": 0.7462549678997249, "grad_norm": 0.7249413132667542, "learning_rate": 9.316010699273978e-06, "loss": 0.2079, "step": 2441 }, { "epoch": 0.7465606848058698, "grad_norm": 0.8707079291343689, "learning_rate": 9.319831868551777e-06, "loss": 0.2791, "step": 2442 }, { "epoch": 0.7468664017120147, "grad_norm": 1.3782247304916382, "learning_rate": 9.323653037829577e-06, "loss": 0.2525, "step": 2443 }, { "epoch": 0.7471721186181596, "grad_norm": 1.113921046257019, "learning_rate": 9.327474207107376e-06, "loss": 0.2578, "step": 2444 }, { "epoch": 0.7474778355243045, "grad_norm": 1.043644905090332, "learning_rate": 9.331295376385175e-06, "loss": 0.2766, "step": 2445 }, { "epoch": 0.7477835524304494, "grad_norm": 1.070979356765747, "learning_rate": 9.335116545662974e-06, "loss": 0.2312, "step": 2446 }, { "epoch": 0.7480892693365944, "grad_norm": 1.1920804977416992, "learning_rate": 9.338937714940772e-06, "loss": 0.2739, "step": 2447 }, { "epoch": 0.7483949862427393, "grad_norm": 1.5266748666763306, "learning_rate": 9.342758884218571e-06, "loss": 0.3418, "step": 2448 }, { "epoch": 0.7487007031488842, "grad_norm": 1.7952054738998413, "learning_rate": 9.34658005349637e-06, "loss": 0.3454, "step": 2449 }, { "epoch": 0.749006420055029, "grad_norm": 2.9125685691833496, "learning_rate": 9.350401222774168e-06, "loss": 0.4205, "step": 2450 }, { "epoch": 0.7493121369611739, "grad_norm": 0.7071707248687744, "learning_rate": 9.354222392051969e-06, "loss": 0.2284, "step": 2451 }, { "epoch": 0.7496178538673188, "grad_norm": 0.5182474255561829, "learning_rate": 9.358043561329768e-06, "loss": 0.1228, "step": 2452 }, { "epoch": 0.7499235707734637, "grad_norm": 1.519235372543335, "learning_rate": 9.361864730607566e-06, "loss": 0.1117, "step": 2453 }, { "epoch": 0.7502292876796087, "grad_norm": 0.4408988356590271, "learning_rate": 9.365685899885365e-06, "loss": 0.1143, "step": 2454 }, { "epoch": 0.7505350045857536, "grad_norm": 0.5972708463668823, "learning_rate": 9.369507069163164e-06, "loss": 0.1073, "step": 2455 }, { "epoch": 0.7508407214918985, "grad_norm": 0.48089078068733215, "learning_rate": 9.373328238440963e-06, "loss": 0.0847, "step": 2456 }, { "epoch": 0.7511464383980434, "grad_norm": 0.46935415267944336, "learning_rate": 9.377149407718761e-06, "loss": 0.1362, "step": 2457 }, { "epoch": 0.7514521553041883, "grad_norm": 0.8307324647903442, "learning_rate": 9.38097057699656e-06, "loss": 0.0922, "step": 2458 }, { "epoch": 0.7517578722103332, "grad_norm": 0.667719304561615, "learning_rate": 9.38479174627436e-06, "loss": 0.119, "step": 2459 }, { "epoch": 0.7520635891164782, "grad_norm": 0.4444475471973419, "learning_rate": 9.38861291555216e-06, "loss": 0.094, "step": 2460 }, { "epoch": 0.7523693060226231, "grad_norm": 0.5887052416801453, "learning_rate": 9.392434084829958e-06, "loss": 0.1577, "step": 2461 }, { "epoch": 0.752675022928768, "grad_norm": 0.573174238204956, "learning_rate": 9.396255254107757e-06, "loss": 0.1254, "step": 2462 }, { "epoch": 0.7529807398349129, "grad_norm": 0.6424505710601807, "learning_rate": 9.400076423385556e-06, "loss": 0.1633, "step": 2463 }, { "epoch": 0.7532864567410578, "grad_norm": 0.9555872082710266, "learning_rate": 9.403897592663354e-06, "loss": 0.1658, "step": 2464 }, { "epoch": 0.7535921736472027, "grad_norm": 0.7887927889823914, "learning_rate": 9.407718761941153e-06, "loss": 0.1976, "step": 2465 }, { "epoch": 0.7538978905533475, "grad_norm": 0.7506389617919922, "learning_rate": 9.411539931218952e-06, "loss": 0.1934, "step": 2466 }, { "epoch": 0.7542036074594926, "grad_norm": 0.6936265230178833, "learning_rate": 9.415361100496752e-06, "loss": 0.225, "step": 2467 }, { "epoch": 0.7545093243656374, "grad_norm": 0.7693415284156799, "learning_rate": 9.419182269774551e-06, "loss": 0.2408, "step": 2468 }, { "epoch": 0.7548150412717823, "grad_norm": 1.0278724431991577, "learning_rate": 9.42300343905235e-06, "loss": 0.2373, "step": 2469 }, { "epoch": 0.7551207581779272, "grad_norm": 1.1192102432250977, "learning_rate": 9.426824608330148e-06, "loss": 0.241, "step": 2470 }, { "epoch": 0.7554264750840721, "grad_norm": 1.360282063484192, "learning_rate": 9.430645777607947e-06, "loss": 0.3459, "step": 2471 }, { "epoch": 0.755732191990217, "grad_norm": 1.5918654203414917, "learning_rate": 9.434466946885748e-06, "loss": 0.2947, "step": 2472 }, { "epoch": 0.756037908896362, "grad_norm": 1.684103012084961, "learning_rate": 9.438288116163546e-06, "loss": 0.3448, "step": 2473 }, { "epoch": 0.7563436258025069, "grad_norm": 3.913331985473633, "learning_rate": 9.442109285441347e-06, "loss": 0.3109, "step": 2474 }, { "epoch": 0.7566493427086518, "grad_norm": 4.037960529327393, "learning_rate": 9.445930454719146e-06, "loss": 0.4515, "step": 2475 }, { "epoch": 0.7569550596147967, "grad_norm": 0.6110999584197998, "learning_rate": 9.449751623996944e-06, "loss": 0.205, "step": 2476 }, { "epoch": 0.7572607765209416, "grad_norm": 0.4705647826194763, "learning_rate": 9.453572793274743e-06, "loss": 0.1356, "step": 2477 }, { "epoch": 0.7575664934270865, "grad_norm": 0.42865124344825745, "learning_rate": 9.457393962552542e-06, "loss": 0.0881, "step": 2478 }, { "epoch": 0.7578722103332314, "grad_norm": 0.3670295178890228, "learning_rate": 9.46121513183034e-06, "loss": 0.0988, "step": 2479 }, { "epoch": 0.7581779272393764, "grad_norm": 0.38634631037712097, "learning_rate": 9.46503630110814e-06, "loss": 0.1128, "step": 2480 }, { "epoch": 0.7584836441455213, "grad_norm": 0.3694614768028259, "learning_rate": 9.468857470385938e-06, "loss": 0.0686, "step": 2481 }, { "epoch": 0.7587893610516662, "grad_norm": 0.4281258285045624, "learning_rate": 9.472678639663738e-06, "loss": 0.108, "step": 2482 }, { "epoch": 0.7590950779578111, "grad_norm": 0.4580366313457489, "learning_rate": 9.476499808941537e-06, "loss": 0.0913, "step": 2483 }, { "epoch": 0.759400794863956, "grad_norm": 0.8265007138252258, "learning_rate": 9.480320978219336e-06, "loss": 0.0913, "step": 2484 }, { "epoch": 0.7597065117701008, "grad_norm": 0.4918110966682434, "learning_rate": 9.484142147497135e-06, "loss": 0.1286, "step": 2485 }, { "epoch": 0.7600122286762458, "grad_norm": 0.6069576740264893, "learning_rate": 9.487963316774933e-06, "loss": 0.1381, "step": 2486 }, { "epoch": 0.7603179455823907, "grad_norm": 0.5939327478408813, "learning_rate": 9.491784486052732e-06, "loss": 0.1106, "step": 2487 }, { "epoch": 0.7606236624885356, "grad_norm": 0.6200658679008484, "learning_rate": 9.495605655330531e-06, "loss": 0.1651, "step": 2488 }, { "epoch": 0.7609293793946805, "grad_norm": 0.6411346197128296, "learning_rate": 9.49942682460833e-06, "loss": 0.1702, "step": 2489 }, { "epoch": 0.7612350963008254, "grad_norm": 0.6577548980712891, "learning_rate": 9.50324799388613e-06, "loss": 0.2023, "step": 2490 }, { "epoch": 0.7615408132069703, "grad_norm": 0.8322537541389465, "learning_rate": 9.507069163163929e-06, "loss": 0.2228, "step": 2491 }, { "epoch": 0.7618465301131152, "grad_norm": 0.8094200491905212, "learning_rate": 9.510890332441728e-06, "loss": 0.2675, "step": 2492 }, { "epoch": 0.7621522470192602, "grad_norm": 0.8478187918663025, "learning_rate": 9.514711501719526e-06, "loss": 0.2595, "step": 2493 }, { "epoch": 0.7624579639254051, "grad_norm": 1.1935769319534302, "learning_rate": 9.518532670997325e-06, "loss": 0.2899, "step": 2494 }, { "epoch": 0.76276368083155, "grad_norm": 0.97035151720047, "learning_rate": 9.522353840275124e-06, "loss": 0.2613, "step": 2495 }, { "epoch": 0.7630693977376949, "grad_norm": 1.3635863065719604, "learning_rate": 9.526175009552923e-06, "loss": 0.2722, "step": 2496 }, { "epoch": 0.7633751146438398, "grad_norm": 1.350322961807251, "learning_rate": 9.529996178830721e-06, "loss": 0.2816, "step": 2497 }, { "epoch": 0.7636808315499847, "grad_norm": 1.5189380645751953, "learning_rate": 9.533817348108522e-06, "loss": 0.3127, "step": 2498 }, { "epoch": 0.7639865484561297, "grad_norm": 1.605339527130127, "learning_rate": 9.53763851738632e-06, "loss": 0.3245, "step": 2499 }, { "epoch": 0.7642922653622746, "grad_norm": 12.506855964660645, "learning_rate": 9.54145968666412e-06, "loss": 0.3879, "step": 2500 }, { "epoch": 0.7645979822684195, "grad_norm": 0.5530233383178711, "learning_rate": 9.545280855941918e-06, "loss": 0.211, "step": 2501 }, { "epoch": 0.7649036991745644, "grad_norm": 0.5467696785926819, "learning_rate": 9.549102025219717e-06, "loss": 0.1506, "step": 2502 }, { "epoch": 0.7652094160807092, "grad_norm": 0.4578435719013214, "learning_rate": 9.552923194497516e-06, "loss": 0.1143, "step": 2503 }, { "epoch": 0.7655151329868541, "grad_norm": 0.3455944359302521, "learning_rate": 9.556744363775314e-06, "loss": 0.083, "step": 2504 }, { "epoch": 0.765820849892999, "grad_norm": 0.771053671836853, "learning_rate": 9.560565533053115e-06, "loss": 0.0948, "step": 2505 }, { "epoch": 0.766126566799144, "grad_norm": 0.3839539885520935, "learning_rate": 9.564386702330913e-06, "loss": 0.1216, "step": 2506 }, { "epoch": 0.7664322837052889, "grad_norm": 0.46379968523979187, "learning_rate": 9.568207871608712e-06, "loss": 0.0945, "step": 2507 }, { "epoch": 0.7667380006114338, "grad_norm": 0.4062173068523407, "learning_rate": 9.572029040886511e-06, "loss": 0.0903, "step": 2508 }, { "epoch": 0.7670437175175787, "grad_norm": 0.6148442625999451, "learning_rate": 9.57585021016431e-06, "loss": 0.1427, "step": 2509 }, { "epoch": 0.7673494344237236, "grad_norm": 0.4554442763328552, "learning_rate": 9.57967137944211e-06, "loss": 0.0937, "step": 2510 }, { "epoch": 0.7676551513298685, "grad_norm": 0.6180024147033691, "learning_rate": 9.583492548719909e-06, "loss": 0.1356, "step": 2511 }, { "epoch": 0.7679608682360135, "grad_norm": 0.4169529676437378, "learning_rate": 9.587313717997708e-06, "loss": 0.1069, "step": 2512 }, { "epoch": 0.7682665851421584, "grad_norm": 0.6711704730987549, "learning_rate": 9.591134887275508e-06, "loss": 0.1379, "step": 2513 }, { "epoch": 0.7685723020483033, "grad_norm": 1.4167042970657349, "learning_rate": 9.594956056553307e-06, "loss": 0.1873, "step": 2514 }, { "epoch": 0.7688780189544482, "grad_norm": 0.7531862854957581, "learning_rate": 9.598777225831106e-06, "loss": 0.1804, "step": 2515 }, { "epoch": 0.7691837358605931, "grad_norm": 0.8953850269317627, "learning_rate": 9.602598395108904e-06, "loss": 0.2192, "step": 2516 }, { "epoch": 0.769489452766738, "grad_norm": 0.6947640180587769, "learning_rate": 9.606419564386703e-06, "loss": 0.2175, "step": 2517 }, { "epoch": 0.7697951696728829, "grad_norm": 0.733005940914154, "learning_rate": 9.610240733664502e-06, "loss": 0.2333, "step": 2518 }, { "epoch": 0.7701008865790279, "grad_norm": 1.0150882005691528, "learning_rate": 9.6140619029423e-06, "loss": 0.2934, "step": 2519 }, { "epoch": 0.7704066034851728, "grad_norm": 1.1450037956237793, "learning_rate": 9.6178830722201e-06, "loss": 0.2757, "step": 2520 }, { "epoch": 0.7707123203913177, "grad_norm": 1.329506516456604, "learning_rate": 9.6217042414979e-06, "loss": 0.3047, "step": 2521 }, { "epoch": 0.7710180372974625, "grad_norm": 1.2694729566574097, "learning_rate": 9.625525410775698e-06, "loss": 0.3238, "step": 2522 }, { "epoch": 0.7713237542036074, "grad_norm": 1.3297109603881836, "learning_rate": 9.629346580053497e-06, "loss": 0.2883, "step": 2523 }, { "epoch": 0.7716294711097523, "grad_norm": 2.0155794620513916, "learning_rate": 9.633167749331296e-06, "loss": 0.3045, "step": 2524 }, { "epoch": 0.7719351880158973, "grad_norm": 2.4185328483581543, "learning_rate": 9.636988918609095e-06, "loss": 0.3403, "step": 2525 }, { "epoch": 0.7722409049220422, "grad_norm": 0.6235795617103577, "learning_rate": 9.640810087886893e-06, "loss": 0.2119, "step": 2526 }, { "epoch": 0.7725466218281871, "grad_norm": 0.41755738854408264, "learning_rate": 9.644631257164692e-06, "loss": 0.1176, "step": 2527 }, { "epoch": 0.772852338734332, "grad_norm": 0.41394704580307007, "learning_rate": 9.648452426442491e-06, "loss": 0.1108, "step": 2528 }, { "epoch": 0.7731580556404769, "grad_norm": 0.37285444140434265, "learning_rate": 9.652273595720291e-06, "loss": 0.0849, "step": 2529 }, { "epoch": 0.7734637725466218, "grad_norm": 0.5734539031982422, "learning_rate": 9.65609476499809e-06, "loss": 0.0883, "step": 2530 }, { "epoch": 0.7737694894527667, "grad_norm": 0.46368440985679626, "learning_rate": 9.659915934275889e-06, "loss": 0.1203, "step": 2531 }, { "epoch": 0.7740752063589117, "grad_norm": 0.5953476428985596, "learning_rate": 9.663737103553688e-06, "loss": 0.0954, "step": 2532 }, { "epoch": 0.7743809232650566, "grad_norm": 0.45024529099464417, "learning_rate": 9.667558272831486e-06, "loss": 0.1337, "step": 2533 }, { "epoch": 0.7746866401712015, "grad_norm": 0.5090838074684143, "learning_rate": 9.671379442109285e-06, "loss": 0.1154, "step": 2534 }, { "epoch": 0.7749923570773464, "grad_norm": 0.5019803047180176, "learning_rate": 9.675200611387084e-06, "loss": 0.0934, "step": 2535 }, { "epoch": 0.7752980739834913, "grad_norm": 0.6705944538116455, "learning_rate": 9.679021780664884e-06, "loss": 0.1489, "step": 2536 }, { "epoch": 0.7756037908896362, "grad_norm": 0.5575826168060303, "learning_rate": 9.682842949942683e-06, "loss": 0.1255, "step": 2537 }, { "epoch": 0.7759095077957812, "grad_norm": 0.48000097274780273, "learning_rate": 9.686664119220482e-06, "loss": 0.1356, "step": 2538 }, { "epoch": 0.776215224701926, "grad_norm": 0.6467621922492981, "learning_rate": 9.69048528849828e-06, "loss": 0.1426, "step": 2539 }, { "epoch": 0.776520941608071, "grad_norm": 1.0518765449523926, "learning_rate": 9.69430645777608e-06, "loss": 0.1911, "step": 2540 }, { "epoch": 0.7768266585142158, "grad_norm": 0.7542822360992432, "learning_rate": 9.698127627053878e-06, "loss": 0.1953, "step": 2541 }, { "epoch": 0.7771323754203607, "grad_norm": 0.8455517292022705, "learning_rate": 9.701948796331677e-06, "loss": 0.2367, "step": 2542 }, { "epoch": 0.7774380923265056, "grad_norm": 1.0229703187942505, "learning_rate": 9.705769965609475e-06, "loss": 0.2362, "step": 2543 }, { "epoch": 0.7777438092326505, "grad_norm": 0.8846530914306641, "learning_rate": 9.709591134887276e-06, "loss": 0.2217, "step": 2544 }, { "epoch": 0.7780495261387955, "grad_norm": 1.0641655921936035, "learning_rate": 9.713412304165075e-06, "loss": 0.3024, "step": 2545 }, { "epoch": 0.7783552430449404, "grad_norm": 1.3481818437576294, "learning_rate": 9.717233473442873e-06, "loss": 0.2692, "step": 2546 }, { "epoch": 0.7786609599510853, "grad_norm": 1.7248295545578003, "learning_rate": 9.721054642720672e-06, "loss": 0.303, "step": 2547 }, { "epoch": 0.7789666768572302, "grad_norm": 1.4553056955337524, "learning_rate": 9.724875811998471e-06, "loss": 0.3063, "step": 2548 }, { "epoch": 0.7792723937633751, "grad_norm": 1.9896763563156128, "learning_rate": 9.728696981276271e-06, "loss": 0.3541, "step": 2549 }, { "epoch": 0.77957811066952, "grad_norm": 1.9619395732879639, "learning_rate": 9.73251815055407e-06, "loss": 0.4237, "step": 2550 }, { "epoch": 0.779883827575665, "grad_norm": 0.590855598449707, "learning_rate": 9.736339319831869e-06, "loss": 0.2026, "step": 2551 }, { "epoch": 0.7801895444818099, "grad_norm": 0.6254149675369263, "learning_rate": 9.74016048910967e-06, "loss": 0.1227, "step": 2552 }, { "epoch": 0.7804952613879548, "grad_norm": 0.457303524017334, "learning_rate": 9.743981658387468e-06, "loss": 0.1045, "step": 2553 }, { "epoch": 0.7808009782940997, "grad_norm": 0.399716854095459, "learning_rate": 9.747802827665267e-06, "loss": 0.0861, "step": 2554 }, { "epoch": 0.7811066952002446, "grad_norm": 0.4164433777332306, "learning_rate": 9.751623996943065e-06, "loss": 0.1086, "step": 2555 }, { "epoch": 0.7814124121063895, "grad_norm": 0.4560070037841797, "learning_rate": 9.755445166220864e-06, "loss": 0.104, "step": 2556 }, { "epoch": 0.7817181290125343, "grad_norm": 0.526213526725769, "learning_rate": 9.759266335498663e-06, "loss": 0.0889, "step": 2557 }, { "epoch": 0.7820238459186794, "grad_norm": 0.4382590055465698, "learning_rate": 9.763087504776462e-06, "loss": 0.0926, "step": 2558 }, { "epoch": 0.7823295628248242, "grad_norm": 0.457150936126709, "learning_rate": 9.76690867405426e-06, "loss": 0.1264, "step": 2559 }, { "epoch": 0.7826352797309691, "grad_norm": 0.45541560649871826, "learning_rate": 9.770729843332061e-06, "loss": 0.1154, "step": 2560 }, { "epoch": 0.782940996637114, "grad_norm": 0.7914599180221558, "learning_rate": 9.77455101260986e-06, "loss": 0.1734, "step": 2561 }, { "epoch": 0.7832467135432589, "grad_norm": 0.5212724804878235, "learning_rate": 9.778372181887658e-06, "loss": 0.1534, "step": 2562 }, { "epoch": 0.7835524304494038, "grad_norm": 0.504765510559082, "learning_rate": 9.782193351165457e-06, "loss": 0.1579, "step": 2563 }, { "epoch": 0.7838581473555487, "grad_norm": 0.9041082859039307, "learning_rate": 9.786014520443256e-06, "loss": 0.153, "step": 2564 }, { "epoch": 0.7841638642616937, "grad_norm": 2.910095691680908, "learning_rate": 9.789835689721055e-06, "loss": 0.2072, "step": 2565 }, { "epoch": 0.7844695811678386, "grad_norm": 0.8283528685569763, "learning_rate": 9.793656858998853e-06, "loss": 0.2321, "step": 2566 }, { "epoch": 0.7847752980739835, "grad_norm": 0.8201271295547485, "learning_rate": 9.797478028276654e-06, "loss": 0.1872, "step": 2567 }, { "epoch": 0.7850810149801284, "grad_norm": 0.9702547788619995, "learning_rate": 9.801299197554453e-06, "loss": 0.2109, "step": 2568 }, { "epoch": 0.7853867318862733, "grad_norm": 1.6394846439361572, "learning_rate": 9.805120366832251e-06, "loss": 0.2536, "step": 2569 }, { "epoch": 0.7856924487924182, "grad_norm": 0.9593629240989685, "learning_rate": 9.80894153611005e-06, "loss": 0.2796, "step": 2570 }, { "epoch": 0.7859981656985632, "grad_norm": 1.4883129596710205, "learning_rate": 9.812762705387849e-06, "loss": 0.2743, "step": 2571 }, { "epoch": 0.7863038826047081, "grad_norm": 1.2911609411239624, "learning_rate": 9.816583874665648e-06, "loss": 0.2842, "step": 2572 }, { "epoch": 0.786609599510853, "grad_norm": 1.786234974861145, "learning_rate": 9.820405043943446e-06, "loss": 0.299, "step": 2573 }, { "epoch": 0.7869153164169979, "grad_norm": 1.6575485467910767, "learning_rate": 9.824226213221245e-06, "loss": 0.3086, "step": 2574 }, { "epoch": 0.7872210333231427, "grad_norm": 2.8727316856384277, "learning_rate": 9.828047382499045e-06, "loss": 0.4506, "step": 2575 }, { "epoch": 0.7875267502292876, "grad_norm": 0.5621397495269775, "learning_rate": 9.831868551776844e-06, "loss": 0.219, "step": 2576 }, { "epoch": 0.7878324671354325, "grad_norm": 0.4363175928592682, "learning_rate": 9.835689721054643e-06, "loss": 0.137, "step": 2577 }, { "epoch": 0.7881381840415775, "grad_norm": 0.5141528248786926, "learning_rate": 9.839510890332442e-06, "loss": 0.0866, "step": 2578 }, { "epoch": 0.7884439009477224, "grad_norm": 0.4561907947063446, "learning_rate": 9.84333205961024e-06, "loss": 0.1078, "step": 2579 }, { "epoch": 0.7887496178538673, "grad_norm": 0.7177948951721191, "learning_rate": 9.84715322888804e-06, "loss": 0.1283, "step": 2580 }, { "epoch": 0.7890553347600122, "grad_norm": 0.427041232585907, "learning_rate": 9.850974398165838e-06, "loss": 0.0951, "step": 2581 }, { "epoch": 0.7893610516661571, "grad_norm": 0.41207680106163025, "learning_rate": 9.854795567443637e-06, "loss": 0.1084, "step": 2582 }, { "epoch": 0.789666768572302, "grad_norm": 0.5212504267692566, "learning_rate": 9.858616736721437e-06, "loss": 0.1014, "step": 2583 }, { "epoch": 0.789972485478447, "grad_norm": 0.5249597430229187, "learning_rate": 9.862437905999236e-06, "loss": 0.1199, "step": 2584 }, { "epoch": 0.7902782023845919, "grad_norm": 0.4155944585800171, "learning_rate": 9.866259075277035e-06, "loss": 0.1017, "step": 2585 }, { "epoch": 0.7905839192907368, "grad_norm": 0.517004668712616, "learning_rate": 9.870080244554833e-06, "loss": 0.1389, "step": 2586 }, { "epoch": 0.7908896361968817, "grad_norm": 0.5294260382652283, "learning_rate": 9.873901413832632e-06, "loss": 0.142, "step": 2587 }, { "epoch": 0.7911953531030266, "grad_norm": 0.6064632534980774, "learning_rate": 9.877722583110433e-06, "loss": 0.1752, "step": 2588 }, { "epoch": 0.7915010700091715, "grad_norm": 0.5776891708374023, "learning_rate": 9.881543752388231e-06, "loss": 0.1369, "step": 2589 }, { "epoch": 0.7918067869153164, "grad_norm": 0.6576943397521973, "learning_rate": 9.88536492166603e-06, "loss": 0.1835, "step": 2590 }, { "epoch": 0.7921125038214614, "grad_norm": 0.6344665884971619, "learning_rate": 9.88918609094383e-06, "loss": 0.1766, "step": 2591 }, { "epoch": 0.7924182207276063, "grad_norm": 1.1109619140625, "learning_rate": 9.89300726022163e-06, "loss": 0.2167, "step": 2592 }, { "epoch": 0.7927239376337512, "grad_norm": 0.8278517127037048, "learning_rate": 9.896828429499428e-06, "loss": 0.2563, "step": 2593 }, { "epoch": 0.793029654539896, "grad_norm": 0.9473411440849304, "learning_rate": 9.900649598777227e-06, "loss": 0.253, "step": 2594 }, { "epoch": 0.7933353714460409, "grad_norm": 1.0560479164123535, "learning_rate": 9.904470768055025e-06, "loss": 0.2832, "step": 2595 }, { "epoch": 0.7936410883521858, "grad_norm": 1.0989123582839966, "learning_rate": 9.908291937332824e-06, "loss": 0.257, "step": 2596 }, { "epoch": 0.7939468052583308, "grad_norm": 1.083175539970398, "learning_rate": 9.912113106610623e-06, "loss": 0.2421, "step": 2597 }, { "epoch": 0.7942525221644757, "grad_norm": 1.5604733228683472, "learning_rate": 9.915934275888422e-06, "loss": 0.3097, "step": 2598 }, { "epoch": 0.7945582390706206, "grad_norm": 2.1447010040283203, "learning_rate": 9.919755445166222e-06, "loss": 0.341, "step": 2599 }, { "epoch": 0.7948639559767655, "grad_norm": 2.1557655334472656, "learning_rate": 9.92357661444402e-06, "loss": 0.4099, "step": 2600 }, { "epoch": 0.7951696728829104, "grad_norm": 0.7053086161613464, "learning_rate": 9.92739778372182e-06, "loss": 0.2024, "step": 2601 }, { "epoch": 0.7954753897890553, "grad_norm": 0.4040185213088989, "learning_rate": 9.931218952999618e-06, "loss": 0.1445, "step": 2602 }, { "epoch": 0.7957811066952002, "grad_norm": 0.3607569634914398, "learning_rate": 9.935040122277417e-06, "loss": 0.0881, "step": 2603 }, { "epoch": 0.7960868236013452, "grad_norm": 0.534721851348877, "learning_rate": 9.938861291555216e-06, "loss": 0.1301, "step": 2604 }, { "epoch": 0.7963925405074901, "grad_norm": 0.43297186493873596, "learning_rate": 9.942682460833015e-06, "loss": 0.0802, "step": 2605 }, { "epoch": 0.796698257413635, "grad_norm": 0.39340656995773315, "learning_rate": 9.946503630110815e-06, "loss": 0.0904, "step": 2606 }, { "epoch": 0.7970039743197799, "grad_norm": 0.4970964789390564, "learning_rate": 9.950324799388614e-06, "loss": 0.0881, "step": 2607 }, { "epoch": 0.7973096912259248, "grad_norm": 0.502224326133728, "learning_rate": 9.954145968666412e-06, "loss": 0.1418, "step": 2608 }, { "epoch": 0.7976154081320697, "grad_norm": 0.4741884469985962, "learning_rate": 9.957967137944211e-06, "loss": 0.0906, "step": 2609 }, { "epoch": 0.7979211250382147, "grad_norm": 0.44710034132003784, "learning_rate": 9.96178830722201e-06, "loss": 0.0989, "step": 2610 }, { "epoch": 0.7982268419443596, "grad_norm": 0.47619882225990295, "learning_rate": 9.965609476499809e-06, "loss": 0.1457, "step": 2611 }, { "epoch": 0.7985325588505044, "grad_norm": 1.610327124595642, "learning_rate": 9.969430645777607e-06, "loss": 0.1218, "step": 2612 }, { "epoch": 0.7988382757566493, "grad_norm": 1.2519185543060303, "learning_rate": 9.973251815055406e-06, "loss": 0.1881, "step": 2613 }, { "epoch": 0.7991439926627942, "grad_norm": 0.6737523078918457, "learning_rate": 9.977072984333207e-06, "loss": 0.147, "step": 2614 }, { "epoch": 0.7994497095689391, "grad_norm": 0.6842924356460571, "learning_rate": 9.980894153611005e-06, "loss": 0.1964, "step": 2615 }, { "epoch": 0.799755426475084, "grad_norm": 0.6262239813804626, "learning_rate": 9.984715322888804e-06, "loss": 0.1744, "step": 2616 }, { "epoch": 0.800061143381229, "grad_norm": 0.8770407438278198, "learning_rate": 9.988536492166603e-06, "loss": 0.2242, "step": 2617 }, { "epoch": 0.8003668602873739, "grad_norm": 0.86372309923172, "learning_rate": 9.992357661444402e-06, "loss": 0.2589, "step": 2618 }, { "epoch": 0.8006725771935188, "grad_norm": 0.8047781586647034, "learning_rate": 9.9961788307222e-06, "loss": 0.2264, "step": 2619 }, { "epoch": 0.8009782940996637, "grad_norm": 1.4239659309387207, "learning_rate": 9.999999999999999e-06, "loss": 0.319, "step": 2620 }, { "epoch": 0.8012840110058086, "grad_norm": 1.2684130668640137, "learning_rate": 1.0003821169277798e-05, "loss": 0.3002, "step": 2621 }, { "epoch": 0.8015897279119535, "grad_norm": 1.0307118892669678, "learning_rate": 1.0007642338555598e-05, "loss": 0.287, "step": 2622 }, { "epoch": 0.8018954448180985, "grad_norm": 4.115602493286133, "learning_rate": 1.0011463507833397e-05, "loss": 0.3261, "step": 2623 }, { "epoch": 0.8022011617242434, "grad_norm": 1.338805913925171, "learning_rate": 1.0015284677111196e-05, "loss": 0.3052, "step": 2624 }, { "epoch": 0.8025068786303883, "grad_norm": 1.8015447854995728, "learning_rate": 1.0019105846388995e-05, "loss": 0.3728, "step": 2625 }, { "epoch": 0.8028125955365332, "grad_norm": 0.938104510307312, "learning_rate": 1.0022927015666793e-05, "loss": 0.1956, "step": 2626 }, { "epoch": 0.8031183124426781, "grad_norm": 0.39401668310165405, "learning_rate": 1.0026748184944594e-05, "loss": 0.1138, "step": 2627 }, { "epoch": 0.803424029348823, "grad_norm": 0.5076119303703308, "learning_rate": 1.0030569354222392e-05, "loss": 0.1341, "step": 2628 }, { "epoch": 0.8037297462549678, "grad_norm": 0.42815831303596497, "learning_rate": 1.0034390523500191e-05, "loss": 0.0852, "step": 2629 }, { "epoch": 0.8040354631611129, "grad_norm": 0.539563000202179, "learning_rate": 1.0038211692777992e-05, "loss": 0.0823, "step": 2630 }, { "epoch": 0.8043411800672577, "grad_norm": 0.4089325964450836, "learning_rate": 1.004203286205579e-05, "loss": 0.1014, "step": 2631 }, { "epoch": 0.8046468969734026, "grad_norm": 0.4188878536224365, "learning_rate": 1.0045854031333589e-05, "loss": 0.0913, "step": 2632 }, { "epoch": 0.8049526138795475, "grad_norm": 0.4448242485523224, "learning_rate": 1.0049675200611388e-05, "loss": 0.0914, "step": 2633 }, { "epoch": 0.8052583307856924, "grad_norm": 0.4662359356880188, "learning_rate": 1.0053496369889187e-05, "loss": 0.1277, "step": 2634 }, { "epoch": 0.8055640476918373, "grad_norm": 0.45505592226982117, "learning_rate": 1.0057317539166985e-05, "loss": 0.0907, "step": 2635 }, { "epoch": 0.8058697645979823, "grad_norm": 0.4483852684497833, "learning_rate": 1.0061138708444784e-05, "loss": 0.1299, "step": 2636 }, { "epoch": 0.8061754815041272, "grad_norm": 0.4546952247619629, "learning_rate": 1.0064959877722585e-05, "loss": 0.1241, "step": 2637 }, { "epoch": 0.8064811984102721, "grad_norm": 0.6013298034667969, "learning_rate": 1.0068781047000383e-05, "loss": 0.2006, "step": 2638 }, { "epoch": 0.806786915316417, "grad_norm": 0.5092337727546692, "learning_rate": 1.0072602216278182e-05, "loss": 0.1458, "step": 2639 }, { "epoch": 0.8070926322225619, "grad_norm": 0.702775239944458, "learning_rate": 1.007642338555598e-05, "loss": 0.1873, "step": 2640 }, { "epoch": 0.8073983491287068, "grad_norm": 0.6225880980491638, "learning_rate": 1.008024455483378e-05, "loss": 0.23, "step": 2641 }, { "epoch": 0.8077040660348517, "grad_norm": 0.7579215168952942, "learning_rate": 1.0084065724111578e-05, "loss": 0.2239, "step": 2642 }, { "epoch": 0.8080097829409967, "grad_norm": 0.7688767910003662, "learning_rate": 1.0087886893389377e-05, "loss": 0.2476, "step": 2643 }, { "epoch": 0.8083154998471416, "grad_norm": 0.8292747139930725, "learning_rate": 1.0091708062667176e-05, "loss": 0.2227, "step": 2644 }, { "epoch": 0.8086212167532865, "grad_norm": 1.3755688667297363, "learning_rate": 1.0095529231944976e-05, "loss": 0.2576, "step": 2645 }, { "epoch": 0.8089269336594314, "grad_norm": 1.60938560962677, "learning_rate": 1.0099350401222775e-05, "loss": 0.2639, "step": 2646 }, { "epoch": 0.8092326505655763, "grad_norm": 1.2197619676589966, "learning_rate": 1.0103171570500574e-05, "loss": 0.2581, "step": 2647 }, { "epoch": 0.8095383674717211, "grad_norm": 1.4927300214767456, "learning_rate": 1.0106992739778372e-05, "loss": 0.293, "step": 2648 }, { "epoch": 0.8098440843778661, "grad_norm": 1.9834568500518799, "learning_rate": 1.0110813909056171e-05, "loss": 0.3269, "step": 2649 }, { "epoch": 0.810149801284011, "grad_norm": 2.8334314823150635, "learning_rate": 1.011463507833397e-05, "loss": 0.4133, "step": 2650 }, { "epoch": 0.8104555181901559, "grad_norm": 0.5006808638572693, "learning_rate": 1.0118456247611769e-05, "loss": 0.1989, "step": 2651 }, { "epoch": 0.8107612350963008, "grad_norm": 0.6638453602790833, "learning_rate": 1.0122277416889567e-05, "loss": 0.1413, "step": 2652 }, { "epoch": 0.8110669520024457, "grad_norm": 0.4254145622253418, "learning_rate": 1.0126098586167368e-05, "loss": 0.0795, "step": 2653 }, { "epoch": 0.8113726689085906, "grad_norm": 0.3154521882534027, "learning_rate": 1.0129919755445167e-05, "loss": 0.0826, "step": 2654 }, { "epoch": 0.8116783858147355, "grad_norm": 0.5340317487716675, "learning_rate": 1.0133740924722965e-05, "loss": 0.1039, "step": 2655 }, { "epoch": 0.8119841027208805, "grad_norm": 0.3642619550228119, "learning_rate": 1.0137562094000764e-05, "loss": 0.1046, "step": 2656 }, { "epoch": 0.8122898196270254, "grad_norm": 0.4244588315486908, "learning_rate": 1.0141383263278563e-05, "loss": 0.0984, "step": 2657 }, { "epoch": 0.8125955365331703, "grad_norm": 0.4497390687465668, "learning_rate": 1.0145204432556362e-05, "loss": 0.1083, "step": 2658 }, { "epoch": 0.8129012534393152, "grad_norm": 0.47265222668647766, "learning_rate": 1.014902560183416e-05, "loss": 0.133, "step": 2659 }, { "epoch": 0.8132069703454601, "grad_norm": 0.42886224389076233, "learning_rate": 1.0152846771111959e-05, "loss": 0.0982, "step": 2660 }, { "epoch": 0.813512687251605, "grad_norm": 0.44536954164505005, "learning_rate": 1.015666794038976e-05, "loss": 0.1434, "step": 2661 }, { "epoch": 0.81381840415775, "grad_norm": 0.7972167730331421, "learning_rate": 1.0160489109667558e-05, "loss": 0.128, "step": 2662 }, { "epoch": 0.8141241210638949, "grad_norm": 0.5563852787017822, "learning_rate": 1.0164310278945357e-05, "loss": 0.1485, "step": 2663 }, { "epoch": 0.8144298379700398, "grad_norm": 0.5481545329093933, "learning_rate": 1.0168131448223156e-05, "loss": 0.1603, "step": 2664 }, { "epoch": 0.8147355548761847, "grad_norm": 0.6063458323478699, "learning_rate": 1.0171952617500954e-05, "loss": 0.1885, "step": 2665 }, { "epoch": 0.8150412717823295, "grad_norm": 0.6633944511413574, "learning_rate": 1.0175773786778755e-05, "loss": 0.1911, "step": 2666 }, { "epoch": 0.8153469886884744, "grad_norm": 0.9465271234512329, "learning_rate": 1.0179594956056554e-05, "loss": 0.2656, "step": 2667 }, { "epoch": 0.8156527055946193, "grad_norm": 0.9913036823272705, "learning_rate": 1.0183416125334354e-05, "loss": 0.2449, "step": 2668 }, { "epoch": 0.8159584225007643, "grad_norm": 1.0758776664733887, "learning_rate": 1.0187237294612153e-05, "loss": 0.2432, "step": 2669 }, { "epoch": 0.8162641394069092, "grad_norm": 0.8601739406585693, "learning_rate": 1.0191058463889952e-05, "loss": 0.289, "step": 2670 }, { "epoch": 0.8165698563130541, "grad_norm": 1.2025723457336426, "learning_rate": 1.019487963316775e-05, "loss": 0.2968, "step": 2671 }, { "epoch": 0.816875573219199, "grad_norm": 1.2072973251342773, "learning_rate": 1.0198700802445549e-05, "loss": 0.2708, "step": 2672 }, { "epoch": 0.8171812901253439, "grad_norm": 1.03488028049469, "learning_rate": 1.0202521971723348e-05, "loss": 0.2731, "step": 2673 }, { "epoch": 0.8174870070314888, "grad_norm": 2.353135824203491, "learning_rate": 1.0206343141001147e-05, "loss": 0.3218, "step": 2674 }, { "epoch": 0.8177927239376338, "grad_norm": 2.160511016845703, "learning_rate": 1.0210164310278945e-05, "loss": 0.392, "step": 2675 }, { "epoch": 0.8180984408437787, "grad_norm": 0.49470964074134827, "learning_rate": 1.0213985479556746e-05, "loss": 0.2097, "step": 2676 }, { "epoch": 0.8184041577499236, "grad_norm": 0.4149441421031952, "learning_rate": 1.0217806648834545e-05, "loss": 0.1361, "step": 2677 }, { "epoch": 0.8187098746560685, "grad_norm": 0.36247092485427856, "learning_rate": 1.0221627818112343e-05, "loss": 0.1065, "step": 2678 }, { "epoch": 0.8190155915622134, "grad_norm": 0.4891511797904968, "learning_rate": 1.0225448987390142e-05, "loss": 0.1143, "step": 2679 }, { "epoch": 0.8193213084683583, "grad_norm": 0.4215775728225708, "learning_rate": 1.022927015666794e-05, "loss": 0.0918, "step": 2680 }, { "epoch": 0.8196270253745032, "grad_norm": 0.44632688164711, "learning_rate": 1.023309132594574e-05, "loss": 0.1041, "step": 2681 }, { "epoch": 0.8199327422806482, "grad_norm": 0.39979422092437744, "learning_rate": 1.0236912495223538e-05, "loss": 0.1101, "step": 2682 }, { "epoch": 0.8202384591867931, "grad_norm": 0.4780469834804535, "learning_rate": 1.0240733664501337e-05, "loss": 0.1131, "step": 2683 }, { "epoch": 0.820544176092938, "grad_norm": 0.5904396772384644, "learning_rate": 1.0244554833779137e-05, "loss": 0.0929, "step": 2684 }, { "epoch": 0.8208498929990828, "grad_norm": 0.6922567486763, "learning_rate": 1.0248376003056936e-05, "loss": 0.1155, "step": 2685 }, { "epoch": 0.8211556099052277, "grad_norm": 0.46451500058174133, "learning_rate": 1.0252197172334735e-05, "loss": 0.1018, "step": 2686 }, { "epoch": 0.8214613268113726, "grad_norm": 0.6084572672843933, "learning_rate": 1.0256018341612534e-05, "loss": 0.1286, "step": 2687 }, { "epoch": 0.8217670437175176, "grad_norm": 0.5727601647377014, "learning_rate": 1.0259839510890332e-05, "loss": 0.1201, "step": 2688 }, { "epoch": 0.8220727606236625, "grad_norm": 0.6302147507667542, "learning_rate": 1.0263660680168131e-05, "loss": 0.181, "step": 2689 }, { "epoch": 0.8223784775298074, "grad_norm": 0.7509560585021973, "learning_rate": 1.026748184944593e-05, "loss": 0.1952, "step": 2690 }, { "epoch": 0.8226841944359523, "grad_norm": 0.5848539471626282, "learning_rate": 1.0271303018723729e-05, "loss": 0.1805, "step": 2691 }, { "epoch": 0.8229899113420972, "grad_norm": 0.7842033505439758, "learning_rate": 1.0275124188001529e-05, "loss": 0.1996, "step": 2692 }, { "epoch": 0.8232956282482421, "grad_norm": 0.8918296098709106, "learning_rate": 1.0278945357279328e-05, "loss": 0.2553, "step": 2693 }, { "epoch": 0.823601345154387, "grad_norm": 0.8631632328033447, "learning_rate": 1.0282766526557127e-05, "loss": 0.2185, "step": 2694 }, { "epoch": 0.823907062060532, "grad_norm": 0.894744336605072, "learning_rate": 1.0286587695834925e-05, "loss": 0.2532, "step": 2695 }, { "epoch": 0.8242127789666769, "grad_norm": 0.9335241913795471, "learning_rate": 1.0290408865112724e-05, "loss": 0.2687, "step": 2696 }, { "epoch": 0.8245184958728218, "grad_norm": 1.0624096393585205, "learning_rate": 1.0294230034390523e-05, "loss": 0.2868, "step": 2697 }, { "epoch": 0.8248242127789667, "grad_norm": 2.713038682937622, "learning_rate": 1.0298051203668322e-05, "loss": 0.2971, "step": 2698 }, { "epoch": 0.8251299296851116, "grad_norm": 1.9276816844940186, "learning_rate": 1.0301872372946122e-05, "loss": 0.3233, "step": 2699 }, { "epoch": 0.8254356465912565, "grad_norm": 1.6911698579788208, "learning_rate": 1.030569354222392e-05, "loss": 0.4258, "step": 2700 }, { "epoch": 0.8257413634974015, "grad_norm": 0.43886327743530273, "learning_rate": 1.030951471150172e-05, "loss": 0.2191, "step": 2701 }, { "epoch": 0.8260470804035464, "grad_norm": 0.5682336091995239, "learning_rate": 1.0313335880779518e-05, "loss": 0.112, "step": 2702 }, { "epoch": 0.8263527973096912, "grad_norm": 0.38871726393699646, "learning_rate": 1.0317157050057317e-05, "loss": 0.1051, "step": 2703 }, { "epoch": 0.8266585142158361, "grad_norm": 0.3348056972026825, "learning_rate": 1.0320978219335116e-05, "loss": 0.1081, "step": 2704 }, { "epoch": 0.826964231121981, "grad_norm": 0.6860211491584778, "learning_rate": 1.0324799388612916e-05, "loss": 0.1327, "step": 2705 }, { "epoch": 0.8272699480281259, "grad_norm": 0.4940170347690582, "learning_rate": 1.0328620557890715e-05, "loss": 0.1054, "step": 2706 }, { "epoch": 0.8275756649342708, "grad_norm": 0.7077764868736267, "learning_rate": 1.0332441727168515e-05, "loss": 0.0948, "step": 2707 }, { "epoch": 0.8278813818404158, "grad_norm": 0.49454790353775024, "learning_rate": 1.0336262896446314e-05, "loss": 0.0853, "step": 2708 }, { "epoch": 0.8281870987465607, "grad_norm": 0.4234122931957245, "learning_rate": 1.0340084065724113e-05, "loss": 0.1118, "step": 2709 }, { "epoch": 0.8284928156527056, "grad_norm": 0.5756216049194336, "learning_rate": 1.0343905235001912e-05, "loss": 0.1195, "step": 2710 }, { "epoch": 0.8287985325588505, "grad_norm": 0.5116868019104004, "learning_rate": 1.034772640427971e-05, "loss": 0.121, "step": 2711 }, { "epoch": 0.8291042494649954, "grad_norm": 0.6026672124862671, "learning_rate": 1.0351547573557509e-05, "loss": 0.1161, "step": 2712 }, { "epoch": 0.8294099663711403, "grad_norm": 0.6034939289093018, "learning_rate": 1.0355368742835308e-05, "loss": 0.1771, "step": 2713 }, { "epoch": 0.8297156832772853, "grad_norm": 0.9179158806800842, "learning_rate": 1.0359189912113107e-05, "loss": 0.1636, "step": 2714 }, { "epoch": 0.8300214001834302, "grad_norm": 0.7176274657249451, "learning_rate": 1.0363011081390907e-05, "loss": 0.1769, "step": 2715 }, { "epoch": 0.8303271170895751, "grad_norm": 0.7101695537567139, "learning_rate": 1.0366832250668706e-05, "loss": 0.1995, "step": 2716 }, { "epoch": 0.83063283399572, "grad_norm": 0.7133851051330566, "learning_rate": 1.0370653419946504e-05, "loss": 0.1974, "step": 2717 }, { "epoch": 0.8309385509018649, "grad_norm": 0.6162519454956055, "learning_rate": 1.0374474589224303e-05, "loss": 0.2155, "step": 2718 }, { "epoch": 0.8312442678080098, "grad_norm": 0.8116396069526672, "learning_rate": 1.0378295758502102e-05, "loss": 0.2421, "step": 2719 }, { "epoch": 0.8315499847141546, "grad_norm": 1.0192192792892456, "learning_rate": 1.03821169277799e-05, "loss": 0.2671, "step": 2720 }, { "epoch": 0.8318557016202996, "grad_norm": 1.7973120212554932, "learning_rate": 1.03859380970577e-05, "loss": 0.3028, "step": 2721 }, { "epoch": 0.8321614185264445, "grad_norm": 1.197771430015564, "learning_rate": 1.0389759266335498e-05, "loss": 0.3178, "step": 2722 }, { "epoch": 0.8324671354325894, "grad_norm": 1.1727361679077148, "learning_rate": 1.0393580435613299e-05, "loss": 0.2802, "step": 2723 }, { "epoch": 0.8327728523387343, "grad_norm": 1.8272231817245483, "learning_rate": 1.0397401604891097e-05, "loss": 0.3497, "step": 2724 }, { "epoch": 0.8330785692448792, "grad_norm": 9.384324073791504, "learning_rate": 1.0401222774168896e-05, "loss": 0.4417, "step": 2725 }, { "epoch": 0.8333842861510241, "grad_norm": 0.6080257892608643, "learning_rate": 1.0405043943446695e-05, "loss": 0.2059, "step": 2726 }, { "epoch": 0.8336900030571691, "grad_norm": 0.4994317889213562, "learning_rate": 1.0408865112724494e-05, "loss": 0.1159, "step": 2727 }, { "epoch": 0.833995719963314, "grad_norm": 0.436990886926651, "learning_rate": 1.0412686282002292e-05, "loss": 0.11, "step": 2728 }, { "epoch": 0.8343014368694589, "grad_norm": 0.3655126392841339, "learning_rate": 1.0416507451280091e-05, "loss": 0.0933, "step": 2729 }, { "epoch": 0.8346071537756038, "grad_norm": 0.579567551612854, "learning_rate": 1.0420328620557892e-05, "loss": 0.1001, "step": 2730 }, { "epoch": 0.8349128706817487, "grad_norm": 0.36911001801490784, "learning_rate": 1.042414978983569e-05, "loss": 0.084, "step": 2731 }, { "epoch": 0.8352185875878936, "grad_norm": 0.5375697016716003, "learning_rate": 1.0427970959113489e-05, "loss": 0.0988, "step": 2732 }, { "epoch": 0.8355243044940385, "grad_norm": 0.38624992966651917, "learning_rate": 1.0431792128391288e-05, "loss": 0.1094, "step": 2733 }, { "epoch": 0.8358300214001835, "grad_norm": 0.4918343424797058, "learning_rate": 1.0435613297669087e-05, "loss": 0.1324, "step": 2734 }, { "epoch": 0.8361357383063284, "grad_norm": 0.5002794861793518, "learning_rate": 1.0439434466946885e-05, "loss": 0.1305, "step": 2735 }, { "epoch": 0.8364414552124733, "grad_norm": 0.47890692949295044, "learning_rate": 1.0443255636224684e-05, "loss": 0.1209, "step": 2736 }, { "epoch": 0.8367471721186182, "grad_norm": 0.4349498152732849, "learning_rate": 1.0447076805502483e-05, "loss": 0.1137, "step": 2737 }, { "epoch": 0.837052889024763, "grad_norm": 0.46539872884750366, "learning_rate": 1.0450897974780283e-05, "loss": 0.1348, "step": 2738 }, { "epoch": 0.8373586059309079, "grad_norm": 0.5796501040458679, "learning_rate": 1.0454719144058082e-05, "loss": 0.1886, "step": 2739 }, { "epoch": 0.8376643228370529, "grad_norm": 0.559927761554718, "learning_rate": 1.045854031333588e-05, "loss": 0.1693, "step": 2740 }, { "epoch": 0.8379700397431978, "grad_norm": 0.6667305827140808, "learning_rate": 1.046236148261368e-05, "loss": 0.1834, "step": 2741 }, { "epoch": 0.8382757566493427, "grad_norm": 0.8264313340187073, "learning_rate": 1.0466182651891478e-05, "loss": 0.2374, "step": 2742 }, { "epoch": 0.8385814735554876, "grad_norm": 0.9184008836746216, "learning_rate": 1.0470003821169277e-05, "loss": 0.2127, "step": 2743 }, { "epoch": 0.8388871904616325, "grad_norm": 1.0119194984436035, "learning_rate": 1.0473824990447077e-05, "loss": 0.2492, "step": 2744 }, { "epoch": 0.8391929073677774, "grad_norm": 1.0189100503921509, "learning_rate": 1.0477646159724876e-05, "loss": 0.2567, "step": 2745 }, { "epoch": 0.8394986242739223, "grad_norm": 1.035668134689331, "learning_rate": 1.0481467329002677e-05, "loss": 0.2635, "step": 2746 }, { "epoch": 0.8398043411800673, "grad_norm": 1.1679913997650146, "learning_rate": 1.0485288498280475e-05, "loss": 0.2671, "step": 2747 }, { "epoch": 0.8401100580862122, "grad_norm": 1.369438648223877, "learning_rate": 1.0489109667558274e-05, "loss": 0.3012, "step": 2748 }, { "epoch": 0.8404157749923571, "grad_norm": 1.5794320106506348, "learning_rate": 1.0492930836836073e-05, "loss": 0.3619, "step": 2749 }, { "epoch": 0.840721491898502, "grad_norm": 2.450791358947754, "learning_rate": 1.0496752006113872e-05, "loss": 0.3651, "step": 2750 }, { "epoch": 0.8410272088046469, "grad_norm": 0.4541258215904236, "learning_rate": 1.050057317539167e-05, "loss": 0.1717, "step": 2751 }, { "epoch": 0.8413329257107918, "grad_norm": 0.3333888053894043, "learning_rate": 1.0504394344669469e-05, "loss": 0.1077, "step": 2752 }, { "epoch": 0.8416386426169368, "grad_norm": 0.35408729314804077, "learning_rate": 1.0508215513947268e-05, "loss": 0.1147, "step": 2753 }, { "epoch": 0.8419443595230817, "grad_norm": 0.4184246361255646, "learning_rate": 1.0512036683225068e-05, "loss": 0.098, "step": 2754 }, { "epoch": 0.8422500764292266, "grad_norm": 0.36315903067588806, "learning_rate": 1.0515857852502867e-05, "loss": 0.0845, "step": 2755 }, { "epoch": 0.8425557933353715, "grad_norm": 0.3816969096660614, "learning_rate": 1.0519679021780666e-05, "loss": 0.0716, "step": 2756 }, { "epoch": 0.8428615102415163, "grad_norm": 0.48533526062965393, "learning_rate": 1.0523500191058464e-05, "loss": 0.1142, "step": 2757 }, { "epoch": 0.8431672271476612, "grad_norm": 0.7747724652290344, "learning_rate": 1.0527321360336263e-05, "loss": 0.1023, "step": 2758 }, { "epoch": 0.8434729440538061, "grad_norm": 0.5113374590873718, "learning_rate": 1.0531142529614062e-05, "loss": 0.1296, "step": 2759 }, { "epoch": 0.8437786609599511, "grad_norm": 0.5016030073165894, "learning_rate": 1.053496369889186e-05, "loss": 0.1281, "step": 2760 }, { "epoch": 0.844084377866096, "grad_norm": 0.6011331081390381, "learning_rate": 1.0538784868169661e-05, "loss": 0.1225, "step": 2761 }, { "epoch": 0.8443900947722409, "grad_norm": 0.5902177095413208, "learning_rate": 1.054260603744746e-05, "loss": 0.1031, "step": 2762 }, { "epoch": 0.8446958116783858, "grad_norm": 0.5465698838233948, "learning_rate": 1.0546427206725259e-05, "loss": 0.1389, "step": 2763 }, { "epoch": 0.8450015285845307, "grad_norm": 1.401891827583313, "learning_rate": 1.0550248376003057e-05, "loss": 0.2009, "step": 2764 }, { "epoch": 0.8453072454906756, "grad_norm": 0.7809762358665466, "learning_rate": 1.0554069545280856e-05, "loss": 0.1455, "step": 2765 }, { "epoch": 0.8456129623968205, "grad_norm": 0.748387336730957, "learning_rate": 1.0557890714558655e-05, "loss": 0.1953, "step": 2766 }, { "epoch": 0.8459186793029655, "grad_norm": 0.8908138871192932, "learning_rate": 1.0561711883836454e-05, "loss": 0.2172, "step": 2767 }, { "epoch": 0.8462243962091104, "grad_norm": 1.379197120666504, "learning_rate": 1.0565533053114252e-05, "loss": 0.2464, "step": 2768 }, { "epoch": 0.8465301131152553, "grad_norm": 0.8978850245475769, "learning_rate": 1.0569354222392053e-05, "loss": 0.2287, "step": 2769 }, { "epoch": 0.8468358300214002, "grad_norm": 1.1102849245071411, "learning_rate": 1.0573175391669851e-05, "loss": 0.2791, "step": 2770 }, { "epoch": 0.8471415469275451, "grad_norm": 1.2840856313705444, "learning_rate": 1.057699656094765e-05, "loss": 0.239, "step": 2771 }, { "epoch": 0.84744726383369, "grad_norm": 1.3378690481185913, "learning_rate": 1.0580817730225449e-05, "loss": 0.2799, "step": 2772 }, { "epoch": 0.847752980739835, "grad_norm": 1.5433465242385864, "learning_rate": 1.0584638899503248e-05, "loss": 0.242, "step": 2773 }, { "epoch": 0.8480586976459799, "grad_norm": 1.9549063444137573, "learning_rate": 1.0588460068781046e-05, "loss": 0.3581, "step": 2774 }, { "epoch": 0.8483644145521247, "grad_norm": 4.625490188598633, "learning_rate": 1.0592281238058845e-05, "loss": 0.4089, "step": 2775 }, { "epoch": 0.8486701314582696, "grad_norm": 0.604476809501648, "learning_rate": 1.0596102407336644e-05, "loss": 0.1876, "step": 2776 }, { "epoch": 0.8489758483644145, "grad_norm": 0.3805790841579437, "learning_rate": 1.0599923576614444e-05, "loss": 0.1072, "step": 2777 }, { "epoch": 0.8492815652705594, "grad_norm": 0.4730716347694397, "learning_rate": 1.0603744745892243e-05, "loss": 0.1083, "step": 2778 }, { "epoch": 0.8495872821767043, "grad_norm": 0.4074982702732086, "learning_rate": 1.0607565915170042e-05, "loss": 0.1111, "step": 2779 }, { "epoch": 0.8498929990828493, "grad_norm": 0.39510518312454224, "learning_rate": 1.061138708444784e-05, "loss": 0.0738, "step": 2780 }, { "epoch": 0.8501987159889942, "grad_norm": 0.40070030093193054, "learning_rate": 1.061520825372564e-05, "loss": 0.1004, "step": 2781 }, { "epoch": 0.8505044328951391, "grad_norm": 0.4255559742450714, "learning_rate": 1.061902942300344e-05, "loss": 0.0956, "step": 2782 }, { "epoch": 0.850810149801284, "grad_norm": 0.42108026146888733, "learning_rate": 1.0622850592281239e-05, "loss": 0.0744, "step": 2783 }, { "epoch": 0.8511158667074289, "grad_norm": 0.5555685758590698, "learning_rate": 1.0626671761559037e-05, "loss": 0.148, "step": 2784 }, { "epoch": 0.8514215836135738, "grad_norm": 0.40412527322769165, "learning_rate": 1.0630492930836838e-05, "loss": 0.1014, "step": 2785 }, { "epoch": 0.8517273005197188, "grad_norm": 0.518484890460968, "learning_rate": 1.0634314100114636e-05, "loss": 0.1339, "step": 2786 }, { "epoch": 0.8520330174258637, "grad_norm": 0.5012656450271606, "learning_rate": 1.0638135269392435e-05, "loss": 0.1124, "step": 2787 }, { "epoch": 0.8523387343320086, "grad_norm": 0.5051573514938354, "learning_rate": 1.0641956438670234e-05, "loss": 0.1338, "step": 2788 }, { "epoch": 0.8526444512381535, "grad_norm": 0.6864897012710571, "learning_rate": 1.0645777607948033e-05, "loss": 0.1506, "step": 2789 }, { "epoch": 0.8529501681442984, "grad_norm": 0.9090233445167542, "learning_rate": 1.0649598777225831e-05, "loss": 0.1847, "step": 2790 }, { "epoch": 0.8532558850504433, "grad_norm": 1.2784740924835205, "learning_rate": 1.065341994650363e-05, "loss": 0.218, "step": 2791 }, { "epoch": 0.8535616019565881, "grad_norm": 1.1337172985076904, "learning_rate": 1.065724111578143e-05, "loss": 0.2131, "step": 2792 }, { "epoch": 0.8538673188627331, "grad_norm": 1.0143535137176514, "learning_rate": 1.066106228505923e-05, "loss": 0.2678, "step": 2793 }, { "epoch": 0.854173035768878, "grad_norm": 1.1509093046188354, "learning_rate": 1.0664883454337028e-05, "loss": 0.2692, "step": 2794 }, { "epoch": 0.8544787526750229, "grad_norm": 0.9132645726203918, "learning_rate": 1.0668704623614827e-05, "loss": 0.264, "step": 2795 }, { "epoch": 0.8547844695811678, "grad_norm": 1.193927526473999, "learning_rate": 1.0672525792892626e-05, "loss": 0.3112, "step": 2796 }, { "epoch": 0.8550901864873127, "grad_norm": 1.2361729145050049, "learning_rate": 1.0676346962170424e-05, "loss": 0.3274, "step": 2797 }, { "epoch": 0.8553959033934576, "grad_norm": 1.2190651893615723, "learning_rate": 1.0680168131448223e-05, "loss": 0.2829, "step": 2798 }, { "epoch": 0.8557016202996026, "grad_norm": 1.6414883136749268, "learning_rate": 1.0683989300726022e-05, "loss": 0.3008, "step": 2799 }, { "epoch": 0.8560073372057475, "grad_norm": 2.0641915798187256, "learning_rate": 1.0687810470003822e-05, "loss": 0.4241, "step": 2800 }, { "epoch": 0.8563130541118924, "grad_norm": 0.5247491002082825, "learning_rate": 1.0691631639281621e-05, "loss": 0.1887, "step": 2801 }, { "epoch": 0.8566187710180373, "grad_norm": 0.3700738251209259, "learning_rate": 1.069545280855942e-05, "loss": 0.1212, "step": 2802 }, { "epoch": 0.8569244879241822, "grad_norm": 0.4272330403327942, "learning_rate": 1.0699273977837219e-05, "loss": 0.1092, "step": 2803 }, { "epoch": 0.8572302048303271, "grad_norm": 0.48416557908058167, "learning_rate": 1.0703095147115017e-05, "loss": 0.0924, "step": 2804 }, { "epoch": 0.857535921736472, "grad_norm": 0.43716931343078613, "learning_rate": 1.0706916316392816e-05, "loss": 0.0978, "step": 2805 }, { "epoch": 0.857841638642617, "grad_norm": 0.46379804611206055, "learning_rate": 1.0710737485670615e-05, "loss": 0.1137, "step": 2806 }, { "epoch": 0.8581473555487619, "grad_norm": 0.34742334485054016, "learning_rate": 1.0714558654948413e-05, "loss": 0.0812, "step": 2807 }, { "epoch": 0.8584530724549068, "grad_norm": 0.40452226996421814, "learning_rate": 1.0718379824226214e-05, "loss": 0.1234, "step": 2808 }, { "epoch": 0.8587587893610517, "grad_norm": 0.4609726667404175, "learning_rate": 1.0722200993504013e-05, "loss": 0.1349, "step": 2809 }, { "epoch": 0.8590645062671965, "grad_norm": 0.4643462002277374, "learning_rate": 1.0726022162781811e-05, "loss": 0.0889, "step": 2810 }, { "epoch": 0.8593702231733414, "grad_norm": 0.5083468556404114, "learning_rate": 1.072984333205961e-05, "loss": 0.168, "step": 2811 }, { "epoch": 0.8596759400794864, "grad_norm": 0.49590837955474854, "learning_rate": 1.0733664501337409e-05, "loss": 0.1226, "step": 2812 }, { "epoch": 0.8599816569856313, "grad_norm": 0.4791555106639862, "learning_rate": 1.0737485670615208e-05, "loss": 0.1443, "step": 2813 }, { "epoch": 0.8602873738917762, "grad_norm": 0.5862661004066467, "learning_rate": 1.0741306839893006e-05, "loss": 0.1632, "step": 2814 }, { "epoch": 0.8605930907979211, "grad_norm": 0.621521532535553, "learning_rate": 1.0745128009170805e-05, "loss": 0.2006, "step": 2815 }, { "epoch": 0.860898807704066, "grad_norm": 0.6400212049484253, "learning_rate": 1.0748949178448606e-05, "loss": 0.1816, "step": 2816 }, { "epoch": 0.8612045246102109, "grad_norm": 0.5871902704238892, "learning_rate": 1.0752770347726404e-05, "loss": 0.1811, "step": 2817 }, { "epoch": 0.8615102415163558, "grad_norm": 0.7071124911308289, "learning_rate": 1.0756591517004203e-05, "loss": 0.212, "step": 2818 }, { "epoch": 0.8618159584225008, "grad_norm": 0.9788336753845215, "learning_rate": 1.0760412686282002e-05, "loss": 0.2716, "step": 2819 }, { "epoch": 0.8621216753286457, "grad_norm": 0.8687110543251038, "learning_rate": 1.07642338555598e-05, "loss": 0.2488, "step": 2820 }, { "epoch": 0.8624273922347906, "grad_norm": 1.0162981748580933, "learning_rate": 1.0768055024837601e-05, "loss": 0.2438, "step": 2821 }, { "epoch": 0.8627331091409355, "grad_norm": 2.261063814163208, "learning_rate": 1.07718761941154e-05, "loss": 0.2795, "step": 2822 }, { "epoch": 0.8630388260470804, "grad_norm": 1.3310582637786865, "learning_rate": 1.0775697363393198e-05, "loss": 0.2673, "step": 2823 }, { "epoch": 0.8633445429532253, "grad_norm": 1.8010623455047607, "learning_rate": 1.0779518532670999e-05, "loss": 0.2951, "step": 2824 }, { "epoch": 0.8636502598593703, "grad_norm": 1.8848446607589722, "learning_rate": 1.0783339701948798e-05, "loss": 0.4069, "step": 2825 }, { "epoch": 0.8639559767655152, "grad_norm": 0.4958030581474304, "learning_rate": 1.0787160871226596e-05, "loss": 0.2028, "step": 2826 }, { "epoch": 0.8642616936716601, "grad_norm": 0.560827374458313, "learning_rate": 1.0790982040504395e-05, "loss": 0.0951, "step": 2827 }, { "epoch": 0.864567410577805, "grad_norm": 0.3952960669994354, "learning_rate": 1.0794803209782194e-05, "loss": 0.0913, "step": 2828 }, { "epoch": 0.8648731274839498, "grad_norm": 0.3796207308769226, "learning_rate": 1.0798624379059993e-05, "loss": 0.0865, "step": 2829 }, { "epoch": 0.8651788443900947, "grad_norm": 0.569622814655304, "learning_rate": 1.0802445548337791e-05, "loss": 0.1191, "step": 2830 }, { "epoch": 0.8654845612962396, "grad_norm": 0.4867333769798279, "learning_rate": 1.0806266717615592e-05, "loss": 0.1195, "step": 2831 }, { "epoch": 0.8657902782023846, "grad_norm": 0.5379158854484558, "learning_rate": 1.081008788689339e-05, "loss": 0.1005, "step": 2832 }, { "epoch": 0.8660959951085295, "grad_norm": 0.47973304986953735, "learning_rate": 1.081390905617119e-05, "loss": 0.1136, "step": 2833 }, { "epoch": 0.8664017120146744, "grad_norm": 0.41333457827568054, "learning_rate": 1.0817730225448988e-05, "loss": 0.1206, "step": 2834 }, { "epoch": 0.8667074289208193, "grad_norm": 0.3723582327365875, "learning_rate": 1.0821551394726787e-05, "loss": 0.0942, "step": 2835 }, { "epoch": 0.8670131458269642, "grad_norm": 0.4610383212566376, "learning_rate": 1.0825372564004586e-05, "loss": 0.1309, "step": 2836 }, { "epoch": 0.8673188627331091, "grad_norm": 0.40014228224754333, "learning_rate": 1.0829193733282384e-05, "loss": 0.1045, "step": 2837 }, { "epoch": 0.8676245796392541, "grad_norm": 0.5419617295265198, "learning_rate": 1.0833014902560183e-05, "loss": 0.1757, "step": 2838 }, { "epoch": 0.867930296545399, "grad_norm": 0.7410321235656738, "learning_rate": 1.0836836071837983e-05, "loss": 0.1596, "step": 2839 }, { "epoch": 0.8682360134515439, "grad_norm": 0.5875413417816162, "learning_rate": 1.0840657241115782e-05, "loss": 0.165, "step": 2840 }, { "epoch": 0.8685417303576888, "grad_norm": 0.64943927526474, "learning_rate": 1.0844478410393581e-05, "loss": 0.2122, "step": 2841 }, { "epoch": 0.8688474472638337, "grad_norm": 0.8595467209815979, "learning_rate": 1.084829957967138e-05, "loss": 0.2295, "step": 2842 }, { "epoch": 0.8691531641699786, "grad_norm": 0.8279783129692078, "learning_rate": 1.0852120748949178e-05, "loss": 0.2213, "step": 2843 }, { "epoch": 0.8694588810761235, "grad_norm": 0.7356963157653809, "learning_rate": 1.0855941918226977e-05, "loss": 0.2323, "step": 2844 }, { "epoch": 0.8697645979822685, "grad_norm": 0.8933345079421997, "learning_rate": 1.0859763087504776e-05, "loss": 0.2694, "step": 2845 }, { "epoch": 0.8700703148884134, "grad_norm": 0.9495128989219666, "learning_rate": 1.0863584256782575e-05, "loss": 0.2608, "step": 2846 }, { "epoch": 0.8703760317945582, "grad_norm": 0.9985908269882202, "learning_rate": 1.0867405426060375e-05, "loss": 0.277, "step": 2847 }, { "epoch": 0.8706817487007031, "grad_norm": 2.0876870155334473, "learning_rate": 1.0871226595338174e-05, "loss": 0.2626, "step": 2848 }, { "epoch": 0.870987465606848, "grad_norm": 1.9679478406906128, "learning_rate": 1.0875047764615973e-05, "loss": 0.2951, "step": 2849 }, { "epoch": 0.8712931825129929, "grad_norm": 2.3658885955810547, "learning_rate": 1.0878868933893771e-05, "loss": 0.3994, "step": 2850 }, { "epoch": 0.8715988994191379, "grad_norm": 0.480333536863327, "learning_rate": 1.088269010317157e-05, "loss": 0.2047, "step": 2851 }, { "epoch": 0.8719046163252828, "grad_norm": 0.5793631076812744, "learning_rate": 1.0886511272449369e-05, "loss": 0.1276, "step": 2852 }, { "epoch": 0.8722103332314277, "grad_norm": 0.45580190420150757, "learning_rate": 1.0890332441727168e-05, "loss": 0.1514, "step": 2853 }, { "epoch": 0.8725160501375726, "grad_norm": 0.4165154695510864, "learning_rate": 1.0894153611004968e-05, "loss": 0.0919, "step": 2854 }, { "epoch": 0.8728217670437175, "grad_norm": 0.3022501468658447, "learning_rate": 1.0897974780282767e-05, "loss": 0.071, "step": 2855 }, { "epoch": 0.8731274839498624, "grad_norm": 0.39715975522994995, "learning_rate": 1.0901795949560566e-05, "loss": 0.075, "step": 2856 }, { "epoch": 0.8734332008560073, "grad_norm": 0.35931283235549927, "learning_rate": 1.0905617118838364e-05, "loss": 0.0832, "step": 2857 }, { "epoch": 0.8737389177621523, "grad_norm": 0.7784217596054077, "learning_rate": 1.0909438288116163e-05, "loss": 0.1085, "step": 2858 }, { "epoch": 0.8740446346682972, "grad_norm": 0.5351943969726562, "learning_rate": 1.0913259457393962e-05, "loss": 0.1023, "step": 2859 }, { "epoch": 0.8743503515744421, "grad_norm": 0.4210098385810852, "learning_rate": 1.0917080626671762e-05, "loss": 0.0889, "step": 2860 }, { "epoch": 0.874656068480587, "grad_norm": 0.6373433470726013, "learning_rate": 1.0920901795949561e-05, "loss": 0.1348, "step": 2861 }, { "epoch": 0.8749617853867319, "grad_norm": 0.41415613889694214, "learning_rate": 1.0924722965227361e-05, "loss": 0.1172, "step": 2862 }, { "epoch": 0.8752675022928768, "grad_norm": 0.4888051748275757, "learning_rate": 1.092854413450516e-05, "loss": 0.1253, "step": 2863 }, { "epoch": 0.8755732191990218, "grad_norm": 0.8585177659988403, "learning_rate": 1.0932365303782959e-05, "loss": 0.1648, "step": 2864 }, { "epoch": 0.8758789361051666, "grad_norm": 0.670505940914154, "learning_rate": 1.0936186473060758e-05, "loss": 0.1876, "step": 2865 }, { "epoch": 0.8761846530113115, "grad_norm": 0.7183477878570557, "learning_rate": 1.0940007642338556e-05, "loss": 0.2485, "step": 2866 }, { "epoch": 0.8764903699174564, "grad_norm": 0.828275203704834, "learning_rate": 1.0943828811616355e-05, "loss": 0.2537, "step": 2867 }, { "epoch": 0.8767960868236013, "grad_norm": 0.7440007328987122, "learning_rate": 1.0947649980894154e-05, "loss": 0.209, "step": 2868 }, { "epoch": 0.8771018037297462, "grad_norm": 0.9077831506729126, "learning_rate": 1.0951471150171953e-05, "loss": 0.2403, "step": 2869 }, { "epoch": 0.8774075206358911, "grad_norm": 0.8311123251914978, "learning_rate": 1.0955292319449753e-05, "loss": 0.2608, "step": 2870 }, { "epoch": 0.8777132375420361, "grad_norm": 1.0120830535888672, "learning_rate": 1.0959113488727552e-05, "loss": 0.2884, "step": 2871 }, { "epoch": 0.878018954448181, "grad_norm": 0.8847616314888, "learning_rate": 1.096293465800535e-05, "loss": 0.2364, "step": 2872 }, { "epoch": 0.8783246713543259, "grad_norm": 1.1198071241378784, "learning_rate": 1.096675582728315e-05, "loss": 0.2809, "step": 2873 }, { "epoch": 0.8786303882604708, "grad_norm": 1.5499933958053589, "learning_rate": 1.0970576996560948e-05, "loss": 0.3785, "step": 2874 }, { "epoch": 0.8789361051666157, "grad_norm": 2.0456345081329346, "learning_rate": 1.0974398165838747e-05, "loss": 0.4426, "step": 2875 }, { "epoch": 0.8792418220727606, "grad_norm": 0.5315583944320679, "learning_rate": 1.0978219335116546e-05, "loss": 0.1794, "step": 2876 }, { "epoch": 0.8795475389789056, "grad_norm": 0.4477832615375519, "learning_rate": 1.0982040504394344e-05, "loss": 0.1281, "step": 2877 }, { "epoch": 0.8798532558850505, "grad_norm": 0.3798954486846924, "learning_rate": 1.0985861673672145e-05, "loss": 0.1095, "step": 2878 }, { "epoch": 0.8801589727911954, "grad_norm": 0.46957242488861084, "learning_rate": 1.0989682842949943e-05, "loss": 0.0983, "step": 2879 }, { "epoch": 0.8804646896973403, "grad_norm": 0.4097435772418976, "learning_rate": 1.0993504012227742e-05, "loss": 0.0972, "step": 2880 }, { "epoch": 0.8807704066034852, "grad_norm": 0.381367027759552, "learning_rate": 1.0997325181505541e-05, "loss": 0.0697, "step": 2881 }, { "epoch": 0.88107612350963, "grad_norm": 0.41547784209251404, "learning_rate": 1.100114635078334e-05, "loss": 0.0799, "step": 2882 }, { "epoch": 0.8813818404157749, "grad_norm": 0.4203016757965088, "learning_rate": 1.1004967520061138e-05, "loss": 0.1072, "step": 2883 }, { "epoch": 0.88168755732192, "grad_norm": 0.38584083318710327, "learning_rate": 1.1008788689338937e-05, "loss": 0.096, "step": 2884 }, { "epoch": 0.8819932742280648, "grad_norm": 0.37973010540008545, "learning_rate": 1.1012609858616736e-05, "loss": 0.1005, "step": 2885 }, { "epoch": 0.8822989911342097, "grad_norm": 0.5457665324211121, "learning_rate": 1.1016431027894536e-05, "loss": 0.1209, "step": 2886 }, { "epoch": 0.8826047080403546, "grad_norm": 0.44151970744132996, "learning_rate": 1.1020252197172335e-05, "loss": 0.1111, "step": 2887 }, { "epoch": 0.8829104249464995, "grad_norm": 0.49243366718292236, "learning_rate": 1.1024073366450134e-05, "loss": 0.1211, "step": 2888 }, { "epoch": 0.8832161418526444, "grad_norm": 3.3297908306121826, "learning_rate": 1.1027894535727933e-05, "loss": 0.1651, "step": 2889 }, { "epoch": 0.8835218587587894, "grad_norm": 0.5512127876281738, "learning_rate": 1.1031715705005731e-05, "loss": 0.1519, "step": 2890 }, { "epoch": 0.8838275756649343, "grad_norm": 1.2579424381256104, "learning_rate": 1.103553687428353e-05, "loss": 0.1868, "step": 2891 }, { "epoch": 0.8841332925710792, "grad_norm": 1.1795662641525269, "learning_rate": 1.1039358043561329e-05, "loss": 0.2225, "step": 2892 }, { "epoch": 0.8844390094772241, "grad_norm": 0.7548216581344604, "learning_rate": 1.104317921283913e-05, "loss": 0.2098, "step": 2893 }, { "epoch": 0.884744726383369, "grad_norm": 0.8025513887405396, "learning_rate": 1.1047000382116928e-05, "loss": 0.2278, "step": 2894 }, { "epoch": 0.8850504432895139, "grad_norm": 1.0729471445083618, "learning_rate": 1.1050821551394727e-05, "loss": 0.2375, "step": 2895 }, { "epoch": 0.8853561601956588, "grad_norm": 1.343372106552124, "learning_rate": 1.1054642720672525e-05, "loss": 0.2496, "step": 2896 }, { "epoch": 0.8856618771018038, "grad_norm": 1.3263108730316162, "learning_rate": 1.1058463889950324e-05, "loss": 0.2995, "step": 2897 }, { "epoch": 0.8859675940079487, "grad_norm": 1.6302849054336548, "learning_rate": 1.1062285059228123e-05, "loss": 0.3044, "step": 2898 }, { "epoch": 0.8862733109140936, "grad_norm": 1.715701699256897, "learning_rate": 1.1066106228505923e-05, "loss": 0.3149, "step": 2899 }, { "epoch": 0.8865790278202385, "grad_norm": 3.2864584922790527, "learning_rate": 1.1069927397783722e-05, "loss": 0.3638, "step": 2900 }, { "epoch": 0.8868847447263833, "grad_norm": 0.484831303358078, "learning_rate": 1.1073748567061523e-05, "loss": 0.2357, "step": 2901 }, { "epoch": 0.8871904616325282, "grad_norm": 0.49691298604011536, "learning_rate": 1.1077569736339321e-05, "loss": 0.1542, "step": 2902 }, { "epoch": 0.8874961785386732, "grad_norm": 0.4422440230846405, "learning_rate": 1.108139090561712e-05, "loss": 0.1147, "step": 2903 }, { "epoch": 0.8878018954448181, "grad_norm": 0.2951458990573883, "learning_rate": 1.1085212074894919e-05, "loss": 0.0759, "step": 2904 }, { "epoch": 0.888107612350963, "grad_norm": 0.402283251285553, "learning_rate": 1.1089033244172718e-05, "loss": 0.068, "step": 2905 }, { "epoch": 0.8884133292571079, "grad_norm": 0.3961677551269531, "learning_rate": 1.1092854413450516e-05, "loss": 0.0961, "step": 2906 }, { "epoch": 0.8887190461632528, "grad_norm": 0.44706377387046814, "learning_rate": 1.1096675582728315e-05, "loss": 0.1042, "step": 2907 }, { "epoch": 0.8890247630693977, "grad_norm": 0.520615816116333, "learning_rate": 1.1100496752006114e-05, "loss": 0.1253, "step": 2908 }, { "epoch": 0.8893304799755426, "grad_norm": 0.41290077567100525, "learning_rate": 1.1104317921283914e-05, "loss": 0.0863, "step": 2909 }, { "epoch": 0.8896361968816876, "grad_norm": 0.4618144631385803, "learning_rate": 1.1108139090561713e-05, "loss": 0.0935, "step": 2910 }, { "epoch": 0.8899419137878325, "grad_norm": 0.5702872276306152, "learning_rate": 1.1111960259839512e-05, "loss": 0.1258, "step": 2911 }, { "epoch": 0.8902476306939774, "grad_norm": 0.5685725808143616, "learning_rate": 1.111578142911731e-05, "loss": 0.1133, "step": 2912 }, { "epoch": 0.8905533476001223, "grad_norm": 0.5611013174057007, "learning_rate": 1.111960259839511e-05, "loss": 0.1378, "step": 2913 }, { "epoch": 0.8908590645062672, "grad_norm": 0.5937516689300537, "learning_rate": 1.1123423767672908e-05, "loss": 0.1318, "step": 2914 }, { "epoch": 0.8911647814124121, "grad_norm": 0.9624830484390259, "learning_rate": 1.1127244936950707e-05, "loss": 0.1736, "step": 2915 }, { "epoch": 0.8914704983185571, "grad_norm": 0.6981553435325623, "learning_rate": 1.1131066106228505e-05, "loss": 0.2052, "step": 2916 }, { "epoch": 0.891776215224702, "grad_norm": 0.7801915407180786, "learning_rate": 1.1134887275506306e-05, "loss": 0.2406, "step": 2917 }, { "epoch": 0.8920819321308469, "grad_norm": 0.71563720703125, "learning_rate": 1.1138708444784105e-05, "loss": 0.2223, "step": 2918 }, { "epoch": 0.8923876490369917, "grad_norm": 0.8708023428916931, "learning_rate": 1.1142529614061903e-05, "loss": 0.2638, "step": 2919 }, { "epoch": 0.8926933659431366, "grad_norm": 0.7496792674064636, "learning_rate": 1.1146350783339702e-05, "loss": 0.2308, "step": 2920 }, { "epoch": 0.8929990828492815, "grad_norm": 1.1899067163467407, "learning_rate": 1.1150171952617501e-05, "loss": 0.2589, "step": 2921 }, { "epoch": 0.8933047997554264, "grad_norm": 1.3301211595535278, "learning_rate": 1.11539931218953e-05, "loss": 0.3205, "step": 2922 }, { "epoch": 0.8936105166615714, "grad_norm": 1.1112548112869263, "learning_rate": 1.1157814291173098e-05, "loss": 0.2848, "step": 2923 }, { "epoch": 0.8939162335677163, "grad_norm": 1.2909464836120605, "learning_rate": 1.1161635460450899e-05, "loss": 0.3042, "step": 2924 }, { "epoch": 0.8942219504738612, "grad_norm": 2.5211145877838135, "learning_rate": 1.1165456629728698e-05, "loss": 0.3839, "step": 2925 }, { "epoch": 0.8945276673800061, "grad_norm": 0.6056529879570007, "learning_rate": 1.1169277799006496e-05, "loss": 0.1761, "step": 2926 }, { "epoch": 0.894833384286151, "grad_norm": 0.3727289140224457, "learning_rate": 1.1173098968284295e-05, "loss": 0.1067, "step": 2927 }, { "epoch": 0.8951391011922959, "grad_norm": 0.452119380235672, "learning_rate": 1.1176920137562094e-05, "loss": 0.1312, "step": 2928 }, { "epoch": 0.8954448180984409, "grad_norm": 0.3365665376186371, "learning_rate": 1.1180741306839893e-05, "loss": 0.0777, "step": 2929 }, { "epoch": 0.8957505350045858, "grad_norm": 0.5192652344703674, "learning_rate": 1.1184562476117691e-05, "loss": 0.0868, "step": 2930 }, { "epoch": 0.8960562519107307, "grad_norm": 0.391388863325119, "learning_rate": 1.118838364539549e-05, "loss": 0.0721, "step": 2931 }, { "epoch": 0.8963619688168756, "grad_norm": 0.3915162682533264, "learning_rate": 1.119220481467329e-05, "loss": 0.0923, "step": 2932 }, { "epoch": 0.8966676857230205, "grad_norm": 0.5019621849060059, "learning_rate": 1.119602598395109e-05, "loss": 0.1088, "step": 2933 }, { "epoch": 0.8969734026291654, "grad_norm": 0.4299291670322418, "learning_rate": 1.1199847153228888e-05, "loss": 0.1127, "step": 2934 }, { "epoch": 0.8972791195353103, "grad_norm": 0.45791012048721313, "learning_rate": 1.1203668322506687e-05, "loss": 0.1314, "step": 2935 }, { "epoch": 0.8975848364414553, "grad_norm": 0.7575715184211731, "learning_rate": 1.1207489491784485e-05, "loss": 0.1606, "step": 2936 }, { "epoch": 0.8978905533476002, "grad_norm": 0.5510888695716858, "learning_rate": 1.1211310661062284e-05, "loss": 0.1245, "step": 2937 }, { "epoch": 0.898196270253745, "grad_norm": 0.668006181716919, "learning_rate": 1.1215131830340085e-05, "loss": 0.1452, "step": 2938 }, { "epoch": 0.8985019871598899, "grad_norm": 0.6660405993461609, "learning_rate": 1.1218952999617883e-05, "loss": 0.1647, "step": 2939 }, { "epoch": 0.8988077040660348, "grad_norm": 0.8394568562507629, "learning_rate": 1.1222774168895684e-05, "loss": 0.2034, "step": 2940 }, { "epoch": 0.8991134209721797, "grad_norm": 0.7790534496307373, "learning_rate": 1.1226595338173483e-05, "loss": 0.1802, "step": 2941 }, { "epoch": 0.8994191378783247, "grad_norm": 0.8532803058624268, "learning_rate": 1.1230416507451281e-05, "loss": 0.2403, "step": 2942 }, { "epoch": 0.8997248547844696, "grad_norm": 0.7701433897018433, "learning_rate": 1.123423767672908e-05, "loss": 0.2228, "step": 2943 }, { "epoch": 0.9000305716906145, "grad_norm": 0.8302480578422546, "learning_rate": 1.1238058846006879e-05, "loss": 0.227, "step": 2944 }, { "epoch": 0.9003362885967594, "grad_norm": 1.0154753923416138, "learning_rate": 1.1241880015284678e-05, "loss": 0.2288, "step": 2945 }, { "epoch": 0.9006420055029043, "grad_norm": 15.72428035736084, "learning_rate": 1.1245701184562476e-05, "loss": 0.2863, "step": 2946 }, { "epoch": 0.9009477224090492, "grad_norm": 1.3436781167984009, "learning_rate": 1.1249522353840275e-05, "loss": 0.3564, "step": 2947 }, { "epoch": 0.9012534393151941, "grad_norm": 2.3462541103363037, "learning_rate": 1.1253343523118075e-05, "loss": 0.301, "step": 2948 }, { "epoch": 0.9015591562213391, "grad_norm": 1.5230966806411743, "learning_rate": 1.1257164692395874e-05, "loss": 0.286, "step": 2949 }, { "epoch": 0.901864873127484, "grad_norm": 2.5260510444641113, "learning_rate": 1.1260985861673673e-05, "loss": 0.3676, "step": 2950 }, { "epoch": 0.9021705900336289, "grad_norm": 0.5845708847045898, "learning_rate": 1.1264807030951472e-05, "loss": 0.1853, "step": 2951 }, { "epoch": 0.9024763069397738, "grad_norm": 0.4082469344139099, "learning_rate": 1.126862820022927e-05, "loss": 0.1234, "step": 2952 }, { "epoch": 0.9027820238459187, "grad_norm": 0.37996071577072144, "learning_rate": 1.127244936950707e-05, "loss": 0.1117, "step": 2953 }, { "epoch": 0.9030877407520636, "grad_norm": 0.6788279414176941, "learning_rate": 1.1276270538784868e-05, "loss": 0.0946, "step": 2954 }, { "epoch": 0.9033934576582086, "grad_norm": 0.6417231559753418, "learning_rate": 1.1280091708062668e-05, "loss": 0.0844, "step": 2955 }, { "epoch": 0.9036991745643534, "grad_norm": 0.49676185846328735, "learning_rate": 1.1283912877340467e-05, "loss": 0.0736, "step": 2956 }, { "epoch": 0.9040048914704983, "grad_norm": 0.35339850187301636, "learning_rate": 1.1287734046618266e-05, "loss": 0.0647, "step": 2957 }, { "epoch": 0.9043106083766432, "grad_norm": 0.49410662055015564, "learning_rate": 1.1291555215896065e-05, "loss": 0.1198, "step": 2958 }, { "epoch": 0.9046163252827881, "grad_norm": 0.4157877266407013, "learning_rate": 1.1295376385173863e-05, "loss": 0.0843, "step": 2959 }, { "epoch": 0.904922042188933, "grad_norm": 0.628097653388977, "learning_rate": 1.1299197554451662e-05, "loss": 0.0987, "step": 2960 }, { "epoch": 0.9052277590950779, "grad_norm": 0.5405787229537964, "learning_rate": 1.130301872372946e-05, "loss": 0.1327, "step": 2961 }, { "epoch": 0.9055334760012229, "grad_norm": 0.8043410181999207, "learning_rate": 1.130683989300726e-05, "loss": 0.146, "step": 2962 }, { "epoch": 0.9058391929073678, "grad_norm": 0.5210809707641602, "learning_rate": 1.131066106228506e-05, "loss": 0.1263, "step": 2963 }, { "epoch": 0.9061449098135127, "grad_norm": 0.8608220219612122, "learning_rate": 1.1314482231562859e-05, "loss": 0.1343, "step": 2964 }, { "epoch": 0.9064506267196576, "grad_norm": 1.181122064590454, "learning_rate": 1.1318303400840658e-05, "loss": 0.1898, "step": 2965 }, { "epoch": 0.9067563436258025, "grad_norm": 0.6273275017738342, "learning_rate": 1.1322124570118456e-05, "loss": 0.1864, "step": 2966 }, { "epoch": 0.9070620605319474, "grad_norm": 0.682072639465332, "learning_rate": 1.1325945739396255e-05, "loss": 0.2414, "step": 2967 }, { "epoch": 0.9073677774380923, "grad_norm": 0.9028895497322083, "learning_rate": 1.1329766908674054e-05, "loss": 0.2421, "step": 2968 }, { "epoch": 0.9076734943442373, "grad_norm": 0.8616337776184082, "learning_rate": 1.1333588077951852e-05, "loss": 0.2651, "step": 2969 }, { "epoch": 0.9079792112503822, "grad_norm": 1.0265978574752808, "learning_rate": 1.1337409247229651e-05, "loss": 0.2476, "step": 2970 }, { "epoch": 0.9082849281565271, "grad_norm": 0.948316752910614, "learning_rate": 1.1341230416507452e-05, "loss": 0.2602, "step": 2971 }, { "epoch": 0.908590645062672, "grad_norm": 1.1828045845031738, "learning_rate": 1.134505158578525e-05, "loss": 0.2656, "step": 2972 }, { "epoch": 0.9088963619688168, "grad_norm": 1.3427733182907104, "learning_rate": 1.134887275506305e-05, "loss": 0.3547, "step": 2973 }, { "epoch": 0.9092020788749617, "grad_norm": 1.292826533317566, "learning_rate": 1.1352693924340848e-05, "loss": 0.3522, "step": 2974 }, { "epoch": 0.9095077957811067, "grad_norm": 2.0130088329315186, "learning_rate": 1.1356515093618647e-05, "loss": 0.3762, "step": 2975 }, { "epoch": 0.9098135126872516, "grad_norm": 0.5117076635360718, "learning_rate": 1.1360336262896445e-05, "loss": 0.1812, "step": 2976 }, { "epoch": 0.9101192295933965, "grad_norm": 0.4219289720058441, "learning_rate": 1.1364157432174246e-05, "loss": 0.126, "step": 2977 }, { "epoch": 0.9104249464995414, "grad_norm": 0.3744547665119171, "learning_rate": 1.1367978601452045e-05, "loss": 0.1069, "step": 2978 }, { "epoch": 0.9107306634056863, "grad_norm": 0.32564160227775574, "learning_rate": 1.1371799770729845e-05, "loss": 0.0835, "step": 2979 }, { "epoch": 0.9110363803118312, "grad_norm": 0.37200793623924255, "learning_rate": 1.1375620940007644e-05, "loss": 0.0989, "step": 2980 }, { "epoch": 0.9113420972179761, "grad_norm": 0.5111452341079712, "learning_rate": 1.1379442109285443e-05, "loss": 0.0798, "step": 2981 }, { "epoch": 0.9116478141241211, "grad_norm": 0.4212460517883301, "learning_rate": 1.1383263278563241e-05, "loss": 0.0962, "step": 2982 }, { "epoch": 0.911953531030266, "grad_norm": 0.47865596413612366, "learning_rate": 1.138708444784104e-05, "loss": 0.11, "step": 2983 }, { "epoch": 0.9122592479364109, "grad_norm": 0.4735793471336365, "learning_rate": 1.1390905617118839e-05, "loss": 0.1211, "step": 2984 }, { "epoch": 0.9125649648425558, "grad_norm": 0.4020056426525116, "learning_rate": 1.1394726786396637e-05, "loss": 0.0812, "step": 2985 }, { "epoch": 0.9128706817487007, "grad_norm": 0.6244885325431824, "learning_rate": 1.1398547955674438e-05, "loss": 0.1617, "step": 2986 }, { "epoch": 0.9131763986548456, "grad_norm": 0.4928550720214844, "learning_rate": 1.1402369124952237e-05, "loss": 0.1127, "step": 2987 }, { "epoch": 0.9134821155609906, "grad_norm": 0.5160829424858093, "learning_rate": 1.1406190294230035e-05, "loss": 0.1695, "step": 2988 }, { "epoch": 0.9137878324671355, "grad_norm": 0.5230198502540588, "learning_rate": 1.1410011463507834e-05, "loss": 0.169, "step": 2989 }, { "epoch": 0.9140935493732804, "grad_norm": 0.5168899297714233, "learning_rate": 1.1413832632785633e-05, "loss": 0.1743, "step": 2990 }, { "epoch": 0.9143992662794252, "grad_norm": 0.5790212154388428, "learning_rate": 1.1417653802063432e-05, "loss": 0.1789, "step": 2991 }, { "epoch": 0.9147049831855701, "grad_norm": 0.872795045375824, "learning_rate": 1.142147497134123e-05, "loss": 0.2497, "step": 2992 }, { "epoch": 0.915010700091715, "grad_norm": 0.8759992122650146, "learning_rate": 1.1425296140619029e-05, "loss": 0.2344, "step": 2993 }, { "epoch": 0.9153164169978599, "grad_norm": 1.6808127164840698, "learning_rate": 1.142911730989683e-05, "loss": 0.2223, "step": 2994 }, { "epoch": 0.9156221339040049, "grad_norm": 0.7747117877006531, "learning_rate": 1.1432938479174628e-05, "loss": 0.2285, "step": 2995 }, { "epoch": 0.9159278508101498, "grad_norm": 1.3400094509124756, "learning_rate": 1.1436759648452427e-05, "loss": 0.2587, "step": 2996 }, { "epoch": 0.9162335677162947, "grad_norm": 0.9564950466156006, "learning_rate": 1.1440580817730226e-05, "loss": 0.2464, "step": 2997 }, { "epoch": 0.9165392846224396, "grad_norm": 1.2289472818374634, "learning_rate": 1.1444401987008025e-05, "loss": 0.3345, "step": 2998 }, { "epoch": 0.9168450015285845, "grad_norm": 1.2521336078643799, "learning_rate": 1.1448223156285823e-05, "loss": 0.2881, "step": 2999 }, { "epoch": 0.9171507184347294, "grad_norm": 2.630722999572754, "learning_rate": 1.1452044325563622e-05, "loss": 0.3766, "step": 3000 }, { "epoch": 0.9171507184347294, "eval_cer": 0.19613105953978593, "eval_loss": 0.29485461115837097, "eval_runtime": 19.0143, "eval_samples_per_second": 238.663, "eval_steps_per_second": 0.789, "eval_wer": 0.35591184603782017, "step": 3000 }, { "epoch": 0.9174564353408744, "grad_norm": 0.6094717979431152, "learning_rate": 1.145586549484142e-05, "loss": 0.2092, "step": 3001 }, { "epoch": 0.9177621522470193, "grad_norm": 0.49538132548332214, "learning_rate": 1.1459686664119221e-05, "loss": 0.1249, "step": 3002 }, { "epoch": 0.9180678691531642, "grad_norm": 0.3630075454711914, "learning_rate": 1.146350783339702e-05, "loss": 0.1063, "step": 3003 }, { "epoch": 0.9183735860593091, "grad_norm": 0.3629021942615509, "learning_rate": 1.1467329002674819e-05, "loss": 0.096, "step": 3004 }, { "epoch": 0.918679302965454, "grad_norm": 0.31935662031173706, "learning_rate": 1.1471150171952617e-05, "loss": 0.0795, "step": 3005 }, { "epoch": 0.9189850198715989, "grad_norm": 0.5039805769920349, "learning_rate": 1.1474971341230416e-05, "loss": 0.0912, "step": 3006 }, { "epoch": 0.9192907367777438, "grad_norm": 0.41487210988998413, "learning_rate": 1.1478792510508215e-05, "loss": 0.0932, "step": 3007 }, { "epoch": 0.9195964536838888, "grad_norm": 0.5537593960762024, "learning_rate": 1.1482613679786014e-05, "loss": 0.0726, "step": 3008 }, { "epoch": 0.9199021705900337, "grad_norm": 0.46186310052871704, "learning_rate": 1.1486434849063812e-05, "loss": 0.1492, "step": 3009 }, { "epoch": 0.9202078874961785, "grad_norm": 0.5484208464622498, "learning_rate": 1.1490256018341613e-05, "loss": 0.1069, "step": 3010 }, { "epoch": 0.9205136044023234, "grad_norm": 0.40302255749702454, "learning_rate": 1.1494077187619412e-05, "loss": 0.0966, "step": 3011 }, { "epoch": 0.9208193213084683, "grad_norm": 0.5142963528633118, "learning_rate": 1.149789835689721e-05, "loss": 0.1137, "step": 3012 }, { "epoch": 0.9211250382146132, "grad_norm": 0.8219571113586426, "learning_rate": 1.1501719526175009e-05, "loss": 0.1715, "step": 3013 }, { "epoch": 0.9214307551207582, "grad_norm": 0.7314652800559998, "learning_rate": 1.1505540695452808e-05, "loss": 0.1638, "step": 3014 }, { "epoch": 0.9217364720269031, "grad_norm": 0.8546196818351746, "learning_rate": 1.1509361864730607e-05, "loss": 0.1893, "step": 3015 }, { "epoch": 0.922042188933048, "grad_norm": 0.8051688075065613, "learning_rate": 1.1513183034008407e-05, "loss": 0.1961, "step": 3016 }, { "epoch": 0.9223479058391929, "grad_norm": 0.7583447694778442, "learning_rate": 1.1517004203286206e-05, "loss": 0.2324, "step": 3017 }, { "epoch": 0.9226536227453378, "grad_norm": 0.7158026695251465, "learning_rate": 1.1520825372564006e-05, "loss": 0.1864, "step": 3018 }, { "epoch": 0.9229593396514827, "grad_norm": 0.8084583282470703, "learning_rate": 1.1524646541841805e-05, "loss": 0.2281, "step": 3019 }, { "epoch": 0.9232650565576276, "grad_norm": 1.0293740034103394, "learning_rate": 1.1528467711119604e-05, "loss": 0.2616, "step": 3020 }, { "epoch": 0.9235707734637726, "grad_norm": 1.7498186826705933, "learning_rate": 1.1532288880397402e-05, "loss": 0.2736, "step": 3021 }, { "epoch": 0.9238764903699175, "grad_norm": 1.3752650022506714, "learning_rate": 1.1536110049675201e-05, "loss": 0.2814, "step": 3022 }, { "epoch": 0.9241822072760624, "grad_norm": 2.6834232807159424, "learning_rate": 1.1539931218953e-05, "loss": 0.2899, "step": 3023 }, { "epoch": 0.9244879241822073, "grad_norm": 1.2478841543197632, "learning_rate": 1.1543752388230799e-05, "loss": 0.2859, "step": 3024 }, { "epoch": 0.9247936410883522, "grad_norm": 1.8052457571029663, "learning_rate": 1.1547573557508599e-05, "loss": 0.3407, "step": 3025 }, { "epoch": 0.925099357994497, "grad_norm": 0.5231805443763733, "learning_rate": 1.1551394726786398e-05, "loss": 0.222, "step": 3026 }, { "epoch": 0.925405074900642, "grad_norm": 0.5492537617683411, "learning_rate": 1.1555215896064197e-05, "loss": 0.1377, "step": 3027 }, { "epoch": 0.925710791806787, "grad_norm": 0.43690863251686096, "learning_rate": 1.1559037065341995e-05, "loss": 0.1095, "step": 3028 }, { "epoch": 0.9260165087129318, "grad_norm": 0.4495159387588501, "learning_rate": 1.1562858234619794e-05, "loss": 0.0938, "step": 3029 }, { "epoch": 0.9263222256190767, "grad_norm": 0.5477100014686584, "learning_rate": 1.1566679403897593e-05, "loss": 0.1053, "step": 3030 }, { "epoch": 0.9266279425252216, "grad_norm": 0.5299806594848633, "learning_rate": 1.1570500573175392e-05, "loss": 0.0953, "step": 3031 }, { "epoch": 0.9269336594313665, "grad_norm": 0.47438350319862366, "learning_rate": 1.157432174245319e-05, "loss": 0.1106, "step": 3032 }, { "epoch": 0.9272393763375114, "grad_norm": 0.41983267664909363, "learning_rate": 1.157814291173099e-05, "loss": 0.1127, "step": 3033 }, { "epoch": 0.9275450932436564, "grad_norm": 0.45701727271080017, "learning_rate": 1.158196408100879e-05, "loss": 0.1169, "step": 3034 }, { "epoch": 0.9278508101498013, "grad_norm": 0.5443419814109802, "learning_rate": 1.1585785250286588e-05, "loss": 0.104, "step": 3035 }, { "epoch": 0.9281565270559462, "grad_norm": 0.49870482087135315, "learning_rate": 1.1589606419564387e-05, "loss": 0.1725, "step": 3036 }, { "epoch": 0.9284622439620911, "grad_norm": 0.5707470774650574, "learning_rate": 1.1593427588842186e-05, "loss": 0.1361, "step": 3037 }, { "epoch": 0.928767960868236, "grad_norm": 0.5046769380569458, "learning_rate": 1.1597248758119984e-05, "loss": 0.1948, "step": 3038 }, { "epoch": 0.9290736777743809, "grad_norm": 0.6744658350944519, "learning_rate": 1.1601069927397783e-05, "loss": 0.2046, "step": 3039 }, { "epoch": 0.9293793946805259, "grad_norm": 0.6707252264022827, "learning_rate": 1.1604891096675582e-05, "loss": 0.1596, "step": 3040 }, { "epoch": 0.9296851115866708, "grad_norm": 0.6023860573768616, "learning_rate": 1.1608712265953382e-05, "loss": 0.1921, "step": 3041 }, { "epoch": 0.9299908284928157, "grad_norm": 0.7500742673873901, "learning_rate": 1.1612533435231181e-05, "loss": 0.2343, "step": 3042 }, { "epoch": 0.9302965453989606, "grad_norm": 1.0655561685562134, "learning_rate": 1.161635460450898e-05, "loss": 0.2316, "step": 3043 }, { "epoch": 0.9306022623051055, "grad_norm": 0.857761025428772, "learning_rate": 1.1620175773786779e-05, "loss": 0.2501, "step": 3044 }, { "epoch": 0.9309079792112503, "grad_norm": 0.9370866417884827, "learning_rate": 1.1623996943064577e-05, "loss": 0.2484, "step": 3045 }, { "epoch": 0.9312136961173952, "grad_norm": 1.0246697664260864, "learning_rate": 1.1627818112342376e-05, "loss": 0.3157, "step": 3046 }, { "epoch": 0.9315194130235402, "grad_norm": 0.9705404043197632, "learning_rate": 1.1631639281620175e-05, "loss": 0.2959, "step": 3047 }, { "epoch": 0.9318251299296851, "grad_norm": 1.425606608390808, "learning_rate": 1.1635460450897975e-05, "loss": 0.3079, "step": 3048 }, { "epoch": 0.93213084683583, "grad_norm": 1.3221688270568848, "learning_rate": 1.1639281620175774e-05, "loss": 0.3638, "step": 3049 }, { "epoch": 0.9324365637419749, "grad_norm": 2.0449726581573486, "learning_rate": 1.1643102789453573e-05, "loss": 0.395, "step": 3050 }, { "epoch": 0.9327422806481198, "grad_norm": 0.8678000569343567, "learning_rate": 1.1646923958731372e-05, "loss": 0.1956, "step": 3051 }, { "epoch": 0.9330479975542647, "grad_norm": 0.35863080620765686, "learning_rate": 1.165074512800917e-05, "loss": 0.1138, "step": 3052 }, { "epoch": 0.9333537144604097, "grad_norm": 0.5624440312385559, "learning_rate": 1.1654566297286969e-05, "loss": 0.1286, "step": 3053 }, { "epoch": 0.9336594313665546, "grad_norm": 0.49314990639686584, "learning_rate": 1.1658387466564768e-05, "loss": 0.0691, "step": 3054 }, { "epoch": 0.9339651482726995, "grad_norm": 0.33078533411026, "learning_rate": 1.1662208635842568e-05, "loss": 0.068, "step": 3055 }, { "epoch": 0.9342708651788444, "grad_norm": 0.4017584025859833, "learning_rate": 1.1666029805120369e-05, "loss": 0.0931, "step": 3056 }, { "epoch": 0.9345765820849893, "grad_norm": 1.489977478981018, "learning_rate": 1.1669850974398167e-05, "loss": 0.0746, "step": 3057 }, { "epoch": 0.9348822989911342, "grad_norm": 0.5206894874572754, "learning_rate": 1.1673672143675966e-05, "loss": 0.0952, "step": 3058 }, { "epoch": 0.9351880158972791, "grad_norm": 0.5214557647705078, "learning_rate": 1.1677493312953765e-05, "loss": 0.1245, "step": 3059 }, { "epoch": 0.9354937328034241, "grad_norm": 0.793854832649231, "learning_rate": 1.1681314482231564e-05, "loss": 0.0864, "step": 3060 }, { "epoch": 0.935799449709569, "grad_norm": 0.5364009141921997, "learning_rate": 1.1685135651509362e-05, "loss": 0.1459, "step": 3061 }, { "epoch": 0.9361051666157139, "grad_norm": 0.47780901193618774, "learning_rate": 1.1688956820787161e-05, "loss": 0.0983, "step": 3062 }, { "epoch": 0.9364108835218588, "grad_norm": 0.5996263027191162, "learning_rate": 1.169277799006496e-05, "loss": 0.1347, "step": 3063 }, { "epoch": 0.9367166004280036, "grad_norm": 0.8027803301811218, "learning_rate": 1.169659915934276e-05, "loss": 0.1764, "step": 3064 }, { "epoch": 0.9370223173341485, "grad_norm": 0.5228124260902405, "learning_rate": 1.1700420328620559e-05, "loss": 0.1426, "step": 3065 }, { "epoch": 0.9373280342402935, "grad_norm": 0.5994173288345337, "learning_rate": 1.1704241497898358e-05, "loss": 0.2008, "step": 3066 }, { "epoch": 0.9376337511464384, "grad_norm": 0.7207835912704468, "learning_rate": 1.1708062667176157e-05, "loss": 0.2017, "step": 3067 }, { "epoch": 0.9379394680525833, "grad_norm": 0.8848859667778015, "learning_rate": 1.1711883836453955e-05, "loss": 0.2111, "step": 3068 }, { "epoch": 0.9382451849587282, "grad_norm": 0.881862998008728, "learning_rate": 1.1715705005731754e-05, "loss": 0.2439, "step": 3069 }, { "epoch": 0.9385509018648731, "grad_norm": 1.148964762687683, "learning_rate": 1.1719526175009553e-05, "loss": 0.2739, "step": 3070 }, { "epoch": 0.938856618771018, "grad_norm": 1.2161192893981934, "learning_rate": 1.1723347344287352e-05, "loss": 0.2515, "step": 3071 }, { "epoch": 0.9391623356771629, "grad_norm": 1.1852374076843262, "learning_rate": 1.1727168513565152e-05, "loss": 0.3044, "step": 3072 }, { "epoch": 0.9394680525833079, "grad_norm": 2.7221360206604004, "learning_rate": 1.173098968284295e-05, "loss": 0.303, "step": 3073 }, { "epoch": 0.9397737694894528, "grad_norm": 1.4548758268356323, "learning_rate": 1.173481085212075e-05, "loss": 0.3428, "step": 3074 }, { "epoch": 0.9400794863955977, "grad_norm": 2.009091854095459, "learning_rate": 1.1738632021398548e-05, "loss": 0.3947, "step": 3075 }, { "epoch": 0.9403852033017426, "grad_norm": 0.5045533776283264, "learning_rate": 1.1742453190676347e-05, "loss": 0.2186, "step": 3076 }, { "epoch": 0.9406909202078875, "grad_norm": 0.31606391072273254, "learning_rate": 1.1746274359954146e-05, "loss": 0.1008, "step": 3077 }, { "epoch": 0.9409966371140324, "grad_norm": 0.34255415201187134, "learning_rate": 1.1750095529231944e-05, "loss": 0.1075, "step": 3078 }, { "epoch": 0.9413023540201774, "grad_norm": 0.3365557789802551, "learning_rate": 1.1753916698509745e-05, "loss": 0.0841, "step": 3079 }, { "epoch": 0.9416080709263223, "grad_norm": 0.398433655500412, "learning_rate": 1.1757737867787544e-05, "loss": 0.0974, "step": 3080 }, { "epoch": 0.9419137878324672, "grad_norm": 0.5073704719543457, "learning_rate": 1.1761559037065342e-05, "loss": 0.0945, "step": 3081 }, { "epoch": 0.942219504738612, "grad_norm": 0.5003860592842102, "learning_rate": 1.1765380206343141e-05, "loss": 0.0929, "step": 3082 }, { "epoch": 0.9425252216447569, "grad_norm": 0.41817784309387207, "learning_rate": 1.176920137562094e-05, "loss": 0.0684, "step": 3083 }, { "epoch": 0.9428309385509018, "grad_norm": 0.7762680649757385, "learning_rate": 1.1773022544898739e-05, "loss": 0.1463, "step": 3084 }, { "epoch": 0.9431366554570467, "grad_norm": 0.488537460565567, "learning_rate": 1.1776843714176537e-05, "loss": 0.0966, "step": 3085 }, { "epoch": 0.9434423723631917, "grad_norm": 0.5260447859764099, "learning_rate": 1.1780664883454336e-05, "loss": 0.1293, "step": 3086 }, { "epoch": 0.9437480892693366, "grad_norm": 0.4259955883026123, "learning_rate": 1.1784486052732137e-05, "loss": 0.1241, "step": 3087 }, { "epoch": 0.9440538061754815, "grad_norm": 0.5980074405670166, "learning_rate": 1.1788307222009935e-05, "loss": 0.1324, "step": 3088 }, { "epoch": 0.9443595230816264, "grad_norm": 0.7142485976219177, "learning_rate": 1.1792128391287734e-05, "loss": 0.1776, "step": 3089 }, { "epoch": 0.9446652399877713, "grad_norm": 0.7556034922599792, "learning_rate": 1.1795949560565533e-05, "loss": 0.1415, "step": 3090 }, { "epoch": 0.9449709568939162, "grad_norm": 0.6711116433143616, "learning_rate": 1.1799770729843332e-05, "loss": 0.1835, "step": 3091 }, { "epoch": 0.9452766738000612, "grad_norm": 0.7155359983444214, "learning_rate": 1.180359189912113e-05, "loss": 0.198, "step": 3092 }, { "epoch": 0.9455823907062061, "grad_norm": 0.9181987047195435, "learning_rate": 1.180741306839893e-05, "loss": 0.2037, "step": 3093 }, { "epoch": 0.945888107612351, "grad_norm": 1.5561527013778687, "learning_rate": 1.181123423767673e-05, "loss": 0.2462, "step": 3094 }, { "epoch": 0.9461938245184959, "grad_norm": 0.875458836555481, "learning_rate": 1.181505540695453e-05, "loss": 0.2188, "step": 3095 }, { "epoch": 0.9464995414246408, "grad_norm": 0.9724174737930298, "learning_rate": 1.1818876576232329e-05, "loss": 0.2822, "step": 3096 }, { "epoch": 0.9468052583307857, "grad_norm": 1.0716722011566162, "learning_rate": 1.1822697745510127e-05, "loss": 0.2538, "step": 3097 }, { "epoch": 0.9471109752369306, "grad_norm": 1.1079601049423218, "learning_rate": 1.1826518914787926e-05, "loss": 0.2405, "step": 3098 }, { "epoch": 0.9474166921430756, "grad_norm": 2.660752773284912, "learning_rate": 1.1830340084065725e-05, "loss": 0.2852, "step": 3099 }, { "epoch": 0.9477224090492204, "grad_norm": 2.0655689239501953, "learning_rate": 1.1834161253343524e-05, "loss": 0.4113, "step": 3100 }, { "epoch": 0.9480281259553653, "grad_norm": 0.6245551705360413, "learning_rate": 1.1837982422621322e-05, "loss": 0.2306, "step": 3101 }, { "epoch": 0.9483338428615102, "grad_norm": 0.31905221939086914, "learning_rate": 1.1841803591899121e-05, "loss": 0.0923, "step": 3102 }, { "epoch": 0.9486395597676551, "grad_norm": 0.37246620655059814, "learning_rate": 1.1845624761176922e-05, "loss": 0.1061, "step": 3103 }, { "epoch": 0.9489452766738, "grad_norm": 0.3460008203983307, "learning_rate": 1.184944593045472e-05, "loss": 0.0869, "step": 3104 }, { "epoch": 0.949250993579945, "grad_norm": 0.47280824184417725, "learning_rate": 1.1853267099732519e-05, "loss": 0.1024, "step": 3105 }, { "epoch": 0.9495567104860899, "grad_norm": 0.3745613992214203, "learning_rate": 1.1857088269010318e-05, "loss": 0.0801, "step": 3106 }, { "epoch": 0.9498624273922348, "grad_norm": 0.4016205370426178, "learning_rate": 1.1860909438288117e-05, "loss": 0.0994, "step": 3107 }, { "epoch": 0.9501681442983797, "grad_norm": 0.35366952419281006, "learning_rate": 1.1864730607565915e-05, "loss": 0.0628, "step": 3108 }, { "epoch": 0.9504738612045246, "grad_norm": 0.4117097556591034, "learning_rate": 1.1868551776843714e-05, "loss": 0.1281, "step": 3109 }, { "epoch": 0.9507795781106695, "grad_norm": 0.37939825654029846, "learning_rate": 1.1872372946121513e-05, "loss": 0.0927, "step": 3110 }, { "epoch": 0.9510852950168144, "grad_norm": 0.5309694409370422, "learning_rate": 1.1876194115399313e-05, "loss": 0.1341, "step": 3111 }, { "epoch": 0.9513910119229594, "grad_norm": 0.5357214212417603, "learning_rate": 1.1880015284677112e-05, "loss": 0.1279, "step": 3112 }, { "epoch": 0.9516967288291043, "grad_norm": 0.4746829569339752, "learning_rate": 1.188383645395491e-05, "loss": 0.1295, "step": 3113 }, { "epoch": 0.9520024457352492, "grad_norm": 0.624019205570221, "learning_rate": 1.188765762323271e-05, "loss": 0.1329, "step": 3114 }, { "epoch": 0.9523081626413941, "grad_norm": 0.6966778039932251, "learning_rate": 1.1891478792510508e-05, "loss": 0.2175, "step": 3115 }, { "epoch": 0.952613879547539, "grad_norm": 0.6576070189476013, "learning_rate": 1.1895299961788307e-05, "loss": 0.2125, "step": 3116 }, { "epoch": 0.9529195964536838, "grad_norm": 0.9204330444335938, "learning_rate": 1.1899121131066106e-05, "loss": 0.2193, "step": 3117 }, { "epoch": 0.9532253133598289, "grad_norm": 0.9086167812347412, "learning_rate": 1.1902942300343906e-05, "loss": 0.2328, "step": 3118 }, { "epoch": 0.9535310302659737, "grad_norm": 1.1761406660079956, "learning_rate": 1.1906763469621705e-05, "loss": 0.2711, "step": 3119 }, { "epoch": 0.9538367471721186, "grad_norm": 1.2942794561386108, "learning_rate": 1.1910584638899504e-05, "loss": 0.2366, "step": 3120 }, { "epoch": 0.9541424640782635, "grad_norm": 1.6845922470092773, "learning_rate": 1.1914405808177302e-05, "loss": 0.2361, "step": 3121 }, { "epoch": 0.9544481809844084, "grad_norm": 1.0513603687286377, "learning_rate": 1.1918226977455101e-05, "loss": 0.2779, "step": 3122 }, { "epoch": 0.9547538978905533, "grad_norm": 1.58570396900177, "learning_rate": 1.19220481467329e-05, "loss": 0.3301, "step": 3123 }, { "epoch": 0.9550596147966982, "grad_norm": 2.0663018226623535, "learning_rate": 1.1925869316010699e-05, "loss": 0.3356, "step": 3124 }, { "epoch": 0.9553653317028432, "grad_norm": 1.75583815574646, "learning_rate": 1.1929690485288497e-05, "loss": 0.3918, "step": 3125 }, { "epoch": 0.9556710486089881, "grad_norm": 0.6798717379570007, "learning_rate": 1.1933511654566298e-05, "loss": 0.1937, "step": 3126 }, { "epoch": 0.955976765515133, "grad_norm": 0.5621383190155029, "learning_rate": 1.1937332823844096e-05, "loss": 0.1105, "step": 3127 }, { "epoch": 0.9562824824212779, "grad_norm": 0.41356372833251953, "learning_rate": 1.1941153993121895e-05, "loss": 0.08, "step": 3128 }, { "epoch": 0.9565881993274228, "grad_norm": 0.30644941329956055, "learning_rate": 1.1944975162399694e-05, "loss": 0.0848, "step": 3129 }, { "epoch": 0.9568939162335677, "grad_norm": 0.3746224641799927, "learning_rate": 1.1948796331677493e-05, "loss": 0.0895, "step": 3130 }, { "epoch": 0.9571996331397127, "grad_norm": 0.41722801327705383, "learning_rate": 1.1952617500955291e-05, "loss": 0.116, "step": 3131 }, { "epoch": 0.9575053500458576, "grad_norm": 0.4162527918815613, "learning_rate": 1.1956438670233092e-05, "loss": 0.0706, "step": 3132 }, { "epoch": 0.9578110669520025, "grad_norm": 0.410766065120697, "learning_rate": 1.196025983951089e-05, "loss": 0.0742, "step": 3133 }, { "epoch": 0.9581167838581474, "grad_norm": 0.4054067134857178, "learning_rate": 1.1964081008788691e-05, "loss": 0.1115, "step": 3134 }, { "epoch": 0.9584225007642923, "grad_norm": 0.5224018692970276, "learning_rate": 1.196790217806649e-05, "loss": 0.0843, "step": 3135 }, { "epoch": 0.9587282176704371, "grad_norm": 0.41606563329696655, "learning_rate": 1.1971723347344289e-05, "loss": 0.111, "step": 3136 }, { "epoch": 0.959033934576582, "grad_norm": 0.4582171142101288, "learning_rate": 1.1975544516622087e-05, "loss": 0.124, "step": 3137 }, { "epoch": 0.959339651482727, "grad_norm": 0.9061025381088257, "learning_rate": 1.1979365685899886e-05, "loss": 0.1529, "step": 3138 }, { "epoch": 0.9596453683888719, "grad_norm": 0.632064700126648, "learning_rate": 1.1983186855177685e-05, "loss": 0.1619, "step": 3139 }, { "epoch": 0.9599510852950168, "grad_norm": 0.6370052099227905, "learning_rate": 1.1987008024455484e-05, "loss": 0.1793, "step": 3140 }, { "epoch": 0.9602568022011617, "grad_norm": 1.1198172569274902, "learning_rate": 1.1990829193733282e-05, "loss": 0.2, "step": 3141 }, { "epoch": 0.9605625191073066, "grad_norm": 0.6073834300041199, "learning_rate": 1.1994650363011083e-05, "loss": 0.1902, "step": 3142 }, { "epoch": 0.9608682360134515, "grad_norm": 0.7214398980140686, "learning_rate": 1.1998471532288881e-05, "loss": 0.2256, "step": 3143 }, { "epoch": 0.9611739529195965, "grad_norm": 0.7250209450721741, "learning_rate": 1.200229270156668e-05, "loss": 0.2441, "step": 3144 }, { "epoch": 0.9614796698257414, "grad_norm": 0.8051237463951111, "learning_rate": 1.2006113870844479e-05, "loss": 0.2437, "step": 3145 }, { "epoch": 0.9617853867318863, "grad_norm": 0.8781859278678894, "learning_rate": 1.2009935040122278e-05, "loss": 0.2274, "step": 3146 }, { "epoch": 0.9620911036380312, "grad_norm": 0.99129319190979, "learning_rate": 1.2013756209400076e-05, "loss": 0.2312, "step": 3147 }, { "epoch": 0.9623968205441761, "grad_norm": 1.4280776977539062, "learning_rate": 1.2017577378677875e-05, "loss": 0.2688, "step": 3148 }, { "epoch": 0.962702537450321, "grad_norm": 1.2105438709259033, "learning_rate": 1.2021398547955676e-05, "loss": 0.2625, "step": 3149 }, { "epoch": 0.9630082543564659, "grad_norm": 2.10922908782959, "learning_rate": 1.2025219717233474e-05, "loss": 0.3571, "step": 3150 }, { "epoch": 0.9633139712626109, "grad_norm": 0.6091034412384033, "learning_rate": 1.2029040886511273e-05, "loss": 0.1906, "step": 3151 }, { "epoch": 0.9636196881687558, "grad_norm": 0.4947749376296997, "learning_rate": 1.2032862055789072e-05, "loss": 0.1364, "step": 3152 }, { "epoch": 0.9639254050749007, "grad_norm": 0.33551961183547974, "learning_rate": 1.203668322506687e-05, "loss": 0.0873, "step": 3153 }, { "epoch": 0.9642311219810455, "grad_norm": 0.45636382699012756, "learning_rate": 1.204050439434467e-05, "loss": 0.0999, "step": 3154 }, { "epoch": 0.9645368388871904, "grad_norm": 0.3531264364719391, "learning_rate": 1.2044325563622468e-05, "loss": 0.0733, "step": 3155 }, { "epoch": 0.9648425557933353, "grad_norm": 0.32733070850372314, "learning_rate": 1.2048146732900267e-05, "loss": 0.0753, "step": 3156 }, { "epoch": 0.9651482726994803, "grad_norm": 0.46069976687431335, "learning_rate": 1.2051967902178067e-05, "loss": 0.0804, "step": 3157 }, { "epoch": 0.9654539896056252, "grad_norm": 0.39397838711738586, "learning_rate": 1.2055789071455866e-05, "loss": 0.1011, "step": 3158 }, { "epoch": 0.9657597065117701, "grad_norm": 0.44641783833503723, "learning_rate": 1.2059610240733665e-05, "loss": 0.104, "step": 3159 }, { "epoch": 0.966065423417915, "grad_norm": 0.36873534321784973, "learning_rate": 1.2063431410011464e-05, "loss": 0.075, "step": 3160 }, { "epoch": 0.9663711403240599, "grad_norm": 0.444614052772522, "learning_rate": 1.2067252579289262e-05, "loss": 0.1458, "step": 3161 }, { "epoch": 0.9666768572302048, "grad_norm": 0.8550121784210205, "learning_rate": 1.2071073748567061e-05, "loss": 0.107, "step": 3162 }, { "epoch": 0.9669825741363497, "grad_norm": 0.5017456412315369, "learning_rate": 1.207489491784486e-05, "loss": 0.1325, "step": 3163 }, { "epoch": 0.9672882910424947, "grad_norm": 1.095397710800171, "learning_rate": 1.2078716087122659e-05, "loss": 0.1484, "step": 3164 }, { "epoch": 0.9675940079486396, "grad_norm": 0.5998310446739197, "learning_rate": 1.2082537256400459e-05, "loss": 0.2119, "step": 3165 }, { "epoch": 0.9678997248547845, "grad_norm": 0.6700792908668518, "learning_rate": 1.2086358425678258e-05, "loss": 0.1862, "step": 3166 }, { "epoch": 0.9682054417609294, "grad_norm": 0.8271272778511047, "learning_rate": 1.2090179594956056e-05, "loss": 0.2066, "step": 3167 }, { "epoch": 0.9685111586670743, "grad_norm": 0.7562806010246277, "learning_rate": 1.2094000764233855e-05, "loss": 0.2974, "step": 3168 }, { "epoch": 0.9688168755732192, "grad_norm": 1.135209321975708, "learning_rate": 1.2097821933511654e-05, "loss": 0.2755, "step": 3169 }, { "epoch": 0.969122592479364, "grad_norm": 0.9814943075180054, "learning_rate": 1.2101643102789453e-05, "loss": 0.2543, "step": 3170 }, { "epoch": 0.9694283093855091, "grad_norm": 1.0182932615280151, "learning_rate": 1.2105464272067253e-05, "loss": 0.2867, "step": 3171 }, { "epoch": 0.969734026291654, "grad_norm": 1.0416746139526367, "learning_rate": 1.2109285441345052e-05, "loss": 0.3131, "step": 3172 }, { "epoch": 0.9700397431977988, "grad_norm": 1.4090399742126465, "learning_rate": 1.2113106610622852e-05, "loss": 0.2822, "step": 3173 }, { "epoch": 0.9703454601039437, "grad_norm": 1.4564845561981201, "learning_rate": 1.2116927779900651e-05, "loss": 0.334, "step": 3174 }, { "epoch": 0.9706511770100886, "grad_norm": 2.002826452255249, "learning_rate": 1.212074894917845e-05, "loss": 0.3772, "step": 3175 }, { "epoch": 0.9709568939162335, "grad_norm": 0.40317240357398987, "learning_rate": 1.2124570118456249e-05, "loss": 0.1834, "step": 3176 }, { "epoch": 0.9712626108223785, "grad_norm": 0.46593916416168213, "learning_rate": 1.2128391287734047e-05, "loss": 0.1399, "step": 3177 }, { "epoch": 0.9715683277285234, "grad_norm": 0.39607441425323486, "learning_rate": 1.2132212457011846e-05, "loss": 0.0821, "step": 3178 }, { "epoch": 0.9718740446346683, "grad_norm": 0.40105581283569336, "learning_rate": 1.2136033626289645e-05, "loss": 0.0782, "step": 3179 }, { "epoch": 0.9721797615408132, "grad_norm": 0.4922158122062683, "learning_rate": 1.2139854795567445e-05, "loss": 0.0918, "step": 3180 }, { "epoch": 0.9724854784469581, "grad_norm": 0.383643239736557, "learning_rate": 1.2143675964845244e-05, "loss": 0.0828, "step": 3181 }, { "epoch": 0.972791195353103, "grad_norm": 0.37614068388938904, "learning_rate": 1.2147497134123043e-05, "loss": 0.0732, "step": 3182 }, { "epoch": 0.9730969122592479, "grad_norm": 0.4367503821849823, "learning_rate": 1.2151318303400841e-05, "loss": 0.0783, "step": 3183 }, { "epoch": 0.9734026291653929, "grad_norm": 0.417643278837204, "learning_rate": 1.215513947267864e-05, "loss": 0.1063, "step": 3184 }, { "epoch": 0.9737083460715378, "grad_norm": 0.46919623017311096, "learning_rate": 1.2158960641956439e-05, "loss": 0.0909, "step": 3185 }, { "epoch": 0.9740140629776827, "grad_norm": 0.5693230032920837, "learning_rate": 1.2162781811234238e-05, "loss": 0.1332, "step": 3186 }, { "epoch": 0.9743197798838276, "grad_norm": 0.6479252576828003, "learning_rate": 1.2166602980512036e-05, "loss": 0.1071, "step": 3187 }, { "epoch": 0.9746254967899725, "grad_norm": 0.5202141404151917, "learning_rate": 1.2170424149789837e-05, "loss": 0.137, "step": 3188 }, { "epoch": 0.9749312136961173, "grad_norm": 0.6304534673690796, "learning_rate": 1.2174245319067636e-05, "loss": 0.1468, "step": 3189 }, { "epoch": 0.9752369306022624, "grad_norm": 0.6721799373626709, "learning_rate": 1.2178066488345434e-05, "loss": 0.175, "step": 3190 }, { "epoch": 0.9755426475084072, "grad_norm": 0.6930961012840271, "learning_rate": 1.2181887657623233e-05, "loss": 0.2081, "step": 3191 }, { "epoch": 0.9758483644145521, "grad_norm": 0.7439306378364563, "learning_rate": 1.2185708826901032e-05, "loss": 0.2235, "step": 3192 }, { "epoch": 0.976154081320697, "grad_norm": 0.7310979962348938, "learning_rate": 1.218952999617883e-05, "loss": 0.1879, "step": 3193 }, { "epoch": 0.9764597982268419, "grad_norm": 0.7020347118377686, "learning_rate": 1.219335116545663e-05, "loss": 0.2192, "step": 3194 }, { "epoch": 0.9767655151329868, "grad_norm": 0.9674530029296875, "learning_rate": 1.2197172334734428e-05, "loss": 0.2302, "step": 3195 }, { "epoch": 0.9770712320391317, "grad_norm": 1.0266170501708984, "learning_rate": 1.2200993504012229e-05, "loss": 0.274, "step": 3196 }, { "epoch": 0.9773769489452767, "grad_norm": 2.747267723083496, "learning_rate": 1.2204814673290027e-05, "loss": 0.3055, "step": 3197 }, { "epoch": 0.9776826658514216, "grad_norm": 1.6522608995437622, "learning_rate": 1.2208635842567826e-05, "loss": 0.307, "step": 3198 }, { "epoch": 0.9779883827575665, "grad_norm": 3.1230483055114746, "learning_rate": 1.2212457011845625e-05, "loss": 0.3126, "step": 3199 }, { "epoch": 0.9782940996637114, "grad_norm": 2.161052703857422, "learning_rate": 1.2216278181123423e-05, "loss": 0.3906, "step": 3200 }, { "epoch": 0.9785998165698563, "grad_norm": 0.5798832178115845, "learning_rate": 1.2220099350401222e-05, "loss": 0.2016, "step": 3201 }, { "epoch": 0.9789055334760012, "grad_norm": 0.33538737893104553, "learning_rate": 1.2223920519679021e-05, "loss": 0.0896, "step": 3202 }, { "epoch": 0.9792112503821462, "grad_norm": 0.6652705669403076, "learning_rate": 1.222774168895682e-05, "loss": 0.1387, "step": 3203 }, { "epoch": 0.9795169672882911, "grad_norm": 0.4096473753452301, "learning_rate": 1.223156285823462e-05, "loss": 0.0726, "step": 3204 }, { "epoch": 0.979822684194436, "grad_norm": 0.3002910315990448, "learning_rate": 1.2235384027512419e-05, "loss": 0.0787, "step": 3205 }, { "epoch": 0.9801284011005809, "grad_norm": 0.37400224804878235, "learning_rate": 1.2239205196790218e-05, "loss": 0.0938, "step": 3206 }, { "epoch": 0.9804341180067258, "grad_norm": 0.6158922910690308, "learning_rate": 1.2243026366068016e-05, "loss": 0.09, "step": 3207 }, { "epoch": 0.9807398349128706, "grad_norm": 0.4815502464771271, "learning_rate": 1.2246847535345815e-05, "loss": 0.1045, "step": 3208 }, { "epoch": 0.9810455518190155, "grad_norm": 0.3926409184932709, "learning_rate": 1.2250668704623614e-05, "loss": 0.103, "step": 3209 }, { "epoch": 0.9813512687251605, "grad_norm": 0.36665141582489014, "learning_rate": 1.2254489873901414e-05, "loss": 0.0839, "step": 3210 }, { "epoch": 0.9816569856313054, "grad_norm": 0.49513351917266846, "learning_rate": 1.2258311043179215e-05, "loss": 0.1315, "step": 3211 }, { "epoch": 0.9819627025374503, "grad_norm": 0.5295798778533936, "learning_rate": 1.2262132212457013e-05, "loss": 0.1408, "step": 3212 }, { "epoch": 0.9822684194435952, "grad_norm": 0.8515869379043579, "learning_rate": 1.2265953381734812e-05, "loss": 0.1416, "step": 3213 }, { "epoch": 0.9825741363497401, "grad_norm": 0.563252329826355, "learning_rate": 1.2269774551012611e-05, "loss": 0.1638, "step": 3214 }, { "epoch": 0.982879853255885, "grad_norm": 0.5980503559112549, "learning_rate": 1.227359572029041e-05, "loss": 0.1682, "step": 3215 }, { "epoch": 0.98318557016203, "grad_norm": 0.6839807033538818, "learning_rate": 1.2277416889568208e-05, "loss": 0.1862, "step": 3216 }, { "epoch": 0.9834912870681749, "grad_norm": 0.9278534054756165, "learning_rate": 1.2281238058846007e-05, "loss": 0.2268, "step": 3217 }, { "epoch": 0.9837970039743198, "grad_norm": 0.7935358881950378, "learning_rate": 1.2285059228123806e-05, "loss": 0.2272, "step": 3218 }, { "epoch": 0.9841027208804647, "grad_norm": 0.9069288969039917, "learning_rate": 1.2288880397401606e-05, "loss": 0.2184, "step": 3219 }, { "epoch": 0.9844084377866096, "grad_norm": 0.9310320615768433, "learning_rate": 1.2292701566679405e-05, "loss": 0.2977, "step": 3220 }, { "epoch": 0.9847141546927545, "grad_norm": 1.11311936378479, "learning_rate": 1.2296522735957204e-05, "loss": 0.2621, "step": 3221 }, { "epoch": 0.9850198715988994, "grad_norm": 1.0219923257827759, "learning_rate": 1.2300343905235003e-05, "loss": 0.2319, "step": 3222 }, { "epoch": 0.9853255885050444, "grad_norm": 1.911068081855774, "learning_rate": 1.2304165074512801e-05, "loss": 0.2909, "step": 3223 }, { "epoch": 0.9856313054111893, "grad_norm": 1.4470555782318115, "learning_rate": 1.23079862437906e-05, "loss": 0.3402, "step": 3224 }, { "epoch": 0.9859370223173342, "grad_norm": 1.9982942342758179, "learning_rate": 1.2311807413068399e-05, "loss": 0.3238, "step": 3225 }, { "epoch": 0.986242739223479, "grad_norm": 0.5515090227127075, "learning_rate": 1.2315628582346198e-05, "loss": 0.1803, "step": 3226 }, { "epoch": 0.9865484561296239, "grad_norm": 0.3995044529438019, "learning_rate": 1.2319449751623998e-05, "loss": 0.1176, "step": 3227 }, { "epoch": 0.9868541730357688, "grad_norm": 0.5768081545829773, "learning_rate": 1.2323270920901797e-05, "loss": 0.1142, "step": 3228 }, { "epoch": 0.9871598899419138, "grad_norm": 0.6919368505477905, "learning_rate": 1.2327092090179596e-05, "loss": 0.0797, "step": 3229 }, { "epoch": 0.9874656068480587, "grad_norm": 0.33816269040107727, "learning_rate": 1.2330913259457394e-05, "loss": 0.0742, "step": 3230 }, { "epoch": 0.9877713237542036, "grad_norm": 0.3338421583175659, "learning_rate": 1.2334734428735193e-05, "loss": 0.0859, "step": 3231 }, { "epoch": 0.9880770406603485, "grad_norm": 0.35861867666244507, "learning_rate": 1.2338555598012992e-05, "loss": 0.0729, "step": 3232 }, { "epoch": 0.9883827575664934, "grad_norm": 0.51128751039505, "learning_rate": 1.234237676729079e-05, "loss": 0.1381, "step": 3233 }, { "epoch": 0.9886884744726383, "grad_norm": 0.6037584543228149, "learning_rate": 1.234619793656859e-05, "loss": 0.0833, "step": 3234 }, { "epoch": 0.9889941913787832, "grad_norm": 0.3994084894657135, "learning_rate": 1.235001910584639e-05, "loss": 0.0934, "step": 3235 }, { "epoch": 0.9892999082849282, "grad_norm": 0.4349466860294342, "learning_rate": 1.2353840275124188e-05, "loss": 0.1235, "step": 3236 }, { "epoch": 0.9896056251910731, "grad_norm": 0.4956713318824768, "learning_rate": 1.2357661444401987e-05, "loss": 0.1279, "step": 3237 }, { "epoch": 0.989911342097218, "grad_norm": 0.6862843632698059, "learning_rate": 1.2361482613679786e-05, "loss": 0.1661, "step": 3238 }, { "epoch": 0.9902170590033629, "grad_norm": 0.6372434496879578, "learning_rate": 1.2365303782957585e-05, "loss": 0.1243, "step": 3239 }, { "epoch": 0.9905227759095078, "grad_norm": 0.6078612208366394, "learning_rate": 1.2369124952235383e-05, "loss": 0.1642, "step": 3240 }, { "epoch": 0.9908284928156527, "grad_norm": 0.7266665697097778, "learning_rate": 1.2372946121513182e-05, "loss": 0.2043, "step": 3241 }, { "epoch": 0.9911342097217977, "grad_norm": 0.7343495488166809, "learning_rate": 1.2376767290790983e-05, "loss": 0.2225, "step": 3242 }, { "epoch": 0.9914399266279426, "grad_norm": 0.8440867066383362, "learning_rate": 1.2380588460068781e-05, "loss": 0.2375, "step": 3243 }, { "epoch": 0.9917456435340875, "grad_norm": 1.1364221572875977, "learning_rate": 1.238440962934658e-05, "loss": 0.2097, "step": 3244 }, { "epoch": 0.9920513604402323, "grad_norm": 4.291292667388916, "learning_rate": 1.2388230798624379e-05, "loss": 0.2598, "step": 3245 }, { "epoch": 0.9923570773463772, "grad_norm": 0.9706876873970032, "learning_rate": 1.2392051967902178e-05, "loss": 0.2552, "step": 3246 }, { "epoch": 0.9926627942525221, "grad_norm": 1.3924599885940552, "learning_rate": 1.2395873137179976e-05, "loss": 0.3023, "step": 3247 }, { "epoch": 0.992968511158667, "grad_norm": 1.1938780546188354, "learning_rate": 1.2399694306457775e-05, "loss": 0.3091, "step": 3248 }, { "epoch": 0.993274228064812, "grad_norm": 1.1592086553573608, "learning_rate": 1.2403515475735576e-05, "loss": 0.3085, "step": 3249 }, { "epoch": 0.9935799449709569, "grad_norm": 2.1471781730651855, "learning_rate": 1.2407336645013376e-05, "loss": 0.3165, "step": 3250 }, { "epoch": 0.9938856618771018, "grad_norm": 0.5745124816894531, "learning_rate": 1.2411157814291175e-05, "loss": 0.2148, "step": 3251 }, { "epoch": 0.9941913787832467, "grad_norm": 0.9873051047325134, "learning_rate": 1.2414978983568973e-05, "loss": 0.1097, "step": 3252 }, { "epoch": 0.9944970956893916, "grad_norm": 0.45054352283477783, "learning_rate": 1.2418800152846772e-05, "loss": 0.1312, "step": 3253 }, { "epoch": 0.9948028125955365, "grad_norm": 0.3725685477256775, "learning_rate": 1.2422621322124571e-05, "loss": 0.0761, "step": 3254 }, { "epoch": 0.9951085295016815, "grad_norm": 0.5229989886283875, "learning_rate": 1.242644249140237e-05, "loss": 0.0927, "step": 3255 }, { "epoch": 0.9954142464078264, "grad_norm": 0.3629966080188751, "learning_rate": 1.2430263660680168e-05, "loss": 0.0778, "step": 3256 }, { "epoch": 0.9957199633139713, "grad_norm": 0.38211309909820557, "learning_rate": 1.2434084829957967e-05, "loss": 0.0888, "step": 3257 }, { "epoch": 0.9960256802201162, "grad_norm": 0.37993139028549194, "learning_rate": 1.2437905999235768e-05, "loss": 0.0728, "step": 3258 }, { "epoch": 0.9963313971262611, "grad_norm": 0.4417772591114044, "learning_rate": 1.2441727168513566e-05, "loss": 0.1198, "step": 3259 }, { "epoch": 0.996637114032406, "grad_norm": 0.5525287389755249, "learning_rate": 1.2445548337791365e-05, "loss": 0.1387, "step": 3260 }, { "epoch": 0.9969428309385509, "grad_norm": 0.5894179344177246, "learning_rate": 1.2449369507069164e-05, "loss": 0.1557, "step": 3261 }, { "epoch": 0.9972485478446959, "grad_norm": 0.6071445941925049, "learning_rate": 1.2453190676346963e-05, "loss": 0.1917, "step": 3262 }, { "epoch": 0.9975542647508407, "grad_norm": 2.4323575496673584, "learning_rate": 1.2457011845624761e-05, "loss": 0.2247, "step": 3263 }, { "epoch": 0.9978599816569856, "grad_norm": 0.631656289100647, "learning_rate": 1.246083301490256e-05, "loss": 0.2058, "step": 3264 }, { "epoch": 0.9981656985631305, "grad_norm": 0.9731677770614624, "learning_rate": 1.2464654184180359e-05, "loss": 0.2107, "step": 3265 }, { "epoch": 0.9984714154692754, "grad_norm": 0.8280726075172424, "learning_rate": 1.246847535345816e-05, "loss": 0.2523, "step": 3266 }, { "epoch": 0.9987771323754203, "grad_norm": 0.7681115865707397, "learning_rate": 1.2472296522735958e-05, "loss": 0.2543, "step": 3267 }, { "epoch": 0.9990828492815653, "grad_norm": 0.9153480529785156, "learning_rate": 1.2476117692013757e-05, "loss": 0.2472, "step": 3268 }, { "epoch": 0.9993885661877102, "grad_norm": 1.7340799570083618, "learning_rate": 1.2479938861291555e-05, "loss": 0.311, "step": 3269 }, { "epoch": 0.9996942830938551, "grad_norm": 1.4459046125411987, "learning_rate": 1.2483760030569354e-05, "loss": 0.3077, "step": 3270 }, { "epoch": 1.0, "grad_norm": 1.5900042057037354, "learning_rate": 1.2487581199847153e-05, "loss": 0.3145, "step": 3271 }, { "epoch": 1.000305716906145, "grad_norm": 0.523026168346405, "learning_rate": 1.2491402369124952e-05, "loss": 0.1917, "step": 3272 }, { "epoch": 1.0006114338122898, "grad_norm": 0.30485799908638, "learning_rate": 1.2495223538402752e-05, "loss": 0.1004, "step": 3273 }, { "epoch": 1.0009171507184347, "grad_norm": 0.4151607155799866, "learning_rate": 1.2499044707680551e-05, "loss": 0.1044, "step": 3274 }, { "epoch": 1.0012228676245796, "grad_norm": 0.35217931866645813, "learning_rate": 1.250286587695835e-05, "loss": 0.0933, "step": 3275 }, { "epoch": 1.0015285845307245, "grad_norm": 0.455073744058609, "learning_rate": 1.2506687046236148e-05, "loss": 0.0823, "step": 3276 }, { "epoch": 1.0018343014368694, "grad_norm": 0.347404420375824, "learning_rate": 1.2510508215513947e-05, "loss": 0.0871, "step": 3277 }, { "epoch": 1.0021400183430145, "grad_norm": 0.3282896876335144, "learning_rate": 1.2514329384791746e-05, "loss": 0.0805, "step": 3278 }, { "epoch": 1.0024457352491594, "grad_norm": 0.4035723805427551, "learning_rate": 1.2518150554069545e-05, "loss": 0.0852, "step": 3279 }, { "epoch": 1.0027514521553043, "grad_norm": 0.44202688336372375, "learning_rate": 1.2521971723347343e-05, "loss": 0.1117, "step": 3280 }, { "epoch": 1.0030571690614491, "grad_norm": 0.4710269868373871, "learning_rate": 1.2525792892625144e-05, "loss": 0.0872, "step": 3281 }, { "epoch": 1.003362885967594, "grad_norm": 0.739876389503479, "learning_rate": 1.2529614061902943e-05, "loss": 0.1912, "step": 3282 }, { "epoch": 1.003668602873739, "grad_norm": 0.5529804825782776, "learning_rate": 1.2533435231180741e-05, "loss": 0.094, "step": 3283 }, { "epoch": 1.0039743197798838, "grad_norm": 0.46453428268432617, "learning_rate": 1.253725640045854e-05, "loss": 0.1261, "step": 3284 }, { "epoch": 1.0042800366860287, "grad_norm": 0.6516205072402954, "learning_rate": 1.2541077569736339e-05, "loss": 0.1844, "step": 3285 }, { "epoch": 1.0045857535921736, "grad_norm": 0.6506209373474121, "learning_rate": 1.2544898739014138e-05, "loss": 0.1604, "step": 3286 }, { "epoch": 1.0048914704983185, "grad_norm": 0.6350597143173218, "learning_rate": 1.2548719908291936e-05, "loss": 0.1707, "step": 3287 }, { "epoch": 1.0051971874044634, "grad_norm": 0.7633315920829773, "learning_rate": 1.2552541077569737e-05, "loss": 0.1576, "step": 3288 }, { "epoch": 1.0055029043106083, "grad_norm": 1.5689492225646973, "learning_rate": 1.2556362246847537e-05, "loss": 0.2247, "step": 3289 }, { "epoch": 1.0058086212167532, "grad_norm": 0.9994948506355286, "learning_rate": 1.2560183416125336e-05, "loss": 0.2751, "step": 3290 }, { "epoch": 1.0061143381228983, "grad_norm": 1.3745059967041016, "learning_rate": 1.2564004585403135e-05, "loss": 0.2452, "step": 3291 }, { "epoch": 1.0064200550290432, "grad_norm": 4.559655666351318, "learning_rate": 1.2567825754680933e-05, "loss": 0.254, "step": 3292 }, { "epoch": 1.006725771935188, "grad_norm": 1.9751355648040771, "learning_rate": 1.2571646923958732e-05, "loss": 0.238, "step": 3293 }, { "epoch": 1.007031488841333, "grad_norm": 1.304632544517517, "learning_rate": 1.2575468093236531e-05, "loss": 0.2949, "step": 3294 }, { "epoch": 1.0073372057474779, "grad_norm": 1.8364901542663574, "learning_rate": 1.257928926251433e-05, "loss": 0.3154, "step": 3295 }, { "epoch": 1.0076429226536228, "grad_norm": 2.023998975753784, "learning_rate": 1.2583110431792128e-05, "loss": 0.3772, "step": 3296 }, { "epoch": 1.0079486395597677, "grad_norm": 0.6140416264533997, "learning_rate": 1.2586931601069929e-05, "loss": 0.2123, "step": 3297 }, { "epoch": 1.0082543564659125, "grad_norm": 0.39847320318222046, "learning_rate": 1.2590752770347728e-05, "loss": 0.0999, "step": 3298 }, { "epoch": 1.0085600733720574, "grad_norm": 0.4859084486961365, "learning_rate": 1.2594573939625526e-05, "loss": 0.096, "step": 3299 }, { "epoch": 1.0088657902782023, "grad_norm": 0.5891222357749939, "learning_rate": 1.2598395108903325e-05, "loss": 0.1258, "step": 3300 }, { "epoch": 1.0091715071843472, "grad_norm": 0.35238388180732727, "learning_rate": 1.2602216278181124e-05, "loss": 0.0776, "step": 3301 }, { "epoch": 1.0094772240904921, "grad_norm": 0.4741254150867462, "learning_rate": 1.2606037447458923e-05, "loss": 0.0801, "step": 3302 }, { "epoch": 1.009782940996637, "grad_norm": 0.42375150322914124, "learning_rate": 1.2609858616736721e-05, "loss": 0.0826, "step": 3303 }, { "epoch": 1.0100886579027821, "grad_norm": 0.5618375539779663, "learning_rate": 1.2613679786014522e-05, "loss": 0.0926, "step": 3304 }, { "epoch": 1.010394374808927, "grad_norm": 0.4147689640522003, "learning_rate": 1.261750095529232e-05, "loss": 0.1123, "step": 3305 }, { "epoch": 1.010700091715072, "grad_norm": 1.0706758499145508, "learning_rate": 1.262132212457012e-05, "loss": 0.0751, "step": 3306 }, { "epoch": 1.0110058086212168, "grad_norm": 0.4225582182407379, "learning_rate": 1.2625143293847918e-05, "loss": 0.1083, "step": 3307 }, { "epoch": 1.0113115255273617, "grad_norm": 0.4806020259857178, "learning_rate": 1.2628964463125717e-05, "loss": 0.1034, "step": 3308 }, { "epoch": 1.0116172424335066, "grad_norm": 0.5840721130371094, "learning_rate": 1.2632785632403515e-05, "loss": 0.1476, "step": 3309 }, { "epoch": 1.0119229593396515, "grad_norm": 0.6282683610916138, "learning_rate": 1.2636606801681314e-05, "loss": 0.2001, "step": 3310 }, { "epoch": 1.0122286762457964, "grad_norm": 0.6195732355117798, "learning_rate": 1.2640427970959113e-05, "loss": 0.1389, "step": 3311 }, { "epoch": 1.0125343931519413, "grad_norm": 0.603095531463623, "learning_rate": 1.2644249140236913e-05, "loss": 0.1988, "step": 3312 }, { "epoch": 1.0128401100580862, "grad_norm": 1.0286253690719604, "learning_rate": 1.2648070309514712e-05, "loss": 0.2142, "step": 3313 }, { "epoch": 1.013145826964231, "grad_norm": 0.6956785917282104, "learning_rate": 1.2651891478792511e-05, "loss": 0.1834, "step": 3314 }, { "epoch": 1.013451543870376, "grad_norm": 1.3510342836380005, "learning_rate": 1.265571264807031e-05, "loss": 0.2589, "step": 3315 }, { "epoch": 1.0137572607765208, "grad_norm": 0.751301109790802, "learning_rate": 1.2659533817348108e-05, "loss": 0.2197, "step": 3316 }, { "epoch": 1.014062977682666, "grad_norm": 1.0760986804962158, "learning_rate": 1.2663354986625907e-05, "loss": 0.2534, "step": 3317 }, { "epoch": 1.0143686945888108, "grad_norm": 1.7020199298858643, "learning_rate": 1.2667176155903706e-05, "loss": 0.2858, "step": 3318 }, { "epoch": 1.0146744114949557, "grad_norm": 1.1337448358535767, "learning_rate": 1.2670997325181505e-05, "loss": 0.2459, "step": 3319 }, { "epoch": 1.0149801284011006, "grad_norm": 1.952995777130127, "learning_rate": 1.2674818494459305e-05, "loss": 0.2776, "step": 3320 }, { "epoch": 1.0152858453072455, "grad_norm": 4.871084690093994, "learning_rate": 1.2678639663737104e-05, "loss": 0.3638, "step": 3321 }, { "epoch": 1.0155915622133904, "grad_norm": 0.42215588688850403, "learning_rate": 1.2682460833014903e-05, "loss": 0.1796, "step": 3322 }, { "epoch": 1.0158972791195353, "grad_norm": 0.40707629919052124, "learning_rate": 1.2686282002292701e-05, "loss": 0.1144, "step": 3323 }, { "epoch": 1.0162029960256802, "grad_norm": 0.3634437620639801, "learning_rate": 1.26901031715705e-05, "loss": 0.1122, "step": 3324 }, { "epoch": 1.016508712931825, "grad_norm": 0.3637818992137909, "learning_rate": 1.2693924340848299e-05, "loss": 0.0928, "step": 3325 }, { "epoch": 1.01681442983797, "grad_norm": 0.453069806098938, "learning_rate": 1.2697745510126097e-05, "loss": 0.0777, "step": 3326 }, { "epoch": 1.0171201467441149, "grad_norm": 0.39156538248062134, "learning_rate": 1.2701566679403898e-05, "loss": 0.105, "step": 3327 }, { "epoch": 1.0174258636502598, "grad_norm": 0.8344194293022156, "learning_rate": 1.2705387848681698e-05, "loss": 0.0718, "step": 3328 }, { "epoch": 1.0177315805564047, "grad_norm": 0.45534253120422363, "learning_rate": 1.2709209017959497e-05, "loss": 0.0963, "step": 3329 }, { "epoch": 1.0180372974625498, "grad_norm": 0.3537411391735077, "learning_rate": 1.2713030187237296e-05, "loss": 0.0984, "step": 3330 }, { "epoch": 1.0183430143686947, "grad_norm": 0.39765554666519165, "learning_rate": 1.2716851356515095e-05, "loss": 0.0794, "step": 3331 }, { "epoch": 1.0186487312748396, "grad_norm": 0.7642285823822021, "learning_rate": 1.2720672525792893e-05, "loss": 0.1411, "step": 3332 }, { "epoch": 1.0189544481809845, "grad_norm": 0.41101911664009094, "learning_rate": 1.2724493695070692e-05, "loss": 0.0891, "step": 3333 }, { "epoch": 1.0192601650871294, "grad_norm": 0.6650947332382202, "learning_rate": 1.272831486434849e-05, "loss": 0.1341, "step": 3334 }, { "epoch": 1.0195658819932742, "grad_norm": 0.806943953037262, "learning_rate": 1.273213603362629e-05, "loss": 0.1391, "step": 3335 }, { "epoch": 1.0198715988994191, "grad_norm": 0.8302177786827087, "learning_rate": 1.273595720290409e-05, "loss": 0.1446, "step": 3336 }, { "epoch": 1.020177315805564, "grad_norm": 0.8942354917526245, "learning_rate": 1.2739778372181889e-05, "loss": 0.2454, "step": 3337 }, { "epoch": 1.020483032711709, "grad_norm": 0.7274819016456604, "learning_rate": 1.2743599541459688e-05, "loss": 0.2173, "step": 3338 }, { "epoch": 1.0207887496178538, "grad_norm": 0.9398229122161865, "learning_rate": 1.2747420710737486e-05, "loss": 0.2192, "step": 3339 }, { "epoch": 1.0210944665239987, "grad_norm": 2.4876389503479004, "learning_rate": 1.2751241880015285e-05, "loss": 0.2567, "step": 3340 }, { "epoch": 1.0214001834301436, "grad_norm": 0.8884159922599792, "learning_rate": 1.2755063049293084e-05, "loss": 0.2446, "step": 3341 }, { "epoch": 1.0217059003362885, "grad_norm": 0.8840627074241638, "learning_rate": 1.2758884218570882e-05, "loss": 0.2468, "step": 3342 }, { "epoch": 1.0220116172424336, "grad_norm": 0.9895331263542175, "learning_rate": 1.2762705387848683e-05, "loss": 0.2568, "step": 3343 }, { "epoch": 1.0223173341485785, "grad_norm": 1.737269401550293, "learning_rate": 1.2766526557126482e-05, "loss": 0.2497, "step": 3344 }, { "epoch": 1.0226230510547234, "grad_norm": 1.861284613609314, "learning_rate": 1.277034772640428e-05, "loss": 0.2626, "step": 3345 }, { "epoch": 1.0229287679608683, "grad_norm": 2.4455862045288086, "learning_rate": 1.277416889568208e-05, "loss": 0.3721, "step": 3346 }, { "epoch": 1.0232344848670132, "grad_norm": 0.5176630020141602, "learning_rate": 1.2777990064959878e-05, "loss": 0.1912, "step": 3347 }, { "epoch": 1.023540201773158, "grad_norm": 0.4056771993637085, "learning_rate": 1.2781811234237677e-05, "loss": 0.1296, "step": 3348 }, { "epoch": 1.023845918679303, "grad_norm": 0.5795426368713379, "learning_rate": 1.2785632403515475e-05, "loss": 0.1113, "step": 3349 }, { "epoch": 1.0241516355854479, "grad_norm": 0.3605540990829468, "learning_rate": 1.2789453572793274e-05, "loss": 0.0848, "step": 3350 }, { "epoch": 1.0244573524915928, "grad_norm": 0.4645642340183258, "learning_rate": 1.2793274742071075e-05, "loss": 0.088, "step": 3351 }, { "epoch": 1.0247630693977376, "grad_norm": 0.361945241689682, "learning_rate": 1.2797095911348873e-05, "loss": 0.0783, "step": 3352 }, { "epoch": 1.0250687863038825, "grad_norm": 0.3510211706161499, "learning_rate": 1.2800917080626672e-05, "loss": 0.084, "step": 3353 }, { "epoch": 1.0253745032100274, "grad_norm": 0.43100854754447937, "learning_rate": 1.280473824990447e-05, "loss": 0.1065, "step": 3354 }, { "epoch": 1.0256802201161723, "grad_norm": 0.34930944442749023, "learning_rate": 1.280855941918227e-05, "loss": 0.0757, "step": 3355 }, { "epoch": 1.0259859370223174, "grad_norm": 0.37202125787734985, "learning_rate": 1.2812380588460068e-05, "loss": 0.0965, "step": 3356 }, { "epoch": 1.0262916539284623, "grad_norm": 0.4783880412578583, "learning_rate": 1.2816201757737867e-05, "loss": 0.1339, "step": 3357 }, { "epoch": 1.0265973708346072, "grad_norm": 0.8890364170074463, "learning_rate": 1.2820022927015666e-05, "loss": 0.1173, "step": 3358 }, { "epoch": 1.0269030877407521, "grad_norm": 0.5560166239738464, "learning_rate": 1.2823844096293466e-05, "loss": 0.1497, "step": 3359 }, { "epoch": 1.027208804646897, "grad_norm": 0.5804054737091064, "learning_rate": 1.2827665265571265e-05, "loss": 0.131, "step": 3360 }, { "epoch": 1.027514521553042, "grad_norm": 0.6091653108596802, "learning_rate": 1.2831486434849064e-05, "loss": 0.1937, "step": 3361 }, { "epoch": 1.0278202384591868, "grad_norm": 0.6687932014465332, "learning_rate": 1.2835307604126862e-05, "loss": 0.229, "step": 3362 }, { "epoch": 1.0281259553653317, "grad_norm": 0.7198684811592102, "learning_rate": 1.2839128773404661e-05, "loss": 0.1932, "step": 3363 }, { "epoch": 1.0284316722714766, "grad_norm": 0.7959886789321899, "learning_rate": 1.284294994268246e-05, "loss": 0.1948, "step": 3364 }, { "epoch": 1.0287373891776215, "grad_norm": 1.506231665611267, "learning_rate": 1.284677111196026e-05, "loss": 0.258, "step": 3365 }, { "epoch": 1.0290431060837664, "grad_norm": 1.1810123920440674, "learning_rate": 1.2850592281238059e-05, "loss": 0.2524, "step": 3366 }, { "epoch": 1.0293488229899113, "grad_norm": 0.967574417591095, "learning_rate": 1.285441345051586e-05, "loss": 0.2199, "step": 3367 }, { "epoch": 1.0296545398960562, "grad_norm": 0.9379544258117676, "learning_rate": 1.2858234619793658e-05, "loss": 0.247, "step": 3368 }, { "epoch": 1.0299602568022013, "grad_norm": 1.4047356843948364, "learning_rate": 1.2862055789071457e-05, "loss": 0.2319, "step": 3369 }, { "epoch": 1.0302659737083462, "grad_norm": 1.7224922180175781, "learning_rate": 1.2865876958349256e-05, "loss": 0.249, "step": 3370 }, { "epoch": 1.030571690614491, "grad_norm": 2.8266820907592773, "learning_rate": 1.2869698127627055e-05, "loss": 0.3621, "step": 3371 }, { "epoch": 1.030877407520636, "grad_norm": 0.5122179985046387, "learning_rate": 1.2873519296904853e-05, "loss": 0.183, "step": 3372 }, { "epoch": 1.0311831244267808, "grad_norm": 0.44144532084465027, "learning_rate": 1.2877340466182652e-05, "loss": 0.1258, "step": 3373 }, { "epoch": 1.0314888413329257, "grad_norm": 0.4110734462738037, "learning_rate": 1.2881161635460452e-05, "loss": 0.0954, "step": 3374 }, { "epoch": 1.0317945582390706, "grad_norm": 1.185043454170227, "learning_rate": 1.2884982804738251e-05, "loss": 0.0663, "step": 3375 }, { "epoch": 1.0321002751452155, "grad_norm": 0.382121741771698, "learning_rate": 1.288880397401605e-05, "loss": 0.0918, "step": 3376 }, { "epoch": 1.0324059920513604, "grad_norm": 0.6995230913162231, "learning_rate": 1.2892625143293849e-05, "loss": 0.0854, "step": 3377 }, { "epoch": 1.0327117089575053, "grad_norm": 0.38066789507865906, "learning_rate": 1.2896446312571647e-05, "loss": 0.078, "step": 3378 }, { "epoch": 1.0330174258636502, "grad_norm": 0.43179142475128174, "learning_rate": 1.2900267481849446e-05, "loss": 0.1099, "step": 3379 }, { "epoch": 1.033323142769795, "grad_norm": 0.4755992889404297, "learning_rate": 1.2904088651127245e-05, "loss": 0.0942, "step": 3380 }, { "epoch": 1.03362885967594, "grad_norm": 0.7263742685317993, "learning_rate": 1.2907909820405044e-05, "loss": 0.0966, "step": 3381 }, { "epoch": 1.033934576582085, "grad_norm": 0.45189982652664185, "learning_rate": 1.2911730989682844e-05, "loss": 0.1284, "step": 3382 }, { "epoch": 1.03424029348823, "grad_norm": 0.36667564511299133, "learning_rate": 1.2915552158960643e-05, "loss": 0.0961, "step": 3383 }, { "epoch": 1.0345460103943749, "grad_norm": 0.6376466155052185, "learning_rate": 1.2919373328238442e-05, "loss": 0.1506, "step": 3384 }, { "epoch": 1.0348517273005198, "grad_norm": 0.47949907183647156, "learning_rate": 1.292319449751624e-05, "loss": 0.146, "step": 3385 }, { "epoch": 1.0351574442066647, "grad_norm": 0.507897675037384, "learning_rate": 1.2927015666794039e-05, "loss": 0.1758, "step": 3386 }, { "epoch": 1.0354631611128096, "grad_norm": 0.6209713816642761, "learning_rate": 1.2930836836071838e-05, "loss": 0.1815, "step": 3387 }, { "epoch": 1.0357688780189545, "grad_norm": 0.6755014061927795, "learning_rate": 1.2934658005349637e-05, "loss": 0.2064, "step": 3388 }, { "epoch": 1.0360745949250993, "grad_norm": 0.8212870955467224, "learning_rate": 1.2938479174627435e-05, "loss": 0.2271, "step": 3389 }, { "epoch": 1.0363803118312442, "grad_norm": 1.0722990036010742, "learning_rate": 1.2942300343905236e-05, "loss": 0.2146, "step": 3390 }, { "epoch": 1.0366860287373891, "grad_norm": 2.581890821456909, "learning_rate": 1.2946121513183035e-05, "loss": 0.2711, "step": 3391 }, { "epoch": 1.036991745643534, "grad_norm": 6.931295871734619, "learning_rate": 1.2949942682460833e-05, "loss": 0.2552, "step": 3392 }, { "epoch": 1.037297462549679, "grad_norm": 0.9800381660461426, "learning_rate": 1.2953763851738632e-05, "loss": 0.2731, "step": 3393 }, { "epoch": 1.0376031794558238, "grad_norm": 1.0490491390228271, "learning_rate": 1.295758502101643e-05, "loss": 0.2778, "step": 3394 }, { "epoch": 1.037908896361969, "grad_norm": 1.5706392526626587, "learning_rate": 1.296140619029423e-05, "loss": 0.3086, "step": 3395 }, { "epoch": 1.0382146132681138, "grad_norm": 2.4154508113861084, "learning_rate": 1.2965227359572028e-05, "loss": 0.345, "step": 3396 }, { "epoch": 1.0385203301742587, "grad_norm": 0.5237347483634949, "learning_rate": 1.2969048528849827e-05, "loss": 0.1967, "step": 3397 }, { "epoch": 1.0388260470804036, "grad_norm": 0.37209585309028625, "learning_rate": 1.2972869698127627e-05, "loss": 0.1184, "step": 3398 }, { "epoch": 1.0391317639865485, "grad_norm": 0.4081514775753021, "learning_rate": 1.2976690867405426e-05, "loss": 0.084, "step": 3399 }, { "epoch": 1.0394374808926934, "grad_norm": 0.358258455991745, "learning_rate": 1.2980512036683225e-05, "loss": 0.078, "step": 3400 }, { "epoch": 1.0397431977988383, "grad_norm": 0.8572607636451721, "learning_rate": 1.2984333205961024e-05, "loss": 0.0863, "step": 3401 }, { "epoch": 1.0400489147049832, "grad_norm": 0.35952770709991455, "learning_rate": 1.2988154375238822e-05, "loss": 0.0736, "step": 3402 }, { "epoch": 1.040354631611128, "grad_norm": 0.39683669805526733, "learning_rate": 1.2991975544516621e-05, "loss": 0.0995, "step": 3403 }, { "epoch": 1.040660348517273, "grad_norm": 0.38819634914398193, "learning_rate": 1.2995796713794422e-05, "loss": 0.0787, "step": 3404 }, { "epoch": 1.0409660654234179, "grad_norm": 0.5035747289657593, "learning_rate": 1.2999617883072222e-05, "loss": 0.0935, "step": 3405 }, { "epoch": 1.0412717823295627, "grad_norm": 0.49394065141677856, "learning_rate": 1.300343905235002e-05, "loss": 0.0879, "step": 3406 }, { "epoch": 1.0415774992357076, "grad_norm": 0.6078733801841736, "learning_rate": 1.300726022162782e-05, "loss": 0.1337, "step": 3407 }, { "epoch": 1.0418832161418528, "grad_norm": 0.4058288335800171, "learning_rate": 1.3011081390905618e-05, "loss": 0.1156, "step": 3408 }, { "epoch": 1.0421889330479976, "grad_norm": 0.5912354588508606, "learning_rate": 1.3014902560183417e-05, "loss": 0.1444, "step": 3409 }, { "epoch": 1.0424946499541425, "grad_norm": 0.6235601305961609, "learning_rate": 1.3018723729461216e-05, "loss": 0.1599, "step": 3410 }, { "epoch": 1.0428003668602874, "grad_norm": 0.8138360381126404, "learning_rate": 1.3022544898739015e-05, "loss": 0.1671, "step": 3411 }, { "epoch": 1.0431060837664323, "grad_norm": 1.3053557872772217, "learning_rate": 1.3026366068016813e-05, "loss": 0.1567, "step": 3412 }, { "epoch": 1.0434118006725772, "grad_norm": 4.561112403869629, "learning_rate": 1.3030187237294614e-05, "loss": 0.2073, "step": 3413 }, { "epoch": 1.043717517578722, "grad_norm": 1.0376298427581787, "learning_rate": 1.3034008406572412e-05, "loss": 0.2184, "step": 3414 }, { "epoch": 1.044023234484867, "grad_norm": 1.1728034019470215, "learning_rate": 1.3037829575850211e-05, "loss": 0.2862, "step": 3415 }, { "epoch": 1.044328951391012, "grad_norm": 1.718489170074463, "learning_rate": 1.304165074512801e-05, "loss": 0.2476, "step": 3416 }, { "epoch": 1.0446346682971568, "grad_norm": 1.1493741273880005, "learning_rate": 1.3045471914405809e-05, "loss": 0.2642, "step": 3417 }, { "epoch": 1.0449403852033017, "grad_norm": 0.9947616457939148, "learning_rate": 1.3049293083683607e-05, "loss": 0.2363, "step": 3418 }, { "epoch": 1.0452461021094466, "grad_norm": 1.3591034412384033, "learning_rate": 1.3053114252961406e-05, "loss": 0.2488, "step": 3419 }, { "epoch": 1.0455518190155915, "grad_norm": 2.155545949935913, "learning_rate": 1.3056935422239205e-05, "loss": 0.3302, "step": 3420 }, { "epoch": 1.0458575359217366, "grad_norm": 1.560060739517212, "learning_rate": 1.3060756591517005e-05, "loss": 0.3786, "step": 3421 }, { "epoch": 1.0461632528278815, "grad_norm": 2.7209043502807617, "learning_rate": 1.3064577760794804e-05, "loss": 0.1928, "step": 3422 }, { "epoch": 1.0464689697340264, "grad_norm": 0.3877220153808594, "learning_rate": 1.3068398930072603e-05, "loss": 0.0994, "step": 3423 }, { "epoch": 1.0467746866401713, "grad_norm": 0.43057548999786377, "learning_rate": 1.3072220099350402e-05, "loss": 0.079, "step": 3424 }, { "epoch": 1.0470804035463162, "grad_norm": 0.38833531737327576, "learning_rate": 1.30760412686282e-05, "loss": 0.0733, "step": 3425 }, { "epoch": 1.047386120452461, "grad_norm": 0.4371500313282013, "learning_rate": 1.3079862437905999e-05, "loss": 0.1029, "step": 3426 }, { "epoch": 1.047691837358606, "grad_norm": 0.3988548517227173, "learning_rate": 1.3083683607183798e-05, "loss": 0.0772, "step": 3427 }, { "epoch": 1.0479975542647508, "grad_norm": 0.5410102009773254, "learning_rate": 1.3087504776461597e-05, "loss": 0.1183, "step": 3428 }, { "epoch": 1.0483032711708957, "grad_norm": 0.39718857407569885, "learning_rate": 1.3091325945739397e-05, "loss": 0.0786, "step": 3429 }, { "epoch": 1.0486089880770406, "grad_norm": 0.7689939141273499, "learning_rate": 1.3095147115017196e-05, "loss": 0.1175, "step": 3430 }, { "epoch": 1.0489147049831855, "grad_norm": 0.4761989414691925, "learning_rate": 1.3098968284294994e-05, "loss": 0.099, "step": 3431 }, { "epoch": 1.0492204218893304, "grad_norm": 0.5723828673362732, "learning_rate": 1.3102789453572793e-05, "loss": 0.1018, "step": 3432 }, { "epoch": 1.0495261387954753, "grad_norm": 0.5243285298347473, "learning_rate": 1.3106610622850592e-05, "loss": 0.0967, "step": 3433 }, { "epoch": 1.0498318557016204, "grad_norm": 0.5096736550331116, "learning_rate": 1.311043179212839e-05, "loss": 0.1181, "step": 3434 }, { "epoch": 1.0501375726077653, "grad_norm": 0.6181472539901733, "learning_rate": 1.311425296140619e-05, "loss": 0.1448, "step": 3435 }, { "epoch": 1.0504432895139102, "grad_norm": 0.716534435749054, "learning_rate": 1.311807413068399e-05, "loss": 0.1863, "step": 3436 }, { "epoch": 1.050749006420055, "grad_norm": 0.7231935858726501, "learning_rate": 1.3121895299961789e-05, "loss": 0.203, "step": 3437 }, { "epoch": 1.0510547233262, "grad_norm": 0.7654871344566345, "learning_rate": 1.3125716469239587e-05, "loss": 0.207, "step": 3438 }, { "epoch": 1.0513604402323449, "grad_norm": 0.839260995388031, "learning_rate": 1.3129537638517386e-05, "loss": 0.2151, "step": 3439 }, { "epoch": 1.0516661571384898, "grad_norm": 0.8134298920631409, "learning_rate": 1.3133358807795185e-05, "loss": 0.2675, "step": 3440 }, { "epoch": 1.0519718740446347, "grad_norm": 1.0949217081069946, "learning_rate": 1.3137179977072984e-05, "loss": 0.2217, "step": 3441 }, { "epoch": 1.0522775909507796, "grad_norm": 1.1129297018051147, "learning_rate": 1.3141001146350782e-05, "loss": 0.2431, "step": 3442 }, { "epoch": 1.0525833078569244, "grad_norm": 1.353756070137024, "learning_rate": 1.3144822315628583e-05, "loss": 0.2302, "step": 3443 }, { "epoch": 1.0528890247630693, "grad_norm": 1.0730942487716675, "learning_rate": 1.3148643484906383e-05, "loss": 0.2543, "step": 3444 }, { "epoch": 1.0531947416692142, "grad_norm": 1.47787606716156, "learning_rate": 1.3152464654184182e-05, "loss": 0.2946, "step": 3445 }, { "epoch": 1.0535004585753591, "grad_norm": 2.340786933898926, "learning_rate": 1.315628582346198e-05, "loss": 0.3161, "step": 3446 }, { "epoch": 1.0538061754815042, "grad_norm": 0.5655141472816467, "learning_rate": 1.316010699273978e-05, "loss": 0.2085, "step": 3447 }, { "epoch": 1.0541118923876491, "grad_norm": 0.3198808431625366, "learning_rate": 1.3163928162017578e-05, "loss": 0.0866, "step": 3448 }, { "epoch": 1.054417609293794, "grad_norm": 0.32286888360977173, "learning_rate": 1.3167749331295377e-05, "loss": 0.0643, "step": 3449 }, { "epoch": 1.054723326199939, "grad_norm": 0.33292022347450256, "learning_rate": 1.3171570500573176e-05, "loss": 0.0724, "step": 3450 }, { "epoch": 1.0550290431060838, "grad_norm": 0.34757155179977417, "learning_rate": 1.3175391669850974e-05, "loss": 0.0803, "step": 3451 }, { "epoch": 1.0553347600122287, "grad_norm": 0.4292463958263397, "learning_rate": 1.3179212839128775e-05, "loss": 0.1013, "step": 3452 }, { "epoch": 1.0556404769183736, "grad_norm": 0.28325366973876953, "learning_rate": 1.3183034008406574e-05, "loss": 0.0707, "step": 3453 }, { "epoch": 1.0559461938245185, "grad_norm": 0.37486550211906433, "learning_rate": 1.3186855177684372e-05, "loss": 0.0934, "step": 3454 }, { "epoch": 1.0562519107306634, "grad_norm": 0.4844576120376587, "learning_rate": 1.3190676346962171e-05, "loss": 0.1641, "step": 3455 }, { "epoch": 1.0565576276368083, "grad_norm": 0.4804476797580719, "learning_rate": 1.319449751623997e-05, "loss": 0.0888, "step": 3456 }, { "epoch": 1.0568633445429532, "grad_norm": 0.47404345870018005, "learning_rate": 1.3198318685517769e-05, "loss": 0.1122, "step": 3457 }, { "epoch": 1.057169061449098, "grad_norm": 0.4747203588485718, "learning_rate": 1.3202139854795567e-05, "loss": 0.1099, "step": 3458 }, { "epoch": 1.057474778355243, "grad_norm": 0.47238969802856445, "learning_rate": 1.3205961024073366e-05, "loss": 0.1193, "step": 3459 }, { "epoch": 1.057780495261388, "grad_norm": 0.6839624643325806, "learning_rate": 1.3209782193351167e-05, "loss": 0.213, "step": 3460 }, { "epoch": 1.058086212167533, "grad_norm": 0.6204606890678406, "learning_rate": 1.3213603362628965e-05, "loss": 0.1486, "step": 3461 }, { "epoch": 1.0583919290736779, "grad_norm": 0.6607351303100586, "learning_rate": 1.3217424531906764e-05, "loss": 0.2092, "step": 3462 }, { "epoch": 1.0586976459798227, "grad_norm": 0.9217658638954163, "learning_rate": 1.3221245701184563e-05, "loss": 0.1775, "step": 3463 }, { "epoch": 1.0590033628859676, "grad_norm": 1.2072175741195679, "learning_rate": 1.3225066870462362e-05, "loss": 0.2128, "step": 3464 }, { "epoch": 1.0593090797921125, "grad_norm": 0.9937958717346191, "learning_rate": 1.322888803974016e-05, "loss": 0.2209, "step": 3465 }, { "epoch": 1.0596147966982574, "grad_norm": 0.8885544538497925, "learning_rate": 1.3232709209017959e-05, "loss": 0.2483, "step": 3466 }, { "epoch": 1.0599205136044023, "grad_norm": 1.2025705575942993, "learning_rate": 1.323653037829576e-05, "loss": 0.2391, "step": 3467 }, { "epoch": 1.0602262305105472, "grad_norm": 0.8999363780021667, "learning_rate": 1.3240351547573558e-05, "loss": 0.2433, "step": 3468 }, { "epoch": 1.060531947416692, "grad_norm": 1.0705933570861816, "learning_rate": 1.3244172716851357e-05, "loss": 0.2587, "step": 3469 }, { "epoch": 1.060837664322837, "grad_norm": 2.0787360668182373, "learning_rate": 1.3247993886129156e-05, "loss": 0.3116, "step": 3470 }, { "epoch": 1.0611433812289819, "grad_norm": 5.4005560874938965, "learning_rate": 1.3251815055406954e-05, "loss": 0.3851, "step": 3471 }, { "epoch": 1.0614490981351268, "grad_norm": 1.0912325382232666, "learning_rate": 1.3255636224684753e-05, "loss": 0.223, "step": 3472 }, { "epoch": 1.061754815041272, "grad_norm": 0.40913817286491394, "learning_rate": 1.3259457393962552e-05, "loss": 0.1085, "step": 3473 }, { "epoch": 1.0620605319474168, "grad_norm": 0.3519333302974701, "learning_rate": 1.326327856324035e-05, "loss": 0.0869, "step": 3474 }, { "epoch": 1.0623662488535617, "grad_norm": 0.523308277130127, "learning_rate": 1.3267099732518151e-05, "loss": 0.1112, "step": 3475 }, { "epoch": 1.0626719657597066, "grad_norm": 0.4450378119945526, "learning_rate": 1.327092090179595e-05, "loss": 0.0713, "step": 3476 }, { "epoch": 1.0629776826658515, "grad_norm": 0.36327341198921204, "learning_rate": 1.3274742071073749e-05, "loss": 0.0608, "step": 3477 }, { "epoch": 1.0632833995719964, "grad_norm": 0.44589927792549133, "learning_rate": 1.3278563240351547e-05, "loss": 0.0832, "step": 3478 }, { "epoch": 1.0635891164781412, "grad_norm": 0.5375767946243286, "learning_rate": 1.3282384409629346e-05, "loss": 0.1027, "step": 3479 }, { "epoch": 1.0638948333842861, "grad_norm": 0.5024489164352417, "learning_rate": 1.3286205578907145e-05, "loss": 0.1178, "step": 3480 }, { "epoch": 1.064200550290431, "grad_norm": 0.3880492150783539, "learning_rate": 1.3290026748184944e-05, "loss": 0.0752, "step": 3481 }, { "epoch": 1.064506267196576, "grad_norm": 1.1360548734664917, "learning_rate": 1.3293847917462744e-05, "loss": 0.1306, "step": 3482 }, { "epoch": 1.0648119841027208, "grad_norm": 1.0456324815750122, "learning_rate": 1.3297669086740544e-05, "loss": 0.1213, "step": 3483 }, { "epoch": 1.0651177010088657, "grad_norm": 2.4843673706054688, "learning_rate": 1.3301490256018343e-05, "loss": 0.1253, "step": 3484 }, { "epoch": 1.0654234179150106, "grad_norm": 0.6335281729698181, "learning_rate": 1.3305311425296142e-05, "loss": 0.1406, "step": 3485 }, { "epoch": 1.0657291348211557, "grad_norm": 1.0818294286727905, "learning_rate": 1.330913259457394e-05, "loss": 0.1503, "step": 3486 }, { "epoch": 1.0660348517273006, "grad_norm": 0.630302906036377, "learning_rate": 1.331295376385174e-05, "loss": 0.1748, "step": 3487 }, { "epoch": 1.0663405686334455, "grad_norm": 0.7543172240257263, "learning_rate": 1.3316774933129538e-05, "loss": 0.1936, "step": 3488 }, { "epoch": 1.0666462855395904, "grad_norm": 1.0082054138183594, "learning_rate": 1.3320596102407337e-05, "loss": 0.2199, "step": 3489 }, { "epoch": 1.0669520024457353, "grad_norm": 0.7996132969856262, "learning_rate": 1.3324417271685136e-05, "loss": 0.1897, "step": 3490 }, { "epoch": 1.0672577193518802, "grad_norm": 2.2606875896453857, "learning_rate": 1.3328238440962936e-05, "loss": 0.2442, "step": 3491 }, { "epoch": 1.067563436258025, "grad_norm": 1.4335640668869019, "learning_rate": 1.3332059610240735e-05, "loss": 0.2443, "step": 3492 }, { "epoch": 1.06786915316417, "grad_norm": 1.2955760955810547, "learning_rate": 1.3335880779518534e-05, "loss": 0.2904, "step": 3493 }, { "epoch": 1.0681748700703149, "grad_norm": 1.5493338108062744, "learning_rate": 1.3339701948796332e-05, "loss": 0.2973, "step": 3494 }, { "epoch": 1.0684805869764598, "grad_norm": 1.4561494588851929, "learning_rate": 1.3343523118074131e-05, "loss": 0.3172, "step": 3495 }, { "epoch": 1.0687863038826046, "grad_norm": 2.662029504776001, "learning_rate": 1.334734428735193e-05, "loss": 0.3789, "step": 3496 }, { "epoch": 1.0690920207887495, "grad_norm": 0.6018199920654297, "learning_rate": 1.3351165456629729e-05, "loss": 0.1854, "step": 3497 }, { "epoch": 1.0693977376948944, "grad_norm": 0.4586171507835388, "learning_rate": 1.3354986625907529e-05, "loss": 0.1111, "step": 3498 }, { "epoch": 1.0697034546010395, "grad_norm": 0.42636045813560486, "learning_rate": 1.3358807795185328e-05, "loss": 0.091, "step": 3499 }, { "epoch": 1.0700091715071844, "grad_norm": 0.37149274349212646, "learning_rate": 1.3362628964463126e-05, "loss": 0.0681, "step": 3500 }, { "epoch": 1.0703148884133293, "grad_norm": 0.25305724143981934, "learning_rate": 1.3366450133740925e-05, "loss": 0.0828, "step": 3501 }, { "epoch": 1.0706206053194742, "grad_norm": 0.3333028554916382, "learning_rate": 1.3370271303018724e-05, "loss": 0.08, "step": 3502 }, { "epoch": 1.0709263222256191, "grad_norm": 0.5960842967033386, "learning_rate": 1.3374092472296523e-05, "loss": 0.0798, "step": 3503 }, { "epoch": 1.071232039131764, "grad_norm": 0.564896285533905, "learning_rate": 1.3377913641574321e-05, "loss": 0.0775, "step": 3504 }, { "epoch": 1.071537756037909, "grad_norm": 0.877511739730835, "learning_rate": 1.338173481085212e-05, "loss": 0.124, "step": 3505 }, { "epoch": 1.0718434729440538, "grad_norm": 0.40789899230003357, "learning_rate": 1.338555598012992e-05, "loss": 0.1007, "step": 3506 }, { "epoch": 1.0721491898501987, "grad_norm": 0.5487057566642761, "learning_rate": 1.338937714940772e-05, "loss": 0.1132, "step": 3507 }, { "epoch": 1.0724549067563436, "grad_norm": 0.7463929653167725, "learning_rate": 1.3393198318685518e-05, "loss": 0.1134, "step": 3508 }, { "epoch": 1.0727606236624885, "grad_norm": 0.7275164723396301, "learning_rate": 1.3397019487963317e-05, "loss": 0.1634, "step": 3509 }, { "epoch": 1.0730663405686334, "grad_norm": 0.581189751625061, "learning_rate": 1.3400840657241116e-05, "loss": 0.1381, "step": 3510 }, { "epoch": 1.0733720574747783, "grad_norm": 0.5896050333976746, "learning_rate": 1.3404661826518914e-05, "loss": 0.1667, "step": 3511 }, { "epoch": 1.0736777743809234, "grad_norm": 0.8883576989173889, "learning_rate": 1.3408482995796713e-05, "loss": 0.21, "step": 3512 }, { "epoch": 1.0739834912870683, "grad_norm": 0.7067463994026184, "learning_rate": 1.3412304165074512e-05, "loss": 0.2241, "step": 3513 }, { "epoch": 1.0742892081932132, "grad_norm": 1.2545331716537476, "learning_rate": 1.3416125334352312e-05, "loss": 0.215, "step": 3514 }, { "epoch": 1.074594925099358, "grad_norm": 1.0058159828186035, "learning_rate": 1.3419946503630111e-05, "loss": 0.2302, "step": 3515 }, { "epoch": 1.074900642005503, "grad_norm": 1.043565034866333, "learning_rate": 1.342376767290791e-05, "loss": 0.2375, "step": 3516 }, { "epoch": 1.0752063589116478, "grad_norm": 1.8136638402938843, "learning_rate": 1.3427588842185709e-05, "loss": 0.2461, "step": 3517 }, { "epoch": 1.0755120758177927, "grad_norm": 2.325191020965576, "learning_rate": 1.3431410011463507e-05, "loss": 0.2743, "step": 3518 }, { "epoch": 1.0758177927239376, "grad_norm": 1.3558809757232666, "learning_rate": 1.3435231180741306e-05, "loss": 0.2932, "step": 3519 }, { "epoch": 1.0761235096300825, "grad_norm": 1.4956409931182861, "learning_rate": 1.3439052350019105e-05, "loss": 0.264, "step": 3520 }, { "epoch": 1.0764292265362274, "grad_norm": 2.2279374599456787, "learning_rate": 1.3442873519296905e-05, "loss": 0.3538, "step": 3521 }, { "epoch": 1.0767349434423723, "grad_norm": 0.7089594006538391, "learning_rate": 1.3446694688574706e-05, "loss": 0.2116, "step": 3522 }, { "epoch": 1.0770406603485172, "grad_norm": 0.5871909260749817, "learning_rate": 1.3450515857852504e-05, "loss": 0.1163, "step": 3523 }, { "epoch": 1.077346377254662, "grad_norm": 0.3462996780872345, "learning_rate": 1.3454337027130303e-05, "loss": 0.1098, "step": 3524 }, { "epoch": 1.0776520941608072, "grad_norm": 0.3832639157772064, "learning_rate": 1.3458158196408102e-05, "loss": 0.1097, "step": 3525 }, { "epoch": 1.077957811066952, "grad_norm": 0.3706163465976715, "learning_rate": 1.34619793656859e-05, "loss": 0.079, "step": 3526 }, { "epoch": 1.078263527973097, "grad_norm": 0.3760715425014496, "learning_rate": 1.34658005349637e-05, "loss": 0.0699, "step": 3527 }, { "epoch": 1.0785692448792419, "grad_norm": 0.416456937789917, "learning_rate": 1.3469621704241498e-05, "loss": 0.111, "step": 3528 }, { "epoch": 1.0788749617853868, "grad_norm": 0.42599523067474365, "learning_rate": 1.3473442873519297e-05, "loss": 0.0746, "step": 3529 }, { "epoch": 1.0791806786915317, "grad_norm": 0.5888670086860657, "learning_rate": 1.3477264042797097e-05, "loss": 0.1159, "step": 3530 }, { "epoch": 1.0794863955976766, "grad_norm": 0.4852854013442993, "learning_rate": 1.3481085212074896e-05, "loss": 0.0947, "step": 3531 }, { "epoch": 1.0797921125038215, "grad_norm": 0.41799622774124146, "learning_rate": 1.3484906381352695e-05, "loss": 0.1063, "step": 3532 }, { "epoch": 1.0800978294099663, "grad_norm": 0.5467134118080139, "learning_rate": 1.3488727550630494e-05, "loss": 0.1472, "step": 3533 }, { "epoch": 1.0804035463161112, "grad_norm": 0.4486512541770935, "learning_rate": 1.3492548719908292e-05, "loss": 0.1309, "step": 3534 }, { "epoch": 1.0807092632222561, "grad_norm": 0.7100693583488464, "learning_rate": 1.3496369889186091e-05, "loss": 0.1482, "step": 3535 }, { "epoch": 1.081014980128401, "grad_norm": 0.7605172991752625, "learning_rate": 1.350019105846389e-05, "loss": 0.1808, "step": 3536 }, { "epoch": 1.081320697034546, "grad_norm": 0.7627620697021484, "learning_rate": 1.350401222774169e-05, "loss": 0.1885, "step": 3537 }, { "epoch": 1.081626413940691, "grad_norm": 0.7717726230621338, "learning_rate": 1.3507833397019489e-05, "loss": 0.1946, "step": 3538 }, { "epoch": 1.081932130846836, "grad_norm": 1.3748269081115723, "learning_rate": 1.3511654566297288e-05, "loss": 0.2533, "step": 3539 }, { "epoch": 1.0822378477529808, "grad_norm": 1.0728331804275513, "learning_rate": 1.3515475735575086e-05, "loss": 0.27, "step": 3540 }, { "epoch": 1.0825435646591257, "grad_norm": 1.0784944295883179, "learning_rate": 1.3519296904852885e-05, "loss": 0.24, "step": 3541 }, { "epoch": 1.0828492815652706, "grad_norm": 1.1592758893966675, "learning_rate": 1.3523118074130684e-05, "loss": 0.1833, "step": 3542 }, { "epoch": 1.0831549984714155, "grad_norm": 1.6536858081817627, "learning_rate": 1.3526939243408483e-05, "loss": 0.2211, "step": 3543 }, { "epoch": 1.0834607153775604, "grad_norm": 1.4424371719360352, "learning_rate": 1.3530760412686281e-05, "loss": 0.2851, "step": 3544 }, { "epoch": 1.0837664322837053, "grad_norm": 1.8731298446655273, "learning_rate": 1.3534581581964082e-05, "loss": 0.3547, "step": 3545 }, { "epoch": 1.0840721491898502, "grad_norm": 1.8920941352844238, "learning_rate": 1.353840275124188e-05, "loss": 0.3443, "step": 3546 }, { "epoch": 1.084377866095995, "grad_norm": 0.4641076326370239, "learning_rate": 1.354222392051968e-05, "loss": 0.1781, "step": 3547 }, { "epoch": 1.08468358300214, "grad_norm": 0.4040733575820923, "learning_rate": 1.3546045089797478e-05, "loss": 0.1247, "step": 3548 }, { "epoch": 1.0849892999082849, "grad_norm": 0.6070154905319214, "learning_rate": 1.3549866259075277e-05, "loss": 0.0837, "step": 3549 }, { "epoch": 1.0852950168144297, "grad_norm": 0.40338262915611267, "learning_rate": 1.3553687428353076e-05, "loss": 0.0783, "step": 3550 }, { "epoch": 1.0856007337205749, "grad_norm": 0.3081102967262268, "learning_rate": 1.3557508597630874e-05, "loss": 0.0641, "step": 3551 }, { "epoch": 1.0859064506267198, "grad_norm": 0.3776710331439972, "learning_rate": 1.3561329766908673e-05, "loss": 0.1094, "step": 3552 }, { "epoch": 1.0862121675328646, "grad_norm": 0.32527872920036316, "learning_rate": 1.3565150936186474e-05, "loss": 0.0645, "step": 3553 }, { "epoch": 1.0865178844390095, "grad_norm": 0.34349459409713745, "learning_rate": 1.3568972105464272e-05, "loss": 0.088, "step": 3554 }, { "epoch": 1.0868236013451544, "grad_norm": 0.41359972953796387, "learning_rate": 1.3572793274742071e-05, "loss": 0.0839, "step": 3555 }, { "epoch": 1.0871293182512993, "grad_norm": 0.32262715697288513, "learning_rate": 1.357661444401987e-05, "loss": 0.084, "step": 3556 }, { "epoch": 1.0874350351574442, "grad_norm": 1.1770823001861572, "learning_rate": 1.3580435613297668e-05, "loss": 0.1401, "step": 3557 }, { "epoch": 1.0877407520635891, "grad_norm": 0.4841969907283783, "learning_rate": 1.3584256782575467e-05, "loss": 0.0971, "step": 3558 }, { "epoch": 1.088046468969734, "grad_norm": 0.4191941022872925, "learning_rate": 1.3588077951853266e-05, "loss": 0.1141, "step": 3559 }, { "epoch": 1.088352185875879, "grad_norm": 0.5742045044898987, "learning_rate": 1.3591899121131066e-05, "loss": 0.1622, "step": 3560 }, { "epoch": 1.0886579027820238, "grad_norm": 0.5907508730888367, "learning_rate": 1.3595720290408867e-05, "loss": 0.1808, "step": 3561 }, { "epoch": 1.0889636196881687, "grad_norm": 0.7449549436569214, "learning_rate": 1.3599541459686666e-05, "loss": 0.171, "step": 3562 }, { "epoch": 1.0892693365943136, "grad_norm": 0.6538222432136536, "learning_rate": 1.3603362628964464e-05, "loss": 0.1798, "step": 3563 }, { "epoch": 1.0895750535004587, "grad_norm": 1.4824936389923096, "learning_rate": 1.3607183798242263e-05, "loss": 0.2041, "step": 3564 }, { "epoch": 1.0898807704066036, "grad_norm": 0.922738254070282, "learning_rate": 1.3611004967520062e-05, "loss": 0.2329, "step": 3565 }, { "epoch": 1.0901864873127485, "grad_norm": 0.9853188395500183, "learning_rate": 1.361482613679786e-05, "loss": 0.2574, "step": 3566 }, { "epoch": 1.0904922042188934, "grad_norm": 0.9640428423881531, "learning_rate": 1.361864730607566e-05, "loss": 0.229, "step": 3567 }, { "epoch": 1.0907979211250383, "grad_norm": 9.24087905883789, "learning_rate": 1.362246847535346e-05, "loss": 0.2626, "step": 3568 }, { "epoch": 1.0911036380311832, "grad_norm": 1.2924261093139648, "learning_rate": 1.3626289644631259e-05, "loss": 0.3055, "step": 3569 }, { "epoch": 1.091409354937328, "grad_norm": 1.6198550462722778, "learning_rate": 1.3630110813909057e-05, "loss": 0.311, "step": 3570 }, { "epoch": 1.091715071843473, "grad_norm": 2.416065216064453, "learning_rate": 1.3633931983186856e-05, "loss": 0.351, "step": 3571 }, { "epoch": 1.0920207887496178, "grad_norm": 0.7451707720756531, "learning_rate": 1.3637753152464655e-05, "loss": 0.2026, "step": 3572 }, { "epoch": 1.0923265056557627, "grad_norm": 0.3675883114337921, "learning_rate": 1.3641574321742453e-05, "loss": 0.1095, "step": 3573 }, { "epoch": 1.0926322225619076, "grad_norm": 0.6783490180969238, "learning_rate": 1.3645395491020252e-05, "loss": 0.084, "step": 3574 }, { "epoch": 1.0929379394680525, "grad_norm": 0.5891165137290955, "learning_rate": 1.3649216660298051e-05, "loss": 0.0898, "step": 3575 }, { "epoch": 1.0932436563741974, "grad_norm": 0.3628169298171997, "learning_rate": 1.3653037829575851e-05, "loss": 0.0835, "step": 3576 }, { "epoch": 1.0935493732803425, "grad_norm": 0.6774737238883972, "learning_rate": 1.365685899885365e-05, "loss": 0.0635, "step": 3577 }, { "epoch": 1.0938550901864874, "grad_norm": 0.3621596097946167, "learning_rate": 1.3660680168131449e-05, "loss": 0.0839, "step": 3578 }, { "epoch": 1.0941608070926323, "grad_norm": 0.4963834881782532, "learning_rate": 1.3664501337409248e-05, "loss": 0.1118, "step": 3579 }, { "epoch": 1.0944665239987772, "grad_norm": 1.0009651184082031, "learning_rate": 1.3668322506687046e-05, "loss": 0.0756, "step": 3580 }, { "epoch": 1.094772240904922, "grad_norm": 0.6557985544204712, "learning_rate": 1.3672143675964845e-05, "loss": 0.0782, "step": 3581 }, { "epoch": 1.095077957811067, "grad_norm": 0.48790231347084045, "learning_rate": 1.3675964845242644e-05, "loss": 0.1172, "step": 3582 }, { "epoch": 1.0953836747172119, "grad_norm": 0.4715130627155304, "learning_rate": 1.3679786014520443e-05, "loss": 0.0885, "step": 3583 }, { "epoch": 1.0956893916233568, "grad_norm": 0.5325279831886292, "learning_rate": 1.3683607183798243e-05, "loss": 0.1009, "step": 3584 }, { "epoch": 1.0959951085295017, "grad_norm": 0.7422232627868652, "learning_rate": 1.3687428353076042e-05, "loss": 0.201, "step": 3585 }, { "epoch": 1.0963008254356466, "grad_norm": 0.7911832928657532, "learning_rate": 1.369124952235384e-05, "loss": 0.1779, "step": 3586 }, { "epoch": 1.0966065423417914, "grad_norm": 0.6884647607803345, "learning_rate": 1.369507069163164e-05, "loss": 0.1861, "step": 3587 }, { "epoch": 1.0969122592479363, "grad_norm": 0.9643663763999939, "learning_rate": 1.3698891860909438e-05, "loss": 0.209, "step": 3588 }, { "epoch": 1.0972179761540812, "grad_norm": 0.6985158324241638, "learning_rate": 1.3702713030187237e-05, "loss": 0.2083, "step": 3589 }, { "epoch": 1.0975236930602263, "grad_norm": 1.3170199394226074, "learning_rate": 1.3706534199465036e-05, "loss": 0.2367, "step": 3590 }, { "epoch": 1.0978294099663712, "grad_norm": 1.1512011289596558, "learning_rate": 1.3710355368742836e-05, "loss": 0.2435, "step": 3591 }, { "epoch": 1.0981351268725161, "grad_norm": 1.4257317781448364, "learning_rate": 1.3714176538020635e-05, "loss": 0.2183, "step": 3592 }, { "epoch": 1.098440843778661, "grad_norm": 1.6157718896865845, "learning_rate": 1.3717997707298433e-05, "loss": 0.2465, "step": 3593 }, { "epoch": 1.098746560684806, "grad_norm": 1.302146077156067, "learning_rate": 1.3721818876576232e-05, "loss": 0.2306, "step": 3594 }, { "epoch": 1.0990522775909508, "grad_norm": 1.887865424156189, "learning_rate": 1.3725640045854031e-05, "loss": 0.3197, "step": 3595 }, { "epoch": 1.0993579944970957, "grad_norm": 2.546576976776123, "learning_rate": 1.372946121513183e-05, "loss": 0.3701, "step": 3596 }, { "epoch": 1.0996637114032406, "grad_norm": 0.48241159319877625, "learning_rate": 1.3733282384409628e-05, "loss": 0.1851, "step": 3597 }, { "epoch": 1.0999694283093855, "grad_norm": 0.41008830070495605, "learning_rate": 1.3737103553687427e-05, "loss": 0.1029, "step": 3598 }, { "epoch": 1.1002751452155304, "grad_norm": 0.8847017884254456, "learning_rate": 1.374092472296523e-05, "loss": 0.1006, "step": 3599 }, { "epoch": 1.1005808621216753, "grad_norm": 0.35807421803474426, "learning_rate": 1.3744745892243028e-05, "loss": 0.0937, "step": 3600 }, { "epoch": 1.1008865790278202, "grad_norm": 0.38815319538116455, "learning_rate": 1.3748567061520827e-05, "loss": 0.0742, "step": 3601 }, { "epoch": 1.101192295933965, "grad_norm": 0.4144587218761444, "learning_rate": 1.3752388230798626e-05, "loss": 0.0813, "step": 3602 }, { "epoch": 1.1014980128401102, "grad_norm": 0.5315117239952087, "learning_rate": 1.3756209400076424e-05, "loss": 0.0917, "step": 3603 }, { "epoch": 1.101803729746255, "grad_norm": 0.8662590384483337, "learning_rate": 1.3760030569354223e-05, "loss": 0.103, "step": 3604 }, { "epoch": 1.1021094466524, "grad_norm": 0.549457311630249, "learning_rate": 1.3763851738632022e-05, "loss": 0.1116, "step": 3605 }, { "epoch": 1.1024151635585449, "grad_norm": 0.5030964612960815, "learning_rate": 1.376767290790982e-05, "loss": 0.0833, "step": 3606 }, { "epoch": 1.1027208804646897, "grad_norm": 0.6444730758666992, "learning_rate": 1.3771494077187621e-05, "loss": 0.1295, "step": 3607 }, { "epoch": 1.1030265973708346, "grad_norm": 0.6001731753349304, "learning_rate": 1.377531524646542e-05, "loss": 0.1448, "step": 3608 }, { "epoch": 1.1033323142769795, "grad_norm": 0.533423662185669, "learning_rate": 1.3779136415743218e-05, "loss": 0.175, "step": 3609 }, { "epoch": 1.1036380311831244, "grad_norm": 0.8740178346633911, "learning_rate": 1.3782957585021017e-05, "loss": 0.1476, "step": 3610 }, { "epoch": 1.1039437480892693, "grad_norm": 0.6171881556510925, "learning_rate": 1.3786778754298816e-05, "loss": 0.1533, "step": 3611 }, { "epoch": 1.1042494649954142, "grad_norm": 0.6815191507339478, "learning_rate": 1.3790599923576615e-05, "loss": 0.1864, "step": 3612 }, { "epoch": 1.104555181901559, "grad_norm": 0.6901628971099854, "learning_rate": 1.3794421092854413e-05, "loss": 0.1923, "step": 3613 }, { "epoch": 1.104860898807704, "grad_norm": 1.1416395902633667, "learning_rate": 1.3798242262132212e-05, "loss": 0.2305, "step": 3614 }, { "epoch": 1.1051666157138489, "grad_norm": 1.0246989727020264, "learning_rate": 1.3802063431410013e-05, "loss": 0.2179, "step": 3615 }, { "epoch": 1.105472332619994, "grad_norm": 1.0003010034561157, "learning_rate": 1.3805884600687811e-05, "loss": 0.2372, "step": 3616 }, { "epoch": 1.105778049526139, "grad_norm": 1.061488389968872, "learning_rate": 1.380970576996561e-05, "loss": 0.2782, "step": 3617 }, { "epoch": 1.1060837664322838, "grad_norm": 1.0477124452590942, "learning_rate": 1.3813526939243409e-05, "loss": 0.2455, "step": 3618 }, { "epoch": 1.1063894833384287, "grad_norm": 1.3805516958236694, "learning_rate": 1.3817348108521208e-05, "loss": 0.2827, "step": 3619 }, { "epoch": 1.1066952002445736, "grad_norm": 1.802405834197998, "learning_rate": 1.3821169277799006e-05, "loss": 0.3245, "step": 3620 }, { "epoch": 1.1070009171507185, "grad_norm": 3.401224136352539, "learning_rate": 1.3824990447076805e-05, "loss": 0.3701, "step": 3621 }, { "epoch": 1.1073066340568634, "grad_norm": 0.5032484531402588, "learning_rate": 1.3828811616354604e-05, "loss": 0.168, "step": 3622 }, { "epoch": 1.1076123509630083, "grad_norm": 0.41003942489624023, "learning_rate": 1.3832632785632404e-05, "loss": 0.1297, "step": 3623 }, { "epoch": 1.1079180678691531, "grad_norm": 0.36163872480392456, "learning_rate": 1.3836453954910203e-05, "loss": 0.1082, "step": 3624 }, { "epoch": 1.108223784775298, "grad_norm": 0.28793492913246155, "learning_rate": 1.3840275124188002e-05, "loss": 0.0717, "step": 3625 }, { "epoch": 1.108529501681443, "grad_norm": 0.2873941659927368, "learning_rate": 1.38440962934658e-05, "loss": 0.0712, "step": 3626 }, { "epoch": 1.1088352185875878, "grad_norm": 0.4013200104236603, "learning_rate": 1.38479174627436e-05, "loss": 0.0809, "step": 3627 }, { "epoch": 1.1091409354937327, "grad_norm": 0.6286357641220093, "learning_rate": 1.3851738632021398e-05, "loss": 0.1039, "step": 3628 }, { "epoch": 1.1094466523998778, "grad_norm": 0.3838531970977783, "learning_rate": 1.3855559801299197e-05, "loss": 0.0779, "step": 3629 }, { "epoch": 1.1097523693060227, "grad_norm": 0.37348291277885437, "learning_rate": 1.3859380970576997e-05, "loss": 0.1005, "step": 3630 }, { "epoch": 1.1100580862121676, "grad_norm": 0.46390050649642944, "learning_rate": 1.3863202139854796e-05, "loss": 0.118, "step": 3631 }, { "epoch": 1.1103638031183125, "grad_norm": 0.6812679767608643, "learning_rate": 1.3867023309132595e-05, "loss": 0.1372, "step": 3632 }, { "epoch": 1.1106695200244574, "grad_norm": 0.39647936820983887, "learning_rate": 1.3870844478410393e-05, "loss": 0.1203, "step": 3633 }, { "epoch": 1.1109752369306023, "grad_norm": 0.4959317147731781, "learning_rate": 1.3874665647688192e-05, "loss": 0.113, "step": 3634 }, { "epoch": 1.1112809538367472, "grad_norm": 1.0240930318832397, "learning_rate": 1.3878486816965991e-05, "loss": 0.15, "step": 3635 }, { "epoch": 1.111586670742892, "grad_norm": 0.6616315841674805, "learning_rate": 1.388230798624379e-05, "loss": 0.1329, "step": 3636 }, { "epoch": 1.111892387649037, "grad_norm": 0.7206836342811584, "learning_rate": 1.3886129155521588e-05, "loss": 0.2102, "step": 3637 }, { "epoch": 1.1121981045551819, "grad_norm": 0.680724561214447, "learning_rate": 1.388995032479939e-05, "loss": 0.1927, "step": 3638 }, { "epoch": 1.1125038214613268, "grad_norm": 1.1669903993606567, "learning_rate": 1.389377149407719e-05, "loss": 0.2645, "step": 3639 }, { "epoch": 1.1128095383674717, "grad_norm": 0.904487133026123, "learning_rate": 1.3897592663354988e-05, "loss": 0.2641, "step": 3640 }, { "epoch": 1.1131152552736165, "grad_norm": 2.1362130641937256, "learning_rate": 1.3901413832632787e-05, "loss": 0.2042, "step": 3641 }, { "epoch": 1.1134209721797614, "grad_norm": 1.750469446182251, "learning_rate": 1.3905235001910586e-05, "loss": 0.2549, "step": 3642 }, { "epoch": 1.1137266890859066, "grad_norm": 0.8521108031272888, "learning_rate": 1.3909056171188384e-05, "loss": 0.2397, "step": 3643 }, { "epoch": 1.1140324059920514, "grad_norm": 1.4793767929077148, "learning_rate": 1.3912877340466183e-05, "loss": 0.2374, "step": 3644 }, { "epoch": 1.1143381228981963, "grad_norm": 1.2826224565505981, "learning_rate": 1.3916698509743982e-05, "loss": 0.2933, "step": 3645 }, { "epoch": 1.1146438398043412, "grad_norm": 2.030794620513916, "learning_rate": 1.3920519679021782e-05, "loss": 0.3919, "step": 3646 }, { "epoch": 1.1149495567104861, "grad_norm": 0.6451963186264038, "learning_rate": 1.3924340848299581e-05, "loss": 0.2033, "step": 3647 }, { "epoch": 1.115255273616631, "grad_norm": 0.3902256488800049, "learning_rate": 1.392816201757738e-05, "loss": 0.1133, "step": 3648 }, { "epoch": 1.115560990522776, "grad_norm": 0.39578482508659363, "learning_rate": 1.3931983186855178e-05, "loss": 0.092, "step": 3649 }, { "epoch": 1.1158667074289208, "grad_norm": 0.4145074784755707, "learning_rate": 1.3935804356132977e-05, "loss": 0.0765, "step": 3650 }, { "epoch": 1.1161724243350657, "grad_norm": 0.3971862494945526, "learning_rate": 1.3939625525410776e-05, "loss": 0.0763, "step": 3651 }, { "epoch": 1.1164781412412106, "grad_norm": 0.32055434584617615, "learning_rate": 1.3943446694688575e-05, "loss": 0.093, "step": 3652 }, { "epoch": 1.1167838581473555, "grad_norm": 0.3881145417690277, "learning_rate": 1.3947267863966373e-05, "loss": 0.0925, "step": 3653 }, { "epoch": 1.1170895750535004, "grad_norm": 0.39582347869873047, "learning_rate": 1.3951089033244174e-05, "loss": 0.0766, "step": 3654 }, { "epoch": 1.1173952919596453, "grad_norm": 0.44554805755615234, "learning_rate": 1.3954910202521973e-05, "loss": 0.0761, "step": 3655 }, { "epoch": 1.1177010088657904, "grad_norm": 0.3877214193344116, "learning_rate": 1.3958731371799771e-05, "loss": 0.1108, "step": 3656 }, { "epoch": 1.1180067257719353, "grad_norm": 0.40304115414619446, "learning_rate": 1.396255254107757e-05, "loss": 0.1079, "step": 3657 }, { "epoch": 1.1183124426780802, "grad_norm": 0.577410101890564, "learning_rate": 1.3966373710355369e-05, "loss": 0.1393, "step": 3658 }, { "epoch": 1.118618159584225, "grad_norm": 0.5461241602897644, "learning_rate": 1.3970194879633168e-05, "loss": 0.1749, "step": 3659 }, { "epoch": 1.11892387649037, "grad_norm": 0.6546995043754578, "learning_rate": 1.3974016048910966e-05, "loss": 0.129, "step": 3660 }, { "epoch": 1.1192295933965148, "grad_norm": 0.6424639225006104, "learning_rate": 1.3977837218188767e-05, "loss": 0.1744, "step": 3661 }, { "epoch": 1.1195353103026597, "grad_norm": 0.5914255380630493, "learning_rate": 1.3981658387466565e-05, "loss": 0.1685, "step": 3662 }, { "epoch": 1.1198410272088046, "grad_norm": 0.8409000039100647, "learning_rate": 1.3985479556744364e-05, "loss": 0.2215, "step": 3663 }, { "epoch": 1.1201467441149495, "grad_norm": 1.6313433647155762, "learning_rate": 1.3989300726022163e-05, "loss": 0.23, "step": 3664 }, { "epoch": 1.1204524610210944, "grad_norm": 1.259299635887146, "learning_rate": 1.3993121895299962e-05, "loss": 0.2229, "step": 3665 }, { "epoch": 1.1207581779272393, "grad_norm": 1.0495622158050537, "learning_rate": 1.399694306457776e-05, "loss": 0.2353, "step": 3666 }, { "epoch": 1.1210638948333842, "grad_norm": 1.0486443042755127, "learning_rate": 1.400076423385556e-05, "loss": 0.2809, "step": 3667 }, { "epoch": 1.121369611739529, "grad_norm": 1.0372015237808228, "learning_rate": 1.4004585403133358e-05, "loss": 0.2244, "step": 3668 }, { "epoch": 1.1216753286456742, "grad_norm": 1.3866404294967651, "learning_rate": 1.4008406572411158e-05, "loss": 0.2816, "step": 3669 }, { "epoch": 1.121981045551819, "grad_norm": 1.1129133701324463, "learning_rate": 1.4012227741688957e-05, "loss": 0.2561, "step": 3670 }, { "epoch": 1.122286762457964, "grad_norm": 3.368788719177246, "learning_rate": 1.4016048910966756e-05, "loss": 0.337, "step": 3671 }, { "epoch": 1.1225924793641089, "grad_norm": 0.5434319972991943, "learning_rate": 1.4019870080244555e-05, "loss": 0.1963, "step": 3672 }, { "epoch": 1.1228981962702538, "grad_norm": 0.37834519147872925, "learning_rate": 1.4023691249522353e-05, "loss": 0.1113, "step": 3673 }, { "epoch": 1.1232039131763987, "grad_norm": 0.2956380248069763, "learning_rate": 1.4027512418800152e-05, "loss": 0.0777, "step": 3674 }, { "epoch": 1.1235096300825436, "grad_norm": 0.3306541442871094, "learning_rate": 1.4031333588077951e-05, "loss": 0.0721, "step": 3675 }, { "epoch": 1.1238153469886885, "grad_norm": 0.33198100328445435, "learning_rate": 1.4035154757355751e-05, "loss": 0.0736, "step": 3676 }, { "epoch": 1.1241210638948334, "grad_norm": 0.4119931757450104, "learning_rate": 1.4038975926633552e-05, "loss": 0.0745, "step": 3677 }, { "epoch": 1.1244267808009782, "grad_norm": 0.3792274594306946, "learning_rate": 1.404279709591135e-05, "loss": 0.0764, "step": 3678 }, { "epoch": 1.1247324977071231, "grad_norm": 0.44131675362586975, "learning_rate": 1.404661826518915e-05, "loss": 0.0896, "step": 3679 }, { "epoch": 1.125038214613268, "grad_norm": 0.5058593153953552, "learning_rate": 1.4050439434466948e-05, "loss": 0.106, "step": 3680 }, { "epoch": 1.1253439315194131, "grad_norm": 0.3046987056732178, "learning_rate": 1.4054260603744747e-05, "loss": 0.0699, "step": 3681 }, { "epoch": 1.1256496484255578, "grad_norm": 0.5419811010360718, "learning_rate": 1.4058081773022545e-05, "loss": 0.1517, "step": 3682 }, { "epoch": 1.125955365331703, "grad_norm": 0.5876270532608032, "learning_rate": 1.4061902942300344e-05, "loss": 0.1478, "step": 3683 }, { "epoch": 1.1262610822378478, "grad_norm": 1.0583444833755493, "learning_rate": 1.4065724111578143e-05, "loss": 0.1252, "step": 3684 }, { "epoch": 1.1265667991439927, "grad_norm": 0.5101488828659058, "learning_rate": 1.4069545280855943e-05, "loss": 0.1434, "step": 3685 }, { "epoch": 1.1268725160501376, "grad_norm": 0.6470577716827393, "learning_rate": 1.4073366450133742e-05, "loss": 0.1776, "step": 3686 }, { "epoch": 1.1271782329562825, "grad_norm": 0.5712004899978638, "learning_rate": 1.4077187619411541e-05, "loss": 0.1645, "step": 3687 }, { "epoch": 1.1274839498624274, "grad_norm": 0.6475619673728943, "learning_rate": 1.408100878868934e-05, "loss": 0.1791, "step": 3688 }, { "epoch": 1.1277896667685723, "grad_norm": 0.6398759484291077, "learning_rate": 1.4084829957967138e-05, "loss": 0.1876, "step": 3689 }, { "epoch": 1.1280953836747172, "grad_norm": 0.77228844165802, "learning_rate": 1.4088651127244937e-05, "loss": 0.2094, "step": 3690 }, { "epoch": 1.128401100580862, "grad_norm": 0.9232661724090576, "learning_rate": 1.4092472296522736e-05, "loss": 0.2542, "step": 3691 }, { "epoch": 1.128706817487007, "grad_norm": 0.9136829972267151, "learning_rate": 1.4096293465800536e-05, "loss": 0.2534, "step": 3692 }, { "epoch": 1.1290125343931519, "grad_norm": 0.9057154655456543, "learning_rate": 1.4100114635078335e-05, "loss": 0.2161, "step": 3693 }, { "epoch": 1.129318251299297, "grad_norm": 0.972487211227417, "learning_rate": 1.4103935804356134e-05, "loss": 0.2473, "step": 3694 }, { "epoch": 1.1296239682054416, "grad_norm": 1.4198299646377563, "learning_rate": 1.4107756973633933e-05, "loss": 0.3219, "step": 3695 }, { "epoch": 1.1299296851115868, "grad_norm": 1.7074863910675049, "learning_rate": 1.4111578142911731e-05, "loss": 0.3769, "step": 3696 }, { "epoch": 1.1302354020177316, "grad_norm": 0.6387917995452881, "learning_rate": 1.411539931218953e-05, "loss": 0.2397, "step": 3697 }, { "epoch": 1.1305411189238765, "grad_norm": 0.5965946912765503, "learning_rate": 1.4119220481467329e-05, "loss": 0.1005, "step": 3698 }, { "epoch": 1.1308468358300214, "grad_norm": 1.1823596954345703, "learning_rate": 1.4123041650745127e-05, "loss": 0.0877, "step": 3699 }, { "epoch": 1.1311525527361663, "grad_norm": 0.3112819194793701, "learning_rate": 1.4126862820022928e-05, "loss": 0.0742, "step": 3700 }, { "epoch": 1.1314582696423112, "grad_norm": 0.31130290031433105, "learning_rate": 1.4130683989300727e-05, "loss": 0.0756, "step": 3701 }, { "epoch": 1.1317639865484561, "grad_norm": 0.2719098627567291, "learning_rate": 1.4134505158578525e-05, "loss": 0.062, "step": 3702 }, { "epoch": 1.132069703454601, "grad_norm": 0.5055156350135803, "learning_rate": 1.4138326327856324e-05, "loss": 0.1237, "step": 3703 }, { "epoch": 1.132375420360746, "grad_norm": 0.7972448468208313, "learning_rate": 1.4142147497134123e-05, "loss": 0.0768, "step": 3704 }, { "epoch": 1.1326811372668908, "grad_norm": 0.8571481704711914, "learning_rate": 1.4145968666411922e-05, "loss": 0.0997, "step": 3705 }, { "epoch": 1.1329868541730357, "grad_norm": 0.7544311881065369, "learning_rate": 1.414978983568972e-05, "loss": 0.0862, "step": 3706 }, { "epoch": 1.1332925710791808, "grad_norm": 0.4160862863063812, "learning_rate": 1.4153611004967519e-05, "loss": 0.1065, "step": 3707 }, { "epoch": 1.1335982879853255, "grad_norm": 0.9249362945556641, "learning_rate": 1.415743217424532e-05, "loss": 0.1184, "step": 3708 }, { "epoch": 1.1339040048914706, "grad_norm": 0.41892093420028687, "learning_rate": 1.4161253343523118e-05, "loss": 0.1092, "step": 3709 }, { "epoch": 1.1342097217976155, "grad_norm": 0.9017132520675659, "learning_rate": 1.4165074512800917e-05, "loss": 0.1968, "step": 3710 }, { "epoch": 1.1345154387037604, "grad_norm": 0.6710590720176697, "learning_rate": 1.4168895682078716e-05, "loss": 0.1894, "step": 3711 }, { "epoch": 1.1348211556099053, "grad_norm": 0.6322555541992188, "learning_rate": 1.4172716851356515e-05, "loss": 0.1901, "step": 3712 }, { "epoch": 1.1351268725160502, "grad_norm": 0.8428823947906494, "learning_rate": 1.4176538020634313e-05, "loss": 0.2053, "step": 3713 }, { "epoch": 1.135432589422195, "grad_norm": 0.9170833230018616, "learning_rate": 1.4180359189912112e-05, "loss": 0.2289, "step": 3714 }, { "epoch": 1.13573830632834, "grad_norm": 0.7728933691978455, "learning_rate": 1.4184180359189912e-05, "loss": 0.2061, "step": 3715 }, { "epoch": 1.1360440232344848, "grad_norm": 0.8760802745819092, "learning_rate": 1.4188001528467713e-05, "loss": 0.227, "step": 3716 }, { "epoch": 1.1363497401406297, "grad_norm": 1.0362374782562256, "learning_rate": 1.4191822697745512e-05, "loss": 0.2419, "step": 3717 }, { "epoch": 1.1366554570467746, "grad_norm": 0.9934778213500977, "learning_rate": 1.419564386702331e-05, "loss": 0.2445, "step": 3718 }, { "epoch": 1.1369611739529195, "grad_norm": 0.9961568713188171, "learning_rate": 1.419946503630111e-05, "loss": 0.2676, "step": 3719 }, { "epoch": 1.1372668908590646, "grad_norm": 1.074027419090271, "learning_rate": 1.4203286205578908e-05, "loss": 0.2487, "step": 3720 }, { "epoch": 1.1375726077652093, "grad_norm": 1.5971406698226929, "learning_rate": 1.4207107374856707e-05, "loss": 0.3258, "step": 3721 }, { "epoch": 1.1378783246713544, "grad_norm": 0.5217958092689514, "learning_rate": 1.4210928544134505e-05, "loss": 0.1699, "step": 3722 }, { "epoch": 1.1381840415774993, "grad_norm": 0.3537219762802124, "learning_rate": 1.4214749713412306e-05, "loss": 0.1048, "step": 3723 }, { "epoch": 1.1384897584836442, "grad_norm": 0.3402404189109802, "learning_rate": 1.4218570882690105e-05, "loss": 0.0971, "step": 3724 }, { "epoch": 1.138795475389789, "grad_norm": 0.40692827105522156, "learning_rate": 1.4222392051967903e-05, "loss": 0.0914, "step": 3725 }, { "epoch": 1.139101192295934, "grad_norm": 0.3845665156841278, "learning_rate": 1.4226213221245702e-05, "loss": 0.0859, "step": 3726 }, { "epoch": 1.1394069092020789, "grad_norm": 0.3267472982406616, "learning_rate": 1.42300343905235e-05, "loss": 0.072, "step": 3727 }, { "epoch": 1.1397126261082238, "grad_norm": 0.29019439220428467, "learning_rate": 1.42338555598013e-05, "loss": 0.0815, "step": 3728 }, { "epoch": 1.1400183430143687, "grad_norm": 0.3384203016757965, "learning_rate": 1.4237676729079098e-05, "loss": 0.0882, "step": 3729 }, { "epoch": 1.1403240599205136, "grad_norm": 0.3508310616016388, "learning_rate": 1.4241497898356897e-05, "loss": 0.0686, "step": 3730 }, { "epoch": 1.1406297768266584, "grad_norm": 0.4534098505973816, "learning_rate": 1.4245319067634697e-05, "loss": 0.0864, "step": 3731 }, { "epoch": 1.1409354937328033, "grad_norm": 0.5458396077156067, "learning_rate": 1.4249140236912496e-05, "loss": 0.1244, "step": 3732 }, { "epoch": 1.1412412106389485, "grad_norm": 0.5063936114311218, "learning_rate": 1.4252961406190295e-05, "loss": 0.1022, "step": 3733 }, { "epoch": 1.1415469275450931, "grad_norm": 0.5140089988708496, "learning_rate": 1.4256782575468094e-05, "loss": 0.1058, "step": 3734 }, { "epoch": 1.1418526444512382, "grad_norm": 0.5799130201339722, "learning_rate": 1.4260603744745892e-05, "loss": 0.1545, "step": 3735 }, { "epoch": 1.1421583613573831, "grad_norm": 0.5863983035087585, "learning_rate": 1.4264424914023691e-05, "loss": 0.139, "step": 3736 }, { "epoch": 1.142464078263528, "grad_norm": 0.8592391014099121, "learning_rate": 1.426824608330149e-05, "loss": 0.1985, "step": 3737 }, { "epoch": 1.142769795169673, "grad_norm": 0.7722941040992737, "learning_rate": 1.4272067252579289e-05, "loss": 0.1853, "step": 3738 }, { "epoch": 1.1430755120758178, "grad_norm": 0.8423188924789429, "learning_rate": 1.4275888421857089e-05, "loss": 0.2397, "step": 3739 }, { "epoch": 1.1433812289819627, "grad_norm": 0.7528480887413025, "learning_rate": 1.4279709591134888e-05, "loss": 0.214, "step": 3740 }, { "epoch": 1.1436869458881076, "grad_norm": 0.904456377029419, "learning_rate": 1.4283530760412687e-05, "loss": 0.2246, "step": 3741 }, { "epoch": 1.1439926627942525, "grad_norm": 0.9289185404777527, "learning_rate": 1.4287351929690485e-05, "loss": 0.2854, "step": 3742 }, { "epoch": 1.1442983797003974, "grad_norm": 1.1075178384780884, "learning_rate": 1.4291173098968284e-05, "loss": 0.2564, "step": 3743 }, { "epoch": 1.1446040966065423, "grad_norm": 1.0858914852142334, "learning_rate": 1.4294994268246083e-05, "loss": 0.261, "step": 3744 }, { "epoch": 1.1449098135126872, "grad_norm": 1.350962519645691, "learning_rate": 1.4298815437523882e-05, "loss": 0.3014, "step": 3745 }, { "epoch": 1.1452155304188323, "grad_norm": 1.9445886611938477, "learning_rate": 1.430263660680168e-05, "loss": 0.2989, "step": 3746 }, { "epoch": 1.145521247324977, "grad_norm": 0.6232193112373352, "learning_rate": 1.430645777607948e-05, "loss": 0.1925, "step": 3747 }, { "epoch": 1.145826964231122, "grad_norm": 0.3223530054092407, "learning_rate": 1.431027894535728e-05, "loss": 0.0993, "step": 3748 }, { "epoch": 1.146132681137267, "grad_norm": 0.4407971203327179, "learning_rate": 1.4314100114635078e-05, "loss": 0.1056, "step": 3749 }, { "epoch": 1.1464383980434119, "grad_norm": 0.47916993498802185, "learning_rate": 1.4317921283912877e-05, "loss": 0.0684, "step": 3750 }, { "epoch": 1.1467441149495567, "grad_norm": 0.37353837490081787, "learning_rate": 1.4321742453190676e-05, "loss": 0.0626, "step": 3751 }, { "epoch": 1.1470498318557016, "grad_norm": 0.3603862524032593, "learning_rate": 1.4325563622468475e-05, "loss": 0.0745, "step": 3752 }, { "epoch": 1.1473555487618465, "grad_norm": 0.534824788570404, "learning_rate": 1.4329384791746273e-05, "loss": 0.08, "step": 3753 }, { "epoch": 1.1476612656679914, "grad_norm": 0.5972107648849487, "learning_rate": 1.4333205961024074e-05, "loss": 0.1288, "step": 3754 }, { "epoch": 1.1479669825741363, "grad_norm": 0.4657822251319885, "learning_rate": 1.4337027130301874e-05, "loss": 0.0958, "step": 3755 }, { "epoch": 1.1482726994802812, "grad_norm": 0.458296537399292, "learning_rate": 1.4340848299579673e-05, "loss": 0.0933, "step": 3756 }, { "epoch": 1.148578416386426, "grad_norm": 0.4565827250480652, "learning_rate": 1.4344669468857472e-05, "loss": 0.0971, "step": 3757 }, { "epoch": 1.148884133292571, "grad_norm": 0.4581966698169708, "learning_rate": 1.434849063813527e-05, "loss": 0.1126, "step": 3758 }, { "epoch": 1.1491898501987161, "grad_norm": 0.8127873539924622, "learning_rate": 1.4352311807413069e-05, "loss": 0.131, "step": 3759 }, { "epoch": 1.1494955671048608, "grad_norm": 0.8841579556465149, "learning_rate": 1.4356132976690868e-05, "loss": 0.1745, "step": 3760 }, { "epoch": 1.149801284011006, "grad_norm": 0.8788304328918457, "learning_rate": 1.4359954145968667e-05, "loss": 0.1568, "step": 3761 }, { "epoch": 1.1501070009171508, "grad_norm": 0.6276860237121582, "learning_rate": 1.4363775315246467e-05, "loss": 0.1716, "step": 3762 }, { "epoch": 1.1504127178232957, "grad_norm": 2.947676658630371, "learning_rate": 1.4367596484524266e-05, "loss": 0.1931, "step": 3763 }, { "epoch": 1.1507184347294406, "grad_norm": 0.9101258516311646, "learning_rate": 1.4371417653802065e-05, "loss": 0.2236, "step": 3764 }, { "epoch": 1.1510241516355855, "grad_norm": 1.435482144355774, "learning_rate": 1.4375238823079863e-05, "loss": 0.2292, "step": 3765 }, { "epoch": 1.1513298685417304, "grad_norm": 1.0214934349060059, "learning_rate": 1.4379059992357662e-05, "loss": 0.2485, "step": 3766 }, { "epoch": 1.1516355854478753, "grad_norm": 1.0099118947982788, "learning_rate": 1.438288116163546e-05, "loss": 0.2272, "step": 3767 }, { "epoch": 1.1519413023540201, "grad_norm": 1.31706702709198, "learning_rate": 1.438670233091326e-05, "loss": 0.2511, "step": 3768 }, { "epoch": 1.152247019260165, "grad_norm": 2.0377330780029297, "learning_rate": 1.4390523500191058e-05, "loss": 0.2787, "step": 3769 }, { "epoch": 1.15255273616631, "grad_norm": 1.5653493404388428, "learning_rate": 1.4394344669468859e-05, "loss": 0.2489, "step": 3770 }, { "epoch": 1.1528584530724548, "grad_norm": 1.9279053211212158, "learning_rate": 1.4398165838746657e-05, "loss": 0.3399, "step": 3771 }, { "epoch": 1.1531641699786, "grad_norm": 0.5721120834350586, "learning_rate": 1.4401987008024456e-05, "loss": 0.1811, "step": 3772 }, { "epoch": 1.1534698868847446, "grad_norm": 0.5852449536323547, "learning_rate": 1.4405808177302255e-05, "loss": 0.0917, "step": 3773 }, { "epoch": 1.1537756037908897, "grad_norm": 0.35119128227233887, "learning_rate": 1.4409629346580054e-05, "loss": 0.0966, "step": 3774 }, { "epoch": 1.1540813206970346, "grad_norm": 0.4686698019504547, "learning_rate": 1.4413450515857852e-05, "loss": 0.0851, "step": 3775 }, { "epoch": 1.1543870376031795, "grad_norm": 0.45871084928512573, "learning_rate": 1.4417271685135651e-05, "loss": 0.0914, "step": 3776 }, { "epoch": 1.1546927545093244, "grad_norm": 0.641860842704773, "learning_rate": 1.442109285441345e-05, "loss": 0.0742, "step": 3777 }, { "epoch": 1.1549984714154693, "grad_norm": 0.407721608877182, "learning_rate": 1.442491402369125e-05, "loss": 0.0807, "step": 3778 }, { "epoch": 1.1553041883216142, "grad_norm": 0.4615117013454437, "learning_rate": 1.4428735192969049e-05, "loss": 0.0991, "step": 3779 }, { "epoch": 1.155609905227759, "grad_norm": 0.44508621096611023, "learning_rate": 1.4432556362246848e-05, "loss": 0.1313, "step": 3780 }, { "epoch": 1.155915622133904, "grad_norm": 0.4256969392299652, "learning_rate": 1.4436377531524647e-05, "loss": 0.0917, "step": 3781 }, { "epoch": 1.1562213390400489, "grad_norm": 0.33615049719810486, "learning_rate": 1.4440198700802445e-05, "loss": 0.1023, "step": 3782 }, { "epoch": 1.1565270559461938, "grad_norm": 0.4994864761829376, "learning_rate": 1.4444019870080244e-05, "loss": 0.1143, "step": 3783 }, { "epoch": 1.1568327728523387, "grad_norm": 0.5033902525901794, "learning_rate": 1.4447841039358043e-05, "loss": 0.121, "step": 3784 }, { "epoch": 1.1571384897584838, "grad_norm": 0.65464848279953, "learning_rate": 1.4451662208635843e-05, "loss": 0.1329, "step": 3785 }, { "epoch": 1.1574442066646284, "grad_norm": 0.5628464818000793, "learning_rate": 1.4455483377913642e-05, "loss": 0.1997, "step": 3786 }, { "epoch": 1.1577499235707736, "grad_norm": 0.5761443376541138, "learning_rate": 1.445930454719144e-05, "loss": 0.167, "step": 3787 }, { "epoch": 1.1580556404769184, "grad_norm": 0.5720776915550232, "learning_rate": 1.446312571646924e-05, "loss": 0.1953, "step": 3788 }, { "epoch": 1.1583613573830633, "grad_norm": 1.0978305339813232, "learning_rate": 1.4466946885747038e-05, "loss": 0.2518, "step": 3789 }, { "epoch": 1.1586670742892082, "grad_norm": 0.853173017501831, "learning_rate": 1.4470768055024837e-05, "loss": 0.2171, "step": 3790 }, { "epoch": 1.1589727911953531, "grad_norm": 0.8561983704566956, "learning_rate": 1.4474589224302636e-05, "loss": 0.2391, "step": 3791 }, { "epoch": 1.159278508101498, "grad_norm": 0.8729215264320374, "learning_rate": 1.4478410393580434e-05, "loss": 0.2579, "step": 3792 }, { "epoch": 1.159584225007643, "grad_norm": 1.043200969696045, "learning_rate": 1.4482231562858237e-05, "loss": 0.3093, "step": 3793 }, { "epoch": 1.1598899419137878, "grad_norm": 1.1264472007751465, "learning_rate": 1.4486052732136035e-05, "loss": 0.2938, "step": 3794 }, { "epoch": 1.1601956588199327, "grad_norm": 1.2651269435882568, "learning_rate": 1.4489873901413834e-05, "loss": 0.2771, "step": 3795 }, { "epoch": 1.1605013757260776, "grad_norm": 2.69063401222229, "learning_rate": 1.4493695070691633e-05, "loss": 0.3691, "step": 3796 }, { "epoch": 1.1608070926322225, "grad_norm": 0.473033607006073, "learning_rate": 1.4497516239969432e-05, "loss": 0.158, "step": 3797 }, { "epoch": 1.1611128095383676, "grad_norm": 0.4807337522506714, "learning_rate": 1.450133740924723e-05, "loss": 0.1342, "step": 3798 }, { "epoch": 1.1614185264445123, "grad_norm": 0.42810148000717163, "learning_rate": 1.4505158578525029e-05, "loss": 0.0995, "step": 3799 }, { "epoch": 1.1617242433506574, "grad_norm": 0.3413153886795044, "learning_rate": 1.4508979747802828e-05, "loss": 0.0909, "step": 3800 }, { "epoch": 1.1620299602568023, "grad_norm": 0.3260716497898102, "learning_rate": 1.4512800917080628e-05, "loss": 0.075, "step": 3801 }, { "epoch": 1.1623356771629472, "grad_norm": 0.5079697966575623, "learning_rate": 1.4516622086358427e-05, "loss": 0.0972, "step": 3802 }, { "epoch": 1.162641394069092, "grad_norm": 0.7211304306983948, "learning_rate": 1.4520443255636226e-05, "loss": 0.0868, "step": 3803 }, { "epoch": 1.162947110975237, "grad_norm": 0.28437793254852295, "learning_rate": 1.4524264424914024e-05, "loss": 0.0756, "step": 3804 }, { "epoch": 1.1632528278813818, "grad_norm": 0.34992438554763794, "learning_rate": 1.4528085594191823e-05, "loss": 0.0857, "step": 3805 }, { "epoch": 1.1635585447875267, "grad_norm": 0.47803762555122375, "learning_rate": 1.4531906763469622e-05, "loss": 0.0763, "step": 3806 }, { "epoch": 1.1638642616936716, "grad_norm": 0.49907350540161133, "learning_rate": 1.453572793274742e-05, "loss": 0.1178, "step": 3807 }, { "epoch": 1.1641699785998165, "grad_norm": 0.5263296365737915, "learning_rate": 1.453954910202522e-05, "loss": 0.1167, "step": 3808 }, { "epoch": 1.1644756955059614, "grad_norm": 0.5875982046127319, "learning_rate": 1.454337027130302e-05, "loss": 0.1073, "step": 3809 }, { "epoch": 1.1647814124121063, "grad_norm": 0.6535350680351257, "learning_rate": 1.4547191440580819e-05, "loss": 0.1504, "step": 3810 }, { "epoch": 1.1650871293182512, "grad_norm": 0.5117608904838562, "learning_rate": 1.4551012609858617e-05, "loss": 0.1691, "step": 3811 }, { "epoch": 1.165392846224396, "grad_norm": 0.64020174741745, "learning_rate": 1.4554833779136416e-05, "loss": 0.1943, "step": 3812 }, { "epoch": 1.1656985631305412, "grad_norm": 0.6095613241195679, "learning_rate": 1.4558654948414215e-05, "loss": 0.1826, "step": 3813 }, { "epoch": 1.166004280036686, "grad_norm": 1.5526913404464722, "learning_rate": 1.4562476117692014e-05, "loss": 0.198, "step": 3814 }, { "epoch": 1.166309996942831, "grad_norm": 0.8464053273200989, "learning_rate": 1.4566297286969812e-05, "loss": 0.2143, "step": 3815 }, { "epoch": 1.1666157138489759, "grad_norm": 0.8184626698493958, "learning_rate": 1.4570118456247613e-05, "loss": 0.2624, "step": 3816 }, { "epoch": 1.1669214307551208, "grad_norm": 1.0747004747390747, "learning_rate": 1.4573939625525412e-05, "loss": 0.2556, "step": 3817 }, { "epoch": 1.1672271476612657, "grad_norm": 2.350855588912964, "learning_rate": 1.457776079480321e-05, "loss": 0.2433, "step": 3818 }, { "epoch": 1.1675328645674106, "grad_norm": 1.127926230430603, "learning_rate": 1.4581581964081009e-05, "loss": 0.2879, "step": 3819 }, { "epoch": 1.1678385814735555, "grad_norm": 2.053006887435913, "learning_rate": 1.4585403133358808e-05, "loss": 0.2385, "step": 3820 }, { "epoch": 1.1681442983797004, "grad_norm": 2.878096103668213, "learning_rate": 1.4589224302636607e-05, "loss": 0.3923, "step": 3821 }, { "epoch": 1.1684500152858452, "grad_norm": 0.433889240026474, "learning_rate": 1.4593045471914405e-05, "loss": 0.1887, "step": 3822 }, { "epoch": 1.1687557321919901, "grad_norm": 0.43005862832069397, "learning_rate": 1.4596866641192204e-05, "loss": 0.1142, "step": 3823 }, { "epoch": 1.169061449098135, "grad_norm": 0.3559347093105316, "learning_rate": 1.4600687810470004e-05, "loss": 0.1034, "step": 3824 }, { "epoch": 1.16936716600428, "grad_norm": 0.5198719501495361, "learning_rate": 1.4604508979747803e-05, "loss": 0.07, "step": 3825 }, { "epoch": 1.169672882910425, "grad_norm": 0.34456461668014526, "learning_rate": 1.4608330149025602e-05, "loss": 0.0786, "step": 3826 }, { "epoch": 1.16997859981657, "grad_norm": 0.25803354382514954, "learning_rate": 1.46121513183034e-05, "loss": 0.0542, "step": 3827 }, { "epoch": 1.1702843167227148, "grad_norm": 0.4048537611961365, "learning_rate": 1.46159724875812e-05, "loss": 0.0698, "step": 3828 }, { "epoch": 1.1705900336288597, "grad_norm": 0.5077415704727173, "learning_rate": 1.4619793656858998e-05, "loss": 0.0882, "step": 3829 }, { "epoch": 1.1708957505350046, "grad_norm": 0.4358462691307068, "learning_rate": 1.4623614826136797e-05, "loss": 0.0893, "step": 3830 }, { "epoch": 1.1712014674411495, "grad_norm": 0.48541709780693054, "learning_rate": 1.4627435995414596e-05, "loss": 0.0777, "step": 3831 }, { "epoch": 1.1715071843472944, "grad_norm": 0.48510313034057617, "learning_rate": 1.4631257164692398e-05, "loss": 0.1032, "step": 3832 }, { "epoch": 1.1718129012534393, "grad_norm": 0.5200100541114807, "learning_rate": 1.4635078333970197e-05, "loss": 0.1086, "step": 3833 }, { "epoch": 1.1721186181595842, "grad_norm": 0.5826969742774963, "learning_rate": 1.4638899503247995e-05, "loss": 0.1269, "step": 3834 }, { "epoch": 1.172424335065729, "grad_norm": 0.8807849287986755, "learning_rate": 1.4642720672525794e-05, "loss": 0.1204, "step": 3835 }, { "epoch": 1.172730051971874, "grad_norm": 0.8566864132881165, "learning_rate": 1.4646541841803593e-05, "loss": 0.1539, "step": 3836 }, { "epoch": 1.1730357688780189, "grad_norm": 0.8132956624031067, "learning_rate": 1.4650363011081392e-05, "loss": 0.2018, "step": 3837 }, { "epoch": 1.1733414857841638, "grad_norm": 1.1463074684143066, "learning_rate": 1.465418418035919e-05, "loss": 0.2342, "step": 3838 }, { "epoch": 1.1736472026903089, "grad_norm": 0.7296339273452759, "learning_rate": 1.4658005349636989e-05, "loss": 0.1959, "step": 3839 }, { "epoch": 1.1739529195964538, "grad_norm": 0.8365247845649719, "learning_rate": 1.466182651891479e-05, "loss": 0.2044, "step": 3840 }, { "epoch": 1.1742586365025987, "grad_norm": 0.917330265045166, "learning_rate": 1.4665647688192588e-05, "loss": 0.2492, "step": 3841 }, { "epoch": 1.1745643534087435, "grad_norm": 1.1642893552780151, "learning_rate": 1.4669468857470387e-05, "loss": 0.2838, "step": 3842 }, { "epoch": 1.1748700703148884, "grad_norm": 1.2833925485610962, "learning_rate": 1.4673290026748186e-05, "loss": 0.2414, "step": 3843 }, { "epoch": 1.1751757872210333, "grad_norm": 1.3854161500930786, "learning_rate": 1.4677111196025984e-05, "loss": 0.2875, "step": 3844 }, { "epoch": 1.1754815041271782, "grad_norm": 3.9037532806396484, "learning_rate": 1.4680932365303783e-05, "loss": 0.3301, "step": 3845 }, { "epoch": 1.1757872210333231, "grad_norm": 1.8634957075119019, "learning_rate": 1.4684753534581582e-05, "loss": 0.3362, "step": 3846 }, { "epoch": 1.176092937939468, "grad_norm": 0.7142884731292725, "learning_rate": 1.468857470385938e-05, "loss": 0.1993, "step": 3847 }, { "epoch": 1.176398654845613, "grad_norm": 0.4491424560546875, "learning_rate": 1.4692395873137181e-05, "loss": 0.1261, "step": 3848 }, { "epoch": 1.1767043717517578, "grad_norm": 0.35146498680114746, "learning_rate": 1.469621704241498e-05, "loss": 0.1277, "step": 3849 }, { "epoch": 1.1770100886579027, "grad_norm": 0.48800498247146606, "learning_rate": 1.4700038211692779e-05, "loss": 0.0893, "step": 3850 }, { "epoch": 1.1773158055640476, "grad_norm": 0.3719889521598816, "learning_rate": 1.4703859380970577e-05, "loss": 0.0882, "step": 3851 }, { "epoch": 1.1776215224701927, "grad_norm": 0.36558881402015686, "learning_rate": 1.4707680550248376e-05, "loss": 0.0767, "step": 3852 }, { "epoch": 1.1779272393763376, "grad_norm": 0.4011582136154175, "learning_rate": 1.4711501719526175e-05, "loss": 0.109, "step": 3853 }, { "epoch": 1.1782329562824825, "grad_norm": 0.3286702036857605, "learning_rate": 1.4715322888803974e-05, "loss": 0.0811, "step": 3854 }, { "epoch": 1.1785386731886274, "grad_norm": 0.3633192181587219, "learning_rate": 1.4719144058081774e-05, "loss": 0.0757, "step": 3855 }, { "epoch": 1.1788443900947723, "grad_norm": 0.48483335971832275, "learning_rate": 1.4722965227359573e-05, "loss": 0.095, "step": 3856 }, { "epoch": 1.1791501070009172, "grad_norm": 0.5001037120819092, "learning_rate": 1.4726786396637372e-05, "loss": 0.1448, "step": 3857 }, { "epoch": 1.179455823907062, "grad_norm": 0.4425494372844696, "learning_rate": 1.473060756591517e-05, "loss": 0.1102, "step": 3858 }, { "epoch": 1.179761540813207, "grad_norm": 0.594631552696228, "learning_rate": 1.4734428735192969e-05, "loss": 0.1592, "step": 3859 }, { "epoch": 1.1800672577193518, "grad_norm": 0.54784095287323, "learning_rate": 1.4738249904470768e-05, "loss": 0.1504, "step": 3860 }, { "epoch": 1.1803729746254967, "grad_norm": 0.5708235502243042, "learning_rate": 1.4742071073748566e-05, "loss": 0.1424, "step": 3861 }, { "epoch": 1.1806786915316416, "grad_norm": 0.5989855527877808, "learning_rate": 1.4745892243026365e-05, "loss": 0.1527, "step": 3862 }, { "epoch": 1.1809844084377865, "grad_norm": 0.8819751739501953, "learning_rate": 1.4749713412304166e-05, "loss": 0.2172, "step": 3863 }, { "epoch": 1.1812901253439314, "grad_norm": 0.8197996616363525, "learning_rate": 1.4753534581581964e-05, "loss": 0.2083, "step": 3864 }, { "epoch": 1.1815958422500765, "grad_norm": 0.753719687461853, "learning_rate": 1.4757355750859763e-05, "loss": 0.2235, "step": 3865 }, { "epoch": 1.1819015591562214, "grad_norm": 0.8852445483207703, "learning_rate": 1.4761176920137562e-05, "loss": 0.2183, "step": 3866 }, { "epoch": 1.1822072760623663, "grad_norm": 0.9368080496788025, "learning_rate": 1.476499808941536e-05, "loss": 0.195, "step": 3867 }, { "epoch": 1.1825129929685112, "grad_norm": 2.436161518096924, "learning_rate": 1.476881925869316e-05, "loss": 0.2121, "step": 3868 }, { "epoch": 1.182818709874656, "grad_norm": 1.2455928325653076, "learning_rate": 1.4772640427970958e-05, "loss": 0.285, "step": 3869 }, { "epoch": 1.183124426780801, "grad_norm": 1.5941458940505981, "learning_rate": 1.4776461597248757e-05, "loss": 0.3043, "step": 3870 }, { "epoch": 1.1834301436869459, "grad_norm": 2.8817880153656006, "learning_rate": 1.4780282766526559e-05, "loss": 0.3612, "step": 3871 }, { "epoch": 1.1837358605930908, "grad_norm": 0.429511159658432, "learning_rate": 1.4784103935804358e-05, "loss": 0.1759, "step": 3872 }, { "epoch": 1.1840415774992357, "grad_norm": 0.43135717511177063, "learning_rate": 1.4787925105082156e-05, "loss": 0.1087, "step": 3873 }, { "epoch": 1.1843472944053806, "grad_norm": 0.31623682379722595, "learning_rate": 1.4791746274359955e-05, "loss": 0.1076, "step": 3874 }, { "epoch": 1.1846530113115255, "grad_norm": 0.31817853450775146, "learning_rate": 1.4795567443637754e-05, "loss": 0.108, "step": 3875 }, { "epoch": 1.1849587282176703, "grad_norm": 0.4093184173107147, "learning_rate": 1.4799388612915553e-05, "loss": 0.0863, "step": 3876 }, { "epoch": 1.1852644451238152, "grad_norm": 0.279117614030838, "learning_rate": 1.4803209782193351e-05, "loss": 0.0691, "step": 3877 }, { "epoch": 1.1855701620299604, "grad_norm": 0.35567447543144226, "learning_rate": 1.480703095147115e-05, "loss": 0.0682, "step": 3878 }, { "epoch": 1.1858758789361052, "grad_norm": 0.38638097047805786, "learning_rate": 1.481085212074895e-05, "loss": 0.1035, "step": 3879 }, { "epoch": 1.1861815958422501, "grad_norm": 0.4357739984989166, "learning_rate": 1.481467329002675e-05, "loss": 0.0827, "step": 3880 }, { "epoch": 1.186487312748395, "grad_norm": 0.4065502882003784, "learning_rate": 1.4818494459304548e-05, "loss": 0.0845, "step": 3881 }, { "epoch": 1.18679302965454, "grad_norm": 0.40309083461761475, "learning_rate": 1.4822315628582347e-05, "loss": 0.1115, "step": 3882 }, { "epoch": 1.1870987465606848, "grad_norm": 0.414975106716156, "learning_rate": 1.4826136797860146e-05, "loss": 0.1065, "step": 3883 }, { "epoch": 1.1874044634668297, "grad_norm": 0.6156466007232666, "learning_rate": 1.4829957967137944e-05, "loss": 0.1081, "step": 3884 }, { "epoch": 1.1877101803729746, "grad_norm": 0.7446378469467163, "learning_rate": 1.4833779136415743e-05, "loss": 0.1536, "step": 3885 }, { "epoch": 1.1880158972791195, "grad_norm": 0.6793490052223206, "learning_rate": 1.4837600305693544e-05, "loss": 0.164, "step": 3886 }, { "epoch": 1.1883216141852644, "grad_norm": 0.9227910041809082, "learning_rate": 1.4841421474971342e-05, "loss": 0.1893, "step": 3887 }, { "epoch": 1.1886273310914093, "grad_norm": 0.9192295670509338, "learning_rate": 1.4845242644249141e-05, "loss": 0.2067, "step": 3888 }, { "epoch": 1.1889330479975542, "grad_norm": 0.7123789191246033, "learning_rate": 1.484906381352694e-05, "loss": 0.1999, "step": 3889 }, { "epoch": 1.189238764903699, "grad_norm": 1.294346570968628, "learning_rate": 1.4852884982804739e-05, "loss": 0.2388, "step": 3890 }, { "epoch": 1.1895444818098442, "grad_norm": 0.9390039443969727, "learning_rate": 1.4856706152082537e-05, "loss": 0.2123, "step": 3891 }, { "epoch": 1.189850198715989, "grad_norm": 0.8346195816993713, "learning_rate": 1.4860527321360336e-05, "loss": 0.211, "step": 3892 }, { "epoch": 1.190155915622134, "grad_norm": 1.0495984554290771, "learning_rate": 1.4864348490638135e-05, "loss": 0.2553, "step": 3893 }, { "epoch": 1.1904616325282789, "grad_norm": 1.2860547304153442, "learning_rate": 1.4868169659915935e-05, "loss": 0.2287, "step": 3894 }, { "epoch": 1.1907673494344237, "grad_norm": 1.63395094871521, "learning_rate": 1.4871990829193734e-05, "loss": 0.2421, "step": 3895 }, { "epoch": 1.1910730663405686, "grad_norm": 1.95939302444458, "learning_rate": 1.4875811998471533e-05, "loss": 0.3648, "step": 3896 }, { "epoch": 1.1913787832467135, "grad_norm": 0.3789427578449249, "learning_rate": 1.4879633167749331e-05, "loss": 0.1691, "step": 3897 }, { "epoch": 1.1916845001528584, "grad_norm": 0.5174517035484314, "learning_rate": 1.488345433702713e-05, "loss": 0.1179, "step": 3898 }, { "epoch": 1.1919902170590033, "grad_norm": 0.36335498094558716, "learning_rate": 1.4887275506304929e-05, "loss": 0.0805, "step": 3899 }, { "epoch": 1.1922959339651482, "grad_norm": 0.2955757975578308, "learning_rate": 1.4891096675582728e-05, "loss": 0.0775, "step": 3900 }, { "epoch": 1.192601650871293, "grad_norm": 0.4168683886528015, "learning_rate": 1.4894917844860526e-05, "loss": 0.085, "step": 3901 }, { "epoch": 1.192907367777438, "grad_norm": 0.3390093743801117, "learning_rate": 1.4898739014138327e-05, "loss": 0.0778, "step": 3902 }, { "epoch": 1.193213084683583, "grad_norm": 0.30220770835876465, "learning_rate": 1.4902560183416126e-05, "loss": 0.0744, "step": 3903 }, { "epoch": 1.193518801589728, "grad_norm": 0.4811137020587921, "learning_rate": 1.4906381352693924e-05, "loss": 0.0961, "step": 3904 }, { "epoch": 1.193824518495873, "grad_norm": 0.43678879737854004, "learning_rate": 1.4910202521971723e-05, "loss": 0.0815, "step": 3905 }, { "epoch": 1.1941302354020178, "grad_norm": 0.44103744626045227, "learning_rate": 1.4914023691249522e-05, "loss": 0.0764, "step": 3906 }, { "epoch": 1.1944359523081627, "grad_norm": 0.454527348279953, "learning_rate": 1.491784486052732e-05, "loss": 0.1092, "step": 3907 }, { "epoch": 1.1947416692143076, "grad_norm": 0.8479472398757935, "learning_rate": 1.492166602980512e-05, "loss": 0.1297, "step": 3908 }, { "epoch": 1.1950473861204525, "grad_norm": 0.43573981523513794, "learning_rate": 1.4925487199082918e-05, "loss": 0.1319, "step": 3909 }, { "epoch": 1.1953531030265974, "grad_norm": 0.6868088245391846, "learning_rate": 1.492930836836072e-05, "loss": 0.2167, "step": 3910 }, { "epoch": 1.1956588199327423, "grad_norm": 0.6345106363296509, "learning_rate": 1.4933129537638519e-05, "loss": 0.1796, "step": 3911 }, { "epoch": 1.1959645368388871, "grad_norm": 0.6973623037338257, "learning_rate": 1.4936950706916318e-05, "loss": 0.1762, "step": 3912 }, { "epoch": 1.196270253745032, "grad_norm": 0.7433644533157349, "learning_rate": 1.4940771876194116e-05, "loss": 0.1911, "step": 3913 }, { "epoch": 1.196575970651177, "grad_norm": 0.6618350744247437, "learning_rate": 1.4944593045471915e-05, "loss": 0.1882, "step": 3914 }, { "epoch": 1.1968816875573218, "grad_norm": 1.1035068035125732, "learning_rate": 1.4948414214749714e-05, "loss": 0.2897, "step": 3915 }, { "epoch": 1.1971874044634667, "grad_norm": 0.9534196853637695, "learning_rate": 1.4952235384027513e-05, "loss": 0.2403, "step": 3916 }, { "epoch": 1.1974931213696118, "grad_norm": 1.2769345045089722, "learning_rate": 1.4956056553305313e-05, "loss": 0.2041, "step": 3917 }, { "epoch": 1.1977988382757567, "grad_norm": 1.3365564346313477, "learning_rate": 1.4959877722583112e-05, "loss": 0.2922, "step": 3918 }, { "epoch": 1.1981045551819016, "grad_norm": 1.6026054620742798, "learning_rate": 1.496369889186091e-05, "loss": 0.2623, "step": 3919 }, { "epoch": 1.1984102720880465, "grad_norm": 1.1769306659698486, "learning_rate": 1.496752006113871e-05, "loss": 0.3419, "step": 3920 }, { "epoch": 1.1987159889941914, "grad_norm": 3.755866527557373, "learning_rate": 1.4971341230416508e-05, "loss": 0.359, "step": 3921 }, { "epoch": 1.1990217059003363, "grad_norm": 0.5003301501274109, "learning_rate": 1.4975162399694307e-05, "loss": 0.1791, "step": 3922 }, { "epoch": 1.1993274228064812, "grad_norm": 0.3515225946903229, "learning_rate": 1.4978983568972106e-05, "loss": 0.101, "step": 3923 }, { "epoch": 1.199633139712626, "grad_norm": 0.40259671211242676, "learning_rate": 1.4982804738249904e-05, "loss": 0.0933, "step": 3924 }, { "epoch": 1.199938856618771, "grad_norm": 0.5180710554122925, "learning_rate": 1.4986625907527705e-05, "loss": 0.089, "step": 3925 }, { "epoch": 1.2002445735249159, "grad_norm": 0.3413981795310974, "learning_rate": 1.4990447076805504e-05, "loss": 0.0835, "step": 3926 }, { "epoch": 1.2005502904310608, "grad_norm": 0.5615536570549011, "learning_rate": 1.4994268246083302e-05, "loss": 0.0957, "step": 3927 }, { "epoch": 1.2008560073372057, "grad_norm": 0.32040658593177795, "learning_rate": 1.4998089415361101e-05, "loss": 0.0623, "step": 3928 }, { "epoch": 1.2011617242433505, "grad_norm": 0.2740074098110199, "learning_rate": 1.5001910584638901e-05, "loss": 0.0613, "step": 3929 }, { "epoch": 1.2014674411494957, "grad_norm": 0.3712644875049591, "learning_rate": 1.50057317539167e-05, "loss": 0.1296, "step": 3930 }, { "epoch": 1.2017731580556406, "grad_norm": 0.4768141806125641, "learning_rate": 1.5009552923194499e-05, "loss": 0.117, "step": 3931 }, { "epoch": 1.2020788749617854, "grad_norm": 0.4983595311641693, "learning_rate": 1.5013374092472298e-05, "loss": 0.1212, "step": 3932 }, { "epoch": 1.2023845918679303, "grad_norm": 0.42652633786201477, "learning_rate": 1.5017195261750098e-05, "loss": 0.1003, "step": 3933 }, { "epoch": 1.2026903087740752, "grad_norm": 0.38822975754737854, "learning_rate": 1.5021016431027897e-05, "loss": 0.1341, "step": 3934 }, { "epoch": 1.2029960256802201, "grad_norm": 0.8292178511619568, "learning_rate": 1.5024837600305696e-05, "loss": 0.1946, "step": 3935 }, { "epoch": 1.203301742586365, "grad_norm": 0.6744150519371033, "learning_rate": 1.5028658769583494e-05, "loss": 0.1768, "step": 3936 }, { "epoch": 1.20360745949251, "grad_norm": 0.592361569404602, "learning_rate": 1.5032479938861293e-05, "loss": 0.167, "step": 3937 }, { "epoch": 1.2039131763986548, "grad_norm": 1.1129181385040283, "learning_rate": 1.5036301108139092e-05, "loss": 0.2032, "step": 3938 }, { "epoch": 1.2042188933047997, "grad_norm": 0.7228469252586365, "learning_rate": 1.504012227741689e-05, "loss": 0.2294, "step": 3939 }, { "epoch": 1.2045246102109446, "grad_norm": 0.6899764537811279, "learning_rate": 1.504394344669469e-05, "loss": 0.1959, "step": 3940 }, { "epoch": 1.2048303271170895, "grad_norm": 0.8034334778785706, "learning_rate": 1.504776461597249e-05, "loss": 0.2272, "step": 3941 }, { "epoch": 1.2051360440232344, "grad_norm": 2.3221099376678467, "learning_rate": 1.5051585785250289e-05, "loss": 0.2499, "step": 3942 }, { "epoch": 1.2054417609293795, "grad_norm": 0.8679633736610413, "learning_rate": 1.5055406954528087e-05, "loss": 0.2997, "step": 3943 }, { "epoch": 1.2057474778355244, "grad_norm": 3.1355345249176025, "learning_rate": 1.5059228123805886e-05, "loss": 0.2297, "step": 3944 }, { "epoch": 1.2060531947416693, "grad_norm": 1.9035682678222656, "learning_rate": 1.5063049293083685e-05, "loss": 0.3415, "step": 3945 }, { "epoch": 1.2063589116478142, "grad_norm": 2.039327621459961, "learning_rate": 1.5066870462361483e-05, "loss": 0.378, "step": 3946 }, { "epoch": 1.206664628553959, "grad_norm": 0.5793928503990173, "learning_rate": 1.5070691631639282e-05, "loss": 0.1983, "step": 3947 }, { "epoch": 1.206970345460104, "grad_norm": 0.33094388246536255, "learning_rate": 1.5074512800917083e-05, "loss": 0.1071, "step": 3948 }, { "epoch": 1.2072760623662488, "grad_norm": 0.3520072400569916, "learning_rate": 1.5078333970194881e-05, "loss": 0.0752, "step": 3949 }, { "epoch": 1.2075817792723937, "grad_norm": 0.3735126554965973, "learning_rate": 1.508215513947268e-05, "loss": 0.0817, "step": 3950 }, { "epoch": 1.2078874961785386, "grad_norm": 0.39761731028556824, "learning_rate": 1.5085976308750479e-05, "loss": 0.0616, "step": 3951 }, { "epoch": 1.2081932130846835, "grad_norm": 0.43507394194602966, "learning_rate": 1.5089797478028278e-05, "loss": 0.1016, "step": 3952 }, { "epoch": 1.2084989299908284, "grad_norm": 0.42752742767333984, "learning_rate": 1.5093618647306076e-05, "loss": 0.0788, "step": 3953 }, { "epoch": 1.2088046468969733, "grad_norm": 0.4609510600566864, "learning_rate": 1.5097439816583875e-05, "loss": 0.1056, "step": 3954 }, { "epoch": 1.2091103638031182, "grad_norm": 0.424041211605072, "learning_rate": 1.5101260985861674e-05, "loss": 0.0747, "step": 3955 }, { "epoch": 1.2094160807092633, "grad_norm": 0.435859739780426, "learning_rate": 1.5105082155139474e-05, "loss": 0.0903, "step": 3956 }, { "epoch": 1.2097217976154082, "grad_norm": 0.5884809494018555, "learning_rate": 1.5108903324417273e-05, "loss": 0.1582, "step": 3957 }, { "epoch": 1.210027514521553, "grad_norm": 0.44616737961769104, "learning_rate": 1.5112724493695072e-05, "loss": 0.0951, "step": 3958 }, { "epoch": 1.210333231427698, "grad_norm": 0.4760551452636719, "learning_rate": 1.511654566297287e-05, "loss": 0.1223, "step": 3959 }, { "epoch": 1.210638948333843, "grad_norm": 0.6164878010749817, "learning_rate": 1.512036683225067e-05, "loss": 0.1902, "step": 3960 }, { "epoch": 1.2109446652399878, "grad_norm": 0.7146462202072144, "learning_rate": 1.5124188001528468e-05, "loss": 0.1762, "step": 3961 }, { "epoch": 1.2112503821461327, "grad_norm": 0.6290848851203918, "learning_rate": 1.5128009170806267e-05, "loss": 0.1787, "step": 3962 }, { "epoch": 1.2115560990522776, "grad_norm": 0.6535750031471252, "learning_rate": 1.5131830340084066e-05, "loss": 0.2198, "step": 3963 }, { "epoch": 1.2118618159584225, "grad_norm": 0.6610704064369202, "learning_rate": 1.5135651509361866e-05, "loss": 0.1832, "step": 3964 }, { "epoch": 1.2121675328645674, "grad_norm": 1.5126785039901733, "learning_rate": 1.5139472678639665e-05, "loss": 0.2287, "step": 3965 }, { "epoch": 1.2124732497707122, "grad_norm": 1.3630045652389526, "learning_rate": 1.5143293847917463e-05, "loss": 0.2765, "step": 3966 }, { "epoch": 1.2127789666768571, "grad_norm": 1.4059979915618896, "learning_rate": 1.5147115017195262e-05, "loss": 0.2796, "step": 3967 }, { "epoch": 1.213084683583002, "grad_norm": 0.9616859555244446, "learning_rate": 1.5150936186473061e-05, "loss": 0.2452, "step": 3968 }, { "epoch": 1.2133904004891471, "grad_norm": 1.245732307434082, "learning_rate": 1.515475735575086e-05, "loss": 0.2574, "step": 3969 }, { "epoch": 1.213696117395292, "grad_norm": 1.3248755931854248, "learning_rate": 1.5158578525028658e-05, "loss": 0.382, "step": 3970 }, { "epoch": 1.214001834301437, "grad_norm": 1.7859441041946411, "learning_rate": 1.5162399694306457e-05, "loss": 0.315, "step": 3971 }, { "epoch": 1.2143075512075818, "grad_norm": 0.46416252851486206, "learning_rate": 1.5166220863584258e-05, "loss": 0.1896, "step": 3972 }, { "epoch": 1.2146132681137267, "grad_norm": 0.4385341703891754, "learning_rate": 1.5170042032862056e-05, "loss": 0.1342, "step": 3973 }, { "epoch": 1.2149189850198716, "grad_norm": 1.4851475954055786, "learning_rate": 1.5173863202139855e-05, "loss": 0.0753, "step": 3974 }, { "epoch": 1.2152247019260165, "grad_norm": 0.4284003973007202, "learning_rate": 1.5177684371417654e-05, "loss": 0.0831, "step": 3975 }, { "epoch": 1.2155304188321614, "grad_norm": 1.792839527130127, "learning_rate": 1.5181505540695453e-05, "loss": 0.1005, "step": 3976 }, { "epoch": 1.2158361357383063, "grad_norm": 0.35479632019996643, "learning_rate": 1.5185326709973251e-05, "loss": 0.0665, "step": 3977 }, { "epoch": 1.2161418526444512, "grad_norm": 0.43435433506965637, "learning_rate": 1.518914787925105e-05, "loss": 0.0839, "step": 3978 }, { "epoch": 1.216447569550596, "grad_norm": 0.3727359473705292, "learning_rate": 1.519296904852885e-05, "loss": 0.0632, "step": 3979 }, { "epoch": 1.216753286456741, "grad_norm": 0.3907489478588104, "learning_rate": 1.519679021780665e-05, "loss": 0.128, "step": 3980 }, { "epoch": 1.2170590033628859, "grad_norm": 0.40870562195777893, "learning_rate": 1.5200611387084448e-05, "loss": 0.0868, "step": 3981 }, { "epoch": 1.217364720269031, "grad_norm": 0.777276337146759, "learning_rate": 1.5204432556362247e-05, "loss": 0.0882, "step": 3982 }, { "epoch": 1.2176704371751759, "grad_norm": 0.6287702322006226, "learning_rate": 1.5208253725640046e-05, "loss": 0.1176, "step": 3983 }, { "epoch": 1.2179761540813208, "grad_norm": 0.8089359998703003, "learning_rate": 1.5212074894917844e-05, "loss": 0.1316, "step": 3984 }, { "epoch": 1.2182818709874657, "grad_norm": 0.861380934715271, "learning_rate": 1.5215896064195643e-05, "loss": 0.1354, "step": 3985 }, { "epoch": 1.2185875878936105, "grad_norm": 0.6139304041862488, "learning_rate": 1.5219717233473442e-05, "loss": 0.1601, "step": 3986 }, { "epoch": 1.2188933047997554, "grad_norm": 0.7399612665176392, "learning_rate": 1.5223538402751242e-05, "loss": 0.2028, "step": 3987 }, { "epoch": 1.2191990217059003, "grad_norm": 0.7141876816749573, "learning_rate": 1.5227359572029041e-05, "loss": 0.1838, "step": 3988 }, { "epoch": 1.2195047386120452, "grad_norm": 0.7473613619804382, "learning_rate": 1.523118074130684e-05, "loss": 0.2061, "step": 3989 }, { "epoch": 1.2198104555181901, "grad_norm": 0.9719789624214172, "learning_rate": 1.5235001910584638e-05, "loss": 0.2219, "step": 3990 }, { "epoch": 1.220116172424335, "grad_norm": 1.0591119527816772, "learning_rate": 1.5238823079862437e-05, "loss": 0.2128, "step": 3991 }, { "epoch": 1.22042188933048, "grad_norm": 0.8527912497520447, "learning_rate": 1.5242644249140236e-05, "loss": 0.2129, "step": 3992 }, { "epoch": 1.2207276062366248, "grad_norm": 0.9950820803642273, "learning_rate": 1.5246465418418035e-05, "loss": 0.2319, "step": 3993 }, { "epoch": 1.2210333231427697, "grad_norm": 1.467010498046875, "learning_rate": 1.5250286587695833e-05, "loss": 0.256, "step": 3994 }, { "epoch": 1.2213390400489148, "grad_norm": 2.1705939769744873, "learning_rate": 1.5254107756973634e-05, "loss": 0.2975, "step": 3995 }, { "epoch": 1.2216447569550597, "grad_norm": 2.069234848022461, "learning_rate": 1.5257928926251433e-05, "loss": 0.3542, "step": 3996 }, { "epoch": 1.2219504738612046, "grad_norm": 0.5531324148178101, "learning_rate": 1.526175009552923e-05, "loss": 0.1996, "step": 3997 }, { "epoch": 1.2222561907673495, "grad_norm": 0.5315176248550415, "learning_rate": 1.526557126480703e-05, "loss": 0.1007, "step": 3998 }, { "epoch": 1.2225619076734944, "grad_norm": 0.39715003967285156, "learning_rate": 1.526939243408483e-05, "loss": 0.1014, "step": 3999 }, { "epoch": 1.2228676245796393, "grad_norm": 0.6482269167900085, "learning_rate": 1.527321360336263e-05, "loss": 0.0778, "step": 4000 }, { "epoch": 1.2228676245796393, "eval_cer": 0.19562371117148375, "eval_loss": 0.2758716344833374, "eval_runtime": 18.9572, "eval_samples_per_second": 239.381, "eval_steps_per_second": 0.791, "eval_wer": 0.35591184603782017, "step": 4000 }, { "epoch": 1.2231733414857842, "grad_norm": 0.7528416514396667, "learning_rate": 1.5277034772640428e-05, "loss": 0.0876, "step": 4001 }, { "epoch": 1.223479058391929, "grad_norm": 0.29678186774253845, "learning_rate": 1.5280855941918227e-05, "loss": 0.0596, "step": 4002 }, { "epoch": 1.223784775298074, "grad_norm": 0.4451054036617279, "learning_rate": 1.5284677111196025e-05, "loss": 0.1076, "step": 4003 }, { "epoch": 1.2240904922042188, "grad_norm": 0.5282634496688843, "learning_rate": 1.5288498280473824e-05, "loss": 0.0895, "step": 4004 }, { "epoch": 1.2243962091103637, "grad_norm": 0.41505691409111023, "learning_rate": 1.5292319449751623e-05, "loss": 0.1163, "step": 4005 }, { "epoch": 1.2247019260165086, "grad_norm": 0.5807064175605774, "learning_rate": 1.5296140619029422e-05, "loss": 0.0966, "step": 4006 }, { "epoch": 1.2250076429226535, "grad_norm": 0.6081426739692688, "learning_rate": 1.5299961788307224e-05, "loss": 0.1611, "step": 4007 }, { "epoch": 1.2253133598287986, "grad_norm": 0.44320955872535706, "learning_rate": 1.5303782957585023e-05, "loss": 0.1411, "step": 4008 }, { "epoch": 1.2256190767349435, "grad_norm": 0.9132999181747437, "learning_rate": 1.530760412686282e-05, "loss": 0.1268, "step": 4009 }, { "epoch": 1.2259247936410884, "grad_norm": 0.6541072726249695, "learning_rate": 1.531142529614062e-05, "loss": 0.1641, "step": 4010 }, { "epoch": 1.2262305105472333, "grad_norm": 0.5931136608123779, "learning_rate": 1.531524646541842e-05, "loss": 0.164, "step": 4011 }, { "epoch": 1.2265362274533782, "grad_norm": 0.7697474956512451, "learning_rate": 1.5319067634696218e-05, "loss": 0.1751, "step": 4012 }, { "epoch": 1.226841944359523, "grad_norm": 0.7937294244766235, "learning_rate": 1.5322888803974016e-05, "loss": 0.2191, "step": 4013 }, { "epoch": 1.227147661265668, "grad_norm": 0.9031004309654236, "learning_rate": 1.532670997325182e-05, "loss": 0.2249, "step": 4014 }, { "epoch": 1.2274533781718129, "grad_norm": 1.2030225992202759, "learning_rate": 1.5330531142529617e-05, "loss": 0.2198, "step": 4015 }, { "epoch": 1.2277590950779578, "grad_norm": 0.7567319869995117, "learning_rate": 1.5334352311807416e-05, "loss": 0.2008, "step": 4016 }, { "epoch": 1.2280648119841027, "grad_norm": 1.0709590911865234, "learning_rate": 1.5338173481085215e-05, "loss": 0.2407, "step": 4017 }, { "epoch": 1.2283705288902476, "grad_norm": 1.7994681596755981, "learning_rate": 1.5341994650363013e-05, "loss": 0.283, "step": 4018 }, { "epoch": 1.2286762457963925, "grad_norm": 1.8519160747528076, "learning_rate": 1.5345815819640812e-05, "loss": 0.2502, "step": 4019 }, { "epoch": 1.2289819627025373, "grad_norm": 1.7662625312805176, "learning_rate": 1.534963698891861e-05, "loss": 0.2543, "step": 4020 }, { "epoch": 1.2292876796086825, "grad_norm": 3.432734251022339, "learning_rate": 1.535345815819641e-05, "loss": 0.3496, "step": 4021 }, { "epoch": 1.2295933965148274, "grad_norm": 0.3942650258541107, "learning_rate": 1.535727932747421e-05, "loss": 0.187, "step": 4022 }, { "epoch": 1.2298991134209722, "grad_norm": 0.3469155728816986, "learning_rate": 1.5361100496752007e-05, "loss": 0.1201, "step": 4023 }, { "epoch": 1.2302048303271171, "grad_norm": 0.3274579346179962, "learning_rate": 1.5364921666029806e-05, "loss": 0.0817, "step": 4024 }, { "epoch": 1.230510547233262, "grad_norm": 0.32187432050704956, "learning_rate": 1.5368742835307605e-05, "loss": 0.0984, "step": 4025 }, { "epoch": 1.230816264139407, "grad_norm": 0.3398663103580475, "learning_rate": 1.5372564004585403e-05, "loss": 0.079, "step": 4026 }, { "epoch": 1.2311219810455518, "grad_norm": 0.42567193508148193, "learning_rate": 1.5376385173863202e-05, "loss": 0.073, "step": 4027 }, { "epoch": 1.2314276979516967, "grad_norm": 0.42081254720687866, "learning_rate": 1.5380206343141e-05, "loss": 0.0793, "step": 4028 }, { "epoch": 1.2317334148578416, "grad_norm": 0.38944512605667114, "learning_rate": 1.53840275124188e-05, "loss": 0.0781, "step": 4029 }, { "epoch": 1.2320391317639865, "grad_norm": 0.4506172835826874, "learning_rate": 1.5387848681696602e-05, "loss": 0.1054, "step": 4030 }, { "epoch": 1.2323448486701314, "grad_norm": 0.33394038677215576, "learning_rate": 1.53916698509744e-05, "loss": 0.0907, "step": 4031 }, { "epoch": 1.2326505655762763, "grad_norm": 0.44463109970092773, "learning_rate": 1.53954910202522e-05, "loss": 0.1022, "step": 4032 }, { "epoch": 1.2329562824824212, "grad_norm": 0.4924067258834839, "learning_rate": 1.5399312189529998e-05, "loss": 0.1049, "step": 4033 }, { "epoch": 1.2332619993885663, "grad_norm": 1.0410585403442383, "learning_rate": 1.5403133358807797e-05, "loss": 0.1222, "step": 4034 }, { "epoch": 1.2335677162947112, "grad_norm": 0.5252612829208374, "learning_rate": 1.5406954528085595e-05, "loss": 0.1308, "step": 4035 }, { "epoch": 1.233873433200856, "grad_norm": 0.5210179686546326, "learning_rate": 1.5410775697363394e-05, "loss": 0.1467, "step": 4036 }, { "epoch": 1.234179150107001, "grad_norm": 0.8332182168960571, "learning_rate": 1.5414596866641193e-05, "loss": 0.2038, "step": 4037 }, { "epoch": 1.2344848670131459, "grad_norm": 0.6858619451522827, "learning_rate": 1.5418418035918992e-05, "loss": 0.1883, "step": 4038 }, { "epoch": 1.2347905839192908, "grad_norm": 1.2753064632415771, "learning_rate": 1.542223920519679e-05, "loss": 0.2189, "step": 4039 }, { "epoch": 1.2350963008254356, "grad_norm": 1.0509653091430664, "learning_rate": 1.542606037447459e-05, "loss": 0.2367, "step": 4040 }, { "epoch": 1.2354020177315805, "grad_norm": 1.1994432210922241, "learning_rate": 1.5429881543752388e-05, "loss": 0.2382, "step": 4041 }, { "epoch": 1.2357077346377254, "grad_norm": 1.191679835319519, "learning_rate": 1.5433702713030187e-05, "loss": 0.2634, "step": 4042 }, { "epoch": 1.2360134515438703, "grad_norm": 0.8284063339233398, "learning_rate": 1.5437523882307985e-05, "loss": 0.2127, "step": 4043 }, { "epoch": 1.2363191684500152, "grad_norm": 0.9076023697853088, "learning_rate": 1.5441345051585784e-05, "loss": 0.221, "step": 4044 }, { "epoch": 1.23662488535616, "grad_norm": 1.3721202611923218, "learning_rate": 1.5445166220863586e-05, "loss": 0.2509, "step": 4045 }, { "epoch": 1.236930602262305, "grad_norm": 1.7763761281967163, "learning_rate": 1.5448987390141385e-05, "loss": 0.3584, "step": 4046 }, { "epoch": 1.2372363191684501, "grad_norm": 0.5645197033882141, "learning_rate": 1.5452808559419184e-05, "loss": 0.1981, "step": 4047 }, { "epoch": 1.237542036074595, "grad_norm": 0.4377846419811249, "learning_rate": 1.5456629728696983e-05, "loss": 0.1252, "step": 4048 }, { "epoch": 1.23784775298074, "grad_norm": 0.3523373305797577, "learning_rate": 1.546045089797478e-05, "loss": 0.0984, "step": 4049 }, { "epoch": 1.2381534698868848, "grad_norm": 0.3800857365131378, "learning_rate": 1.546427206725258e-05, "loss": 0.0967, "step": 4050 }, { "epoch": 1.2384591867930297, "grad_norm": 0.31009718775749207, "learning_rate": 1.546809323653038e-05, "loss": 0.0708, "step": 4051 }, { "epoch": 1.2387649036991746, "grad_norm": 0.36954352259635925, "learning_rate": 1.5471914405808178e-05, "loss": 0.0878, "step": 4052 }, { "epoch": 1.2390706206053195, "grad_norm": 0.24114727973937988, "learning_rate": 1.5475735575085976e-05, "loss": 0.0847, "step": 4053 }, { "epoch": 1.2393763375114644, "grad_norm": 0.4082677364349365, "learning_rate": 1.5479556744363775e-05, "loss": 0.0868, "step": 4054 }, { "epoch": 1.2396820544176093, "grad_norm": 1.1338554620742798, "learning_rate": 1.5483377913641574e-05, "loss": 0.1115, "step": 4055 }, { "epoch": 1.2399877713237542, "grad_norm": 0.3935781717300415, "learning_rate": 1.5487199082919373e-05, "loss": 0.1028, "step": 4056 }, { "epoch": 1.240293488229899, "grad_norm": 0.5365492701530457, "learning_rate": 1.549102025219717e-05, "loss": 0.128, "step": 4057 }, { "epoch": 1.240599205136044, "grad_norm": 0.4242883324623108, "learning_rate": 1.549484142147497e-05, "loss": 0.1125, "step": 4058 }, { "epoch": 1.2409049220421888, "grad_norm": 0.3777497410774231, "learning_rate": 1.549866259075277e-05, "loss": 0.107, "step": 4059 }, { "epoch": 1.241210638948334, "grad_norm": 0.5446922183036804, "learning_rate": 1.5502483760030567e-05, "loss": 0.1457, "step": 4060 }, { "epoch": 1.2415163558544788, "grad_norm": 0.47529181838035583, "learning_rate": 1.550630492930837e-05, "loss": 0.1333, "step": 4061 }, { "epoch": 1.2418220727606237, "grad_norm": 0.59477299451828, "learning_rate": 1.551012609858617e-05, "loss": 0.1594, "step": 4062 }, { "epoch": 1.2421277896667686, "grad_norm": 0.7496638298034668, "learning_rate": 1.5513947267863967e-05, "loss": 0.2061, "step": 4063 }, { "epoch": 1.2424335065729135, "grad_norm": 0.8088547587394714, "learning_rate": 1.5517768437141766e-05, "loss": 0.2263, "step": 4064 }, { "epoch": 1.2427392234790584, "grad_norm": 1.2030723094940186, "learning_rate": 1.5521589606419565e-05, "loss": 0.222, "step": 4065 }, { "epoch": 1.2430449403852033, "grad_norm": 0.6785872578620911, "learning_rate": 1.5525410775697363e-05, "loss": 0.2192, "step": 4066 }, { "epoch": 1.2433506572913482, "grad_norm": 0.8134067058563232, "learning_rate": 1.5529231944975162e-05, "loss": 0.1956, "step": 4067 }, { "epoch": 1.243656374197493, "grad_norm": 0.767875611782074, "learning_rate": 1.553305311425296e-05, "loss": 0.2015, "step": 4068 }, { "epoch": 1.243962091103638, "grad_norm": 1.3381571769714355, "learning_rate": 1.553687428353076e-05, "loss": 0.3178, "step": 4069 }, { "epoch": 1.2442678080097829, "grad_norm": 2.2440991401672363, "learning_rate": 1.554069545280856e-05, "loss": 0.2435, "step": 4070 }, { "epoch": 1.2445735249159278, "grad_norm": 2.37255859375, "learning_rate": 1.5544516622086357e-05, "loss": 0.3568, "step": 4071 }, { "epoch": 1.2448792418220727, "grad_norm": 0.6291599869728088, "learning_rate": 1.5548337791364156e-05, "loss": 0.1855, "step": 4072 }, { "epoch": 1.2451849587282178, "grad_norm": 0.41899725794792175, "learning_rate": 1.5552158960641955e-05, "loss": 0.1111, "step": 4073 }, { "epoch": 1.2454906756343627, "grad_norm": 0.32082051038742065, "learning_rate": 1.5555980129919753e-05, "loss": 0.0819, "step": 4074 }, { "epoch": 1.2457963925405076, "grad_norm": 0.4022083282470703, "learning_rate": 1.5559801299197552e-05, "loss": 0.0908, "step": 4075 }, { "epoch": 1.2461021094466525, "grad_norm": 0.6915343403816223, "learning_rate": 1.5563622468475354e-05, "loss": 0.0717, "step": 4076 }, { "epoch": 1.2464078263527973, "grad_norm": 0.38915374875068665, "learning_rate": 1.5567443637753153e-05, "loss": 0.0697, "step": 4077 }, { "epoch": 1.2467135432589422, "grad_norm": 0.4180588126182556, "learning_rate": 1.557126480703095e-05, "loss": 0.0688, "step": 4078 }, { "epoch": 1.2470192601650871, "grad_norm": 0.4475787281990051, "learning_rate": 1.557508597630875e-05, "loss": 0.1374, "step": 4079 }, { "epoch": 1.247324977071232, "grad_norm": 0.5211975574493408, "learning_rate": 1.557890714558655e-05, "loss": 0.0857, "step": 4080 }, { "epoch": 1.247630693977377, "grad_norm": 0.43061378598213196, "learning_rate": 1.5582728314864348e-05, "loss": 0.0721, "step": 4081 }, { "epoch": 1.2479364108835218, "grad_norm": 0.4147164821624756, "learning_rate": 1.5586549484142147e-05, "loss": 0.1225, "step": 4082 }, { "epoch": 1.2482421277896667, "grad_norm": 0.8903311491012573, "learning_rate": 1.5590370653419945e-05, "loss": 0.1072, "step": 4083 }, { "epoch": 1.2485478446958116, "grad_norm": 0.5471884608268738, "learning_rate": 1.5594191822697748e-05, "loss": 0.1401, "step": 4084 }, { "epoch": 1.2488535616019565, "grad_norm": 0.9537472128868103, "learning_rate": 1.5598012991975546e-05, "loss": 0.1399, "step": 4085 }, { "epoch": 1.2491592785081016, "grad_norm": 0.54946368932724, "learning_rate": 1.5601834161253345e-05, "loss": 0.1825, "step": 4086 }, { "epoch": 1.2494649954142465, "grad_norm": 0.8691996932029724, "learning_rate": 1.5605655330531144e-05, "loss": 0.1763, "step": 4087 }, { "epoch": 1.2497707123203914, "grad_norm": 1.6303462982177734, "learning_rate": 1.5609476499808943e-05, "loss": 0.2129, "step": 4088 }, { "epoch": 1.2500764292265363, "grad_norm": 0.7432484030723572, "learning_rate": 1.561329766908674e-05, "loss": 0.184, "step": 4089 }, { "epoch": 1.2503821461326812, "grad_norm": 0.9804303646087646, "learning_rate": 1.561711883836454e-05, "loss": 0.2328, "step": 4090 }, { "epoch": 1.250687863038826, "grad_norm": 0.916713535785675, "learning_rate": 1.562094000764234e-05, "loss": 0.2478, "step": 4091 }, { "epoch": 1.250993579944971, "grad_norm": 0.9228897094726562, "learning_rate": 1.562476117692014e-05, "loss": 0.2253, "step": 4092 }, { "epoch": 1.2512992968511158, "grad_norm": 1.2800862789154053, "learning_rate": 1.562858234619794e-05, "loss": 0.2342, "step": 4093 }, { "epoch": 1.2516050137572607, "grad_norm": 1.794782042503357, "learning_rate": 1.563240351547574e-05, "loss": 0.2258, "step": 4094 }, { "epoch": 1.2519107306634056, "grad_norm": 2.126619577407837, "learning_rate": 1.5636224684753537e-05, "loss": 0.2701, "step": 4095 }, { "epoch": 1.2522164475695505, "grad_norm": 2.9247608184814453, "learning_rate": 1.5640045854031336e-05, "loss": 0.3891, "step": 4096 }, { "epoch": 1.2525221644756956, "grad_norm": 0.6144734025001526, "learning_rate": 1.5643867023309135e-05, "loss": 0.224, "step": 4097 }, { "epoch": 1.2528278813818403, "grad_norm": 0.2890974283218384, "learning_rate": 1.5647688192586933e-05, "loss": 0.1093, "step": 4098 }, { "epoch": 1.2531335982879854, "grad_norm": 0.4625438153743744, "learning_rate": 1.5651509361864732e-05, "loss": 0.089, "step": 4099 }, { "epoch": 1.25343931519413, "grad_norm": 0.25873515009880066, "learning_rate": 1.565533053114253e-05, "loss": 0.0858, "step": 4100 }, { "epoch": 1.2537450321002752, "grad_norm": 0.3356883227825165, "learning_rate": 1.565915170042033e-05, "loss": 0.091, "step": 4101 }, { "epoch": 1.25405074900642, "grad_norm": 0.2696295380592346, "learning_rate": 1.566297286969813e-05, "loss": 0.081, "step": 4102 }, { "epoch": 1.254356465912565, "grad_norm": 0.34146955609321594, "learning_rate": 1.5666794038975927e-05, "loss": 0.0688, "step": 4103 }, { "epoch": 1.25466218281871, "grad_norm": 0.5468454957008362, "learning_rate": 1.5670615208253726e-05, "loss": 0.1067, "step": 4104 }, { "epoch": 1.2549678997248548, "grad_norm": 0.37113285064697266, "learning_rate": 1.5674436377531525e-05, "loss": 0.0789, "step": 4105 }, { "epoch": 1.2552736166309997, "grad_norm": 0.3883320987224579, "learning_rate": 1.5678257546809323e-05, "loss": 0.0882, "step": 4106 }, { "epoch": 1.2555793335371446, "grad_norm": 0.3791731894016266, "learning_rate": 1.5682078716087125e-05, "loss": 0.0975, "step": 4107 }, { "epoch": 1.2558850504432895, "grad_norm": 0.37353911995887756, "learning_rate": 1.5685899885364924e-05, "loss": 0.0969, "step": 4108 }, { "epoch": 1.2561907673494344, "grad_norm": 0.4966721832752228, "learning_rate": 1.5689721054642723e-05, "loss": 0.123, "step": 4109 }, { "epoch": 1.2564964842555795, "grad_norm": 0.7007614374160767, "learning_rate": 1.569354222392052e-05, "loss": 0.1496, "step": 4110 }, { "epoch": 1.2568022011617241, "grad_norm": 0.7053175568580627, "learning_rate": 1.569736339319832e-05, "loss": 0.1541, "step": 4111 }, { "epoch": 1.2571079180678693, "grad_norm": 0.6480387449264526, "learning_rate": 1.570118456247612e-05, "loss": 0.1696, "step": 4112 }, { "epoch": 1.257413634974014, "grad_norm": 0.9772491455078125, "learning_rate": 1.5705005731753918e-05, "loss": 0.1846, "step": 4113 }, { "epoch": 1.257719351880159, "grad_norm": 0.7127219438552856, "learning_rate": 1.5708826901031717e-05, "loss": 0.2115, "step": 4114 }, { "epoch": 1.258025068786304, "grad_norm": 0.8794042468070984, "learning_rate": 1.5712648070309515e-05, "loss": 0.2125, "step": 4115 }, { "epoch": 1.2583307856924488, "grad_norm": 0.8075908422470093, "learning_rate": 1.5716469239587314e-05, "loss": 0.1917, "step": 4116 }, { "epoch": 1.2586365025985937, "grad_norm": 1.4756178855895996, "learning_rate": 1.5720290408865113e-05, "loss": 0.2322, "step": 4117 }, { "epoch": 1.2589422195047386, "grad_norm": 1.264198899269104, "learning_rate": 1.572411157814291e-05, "loss": 0.3023, "step": 4118 }, { "epoch": 1.2592479364108835, "grad_norm": 1.5908557176589966, "learning_rate": 1.572793274742071e-05, "loss": 0.2609, "step": 4119 }, { "epoch": 1.2595536533170284, "grad_norm": 2.5790021419525146, "learning_rate": 1.573175391669851e-05, "loss": 0.2788, "step": 4120 }, { "epoch": 1.2598593702231733, "grad_norm": 1.7864441871643066, "learning_rate": 1.5735575085976308e-05, "loss": 0.2699, "step": 4121 }, { "epoch": 1.2601650871293182, "grad_norm": 0.6901083588600159, "learning_rate": 1.5739396255254107e-05, "loss": 0.1736, "step": 4122 }, { "epoch": 1.2604708040354633, "grad_norm": 0.40693119168281555, "learning_rate": 1.574321742453191e-05, "loss": 0.1087, "step": 4123 }, { "epoch": 1.260776520941608, "grad_norm": 0.35905924439430237, "learning_rate": 1.5747038593809707e-05, "loss": 0.1024, "step": 4124 }, { "epoch": 1.261082237847753, "grad_norm": 0.45487916469573975, "learning_rate": 1.5750859763087506e-05, "loss": 0.0595, "step": 4125 }, { "epoch": 1.2613879547538978, "grad_norm": 0.32345059514045715, "learning_rate": 1.5754680932365305e-05, "loss": 0.0619, "step": 4126 }, { "epoch": 1.2616936716600429, "grad_norm": 0.5829877257347107, "learning_rate": 1.5758502101643104e-05, "loss": 0.1001, "step": 4127 }, { "epoch": 1.2619993885661878, "grad_norm": 0.4490090608596802, "learning_rate": 1.5762323270920902e-05, "loss": 0.094, "step": 4128 }, { "epoch": 1.2623051054723327, "grad_norm": 0.40385374426841736, "learning_rate": 1.57661444401987e-05, "loss": 0.0804, "step": 4129 }, { "epoch": 1.2626108223784775, "grad_norm": 0.46193644404411316, "learning_rate": 1.57699656094765e-05, "loss": 0.1037, "step": 4130 }, { "epoch": 1.2629165392846224, "grad_norm": 0.584024965763092, "learning_rate": 1.57737867787543e-05, "loss": 0.0992, "step": 4131 }, { "epoch": 1.2632222561907673, "grad_norm": 0.45870113372802734, "learning_rate": 1.5777607948032097e-05, "loss": 0.1293, "step": 4132 }, { "epoch": 1.2635279730969122, "grad_norm": 0.6734806895256042, "learning_rate": 1.5781429117309896e-05, "loss": 0.1128, "step": 4133 }, { "epoch": 1.2638336900030571, "grad_norm": 0.6526630520820618, "learning_rate": 1.5785250286587695e-05, "loss": 0.1104, "step": 4134 }, { "epoch": 1.264139406909202, "grad_norm": 0.8923491835594177, "learning_rate": 1.5789071455865494e-05, "loss": 0.1439, "step": 4135 }, { "epoch": 1.2644451238153471, "grad_norm": 0.6380122900009155, "learning_rate": 1.5792892625143292e-05, "loss": 0.1726, "step": 4136 }, { "epoch": 1.2647508407214918, "grad_norm": 0.543453574180603, "learning_rate": 1.579671379442109e-05, "loss": 0.1569, "step": 4137 }, { "epoch": 1.265056557627637, "grad_norm": 1.418705940246582, "learning_rate": 1.5800534963698893e-05, "loss": 0.2151, "step": 4138 }, { "epoch": 1.2653622745337816, "grad_norm": 0.8538497090339661, "learning_rate": 1.5804356132976692e-05, "loss": 0.2634, "step": 4139 }, { "epoch": 1.2656679914399267, "grad_norm": 0.9455171227455139, "learning_rate": 1.580817730225449e-05, "loss": 0.2539, "step": 4140 }, { "epoch": 1.2659737083460716, "grad_norm": 2.373990297317505, "learning_rate": 1.581199847153229e-05, "loss": 0.1978, "step": 4141 }, { "epoch": 1.2662794252522165, "grad_norm": 0.9486874341964722, "learning_rate": 1.5815819640810088e-05, "loss": 0.2261, "step": 4142 }, { "epoch": 1.2665851421583614, "grad_norm": 1.4061384201049805, "learning_rate": 1.5819640810087887e-05, "loss": 0.2286, "step": 4143 }, { "epoch": 1.2668908590645063, "grad_norm": 1.1332674026489258, "learning_rate": 1.5823461979365686e-05, "loss": 0.2719, "step": 4144 }, { "epoch": 1.2671965759706512, "grad_norm": 1.1130053997039795, "learning_rate": 1.5827283148643484e-05, "loss": 0.2721, "step": 4145 }, { "epoch": 1.267502292876796, "grad_norm": 1.4446762800216675, "learning_rate": 1.5831104317921283e-05, "loss": 0.3066, "step": 4146 }, { "epoch": 1.267808009782941, "grad_norm": 0.3990901708602905, "learning_rate": 1.5834925487199082e-05, "loss": 0.1651, "step": 4147 }, { "epoch": 1.2681137266890858, "grad_norm": 0.4220849275588989, "learning_rate": 1.583874665647688e-05, "loss": 0.1195, "step": 4148 }, { "epoch": 1.2684194435952307, "grad_norm": 0.4450114965438843, "learning_rate": 1.584256782575468e-05, "loss": 0.1231, "step": 4149 }, { "epoch": 1.2687251605013756, "grad_norm": 0.28868818283081055, "learning_rate": 1.5846388995032478e-05, "loss": 0.0754, "step": 4150 }, { "epoch": 1.2690308774075207, "grad_norm": 0.43125924468040466, "learning_rate": 1.5850210164310277e-05, "loss": 0.0719, "step": 4151 }, { "epoch": 1.2693365943136654, "grad_norm": 0.3544538617134094, "learning_rate": 1.5854031333588076e-05, "loss": 0.0937, "step": 4152 }, { "epoch": 1.2696423112198105, "grad_norm": 0.33841386437416077, "learning_rate": 1.5857852502865874e-05, "loss": 0.0658, "step": 4153 }, { "epoch": 1.2699480281259554, "grad_norm": 0.3981311619281769, "learning_rate": 1.5861673672143677e-05, "loss": 0.1024, "step": 4154 }, { "epoch": 1.2702537450321003, "grad_norm": 0.37528732419013977, "learning_rate": 1.5865494841421475e-05, "loss": 0.1083, "step": 4155 }, { "epoch": 1.2705594619382452, "grad_norm": 0.27345675230026245, "learning_rate": 1.5869316010699274e-05, "loss": 0.0759, "step": 4156 }, { "epoch": 1.27086517884439, "grad_norm": 0.4272817075252533, "learning_rate": 1.5873137179977073e-05, "loss": 0.1336, "step": 4157 }, { "epoch": 1.271170895750535, "grad_norm": 0.36158037185668945, "learning_rate": 1.587695834925487e-05, "loss": 0.118, "step": 4158 }, { "epoch": 1.2714766126566799, "grad_norm": 0.5444701313972473, "learning_rate": 1.588077951853267e-05, "loss": 0.1113, "step": 4159 }, { "epoch": 1.2717823295628248, "grad_norm": 0.5106292963027954, "learning_rate": 1.588460068781047e-05, "loss": 0.1504, "step": 4160 }, { "epoch": 1.2720880464689697, "grad_norm": 0.5116832256317139, "learning_rate": 1.5888421857088268e-05, "loss": 0.1553, "step": 4161 }, { "epoch": 1.2723937633751146, "grad_norm": 0.8638631105422974, "learning_rate": 1.589224302636607e-05, "loss": 0.2132, "step": 4162 }, { "epoch": 1.2726994802812595, "grad_norm": 0.6956072449684143, "learning_rate": 1.589606419564387e-05, "loss": 0.159, "step": 4163 }, { "epoch": 1.2730051971874046, "grad_norm": 0.834652841091156, "learning_rate": 1.5899885364921667e-05, "loss": 0.2295, "step": 4164 }, { "epoch": 1.2733109140935492, "grad_norm": 0.6146866083145142, "learning_rate": 1.5903706534199466e-05, "loss": 0.1877, "step": 4165 }, { "epoch": 1.2736166309996944, "grad_norm": 0.7087380290031433, "learning_rate": 1.5907527703477265e-05, "loss": 0.2117, "step": 4166 }, { "epoch": 1.2739223479058392, "grad_norm": 0.887996256351471, "learning_rate": 1.5911348872755064e-05, "loss": 0.2406, "step": 4167 }, { "epoch": 1.2742280648119841, "grad_norm": 1.0135014057159424, "learning_rate": 1.5915170042032862e-05, "loss": 0.2554, "step": 4168 }, { "epoch": 1.274533781718129, "grad_norm": 1.827491044998169, "learning_rate": 1.591899121131066e-05, "loss": 0.2597, "step": 4169 }, { "epoch": 1.274839498624274, "grad_norm": 1.2871718406677246, "learning_rate": 1.5922812380588463e-05, "loss": 0.2514, "step": 4170 }, { "epoch": 1.2751452155304188, "grad_norm": 1.491492748260498, "learning_rate": 1.5926633549866262e-05, "loss": 0.326, "step": 4171 }, { "epoch": 1.2754509324365637, "grad_norm": 0.4316195249557495, "learning_rate": 1.593045471914406e-05, "loss": 0.1804, "step": 4172 }, { "epoch": 1.2757566493427086, "grad_norm": 0.5131692290306091, "learning_rate": 1.593427588842186e-05, "loss": 0.102, "step": 4173 }, { "epoch": 1.2760623662488535, "grad_norm": 1.006194829940796, "learning_rate": 1.5938097057699658e-05, "loss": 0.0741, "step": 4174 }, { "epoch": 1.2763680831549984, "grad_norm": 0.37987223267555237, "learning_rate": 1.5941918226977457e-05, "loss": 0.0678, "step": 4175 }, { "epoch": 1.2766738000611433, "grad_norm": 0.37949803471565247, "learning_rate": 1.5945739396255256e-05, "loss": 0.0813, "step": 4176 }, { "epoch": 1.2769795169672884, "grad_norm": 0.36859527230262756, "learning_rate": 1.5949560565533054e-05, "loss": 0.0851, "step": 4177 }, { "epoch": 1.277285233873433, "grad_norm": 0.38190022110939026, "learning_rate": 1.5953381734810853e-05, "loss": 0.1121, "step": 4178 }, { "epoch": 1.2775909507795782, "grad_norm": 0.3767496943473816, "learning_rate": 1.5957202904088652e-05, "loss": 0.0773, "step": 4179 }, { "epoch": 1.277896667685723, "grad_norm": 0.6740476489067078, "learning_rate": 1.596102407336645e-05, "loss": 0.0833, "step": 4180 }, { "epoch": 1.278202384591868, "grad_norm": 0.42833760380744934, "learning_rate": 1.596484524264425e-05, "loss": 0.0805, "step": 4181 }, { "epoch": 1.2785081014980129, "grad_norm": 0.41021302342414856, "learning_rate": 1.5968666411922048e-05, "loss": 0.1213, "step": 4182 }, { "epoch": 1.2788138184041578, "grad_norm": 0.4863133132457733, "learning_rate": 1.5972487581199847e-05, "loss": 0.1073, "step": 4183 }, { "epoch": 1.2791195353103026, "grad_norm": 0.49638915061950684, "learning_rate": 1.5976308750477646e-05, "loss": 0.1116, "step": 4184 }, { "epoch": 1.2794252522164475, "grad_norm": 0.4518325626850128, "learning_rate": 1.5980129919755448e-05, "loss": 0.1171, "step": 4185 }, { "epoch": 1.2797309691225924, "grad_norm": 0.6344385147094727, "learning_rate": 1.5983951089033247e-05, "loss": 0.1539, "step": 4186 }, { "epoch": 1.2800366860287373, "grad_norm": 0.9830924272537231, "learning_rate": 1.5987772258311045e-05, "loss": 0.1659, "step": 4187 }, { "epoch": 1.2803424029348822, "grad_norm": 0.7832651734352112, "learning_rate": 1.5991593427588844e-05, "loss": 0.1919, "step": 4188 }, { "epoch": 1.280648119841027, "grad_norm": 0.9067022800445557, "learning_rate": 1.5995414596866643e-05, "loss": 0.2329, "step": 4189 }, { "epoch": 1.2809538367471722, "grad_norm": 0.8604720830917358, "learning_rate": 1.599923576614444e-05, "loss": 0.2015, "step": 4190 }, { "epoch": 1.281259553653317, "grad_norm": 2.857802391052246, "learning_rate": 1.600305693542224e-05, "loss": 0.2175, "step": 4191 }, { "epoch": 1.281565270559462, "grad_norm": 1.1489886045455933, "learning_rate": 1.600687810470004e-05, "loss": 0.1946, "step": 4192 }, { "epoch": 1.281870987465607, "grad_norm": 1.3914732933044434, "learning_rate": 1.6010699273977838e-05, "loss": 0.2718, "step": 4193 }, { "epoch": 1.2821767043717518, "grad_norm": 0.9937353730201721, "learning_rate": 1.6014520443255637e-05, "loss": 0.2428, "step": 4194 }, { "epoch": 1.2824824212778967, "grad_norm": 1.3116315603256226, "learning_rate": 1.6018341612533435e-05, "loss": 0.3077, "step": 4195 }, { "epoch": 1.2827881381840416, "grad_norm": 1.6045912504196167, "learning_rate": 1.6022162781811234e-05, "loss": 0.3749, "step": 4196 }, { "epoch": 1.2830938550901865, "grad_norm": 0.5939640998840332, "learning_rate": 1.6025983951089033e-05, "loss": 0.1628, "step": 4197 }, { "epoch": 1.2833995719963314, "grad_norm": 0.4478054344654083, "learning_rate": 1.602980512036683e-05, "loss": 0.1, "step": 4198 }, { "epoch": 1.2837052889024763, "grad_norm": 0.3189269006252289, "learning_rate": 1.603362628964463e-05, "loss": 0.0891, "step": 4199 }, { "epoch": 1.2840110058086212, "grad_norm": 0.6877144575119019, "learning_rate": 1.603744745892243e-05, "loss": 0.095, "step": 4200 }, { "epoch": 1.284316722714766, "grad_norm": 0.3309599757194519, "learning_rate": 1.604126862820023e-05, "loss": 0.0757, "step": 4201 }, { "epoch": 1.284622439620911, "grad_norm": 0.3564668297767639, "learning_rate": 1.604508979747803e-05, "loss": 0.0901, "step": 4202 }, { "epoch": 1.284928156527056, "grad_norm": 0.5964449644088745, "learning_rate": 1.604891096675583e-05, "loss": 0.086, "step": 4203 }, { "epoch": 1.2852338734332007, "grad_norm": 0.6139293909072876, "learning_rate": 1.6052732136033627e-05, "loss": 0.081, "step": 4204 }, { "epoch": 1.2855395903393458, "grad_norm": 0.45233115553855896, "learning_rate": 1.6056553305311426e-05, "loss": 0.1243, "step": 4205 }, { "epoch": 1.2858453072454907, "grad_norm": 0.5634717345237732, "learning_rate": 1.6060374474589225e-05, "loss": 0.1227, "step": 4206 }, { "epoch": 1.2861510241516356, "grad_norm": 1.0095669031143188, "learning_rate": 1.6064195643867024e-05, "loss": 0.0951, "step": 4207 }, { "epoch": 1.2864567410577805, "grad_norm": 0.5161130428314209, "learning_rate": 1.6068016813144822e-05, "loss": 0.1223, "step": 4208 }, { "epoch": 1.2867624579639254, "grad_norm": 0.7067562937736511, "learning_rate": 1.607183798242262e-05, "loss": 0.1483, "step": 4209 }, { "epoch": 1.2870681748700703, "grad_norm": 0.6007308959960938, "learning_rate": 1.607565915170042e-05, "loss": 0.1107, "step": 4210 }, { "epoch": 1.2873738917762152, "grad_norm": 0.6930946707725525, "learning_rate": 1.607948032097822e-05, "loss": 0.1851, "step": 4211 }, { "epoch": 1.28767960868236, "grad_norm": 0.7180156111717224, "learning_rate": 1.6083301490256017e-05, "loss": 0.1966, "step": 4212 }, { "epoch": 1.287985325588505, "grad_norm": 0.7618272304534912, "learning_rate": 1.6087122659533816e-05, "loss": 0.2189, "step": 4213 }, { "epoch": 1.2882910424946499, "grad_norm": 1.3145208358764648, "learning_rate": 1.6090943828811615e-05, "loss": 0.2766, "step": 4214 }, { "epoch": 1.2885967594007948, "grad_norm": 0.710412859916687, "learning_rate": 1.6094764998089414e-05, "loss": 0.2134, "step": 4215 }, { "epoch": 1.2889024763069399, "grad_norm": 0.8298164010047913, "learning_rate": 1.6098586167367216e-05, "loss": 0.2058, "step": 4216 }, { "epoch": 1.2892081932130846, "grad_norm": 0.8479624390602112, "learning_rate": 1.6102407336645014e-05, "loss": 0.2483, "step": 4217 }, { "epoch": 1.2895139101192297, "grad_norm": 0.9438287019729614, "learning_rate": 1.6106228505922813e-05, "loss": 0.2475, "step": 4218 }, { "epoch": 1.2898196270253746, "grad_norm": 1.483426809310913, "learning_rate": 1.6110049675200612e-05, "loss": 0.2648, "step": 4219 }, { "epoch": 1.2901253439315195, "grad_norm": 2.221639394760132, "learning_rate": 1.611387084447841e-05, "loss": 0.2477, "step": 4220 }, { "epoch": 1.2904310608376643, "grad_norm": 1.8103686571121216, "learning_rate": 1.611769201375621e-05, "loss": 0.3383, "step": 4221 }, { "epoch": 1.2907367777438092, "grad_norm": 0.4505755305290222, "learning_rate": 1.6121513183034008e-05, "loss": 0.1695, "step": 4222 }, { "epoch": 1.2910424946499541, "grad_norm": 0.3504200875759125, "learning_rate": 1.6125334352311807e-05, "loss": 0.1076, "step": 4223 }, { "epoch": 1.291348211556099, "grad_norm": 0.5245065689086914, "learning_rate": 1.6129155521589606e-05, "loss": 0.1144, "step": 4224 }, { "epoch": 1.291653928462244, "grad_norm": 0.4144364297389984, "learning_rate": 1.6132976690867404e-05, "loss": 0.0695, "step": 4225 }, { "epoch": 1.2919596453683888, "grad_norm": 0.342710018157959, "learning_rate": 1.6136797860145203e-05, "loss": 0.0711, "step": 4226 }, { "epoch": 1.2922653622745337, "grad_norm": 0.2826671302318573, "learning_rate": 1.6140619029423002e-05, "loss": 0.0684, "step": 4227 }, { "epoch": 1.2925710791806786, "grad_norm": 0.5440459251403809, "learning_rate": 1.61444401987008e-05, "loss": 0.0819, "step": 4228 }, { "epoch": 1.2928767960868237, "grad_norm": 0.2779577672481537, "learning_rate": 1.61482613679786e-05, "loss": 0.0742, "step": 4229 }, { "epoch": 1.2931825129929684, "grad_norm": 0.48501530289649963, "learning_rate": 1.6152082537256398e-05, "loss": 0.1114, "step": 4230 }, { "epoch": 1.2934882298991135, "grad_norm": 0.3702336251735687, "learning_rate": 1.61559037065342e-05, "loss": 0.1056, "step": 4231 }, { "epoch": 1.2937939468052584, "grad_norm": 0.43722736835479736, "learning_rate": 1.6159724875812e-05, "loss": 0.1052, "step": 4232 }, { "epoch": 1.2940996637114033, "grad_norm": 0.4143064320087433, "learning_rate": 1.6163546045089798e-05, "loss": 0.1134, "step": 4233 }, { "epoch": 1.2944053806175482, "grad_norm": 0.54735267162323, "learning_rate": 1.6167367214367596e-05, "loss": 0.1343, "step": 4234 }, { "epoch": 1.294711097523693, "grad_norm": 0.6242133378982544, "learning_rate": 1.6171188383645395e-05, "loss": 0.1981, "step": 4235 }, { "epoch": 1.295016814429838, "grad_norm": 0.45344120264053345, "learning_rate": 1.6175009552923194e-05, "loss": 0.1317, "step": 4236 }, { "epoch": 1.2953225313359829, "grad_norm": 0.6744596362113953, "learning_rate": 1.6178830722200993e-05, "loss": 0.2516, "step": 4237 }, { "epoch": 1.2956282482421277, "grad_norm": 0.665655255317688, "learning_rate": 1.618265189147879e-05, "loss": 0.1981, "step": 4238 }, { "epoch": 1.2959339651482726, "grad_norm": 0.6251403093338013, "learning_rate": 1.618647306075659e-05, "loss": 0.2182, "step": 4239 }, { "epoch": 1.2962396820544175, "grad_norm": 0.7848618030548096, "learning_rate": 1.6190294230034392e-05, "loss": 0.2442, "step": 4240 }, { "epoch": 1.2965453989605624, "grad_norm": 0.872360348701477, "learning_rate": 1.619411539931219e-05, "loss": 0.2446, "step": 4241 }, { "epoch": 1.2968511158667075, "grad_norm": 0.8194075226783752, "learning_rate": 1.619793656858999e-05, "loss": 0.2339, "step": 4242 }, { "epoch": 1.2971568327728522, "grad_norm": 0.7862655520439148, "learning_rate": 1.620175773786779e-05, "loss": 0.242, "step": 4243 }, { "epoch": 1.2974625496789973, "grad_norm": 1.1580733060836792, "learning_rate": 1.6205578907145587e-05, "loss": 0.2816, "step": 4244 }, { "epoch": 1.2977682665851422, "grad_norm": 0.999839723110199, "learning_rate": 1.6209400076423386e-05, "loss": 0.2388, "step": 4245 }, { "epoch": 1.298073983491287, "grad_norm": 1.636009931564331, "learning_rate": 1.6213221245701185e-05, "loss": 0.3099, "step": 4246 }, { "epoch": 1.298379700397432, "grad_norm": 0.4082302153110504, "learning_rate": 1.6217042414978987e-05, "loss": 0.1529, "step": 4247 }, { "epoch": 1.298685417303577, "grad_norm": 0.3818858861923218, "learning_rate": 1.6220863584256786e-05, "loss": 0.1007, "step": 4248 }, { "epoch": 1.2989911342097218, "grad_norm": 0.4326591491699219, "learning_rate": 1.6224684753534584e-05, "loss": 0.0832, "step": 4249 }, { "epoch": 1.2992968511158667, "grad_norm": 0.5648101568222046, "learning_rate": 1.6228505922812383e-05, "loss": 0.0765, "step": 4250 }, { "epoch": 1.2996025680220116, "grad_norm": 0.27544474601745605, "learning_rate": 1.6232327092090182e-05, "loss": 0.0647, "step": 4251 }, { "epoch": 1.2999082849281565, "grad_norm": 0.4549129605293274, "learning_rate": 1.623614826136798e-05, "loss": 0.1189, "step": 4252 }, { "epoch": 1.3002140018343014, "grad_norm": 0.43791383504867554, "learning_rate": 1.623996943064578e-05, "loss": 0.0892, "step": 4253 }, { "epoch": 1.3005197187404463, "grad_norm": 0.334665983915329, "learning_rate": 1.6243790599923578e-05, "loss": 0.073, "step": 4254 }, { "epoch": 1.3008254356465914, "grad_norm": 0.7160771489143372, "learning_rate": 1.6247611769201377e-05, "loss": 0.1478, "step": 4255 }, { "epoch": 1.301131152552736, "grad_norm": 0.533931314945221, "learning_rate": 1.6251432938479176e-05, "loss": 0.1092, "step": 4256 }, { "epoch": 1.3014368694588812, "grad_norm": 0.42462098598480225, "learning_rate": 1.6255254107756974e-05, "loss": 0.0902, "step": 4257 }, { "epoch": 1.301742586365026, "grad_norm": 0.5780829191207886, "learning_rate": 1.6259075277034773e-05, "loss": 0.1449, "step": 4258 }, { "epoch": 1.302048303271171, "grad_norm": 0.47487106919288635, "learning_rate": 1.6262896446312572e-05, "loss": 0.1176, "step": 4259 }, { "epoch": 1.3023540201773158, "grad_norm": 0.6973261833190918, "learning_rate": 1.626671761559037e-05, "loss": 0.1408, "step": 4260 }, { "epoch": 1.3026597370834607, "grad_norm": 0.6700336337089539, "learning_rate": 1.627053878486817e-05, "loss": 0.1406, "step": 4261 }, { "epoch": 1.3029654539896056, "grad_norm": 0.6828939318656921, "learning_rate": 1.6274359954145968e-05, "loss": 0.1588, "step": 4262 }, { "epoch": 1.3032711708957505, "grad_norm": 1.9856464862823486, "learning_rate": 1.627818112342377e-05, "loss": 0.1807, "step": 4263 }, { "epoch": 1.3035768878018954, "grad_norm": 1.7725322246551514, "learning_rate": 1.628200229270157e-05, "loss": 0.2068, "step": 4264 }, { "epoch": 1.3038826047080403, "grad_norm": 1.2852046489715576, "learning_rate": 1.6285823461979368e-05, "loss": 0.2072, "step": 4265 }, { "epoch": 1.3041883216141852, "grad_norm": 1.1067461967468262, "learning_rate": 1.6289644631257166e-05, "loss": 0.24, "step": 4266 }, { "epoch": 1.30449403852033, "grad_norm": 1.1525146961212158, "learning_rate": 1.6293465800534965e-05, "loss": 0.2586, "step": 4267 }, { "epoch": 1.3047997554264752, "grad_norm": 1.1664133071899414, "learning_rate": 1.6297286969812764e-05, "loss": 0.2685, "step": 4268 }, { "epoch": 1.3051054723326199, "grad_norm": 1.9363532066345215, "learning_rate": 1.6301108139090563e-05, "loss": 0.2769, "step": 4269 }, { "epoch": 1.305411189238765, "grad_norm": 1.3658287525177002, "learning_rate": 1.630492930836836e-05, "loss": 0.2736, "step": 4270 }, { "epoch": 1.3057169061449099, "grad_norm": 2.9449501037597656, "learning_rate": 1.630875047764616e-05, "loss": 0.3793, "step": 4271 }, { "epoch": 1.3060226230510548, "grad_norm": 0.47758424282073975, "learning_rate": 1.631257164692396e-05, "loss": 0.1812, "step": 4272 }, { "epoch": 1.3063283399571997, "grad_norm": 0.42777350544929504, "learning_rate": 1.6316392816201758e-05, "loss": 0.0978, "step": 4273 }, { "epoch": 1.3066340568633446, "grad_norm": 0.3374486565589905, "learning_rate": 1.6320213985479556e-05, "loss": 0.091, "step": 4274 }, { "epoch": 1.3069397737694894, "grad_norm": 0.5066138505935669, "learning_rate": 1.6324035154757355e-05, "loss": 0.0849, "step": 4275 }, { "epoch": 1.3072454906756343, "grad_norm": 0.3090674579143524, "learning_rate": 1.6327856324035154e-05, "loss": 0.0713, "step": 4276 }, { "epoch": 1.3075512075817792, "grad_norm": 0.429105669260025, "learning_rate": 1.6331677493312953e-05, "loss": 0.0839, "step": 4277 }, { "epoch": 1.3078569244879241, "grad_norm": 0.6032496690750122, "learning_rate": 1.6335498662590755e-05, "loss": 0.0673, "step": 4278 }, { "epoch": 1.308162641394069, "grad_norm": 0.5613090395927429, "learning_rate": 1.6339319831868554e-05, "loss": 0.0639, "step": 4279 }, { "epoch": 1.308468358300214, "grad_norm": 0.458722859621048, "learning_rate": 1.6343141001146352e-05, "loss": 0.1116, "step": 4280 }, { "epoch": 1.308774075206359, "grad_norm": 0.48220011591911316, "learning_rate": 1.634696217042415e-05, "loss": 0.0799, "step": 4281 }, { "epoch": 1.3090797921125037, "grad_norm": 0.4863326847553253, "learning_rate": 1.635078333970195e-05, "loss": 0.1121, "step": 4282 }, { "epoch": 1.3093855090186488, "grad_norm": 0.5389482975006104, "learning_rate": 1.635460450897975e-05, "loss": 0.1149, "step": 4283 }, { "epoch": 1.3096912259247937, "grad_norm": 0.5480302572250366, "learning_rate": 1.6358425678257547e-05, "loss": 0.1156, "step": 4284 }, { "epoch": 1.3099969428309386, "grad_norm": 0.7444908022880554, "learning_rate": 1.6362246847535346e-05, "loss": 0.14, "step": 4285 }, { "epoch": 1.3103026597370835, "grad_norm": 0.5457068085670471, "learning_rate": 1.6366068016813145e-05, "loss": 0.1586, "step": 4286 }, { "epoch": 1.3106083766432284, "grad_norm": 0.7477599382400513, "learning_rate": 1.6369889186090944e-05, "loss": 0.1566, "step": 4287 }, { "epoch": 1.3109140935493733, "grad_norm": 0.7039673328399658, "learning_rate": 1.6373710355368742e-05, "loss": 0.2056, "step": 4288 }, { "epoch": 1.3112198104555182, "grad_norm": 0.8118571639060974, "learning_rate": 1.637753152464654e-05, "loss": 0.2116, "step": 4289 }, { "epoch": 1.311525527361663, "grad_norm": 0.701614260673523, "learning_rate": 1.638135269392434e-05, "loss": 0.2195, "step": 4290 }, { "epoch": 1.311831244267808, "grad_norm": 0.6155600547790527, "learning_rate": 1.638517386320214e-05, "loss": 0.2184, "step": 4291 }, { "epoch": 1.3121369611739528, "grad_norm": 1.2075769901275635, "learning_rate": 1.6388995032479937e-05, "loss": 0.1998, "step": 4292 }, { "epoch": 1.3124426780800977, "grad_norm": 0.9551590085029602, "learning_rate": 1.6392816201757736e-05, "loss": 0.2459, "step": 4293 }, { "epoch": 1.3127483949862429, "grad_norm": 0.9131357669830322, "learning_rate": 1.6396637371035538e-05, "loss": 0.2325, "step": 4294 }, { "epoch": 1.3130541118923875, "grad_norm": 2.4984371662139893, "learning_rate": 1.6400458540313337e-05, "loss": 0.2658, "step": 4295 }, { "epoch": 1.3133598287985326, "grad_norm": 2.3338918685913086, "learning_rate": 1.6404279709591136e-05, "loss": 0.3664, "step": 4296 }, { "epoch": 1.3136655457046775, "grad_norm": 0.5880306959152222, "learning_rate": 1.6408100878868934e-05, "loss": 0.1903, "step": 4297 }, { "epoch": 1.3139712626108224, "grad_norm": 0.3588714003562927, "learning_rate": 1.6411922048146733e-05, "loss": 0.1199, "step": 4298 }, { "epoch": 1.3142769795169673, "grad_norm": 0.32108643651008606, "learning_rate": 1.6415743217424532e-05, "loss": 0.0937, "step": 4299 }, { "epoch": 1.3145826964231122, "grad_norm": 0.34557491540908813, "learning_rate": 1.641956438670233e-05, "loss": 0.0994, "step": 4300 }, { "epoch": 1.314888413329257, "grad_norm": 0.6049674153327942, "learning_rate": 1.642338555598013e-05, "loss": 0.0999, "step": 4301 }, { "epoch": 1.315194130235402, "grad_norm": 0.32579824328422546, "learning_rate": 1.6427206725257928e-05, "loss": 0.0951, "step": 4302 }, { "epoch": 1.3154998471415469, "grad_norm": 0.36553266644477844, "learning_rate": 1.6431027894535727e-05, "loss": 0.0682, "step": 4303 }, { "epoch": 1.3158055640476918, "grad_norm": 0.39298632740974426, "learning_rate": 1.6434849063813526e-05, "loss": 0.095, "step": 4304 }, { "epoch": 1.3161112809538367, "grad_norm": 0.43242940306663513, "learning_rate": 1.6438670233091324e-05, "loss": 0.0726, "step": 4305 }, { "epoch": 1.3164169978599816, "grad_norm": 0.4386289417743683, "learning_rate": 1.6442491402369123e-05, "loss": 0.0913, "step": 4306 }, { "epoch": 1.3167227147661267, "grad_norm": 0.6126132607460022, "learning_rate": 1.6446312571646922e-05, "loss": 0.0845, "step": 4307 }, { "epoch": 1.3170284316722713, "grad_norm": 0.5504536032676697, "learning_rate": 1.645013374092472e-05, "loss": 0.1073, "step": 4308 }, { "epoch": 1.3173341485784165, "grad_norm": 1.1543693542480469, "learning_rate": 1.6453954910202523e-05, "loss": 0.1596, "step": 4309 }, { "epoch": 1.3176398654845614, "grad_norm": 0.43652305006980896, "learning_rate": 1.645777607948032e-05, "loss": 0.1377, "step": 4310 }, { "epoch": 1.3179455823907062, "grad_norm": 0.5005825161933899, "learning_rate": 1.646159724875812e-05, "loss": 0.1279, "step": 4311 }, { "epoch": 1.3182512992968511, "grad_norm": 0.6188924312591553, "learning_rate": 1.646541841803592e-05, "loss": 0.1665, "step": 4312 }, { "epoch": 1.318557016202996, "grad_norm": 1.5773636102676392, "learning_rate": 1.6469239587313718e-05, "loss": 0.212, "step": 4313 }, { "epoch": 1.318862733109141, "grad_norm": 0.8269373774528503, "learning_rate": 1.6473060756591516e-05, "loss": 0.2171, "step": 4314 }, { "epoch": 1.3191684500152858, "grad_norm": 0.9545656442642212, "learning_rate": 1.6476881925869315e-05, "loss": 0.2022, "step": 4315 }, { "epoch": 1.3194741669214307, "grad_norm": 2.165027379989624, "learning_rate": 1.6480703095147114e-05, "loss": 0.2584, "step": 4316 }, { "epoch": 1.3197798838275756, "grad_norm": 1.4532722234725952, "learning_rate": 1.6484524264424913e-05, "loss": 0.2459, "step": 4317 }, { "epoch": 1.3200856007337205, "grad_norm": 1.1318527460098267, "learning_rate": 1.6488345433702715e-05, "loss": 0.2628, "step": 4318 }, { "epoch": 1.3203913176398654, "grad_norm": 1.2358843088150024, "learning_rate": 1.6492166602980513e-05, "loss": 0.2823, "step": 4319 }, { "epoch": 1.3206970345460105, "grad_norm": 2.7148478031158447, "learning_rate": 1.6495987772258312e-05, "loss": 0.2964, "step": 4320 }, { "epoch": 1.3210027514521552, "grad_norm": 2.7043585777282715, "learning_rate": 1.649980894153611e-05, "loss": 0.3183, "step": 4321 }, { "epoch": 1.3213084683583003, "grad_norm": 0.750073254108429, "learning_rate": 1.650363011081391e-05, "loss": 0.163, "step": 4322 }, { "epoch": 1.3216141852644452, "grad_norm": 0.414447158575058, "learning_rate": 1.650745128009171e-05, "loss": 0.1128, "step": 4323 }, { "epoch": 1.32191990217059, "grad_norm": 0.3874996304512024, "learning_rate": 1.6511272449369507e-05, "loss": 0.082, "step": 4324 }, { "epoch": 1.322225619076735, "grad_norm": 0.42432349920272827, "learning_rate": 1.651509361864731e-05, "loss": 0.0761, "step": 4325 }, { "epoch": 1.3225313359828799, "grad_norm": 0.6645374894142151, "learning_rate": 1.6518914787925108e-05, "loss": 0.1224, "step": 4326 }, { "epoch": 1.3228370528890248, "grad_norm": 0.4477441906929016, "learning_rate": 1.6522735957202907e-05, "loss": 0.0942, "step": 4327 }, { "epoch": 1.3231427697951696, "grad_norm": 0.30093973875045776, "learning_rate": 1.6526557126480706e-05, "loss": 0.0647, "step": 4328 }, { "epoch": 1.3234484867013145, "grad_norm": 0.4194156527519226, "learning_rate": 1.6530378295758504e-05, "loss": 0.0592, "step": 4329 }, { "epoch": 1.3237542036074594, "grad_norm": 0.3571324944496155, "learning_rate": 1.6534199465036303e-05, "loss": 0.0944, "step": 4330 }, { "epoch": 1.3240599205136043, "grad_norm": 0.4157802164554596, "learning_rate": 1.6538020634314102e-05, "loss": 0.0694, "step": 4331 }, { "epoch": 1.3243656374197492, "grad_norm": 0.466672420501709, "learning_rate": 1.65418418035919e-05, "loss": 0.1185, "step": 4332 }, { "epoch": 1.3246713543258943, "grad_norm": 0.5717076659202576, "learning_rate": 1.65456629728697e-05, "loss": 0.1162, "step": 4333 }, { "epoch": 1.324977071232039, "grad_norm": 0.40371379256248474, "learning_rate": 1.6549484142147498e-05, "loss": 0.1144, "step": 4334 }, { "epoch": 1.3252827881381841, "grad_norm": 0.6642513275146484, "learning_rate": 1.6553305311425297e-05, "loss": 0.1751, "step": 4335 }, { "epoch": 1.325588505044329, "grad_norm": 0.6191803812980652, "learning_rate": 1.6557126480703096e-05, "loss": 0.1644, "step": 4336 }, { "epoch": 1.325894221950474, "grad_norm": 0.49579718708992004, "learning_rate": 1.6560947649980894e-05, "loss": 0.1875, "step": 4337 }, { "epoch": 1.3261999388566188, "grad_norm": 1.1201220750808716, "learning_rate": 1.6564768819258693e-05, "loss": 0.1953, "step": 4338 }, { "epoch": 1.3265056557627637, "grad_norm": 1.1902496814727783, "learning_rate": 1.6568589988536492e-05, "loss": 0.2118, "step": 4339 }, { "epoch": 1.3268113726689086, "grad_norm": 1.1501621007919312, "learning_rate": 1.6572411157814294e-05, "loss": 0.2276, "step": 4340 }, { "epoch": 1.3271170895750535, "grad_norm": 1.1200960874557495, "learning_rate": 1.6576232327092093e-05, "loss": 0.2258, "step": 4341 }, { "epoch": 1.3274228064811984, "grad_norm": 0.8742534518241882, "learning_rate": 1.658005349636989e-05, "loss": 0.2109, "step": 4342 }, { "epoch": 1.3277285233873433, "grad_norm": 1.0558254718780518, "learning_rate": 1.658387466564769e-05, "loss": 0.2403, "step": 4343 }, { "epoch": 1.3280342402934882, "grad_norm": 1.3200479745864868, "learning_rate": 1.658769583492549e-05, "loss": 0.3144, "step": 4344 }, { "epoch": 1.328339957199633, "grad_norm": 1.8608819246292114, "learning_rate": 1.6591517004203288e-05, "loss": 0.3131, "step": 4345 }, { "epoch": 1.3286456741057782, "grad_norm": 3.8289437294006348, "learning_rate": 1.6595338173481086e-05, "loss": 0.3556, "step": 4346 }, { "epoch": 1.3289513910119228, "grad_norm": 0.4084533452987671, "learning_rate": 1.6599159342758885e-05, "loss": 0.1837, "step": 4347 }, { "epoch": 1.329257107918068, "grad_norm": 0.4112188518047333, "learning_rate": 1.6602980512036684e-05, "loss": 0.0885, "step": 4348 }, { "epoch": 1.3295628248242128, "grad_norm": 0.29666778445243835, "learning_rate": 1.6606801681314483e-05, "loss": 0.088, "step": 4349 }, { "epoch": 1.3298685417303577, "grad_norm": 0.33214810490608215, "learning_rate": 1.661062285059228e-05, "loss": 0.0818, "step": 4350 }, { "epoch": 1.3301742586365026, "grad_norm": 0.30293792486190796, "learning_rate": 1.661444401987008e-05, "loss": 0.1014, "step": 4351 }, { "epoch": 1.3304799755426475, "grad_norm": 0.29879051446914673, "learning_rate": 1.661826518914788e-05, "loss": 0.0812, "step": 4352 }, { "epoch": 1.3307856924487924, "grad_norm": 0.4633210301399231, "learning_rate": 1.6622086358425678e-05, "loss": 0.0758, "step": 4353 }, { "epoch": 1.3310914093549373, "grad_norm": 0.3009864091873169, "learning_rate": 1.6625907527703476e-05, "loss": 0.0916, "step": 4354 }, { "epoch": 1.3313971262610822, "grad_norm": 0.4621523916721344, "learning_rate": 1.6629728696981275e-05, "loss": 0.1008, "step": 4355 }, { "epoch": 1.331702843167227, "grad_norm": 0.4815397262573242, "learning_rate": 1.6633549866259077e-05, "loss": 0.097, "step": 4356 }, { "epoch": 1.332008560073372, "grad_norm": 0.4491938650608063, "learning_rate": 1.6637371035536876e-05, "loss": 0.1043, "step": 4357 }, { "epoch": 1.3323142769795169, "grad_norm": 0.33653590083122253, "learning_rate": 1.6641192204814675e-05, "loss": 0.0859, "step": 4358 }, { "epoch": 1.332619993885662, "grad_norm": 0.4932832419872284, "learning_rate": 1.6645013374092473e-05, "loss": 0.1354, "step": 4359 }, { "epoch": 1.3329257107918067, "grad_norm": 0.71062833070755, "learning_rate": 1.6648834543370272e-05, "loss": 0.1714, "step": 4360 }, { "epoch": 1.3332314276979518, "grad_norm": 0.6230425238609314, "learning_rate": 1.665265571264807e-05, "loss": 0.1568, "step": 4361 }, { "epoch": 1.3335371446040967, "grad_norm": 0.5210211873054504, "learning_rate": 1.665647688192587e-05, "loss": 0.1756, "step": 4362 }, { "epoch": 1.3338428615102416, "grad_norm": 0.5897666215896606, "learning_rate": 1.666029805120367e-05, "loss": 0.1949, "step": 4363 }, { "epoch": 1.3341485784163865, "grad_norm": 0.6573787927627563, "learning_rate": 1.6664119220481467e-05, "loss": 0.1821, "step": 4364 }, { "epoch": 1.3344542953225313, "grad_norm": 0.6695916056632996, "learning_rate": 1.6667940389759266e-05, "loss": 0.2473, "step": 4365 }, { "epoch": 1.3347600122286762, "grad_norm": 1.2150026559829712, "learning_rate": 1.6671761559037065e-05, "loss": 0.2129, "step": 4366 }, { "epoch": 1.3350657291348211, "grad_norm": 1.2347335815429688, "learning_rate": 1.6675582728314863e-05, "loss": 0.2342, "step": 4367 }, { "epoch": 1.335371446040966, "grad_norm": 1.306232213973999, "learning_rate": 1.6679403897592662e-05, "loss": 0.2609, "step": 4368 }, { "epoch": 1.335677162947111, "grad_norm": 1.5707966089248657, "learning_rate": 1.668322506687046e-05, "loss": 0.2982, "step": 4369 }, { "epoch": 1.3359828798532558, "grad_norm": 1.0185120105743408, "learning_rate": 1.668704623614826e-05, "loss": 0.2753, "step": 4370 }, { "epoch": 1.3362885967594007, "grad_norm": 1.165470004081726, "learning_rate": 1.6690867405426062e-05, "loss": 0.2876, "step": 4371 }, { "epoch": 1.3365943136655458, "grad_norm": 0.5372248291969299, "learning_rate": 1.669468857470386e-05, "loss": 0.1784, "step": 4372 }, { "epoch": 1.3369000305716905, "grad_norm": 0.29437947273254395, "learning_rate": 1.669850974398166e-05, "loss": 0.0963, "step": 4373 }, { "epoch": 1.3372057474778356, "grad_norm": 0.43204325437545776, "learning_rate": 1.6702330913259458e-05, "loss": 0.1331, "step": 4374 }, { "epoch": 1.3375114643839805, "grad_norm": 0.38841119408607483, "learning_rate": 1.6706152082537257e-05, "loss": 0.0894, "step": 4375 }, { "epoch": 1.3378171812901254, "grad_norm": 0.28313586115837097, "learning_rate": 1.6709973251815055e-05, "loss": 0.0783, "step": 4376 }, { "epoch": 1.3381228981962703, "grad_norm": 0.2466467022895813, "learning_rate": 1.6713794421092854e-05, "loss": 0.0631, "step": 4377 }, { "epoch": 1.3384286151024152, "grad_norm": 0.2961346507072449, "learning_rate": 1.6717615590370653e-05, "loss": 0.0801, "step": 4378 }, { "epoch": 1.33873433200856, "grad_norm": 0.5181246995925903, "learning_rate": 1.6721436759648452e-05, "loss": 0.1025, "step": 4379 }, { "epoch": 1.339040048914705, "grad_norm": 0.32267847657203674, "learning_rate": 1.672525792892625e-05, "loss": 0.0896, "step": 4380 }, { "epoch": 1.3393457658208499, "grad_norm": 0.35200878977775574, "learning_rate": 1.672907909820405e-05, "loss": 0.0768, "step": 4381 }, { "epoch": 1.3396514827269947, "grad_norm": 0.45594683289527893, "learning_rate": 1.6732900267481848e-05, "loss": 0.1076, "step": 4382 }, { "epoch": 1.3399571996331396, "grad_norm": 0.3693373203277588, "learning_rate": 1.6736721436759647e-05, "loss": 0.0888, "step": 4383 }, { "epoch": 1.3402629165392845, "grad_norm": 0.5591658353805542, "learning_rate": 1.6740542606037445e-05, "loss": 0.1558, "step": 4384 }, { "epoch": 1.3405686334454296, "grad_norm": 0.34734728932380676, "learning_rate": 1.6744363775315244e-05, "loss": 0.1073, "step": 4385 }, { "epoch": 1.3408743503515743, "grad_norm": 0.8550172448158264, "learning_rate": 1.6748184944593043e-05, "loss": 0.1251, "step": 4386 }, { "epoch": 1.3411800672577194, "grad_norm": 0.6336030960083008, "learning_rate": 1.6752006113870845e-05, "loss": 0.1835, "step": 4387 }, { "epoch": 1.3414857841638643, "grad_norm": 0.691135823726654, "learning_rate": 1.6755827283148644e-05, "loss": 0.1934, "step": 4388 }, { "epoch": 1.3417915010700092, "grad_norm": 0.5688754320144653, "learning_rate": 1.6759648452426443e-05, "loss": 0.1989, "step": 4389 }, { "epoch": 1.342097217976154, "grad_norm": 0.7300745844841003, "learning_rate": 1.676346962170424e-05, "loss": 0.2107, "step": 4390 }, { "epoch": 1.342402934882299, "grad_norm": 1.0066086053848267, "learning_rate": 1.676729079098204e-05, "loss": 0.2196, "step": 4391 }, { "epoch": 1.342708651788444, "grad_norm": 0.858579158782959, "learning_rate": 1.677111196025984e-05, "loss": 0.2135, "step": 4392 }, { "epoch": 1.3430143686945888, "grad_norm": 1.2875819206237793, "learning_rate": 1.6774933129537638e-05, "loss": 0.227, "step": 4393 }, { "epoch": 1.3433200856007337, "grad_norm": 0.9651837348937988, "learning_rate": 1.6778754298815436e-05, "loss": 0.2381, "step": 4394 }, { "epoch": 1.3436258025068786, "grad_norm": 1.1214548349380493, "learning_rate": 1.678257546809324e-05, "loss": 0.2323, "step": 4395 }, { "epoch": 1.3439315194130235, "grad_norm": 1.9222310781478882, "learning_rate": 1.6786396637371037e-05, "loss": 0.2987, "step": 4396 }, { "epoch": 1.3442372363191684, "grad_norm": 0.6019227504730225, "learning_rate": 1.6790217806648836e-05, "loss": 0.1928, "step": 4397 }, { "epoch": 1.3445429532253135, "grad_norm": 0.391345739364624, "learning_rate": 1.6794038975926635e-05, "loss": 0.1233, "step": 4398 }, { "epoch": 1.3448486701314581, "grad_norm": 0.3544301688671112, "learning_rate": 1.6797860145204433e-05, "loss": 0.1139, "step": 4399 }, { "epoch": 1.3451543870376033, "grad_norm": 0.3605884313583374, "learning_rate": 1.6801681314482232e-05, "loss": 0.078, "step": 4400 }, { "epoch": 1.3454601039437482, "grad_norm": 0.28776389360427856, "learning_rate": 1.680550248376003e-05, "loss": 0.0518, "step": 4401 }, { "epoch": 1.345765820849893, "grad_norm": 0.35393187403678894, "learning_rate": 1.6809323653037833e-05, "loss": 0.0733, "step": 4402 }, { "epoch": 1.346071537756038, "grad_norm": 0.3042962849140167, "learning_rate": 1.6813144822315632e-05, "loss": 0.0568, "step": 4403 }, { "epoch": 1.3463772546621828, "grad_norm": 0.7327079772949219, "learning_rate": 1.681696599159343e-05, "loss": 0.062, "step": 4404 }, { "epoch": 1.3466829715683277, "grad_norm": 0.5292685627937317, "learning_rate": 1.682078716087123e-05, "loss": 0.1013, "step": 4405 }, { "epoch": 1.3469886884744726, "grad_norm": 0.36294665932655334, "learning_rate": 1.6824608330149028e-05, "loss": 0.0824, "step": 4406 }, { "epoch": 1.3472944053806175, "grad_norm": 0.58321213722229, "learning_rate": 1.6828429499426827e-05, "loss": 0.1137, "step": 4407 }, { "epoch": 1.3476001222867624, "grad_norm": 0.630562424659729, "learning_rate": 1.6832250668704625e-05, "loss": 0.1081, "step": 4408 }, { "epoch": 1.3479058391929073, "grad_norm": 0.5915533900260925, "learning_rate": 1.6836071837982424e-05, "loss": 0.1304, "step": 4409 }, { "epoch": 1.3482115560990522, "grad_norm": 0.38803166151046753, "learning_rate": 1.6839893007260223e-05, "loss": 0.1071, "step": 4410 }, { "epoch": 1.3485172730051973, "grad_norm": 0.6967632174491882, "learning_rate": 1.6843714176538022e-05, "loss": 0.1808, "step": 4411 }, { "epoch": 1.348822989911342, "grad_norm": 0.7023932933807373, "learning_rate": 1.684753534581582e-05, "loss": 0.1448, "step": 4412 }, { "epoch": 1.349128706817487, "grad_norm": 0.7361250519752502, "learning_rate": 1.685135651509362e-05, "loss": 0.198, "step": 4413 }, { "epoch": 1.349434423723632, "grad_norm": 0.7336496710777283, "learning_rate": 1.6855177684371418e-05, "loss": 0.2229, "step": 4414 }, { "epoch": 1.3497401406297769, "grad_norm": 0.7739676237106323, "learning_rate": 1.6858998853649217e-05, "loss": 0.2462, "step": 4415 }, { "epoch": 1.3500458575359218, "grad_norm": 0.8196535110473633, "learning_rate": 1.6862820022927015e-05, "loss": 0.1973, "step": 4416 }, { "epoch": 1.3503515744420667, "grad_norm": 0.8046455979347229, "learning_rate": 1.6866641192204814e-05, "loss": 0.2262, "step": 4417 }, { "epoch": 1.3506572913482116, "grad_norm": 0.9596796631813049, "learning_rate": 1.6870462361482616e-05, "loss": 0.2133, "step": 4418 }, { "epoch": 1.3509630082543564, "grad_norm": 1.9076273441314697, "learning_rate": 1.6874283530760415e-05, "loss": 0.2344, "step": 4419 }, { "epoch": 1.3512687251605013, "grad_norm": 1.0052157640457153, "learning_rate": 1.6878104700038214e-05, "loss": 0.2739, "step": 4420 }, { "epoch": 1.3515744420666462, "grad_norm": 3.3264660835266113, "learning_rate": 1.6881925869316013e-05, "loss": 0.3777, "step": 4421 }, { "epoch": 1.3518801589727911, "grad_norm": 0.707154393196106, "learning_rate": 1.688574703859381e-05, "loss": 0.2261, "step": 4422 }, { "epoch": 1.352185875878936, "grad_norm": 0.38915780186653137, "learning_rate": 1.688956820787161e-05, "loss": 0.1072, "step": 4423 }, { "epoch": 1.3524915927850811, "grad_norm": 0.2976902425289154, "learning_rate": 1.689338937714941e-05, "loss": 0.0793, "step": 4424 }, { "epoch": 1.3527973096912258, "grad_norm": 0.3211630582809448, "learning_rate": 1.6897210546427208e-05, "loss": 0.0823, "step": 4425 }, { "epoch": 1.353103026597371, "grad_norm": 0.29538851976394653, "learning_rate": 1.6901031715705006e-05, "loss": 0.0591, "step": 4426 }, { "epoch": 1.3534087435035158, "grad_norm": 0.4207291901111603, "learning_rate": 1.6904852884982805e-05, "loss": 0.1267, "step": 4427 }, { "epoch": 1.3537144604096607, "grad_norm": 0.4401700496673584, "learning_rate": 1.6908674054260604e-05, "loss": 0.082, "step": 4428 }, { "epoch": 1.3540201773158056, "grad_norm": 0.35835370421409607, "learning_rate": 1.6912495223538403e-05, "loss": 0.1108, "step": 4429 }, { "epoch": 1.3543258942219505, "grad_norm": 0.3787846267223358, "learning_rate": 1.69163163928162e-05, "loss": 0.0956, "step": 4430 }, { "epoch": 1.3546316111280954, "grad_norm": 0.43501025438308716, "learning_rate": 1.6920137562094e-05, "loss": 0.097, "step": 4431 }, { "epoch": 1.3549373280342403, "grad_norm": 0.4480206072330475, "learning_rate": 1.69239587313718e-05, "loss": 0.0895, "step": 4432 }, { "epoch": 1.3552430449403852, "grad_norm": 0.5843552947044373, "learning_rate": 1.69277799006496e-05, "loss": 0.1042, "step": 4433 }, { "epoch": 1.35554876184653, "grad_norm": 0.6743726134300232, "learning_rate": 1.69316010699274e-05, "loss": 0.1248, "step": 4434 }, { "epoch": 1.355854478752675, "grad_norm": 0.44126126170158386, "learning_rate": 1.69354222392052e-05, "loss": 0.1335, "step": 4435 }, { "epoch": 1.3561601956588198, "grad_norm": 0.6063469648361206, "learning_rate": 1.6939243408482997e-05, "loss": 0.1385, "step": 4436 }, { "epoch": 1.356465912564965, "grad_norm": 0.9467818140983582, "learning_rate": 1.6943064577760796e-05, "loss": 0.174, "step": 4437 }, { "epoch": 1.3567716294711096, "grad_norm": 1.2091715335845947, "learning_rate": 1.6946885747038595e-05, "loss": 0.1899, "step": 4438 }, { "epoch": 1.3570773463772547, "grad_norm": 0.9021766781806946, "learning_rate": 1.6950706916316393e-05, "loss": 0.1681, "step": 4439 }, { "epoch": 1.3573830632833996, "grad_norm": 0.7419281601905823, "learning_rate": 1.6954528085594192e-05, "loss": 0.21, "step": 4440 }, { "epoch": 1.3576887801895445, "grad_norm": 0.8600410223007202, "learning_rate": 1.695834925487199e-05, "loss": 0.2379, "step": 4441 }, { "epoch": 1.3579944970956894, "grad_norm": 0.8883064389228821, "learning_rate": 1.696217042414979e-05, "loss": 0.2445, "step": 4442 }, { "epoch": 1.3583002140018343, "grad_norm": 1.5577998161315918, "learning_rate": 1.696599159342759e-05, "loss": 0.2672, "step": 4443 }, { "epoch": 1.3586059309079792, "grad_norm": 1.154030680656433, "learning_rate": 1.6969812762705387e-05, "loss": 0.2536, "step": 4444 }, { "epoch": 1.358911647814124, "grad_norm": 2.069190263748169, "learning_rate": 1.6973633931983186e-05, "loss": 0.3003, "step": 4445 }, { "epoch": 1.359217364720269, "grad_norm": 2.6943845748901367, "learning_rate": 1.6977455101260985e-05, "loss": 0.3121, "step": 4446 }, { "epoch": 1.3595230816264139, "grad_norm": 0.5730251669883728, "learning_rate": 1.6981276270538783e-05, "loss": 0.1571, "step": 4447 }, { "epoch": 1.3598287985325588, "grad_norm": 0.322055459022522, "learning_rate": 1.6985097439816582e-05, "loss": 0.1065, "step": 4448 }, { "epoch": 1.3601345154387037, "grad_norm": 0.28359362483024597, "learning_rate": 1.6988918609094384e-05, "loss": 0.0781, "step": 4449 }, { "epoch": 1.3604402323448488, "grad_norm": 0.3983651399612427, "learning_rate": 1.6992739778372183e-05, "loss": 0.0863, "step": 4450 }, { "epoch": 1.3607459492509935, "grad_norm": 0.3422231674194336, "learning_rate": 1.699656094764998e-05, "loss": 0.0749, "step": 4451 }, { "epoch": 1.3610516661571386, "grad_norm": 0.758406937122345, "learning_rate": 1.700038211692778e-05, "loss": 0.0645, "step": 4452 }, { "epoch": 1.3613573830632835, "grad_norm": 0.5961878895759583, "learning_rate": 1.700420328620558e-05, "loss": 0.1082, "step": 4453 }, { "epoch": 1.3616630999694284, "grad_norm": 0.38581210374832153, "learning_rate": 1.7008024455483378e-05, "loss": 0.0973, "step": 4454 }, { "epoch": 1.3619688168755733, "grad_norm": 0.4693896770477295, "learning_rate": 1.7011845624761177e-05, "loss": 0.0759, "step": 4455 }, { "epoch": 1.3622745337817181, "grad_norm": 0.33509013056755066, "learning_rate": 1.7015666794038975e-05, "loss": 0.0855, "step": 4456 }, { "epoch": 1.362580250687863, "grad_norm": 0.46634557843208313, "learning_rate": 1.7019487963316774e-05, "loss": 0.0997, "step": 4457 }, { "epoch": 1.362885967594008, "grad_norm": 0.5246798396110535, "learning_rate": 1.7023309132594573e-05, "loss": 0.108, "step": 4458 }, { "epoch": 1.3631916845001528, "grad_norm": 0.6033738851547241, "learning_rate": 1.702713030187237e-05, "loss": 0.147, "step": 4459 }, { "epoch": 1.3634974014062977, "grad_norm": 0.4958706796169281, "learning_rate": 1.703095147115017e-05, "loss": 0.1471, "step": 4460 }, { "epoch": 1.3638031183124426, "grad_norm": 0.5945125222206116, "learning_rate": 1.703477264042797e-05, "loss": 0.1512, "step": 4461 }, { "epoch": 1.3641088352185875, "grad_norm": 0.7092231512069702, "learning_rate": 1.7038593809705768e-05, "loss": 0.1971, "step": 4462 }, { "epoch": 1.3644145521247326, "grad_norm": 0.5800310969352722, "learning_rate": 1.7042414978983567e-05, "loss": 0.2439, "step": 4463 }, { "epoch": 1.3647202690308773, "grad_norm": 0.6585589647293091, "learning_rate": 1.704623614826137e-05, "loss": 0.2102, "step": 4464 }, { "epoch": 1.3650259859370224, "grad_norm": 0.6748008131980896, "learning_rate": 1.7050057317539167e-05, "loss": 0.2079, "step": 4465 }, { "epoch": 1.3653317028431673, "grad_norm": 1.525900959968567, "learning_rate": 1.7053878486816966e-05, "loss": 0.2378, "step": 4466 }, { "epoch": 1.3656374197493122, "grad_norm": 0.6209724545478821, "learning_rate": 1.7057699656094765e-05, "loss": 0.2395, "step": 4467 }, { "epoch": 1.365943136655457, "grad_norm": 0.8854621648788452, "learning_rate": 1.7061520825372564e-05, "loss": 0.2267, "step": 4468 }, { "epoch": 1.366248853561602, "grad_norm": 1.2644582986831665, "learning_rate": 1.7065341994650362e-05, "loss": 0.245, "step": 4469 }, { "epoch": 1.3665545704677469, "grad_norm": 1.3350932598114014, "learning_rate": 1.706916316392816e-05, "loss": 0.264, "step": 4470 }, { "epoch": 1.3668602873738918, "grad_norm": 1.8972976207733154, "learning_rate": 1.707298433320596e-05, "loss": 0.3322, "step": 4471 }, { "epoch": 1.3671660042800367, "grad_norm": 0.5126451253890991, "learning_rate": 1.707680550248376e-05, "loss": 0.1604, "step": 4472 }, { "epoch": 1.3674717211861815, "grad_norm": 0.3764224350452423, "learning_rate": 1.708062667176156e-05, "loss": 0.1044, "step": 4473 }, { "epoch": 1.3677774380923264, "grad_norm": 0.6206727027893066, "learning_rate": 1.708444784103936e-05, "loss": 0.1091, "step": 4474 }, { "epoch": 1.3680831549984713, "grad_norm": 0.3296270966529846, "learning_rate": 1.708826901031716e-05, "loss": 0.1055, "step": 4475 }, { "epoch": 1.3683888719046164, "grad_norm": 0.28835615515708923, "learning_rate": 1.7092090179594957e-05, "loss": 0.0706, "step": 4476 }, { "epoch": 1.3686945888107611, "grad_norm": 0.42752593755722046, "learning_rate": 1.7095911348872756e-05, "loss": 0.0792, "step": 4477 }, { "epoch": 1.3690003057169062, "grad_norm": 0.326241672039032, "learning_rate": 1.7099732518150555e-05, "loss": 0.111, "step": 4478 }, { "epoch": 1.3693060226230511, "grad_norm": 0.2647353410720825, "learning_rate": 1.7103553687428353e-05, "loss": 0.0762, "step": 4479 }, { "epoch": 1.369611739529196, "grad_norm": 0.3022434413433075, "learning_rate": 1.7107374856706155e-05, "loss": 0.075, "step": 4480 }, { "epoch": 1.369917456435341, "grad_norm": 0.45376309752464294, "learning_rate": 1.7111196025983954e-05, "loss": 0.1111, "step": 4481 }, { "epoch": 1.3702231733414858, "grad_norm": 0.5149494409561157, "learning_rate": 1.7115017195261753e-05, "loss": 0.1352, "step": 4482 }, { "epoch": 1.3705288902476307, "grad_norm": 0.44391411542892456, "learning_rate": 1.711883836453955e-05, "loss": 0.0881, "step": 4483 }, { "epoch": 1.3708346071537756, "grad_norm": 0.4206424057483673, "learning_rate": 1.712265953381735e-05, "loss": 0.1252, "step": 4484 }, { "epoch": 1.3711403240599205, "grad_norm": 0.5113542079925537, "learning_rate": 1.712648070309515e-05, "loss": 0.156, "step": 4485 }, { "epoch": 1.3714460409660654, "grad_norm": 0.6359466314315796, "learning_rate": 1.7130301872372948e-05, "loss": 0.1666, "step": 4486 }, { "epoch": 1.3717517578722103, "grad_norm": 0.6862449049949646, "learning_rate": 1.7134123041650747e-05, "loss": 0.2201, "step": 4487 }, { "epoch": 1.3720574747783552, "grad_norm": 0.7589373588562012, "learning_rate": 1.7137944210928545e-05, "loss": 0.2378, "step": 4488 }, { "epoch": 1.3723631916845003, "grad_norm": 0.9406682848930359, "learning_rate": 1.7141765380206344e-05, "loss": 0.2526, "step": 4489 }, { "epoch": 1.372668908590645, "grad_norm": 0.7832614779472351, "learning_rate": 1.7145586549484143e-05, "loss": 0.2004, "step": 4490 }, { "epoch": 1.37297462549679, "grad_norm": 0.6688991785049438, "learning_rate": 1.714940771876194e-05, "loss": 0.2332, "step": 4491 }, { "epoch": 1.373280342402935, "grad_norm": 1.2780838012695312, "learning_rate": 1.715322888803974e-05, "loss": 0.2449, "step": 4492 }, { "epoch": 1.3735860593090798, "grad_norm": 0.8404556512832642, "learning_rate": 1.715705005731754e-05, "loss": 0.2598, "step": 4493 }, { "epoch": 1.3738917762152247, "grad_norm": 1.8642058372497559, "learning_rate": 1.7160871226595338e-05, "loss": 0.2305, "step": 4494 }, { "epoch": 1.3741974931213696, "grad_norm": 1.3769936561584473, "learning_rate": 1.716469239587314e-05, "loss": 0.2934, "step": 4495 }, { "epoch": 1.3745032100275145, "grad_norm": 3.490083694458008, "learning_rate": 1.716851356515094e-05, "loss": 0.4218, "step": 4496 }, { "epoch": 1.3748089269336594, "grad_norm": 0.505414605140686, "learning_rate": 1.7172334734428737e-05, "loss": 0.1673, "step": 4497 }, { "epoch": 1.3751146438398043, "grad_norm": 0.4749702513217926, "learning_rate": 1.7176155903706536e-05, "loss": 0.1156, "step": 4498 }, { "epoch": 1.3754203607459492, "grad_norm": 0.3771167993545532, "learning_rate": 1.7179977072984335e-05, "loss": 0.1127, "step": 4499 }, { "epoch": 1.375726077652094, "grad_norm": 0.31737351417541504, "learning_rate": 1.7183798242262134e-05, "loss": 0.0683, "step": 4500 }, { "epoch": 1.376031794558239, "grad_norm": 0.413178414106369, "learning_rate": 1.7187619411539932e-05, "loss": 0.0776, "step": 4501 }, { "epoch": 1.376337511464384, "grad_norm": 0.35414883494377136, "learning_rate": 1.719144058081773e-05, "loss": 0.0648, "step": 4502 }, { "epoch": 1.3766432283705288, "grad_norm": 0.29753410816192627, "learning_rate": 1.719526175009553e-05, "loss": 0.0746, "step": 4503 }, { "epoch": 1.3769489452766739, "grad_norm": 0.4777126610279083, "learning_rate": 1.719908291937333e-05, "loss": 0.0793, "step": 4504 }, { "epoch": 1.3772546621828188, "grad_norm": 0.6301906108856201, "learning_rate": 1.7202904088651127e-05, "loss": 0.124, "step": 4505 }, { "epoch": 1.3775603790889637, "grad_norm": 0.4264582395553589, "learning_rate": 1.7206725257928926e-05, "loss": 0.0906, "step": 4506 }, { "epoch": 1.3778660959951086, "grad_norm": 0.46989935636520386, "learning_rate": 1.7210546427206725e-05, "loss": 0.1455, "step": 4507 }, { "epoch": 1.3781718129012535, "grad_norm": 0.411588191986084, "learning_rate": 1.7214367596484524e-05, "loss": 0.087, "step": 4508 }, { "epoch": 1.3784775298073983, "grad_norm": 0.4286401569843292, "learning_rate": 1.7218188765762322e-05, "loss": 0.139, "step": 4509 }, { "epoch": 1.3787832467135432, "grad_norm": 1.1199426651000977, "learning_rate": 1.722200993504012e-05, "loss": 0.1658, "step": 4510 }, { "epoch": 1.3790889636196881, "grad_norm": 0.6612309217453003, "learning_rate": 1.7225831104317923e-05, "loss": 0.1803, "step": 4511 }, { "epoch": 1.379394680525833, "grad_norm": 0.7267336845397949, "learning_rate": 1.7229652273595722e-05, "loss": 0.175, "step": 4512 }, { "epoch": 1.379700397431978, "grad_norm": 0.6504543423652649, "learning_rate": 1.723347344287352e-05, "loss": 0.2252, "step": 4513 }, { "epoch": 1.3800061143381228, "grad_norm": 0.7008295059204102, "learning_rate": 1.723729461215132e-05, "loss": 0.1889, "step": 4514 }, { "epoch": 1.380311831244268, "grad_norm": 0.6969344615936279, "learning_rate": 1.7241115781429118e-05, "loss": 0.2392, "step": 4515 }, { "epoch": 1.3806175481504126, "grad_norm": 1.403977870941162, "learning_rate": 1.7244936950706917e-05, "loss": 0.2257, "step": 4516 }, { "epoch": 1.3809232650565577, "grad_norm": 1.1869144439697266, "learning_rate": 1.7248758119984716e-05, "loss": 0.2507, "step": 4517 }, { "epoch": 1.3812289819627026, "grad_norm": 0.9607073068618774, "learning_rate": 1.7252579289262515e-05, "loss": 0.2478, "step": 4518 }, { "epoch": 1.3815346988688475, "grad_norm": 1.39920175075531, "learning_rate": 1.7256400458540313e-05, "loss": 0.3048, "step": 4519 }, { "epoch": 1.3818404157749924, "grad_norm": 1.5312728881835938, "learning_rate": 1.7260221627818112e-05, "loss": 0.2951, "step": 4520 }, { "epoch": 1.3821461326811373, "grad_norm": 1.5090358257293701, "learning_rate": 1.726404279709591e-05, "loss": 0.3675, "step": 4521 }, { "epoch": 1.3824518495872822, "grad_norm": 0.4131113588809967, "learning_rate": 1.726786396637371e-05, "loss": 0.1731, "step": 4522 }, { "epoch": 1.382757566493427, "grad_norm": 0.3637554347515106, "learning_rate": 1.7271685135651508e-05, "loss": 0.097, "step": 4523 }, { "epoch": 1.383063283399572, "grad_norm": 0.295906126499176, "learning_rate": 1.7275506304929307e-05, "loss": 0.0656, "step": 4524 }, { "epoch": 1.3833690003057169, "grad_norm": 0.2777252197265625, "learning_rate": 1.7279327474207106e-05, "loss": 0.074, "step": 4525 }, { "epoch": 1.3836747172118617, "grad_norm": 0.2690560817718506, "learning_rate": 1.7283148643484908e-05, "loss": 0.0579, "step": 4526 }, { "epoch": 1.3839804341180066, "grad_norm": 0.29579734802246094, "learning_rate": 1.7286969812762707e-05, "loss": 0.0861, "step": 4527 }, { "epoch": 1.3842861510241518, "grad_norm": 0.3289634585380554, "learning_rate": 1.7290790982040505e-05, "loss": 0.078, "step": 4528 }, { "epoch": 1.3845918679302964, "grad_norm": 0.5497490167617798, "learning_rate": 1.7294612151318304e-05, "loss": 0.0913, "step": 4529 }, { "epoch": 1.3848975848364415, "grad_norm": 0.4605760872364044, "learning_rate": 1.7298433320596103e-05, "loss": 0.1179, "step": 4530 }, { "epoch": 1.3852033017425864, "grad_norm": 0.36811983585357666, "learning_rate": 1.73022544898739e-05, "loss": 0.0847, "step": 4531 }, { "epoch": 1.3855090186487313, "grad_norm": 0.5983685851097107, "learning_rate": 1.73060756591517e-05, "loss": 0.0989, "step": 4532 }, { "epoch": 1.3858147355548762, "grad_norm": 0.3537741005420685, "learning_rate": 1.73098968284295e-05, "loss": 0.1, "step": 4533 }, { "epoch": 1.3861204524610211, "grad_norm": 0.5785964727401733, "learning_rate": 1.7313717997707298e-05, "loss": 0.1364, "step": 4534 }, { "epoch": 1.386426169367166, "grad_norm": 0.44191017746925354, "learning_rate": 1.7317539166985097e-05, "loss": 0.1199, "step": 4535 }, { "epoch": 1.386731886273311, "grad_norm": 1.4841437339782715, "learning_rate": 1.7321360336262895e-05, "loss": 0.1666, "step": 4536 }, { "epoch": 1.3870376031794558, "grad_norm": 0.6062614917755127, "learning_rate": 1.7325181505540694e-05, "loss": 0.13, "step": 4537 }, { "epoch": 1.3873433200856007, "grad_norm": 0.6443777680397034, "learning_rate": 1.7329002674818493e-05, "loss": 0.188, "step": 4538 }, { "epoch": 1.3876490369917456, "grad_norm": 1.0200345516204834, "learning_rate": 1.733282384409629e-05, "loss": 0.2032, "step": 4539 }, { "epoch": 1.3879547538978905, "grad_norm": 0.670369565486908, "learning_rate": 1.733664501337409e-05, "loss": 0.2214, "step": 4540 }, { "epoch": 1.3882604708040356, "grad_norm": 0.9262901544570923, "learning_rate": 1.734046618265189e-05, "loss": 0.25, "step": 4541 }, { "epoch": 1.3885661877101803, "grad_norm": 0.8510546088218689, "learning_rate": 1.734428735192969e-05, "loss": 0.244, "step": 4542 }, { "epoch": 1.3888719046163254, "grad_norm": 0.9853614568710327, "learning_rate": 1.734810852120749e-05, "loss": 0.2303, "step": 4543 }, { "epoch": 1.3891776215224703, "grad_norm": 1.2222201824188232, "learning_rate": 1.735192969048529e-05, "loss": 0.2233, "step": 4544 }, { "epoch": 1.3894833384286152, "grad_norm": 1.5510960817337036, "learning_rate": 1.7355750859763087e-05, "loss": 0.2789, "step": 4545 }, { "epoch": 1.38978905533476, "grad_norm": 2.976111650466919, "learning_rate": 1.7359572029040886e-05, "loss": 0.4102, "step": 4546 }, { "epoch": 1.390094772240905, "grad_norm": 0.3839532732963562, "learning_rate": 1.7363393198318685e-05, "loss": 0.1373, "step": 4547 }, { "epoch": 1.3904004891470498, "grad_norm": 0.3455788791179657, "learning_rate": 1.7367214367596484e-05, "loss": 0.1105, "step": 4548 }, { "epoch": 1.3907062060531947, "grad_norm": 0.3032214045524597, "learning_rate": 1.7371035536874282e-05, "loss": 0.0887, "step": 4549 }, { "epoch": 1.3910119229593396, "grad_norm": 0.28879889845848083, "learning_rate": 1.737485670615208e-05, "loss": 0.0647, "step": 4550 }, { "epoch": 1.3913176398654845, "grad_norm": 0.27505236864089966, "learning_rate": 1.7378677875429883e-05, "loss": 0.0697, "step": 4551 }, { "epoch": 1.3916233567716294, "grad_norm": 0.3688119649887085, "learning_rate": 1.7382499044707682e-05, "loss": 0.0607, "step": 4552 }, { "epoch": 1.3919290736777743, "grad_norm": 0.36010128259658813, "learning_rate": 1.738632021398548e-05, "loss": 0.0752, "step": 4553 }, { "epoch": 1.3922347905839194, "grad_norm": 0.2716071903705597, "learning_rate": 1.739014138326328e-05, "loss": 0.0786, "step": 4554 }, { "epoch": 1.392540507490064, "grad_norm": 0.4262457489967346, "learning_rate": 1.7393962552541078e-05, "loss": 0.0756, "step": 4555 }, { "epoch": 1.3928462243962092, "grad_norm": 0.3398357033729553, "learning_rate": 1.7397783721818877e-05, "loss": 0.0952, "step": 4556 }, { "epoch": 1.393151941302354, "grad_norm": 0.39480340480804443, "learning_rate": 1.740160489109668e-05, "loss": 0.1148, "step": 4557 }, { "epoch": 1.393457658208499, "grad_norm": 0.4079437255859375, "learning_rate": 1.7405426060374478e-05, "loss": 0.0972, "step": 4558 }, { "epoch": 1.3937633751146439, "grad_norm": 0.5739957094192505, "learning_rate": 1.7409247229652277e-05, "loss": 0.1584, "step": 4559 }, { "epoch": 1.3940690920207888, "grad_norm": 0.7562238574028015, "learning_rate": 1.7413068398930075e-05, "loss": 0.171, "step": 4560 }, { "epoch": 1.3943748089269337, "grad_norm": 0.7060211300849915, "learning_rate": 1.7416889568207874e-05, "loss": 0.1811, "step": 4561 }, { "epoch": 1.3946805258330786, "grad_norm": 0.5174992680549622, "learning_rate": 1.7420710737485673e-05, "loss": 0.1897, "step": 4562 }, { "epoch": 1.3949862427392234, "grad_norm": 0.6795030832290649, "learning_rate": 1.742453190676347e-05, "loss": 0.1865, "step": 4563 }, { "epoch": 1.3952919596453683, "grad_norm": 0.6605001091957092, "learning_rate": 1.742835307604127e-05, "loss": 0.2245, "step": 4564 }, { "epoch": 1.3955976765515132, "grad_norm": 0.7283856868743896, "learning_rate": 1.743217424531907e-05, "loss": 0.214, "step": 4565 }, { "epoch": 1.3959033934576581, "grad_norm": 0.6759514808654785, "learning_rate": 1.7435995414596868e-05, "loss": 0.1973, "step": 4566 }, { "epoch": 1.3962091103638032, "grad_norm": 0.7059095501899719, "learning_rate": 1.7439816583874667e-05, "loss": 0.2077, "step": 4567 }, { "epoch": 1.396514827269948, "grad_norm": 1.4016541242599487, "learning_rate": 1.7443637753152465e-05, "loss": 0.2638, "step": 4568 }, { "epoch": 1.396820544176093, "grad_norm": 2.399240016937256, "learning_rate": 1.7447458922430264e-05, "loss": 0.2848, "step": 4569 }, { "epoch": 1.397126261082238, "grad_norm": 1.2946960926055908, "learning_rate": 1.7451280091708063e-05, "loss": 0.2999, "step": 4570 }, { "epoch": 1.3974319779883828, "grad_norm": 1.1499744653701782, "learning_rate": 1.745510126098586e-05, "loss": 0.2943, "step": 4571 }, { "epoch": 1.3977376948945277, "grad_norm": 0.5281968712806702, "learning_rate": 1.745892243026366e-05, "loss": 0.1876, "step": 4572 }, { "epoch": 1.3980434118006726, "grad_norm": 0.32196298241615295, "learning_rate": 1.7462743599541462e-05, "loss": 0.1115, "step": 4573 }, { "epoch": 1.3983491287068175, "grad_norm": 0.3691723048686981, "learning_rate": 1.746656476881926e-05, "loss": 0.0827, "step": 4574 }, { "epoch": 1.3986548456129624, "grad_norm": 0.42651528120040894, "learning_rate": 1.747038593809706e-05, "loss": 0.0702, "step": 4575 }, { "epoch": 1.3989605625191073, "grad_norm": 0.26535725593566895, "learning_rate": 1.747420710737486e-05, "loss": 0.0669, "step": 4576 }, { "epoch": 1.3992662794252522, "grad_norm": 0.440723180770874, "learning_rate": 1.7478028276652657e-05, "loss": 0.0846, "step": 4577 }, { "epoch": 1.399571996331397, "grad_norm": 0.4409097135066986, "learning_rate": 1.7481849445930456e-05, "loss": 0.0765, "step": 4578 }, { "epoch": 1.399877713237542, "grad_norm": 0.3460056781768799, "learning_rate": 1.7485670615208255e-05, "loss": 0.0622, "step": 4579 }, { "epoch": 1.400183430143687, "grad_norm": 0.4511181712150574, "learning_rate": 1.7489491784486054e-05, "loss": 0.115, "step": 4580 }, { "epoch": 1.4004891470498317, "grad_norm": 0.43723994493484497, "learning_rate": 1.7493312953763852e-05, "loss": 0.0699, "step": 4581 }, { "epoch": 1.4007948639559769, "grad_norm": 0.457193523645401, "learning_rate": 1.749713412304165e-05, "loss": 0.1243, "step": 4582 }, { "epoch": 1.4011005808621217, "grad_norm": 0.37365850806236267, "learning_rate": 1.750095529231945e-05, "loss": 0.1121, "step": 4583 }, { "epoch": 1.4014062977682666, "grad_norm": 0.4125983715057373, "learning_rate": 1.750477646159725e-05, "loss": 0.1132, "step": 4584 }, { "epoch": 1.4017120146744115, "grad_norm": 0.4373261034488678, "learning_rate": 1.7508597630875047e-05, "loss": 0.1361, "step": 4585 }, { "epoch": 1.4020177315805564, "grad_norm": 0.6425427198410034, "learning_rate": 1.7512418800152846e-05, "loss": 0.1525, "step": 4586 }, { "epoch": 1.4023234484867013, "grad_norm": 0.815712034702301, "learning_rate": 1.7516239969430645e-05, "loss": 0.1772, "step": 4587 }, { "epoch": 1.4026291653928462, "grad_norm": 0.7891057729721069, "learning_rate": 1.7520061138708447e-05, "loss": 0.1798, "step": 4588 }, { "epoch": 1.402934882298991, "grad_norm": 1.1625258922576904, "learning_rate": 1.7523882307986246e-05, "loss": 0.1885, "step": 4589 }, { "epoch": 1.403240599205136, "grad_norm": 0.6484590172767639, "learning_rate": 1.7527703477264044e-05, "loss": 0.1996, "step": 4590 }, { "epoch": 1.403546316111281, "grad_norm": 0.6809478998184204, "learning_rate": 1.7531524646541843e-05, "loss": 0.188, "step": 4591 }, { "epoch": 1.4038520330174258, "grad_norm": 0.7825261950492859, "learning_rate": 1.7535345815819642e-05, "loss": 0.2532, "step": 4592 }, { "epoch": 1.404157749923571, "grad_norm": 1.576154351234436, "learning_rate": 1.753916698509744e-05, "loss": 0.2186, "step": 4593 }, { "epoch": 1.4044634668297156, "grad_norm": 0.8604038953781128, "learning_rate": 1.754298815437524e-05, "loss": 0.2221, "step": 4594 }, { "epoch": 1.4047691837358607, "grad_norm": 1.469631552696228, "learning_rate": 1.7546809323653038e-05, "loss": 0.2334, "step": 4595 }, { "epoch": 1.4050749006420056, "grad_norm": 8.019762992858887, "learning_rate": 1.7550630492930837e-05, "loss": 0.3739, "step": 4596 }, { "epoch": 1.4053806175481505, "grad_norm": 0.9803339838981628, "learning_rate": 1.7554451662208636e-05, "loss": 0.208, "step": 4597 }, { "epoch": 1.4056863344542954, "grad_norm": 0.3570702373981476, "learning_rate": 1.7558272831486434e-05, "loss": 0.0973, "step": 4598 }, { "epoch": 1.4059920513604403, "grad_norm": 0.35581618547439575, "learning_rate": 1.7562094000764233e-05, "loss": 0.0784, "step": 4599 }, { "epoch": 1.4062977682665851, "grad_norm": 0.4615285396575928, "learning_rate": 1.7565915170042032e-05, "loss": 0.0816, "step": 4600 }, { "epoch": 1.40660348517273, "grad_norm": 0.398980975151062, "learning_rate": 1.756973633931983e-05, "loss": 0.1022, "step": 4601 }, { "epoch": 1.406909202078875, "grad_norm": 0.3214224576950073, "learning_rate": 1.757355750859763e-05, "loss": 0.0799, "step": 4602 }, { "epoch": 1.4072149189850198, "grad_norm": 0.3896554410457611, "learning_rate": 1.7577378677875428e-05, "loss": 0.1071, "step": 4603 }, { "epoch": 1.4075206358911647, "grad_norm": 0.3923465311527252, "learning_rate": 1.758119984715323e-05, "loss": 0.0863, "step": 4604 }, { "epoch": 1.4078263527973096, "grad_norm": 0.3423735201358795, "learning_rate": 1.758502101643103e-05, "loss": 0.0955, "step": 4605 }, { "epoch": 1.4081320697034547, "grad_norm": 0.5946043729782104, "learning_rate": 1.7588842185708828e-05, "loss": 0.0956, "step": 4606 }, { "epoch": 1.4084377866095994, "grad_norm": 0.6458670496940613, "learning_rate": 1.7592663354986626e-05, "loss": 0.1307, "step": 4607 }, { "epoch": 1.4087435035157445, "grad_norm": 0.378852516412735, "learning_rate": 1.7596484524264425e-05, "loss": 0.0964, "step": 4608 }, { "epoch": 1.4090492204218894, "grad_norm": 0.45373937487602234, "learning_rate": 1.7600305693542224e-05, "loss": 0.13, "step": 4609 }, { "epoch": 1.4093549373280343, "grad_norm": 0.4395017921924591, "learning_rate": 1.7604126862820023e-05, "loss": 0.1509, "step": 4610 }, { "epoch": 1.4096606542341792, "grad_norm": 0.6549898982048035, "learning_rate": 1.760794803209782e-05, "loss": 0.1454, "step": 4611 }, { "epoch": 1.409966371140324, "grad_norm": 0.5252149105072021, "learning_rate": 1.761176920137562e-05, "loss": 0.1573, "step": 4612 }, { "epoch": 1.410272088046469, "grad_norm": 0.6094899773597717, "learning_rate": 1.761559037065342e-05, "loss": 0.1736, "step": 4613 }, { "epoch": 1.4105778049526139, "grad_norm": 0.6282908320426941, "learning_rate": 1.7619411539931218e-05, "loss": 0.2005, "step": 4614 }, { "epoch": 1.4108835218587588, "grad_norm": 0.6267551183700562, "learning_rate": 1.7623232709209016e-05, "loss": 0.1767, "step": 4615 }, { "epoch": 1.4111892387649037, "grad_norm": 1.6510655879974365, "learning_rate": 1.7627053878486815e-05, "loss": 0.2059, "step": 4616 }, { "epoch": 1.4114949556710485, "grad_norm": 2.5590100288391113, "learning_rate": 1.7630875047764614e-05, "loss": 0.2425, "step": 4617 }, { "epoch": 1.4118006725771934, "grad_norm": 1.0575021505355835, "learning_rate": 1.7634696217042413e-05, "loss": 0.2349, "step": 4618 }, { "epoch": 1.4121063894833386, "grad_norm": 1.2519627809524536, "learning_rate": 1.7638517386320215e-05, "loss": 0.2194, "step": 4619 }, { "epoch": 1.4124121063894832, "grad_norm": 1.1152992248535156, "learning_rate": 1.7642338555598014e-05, "loss": 0.2794, "step": 4620 }, { "epoch": 1.4127178232956283, "grad_norm": 1.5955861806869507, "learning_rate": 1.7646159724875812e-05, "loss": 0.3279, "step": 4621 }, { "epoch": 1.4130235402017732, "grad_norm": 0.5006139278411865, "learning_rate": 1.764998089415361e-05, "loss": 0.1852, "step": 4622 }, { "epoch": 1.4133292571079181, "grad_norm": 0.4028668701648712, "learning_rate": 1.765380206343141e-05, "loss": 0.1137, "step": 4623 }, { "epoch": 1.413634974014063, "grad_norm": 0.31953856348991394, "learning_rate": 1.765762323270921e-05, "loss": 0.0788, "step": 4624 }, { "epoch": 1.413940690920208, "grad_norm": 1.2448453903198242, "learning_rate": 1.7661444401987007e-05, "loss": 0.1185, "step": 4625 }, { "epoch": 1.4142464078263528, "grad_norm": 0.3105084002017975, "learning_rate": 1.7665265571264806e-05, "loss": 0.0723, "step": 4626 }, { "epoch": 1.4145521247324977, "grad_norm": 0.4628404378890991, "learning_rate": 1.7669086740542605e-05, "loss": 0.0645, "step": 4627 }, { "epoch": 1.4148578416386426, "grad_norm": 0.35880762338638306, "learning_rate": 1.7672907909820404e-05, "loss": 0.0883, "step": 4628 }, { "epoch": 1.4151635585447875, "grad_norm": 0.3016505837440491, "learning_rate": 1.7676729079098206e-05, "loss": 0.0922, "step": 4629 }, { "epoch": 1.4154692754509324, "grad_norm": 0.3509964644908905, "learning_rate": 1.7680550248376004e-05, "loss": 0.085, "step": 4630 }, { "epoch": 1.4157749923570773, "grad_norm": 0.4818178415298462, "learning_rate": 1.7684371417653803e-05, "loss": 0.1224, "step": 4631 }, { "epoch": 1.4160807092632224, "grad_norm": 0.4501022696495056, "learning_rate": 1.7688192586931602e-05, "loss": 0.1358, "step": 4632 }, { "epoch": 1.416386426169367, "grad_norm": 0.3604118824005127, "learning_rate": 1.76920137562094e-05, "loss": 0.0906, "step": 4633 }, { "epoch": 1.4166921430755122, "grad_norm": 0.548575222492218, "learning_rate": 1.76958349254872e-05, "loss": 0.13, "step": 4634 }, { "epoch": 1.416997859981657, "grad_norm": 0.48882654309272766, "learning_rate": 1.7699656094765e-05, "loss": 0.1649, "step": 4635 }, { "epoch": 1.417303576887802, "grad_norm": 0.5676580667495728, "learning_rate": 1.77034772640428e-05, "loss": 0.1688, "step": 4636 }, { "epoch": 1.4176092937939468, "grad_norm": 0.4961376488208771, "learning_rate": 1.77072984333206e-05, "loss": 0.1708, "step": 4637 }, { "epoch": 1.4179150107000917, "grad_norm": 0.663480281829834, "learning_rate": 1.7711119602598398e-05, "loss": 0.2069, "step": 4638 }, { "epoch": 1.4182207276062366, "grad_norm": 2.816117525100708, "learning_rate": 1.7714940771876196e-05, "loss": 0.2417, "step": 4639 }, { "epoch": 1.4185264445123815, "grad_norm": 0.7621625661849976, "learning_rate": 1.7718761941153995e-05, "loss": 0.2328, "step": 4640 }, { "epoch": 1.4188321614185264, "grad_norm": 0.7593342065811157, "learning_rate": 1.7722583110431794e-05, "loss": 0.2016, "step": 4641 }, { "epoch": 1.4191378783246713, "grad_norm": 0.9401252865791321, "learning_rate": 1.7726404279709593e-05, "loss": 0.233, "step": 4642 }, { "epoch": 1.4194435952308162, "grad_norm": 1.0323154926300049, "learning_rate": 1.773022544898739e-05, "loss": 0.2165, "step": 4643 }, { "epoch": 1.419749312136961, "grad_norm": 1.115531086921692, "learning_rate": 1.773404661826519e-05, "loss": 0.2523, "step": 4644 }, { "epoch": 1.4200550290431062, "grad_norm": 1.3900129795074463, "learning_rate": 1.773786778754299e-05, "loss": 0.2828, "step": 4645 }, { "epoch": 1.4203607459492509, "grad_norm": 2.177528142929077, "learning_rate": 1.7741688956820788e-05, "loss": 0.3225, "step": 4646 }, { "epoch": 1.420666462855396, "grad_norm": 0.6194936633110046, "learning_rate": 1.7745510126098586e-05, "loss": 0.1618, "step": 4647 }, { "epoch": 1.4209721797615409, "grad_norm": 0.3996970057487488, "learning_rate": 1.7749331295376385e-05, "loss": 0.1008, "step": 4648 }, { "epoch": 1.4212778966676858, "grad_norm": 0.3539332151412964, "learning_rate": 1.7753152464654184e-05, "loss": 0.0762, "step": 4649 }, { "epoch": 1.4215836135738307, "grad_norm": 0.4072423577308655, "learning_rate": 1.7756973633931983e-05, "loss": 0.0654, "step": 4650 }, { "epoch": 1.4218893304799756, "grad_norm": 0.4960806965827942, "learning_rate": 1.7760794803209785e-05, "loss": 0.0741, "step": 4651 }, { "epoch": 1.4221950473861205, "grad_norm": 0.4044349789619446, "learning_rate": 1.7764615972487584e-05, "loss": 0.0758, "step": 4652 }, { "epoch": 1.4225007642922654, "grad_norm": 0.27864620089530945, "learning_rate": 1.7768437141765382e-05, "loss": 0.0707, "step": 4653 }, { "epoch": 1.4228064811984102, "grad_norm": 0.943559467792511, "learning_rate": 1.777225831104318e-05, "loss": 0.0854, "step": 4654 }, { "epoch": 1.4231121981045551, "grad_norm": 0.37523677945137024, "learning_rate": 1.777607948032098e-05, "loss": 0.1059, "step": 4655 }, { "epoch": 1.4234179150107, "grad_norm": 0.4394600987434387, "learning_rate": 1.777990064959878e-05, "loss": 0.0875, "step": 4656 }, { "epoch": 1.423723631916845, "grad_norm": 0.5006345510482788, "learning_rate": 1.7783721818876577e-05, "loss": 0.1499, "step": 4657 }, { "epoch": 1.42402934882299, "grad_norm": 0.5094952583312988, "learning_rate": 1.7787542988154376e-05, "loss": 0.1131, "step": 4658 }, { "epoch": 1.4243350657291347, "grad_norm": 0.5018875598907471, "learning_rate": 1.7791364157432175e-05, "loss": 0.1011, "step": 4659 }, { "epoch": 1.4246407826352798, "grad_norm": 0.5363076329231262, "learning_rate": 1.7795185326709974e-05, "loss": 0.1365, "step": 4660 }, { "epoch": 1.4249464995414247, "grad_norm": 0.5808166265487671, "learning_rate": 1.7799006495987772e-05, "loss": 0.1398, "step": 4661 }, { "epoch": 1.4252522164475696, "grad_norm": 0.5310324430465698, "learning_rate": 1.780282766526557e-05, "loss": 0.1834, "step": 4662 }, { "epoch": 1.4255579333537145, "grad_norm": 0.6702636480331421, "learning_rate": 1.780664883454337e-05, "loss": 0.2072, "step": 4663 }, { "epoch": 1.4258636502598594, "grad_norm": 0.8907561898231506, "learning_rate": 1.781047000382117e-05, "loss": 0.2287, "step": 4664 }, { "epoch": 1.4261693671660043, "grad_norm": 0.7627171277999878, "learning_rate": 1.7814291173098967e-05, "loss": 0.2369, "step": 4665 }, { "epoch": 1.4264750840721492, "grad_norm": 0.9001628160476685, "learning_rate": 1.781811234237677e-05, "loss": 0.2141, "step": 4666 }, { "epoch": 1.426780800978294, "grad_norm": 0.7132755517959595, "learning_rate": 1.7821933511654568e-05, "loss": 0.2116, "step": 4667 }, { "epoch": 1.427086517884439, "grad_norm": 1.1921051740646362, "learning_rate": 1.7825754680932367e-05, "loss": 0.2468, "step": 4668 }, { "epoch": 1.4273922347905839, "grad_norm": 1.0621750354766846, "learning_rate": 1.7829575850210166e-05, "loss": 0.2893, "step": 4669 }, { "epoch": 1.4276979516967288, "grad_norm": 1.695202112197876, "learning_rate": 1.7833397019487964e-05, "loss": 0.3076, "step": 4670 }, { "epoch": 1.4280036686028739, "grad_norm": 1.5169199705123901, "learning_rate": 1.7837218188765763e-05, "loss": 0.358, "step": 4671 }, { "epoch": 1.4283093855090185, "grad_norm": 0.8725921511650085, "learning_rate": 1.7841039358043562e-05, "loss": 0.2195, "step": 4672 }, { "epoch": 1.4286151024151637, "grad_norm": 0.6261831521987915, "learning_rate": 1.784486052732136e-05, "loss": 0.1047, "step": 4673 }, { "epoch": 1.4289208193213085, "grad_norm": 0.5940900444984436, "learning_rate": 1.784868169659916e-05, "loss": 0.0891, "step": 4674 }, { "epoch": 1.4292265362274534, "grad_norm": 0.46754786372184753, "learning_rate": 1.7852502865876958e-05, "loss": 0.0964, "step": 4675 }, { "epoch": 1.4295322531335983, "grad_norm": 0.4088723957538605, "learning_rate": 1.7856324035154757e-05, "loss": 0.0727, "step": 4676 }, { "epoch": 1.4298379700397432, "grad_norm": 0.32776710391044617, "learning_rate": 1.7860145204432556e-05, "loss": 0.0583, "step": 4677 }, { "epoch": 1.4301436869458881, "grad_norm": 0.3067162334918976, "learning_rate": 1.7863966373710354e-05, "loss": 0.0764, "step": 4678 }, { "epoch": 1.430449403852033, "grad_norm": 0.3914501965045929, "learning_rate": 1.7867787542988153e-05, "loss": 0.0908, "step": 4679 }, { "epoch": 1.430755120758178, "grad_norm": 0.3783588409423828, "learning_rate": 1.7871608712265952e-05, "loss": 0.0916, "step": 4680 }, { "epoch": 1.4310608376643228, "grad_norm": 0.48495280742645264, "learning_rate": 1.7875429881543754e-05, "loss": 0.0917, "step": 4681 }, { "epoch": 1.4313665545704677, "grad_norm": 0.657866895198822, "learning_rate": 1.7879251050821553e-05, "loss": 0.1494, "step": 4682 }, { "epoch": 1.4316722714766126, "grad_norm": 0.4402891993522644, "learning_rate": 1.788307222009935e-05, "loss": 0.1356, "step": 4683 }, { "epoch": 1.4319779883827577, "grad_norm": 0.5953347682952881, "learning_rate": 1.788689338937715e-05, "loss": 0.142, "step": 4684 }, { "epoch": 1.4322837052889024, "grad_norm": 0.5702515840530396, "learning_rate": 1.789071455865495e-05, "loss": 0.1298, "step": 4685 }, { "epoch": 1.4325894221950475, "grad_norm": 0.6593410968780518, "learning_rate": 1.7894535727932748e-05, "loss": 0.1471, "step": 4686 }, { "epoch": 1.4328951391011924, "grad_norm": 0.8355332016944885, "learning_rate": 1.7898356897210546e-05, "loss": 0.2018, "step": 4687 }, { "epoch": 1.4332008560073373, "grad_norm": 0.7185826897621155, "learning_rate": 1.7902178066488345e-05, "loss": 0.1744, "step": 4688 }, { "epoch": 1.4335065729134822, "grad_norm": 1.0748711824417114, "learning_rate": 1.7905999235766144e-05, "loss": 0.1843, "step": 4689 }, { "epoch": 1.433812289819627, "grad_norm": 1.2485425472259521, "learning_rate": 1.7909820405043943e-05, "loss": 0.2015, "step": 4690 }, { "epoch": 1.434118006725772, "grad_norm": 0.7532191872596741, "learning_rate": 1.791364157432174e-05, "loss": 0.2437, "step": 4691 }, { "epoch": 1.4344237236319168, "grad_norm": 1.2150012254714966, "learning_rate": 1.791746274359954e-05, "loss": 0.2123, "step": 4692 }, { "epoch": 1.4347294405380617, "grad_norm": 1.4534136056900024, "learning_rate": 1.792128391287734e-05, "loss": 0.2541, "step": 4693 }, { "epoch": 1.4350351574442066, "grad_norm": 1.3246939182281494, "learning_rate": 1.7925105082155138e-05, "loss": 0.2229, "step": 4694 }, { "epoch": 1.4353408743503515, "grad_norm": 1.219861626625061, "learning_rate": 1.7928926251432936e-05, "loss": 0.2236, "step": 4695 }, { "epoch": 1.4356465912564964, "grad_norm": 2.3420095443725586, "learning_rate": 1.7932747420710735e-05, "loss": 0.3318, "step": 4696 }, { "epoch": 1.4359523081626415, "grad_norm": 0.5687587857246399, "learning_rate": 1.7936568589988537e-05, "loss": 0.1684, "step": 4697 }, { "epoch": 1.4362580250687862, "grad_norm": 0.39435166120529175, "learning_rate": 1.7940389759266336e-05, "loss": 0.0871, "step": 4698 }, { "epoch": 1.4365637419749313, "grad_norm": 0.3659171462059021, "learning_rate": 1.7944210928544135e-05, "loss": 0.115, "step": 4699 }, { "epoch": 1.4368694588810762, "grad_norm": 0.3536880314350128, "learning_rate": 1.7948032097821933e-05, "loss": 0.094, "step": 4700 }, { "epoch": 1.437175175787221, "grad_norm": 0.3956393301486969, "learning_rate": 1.7951853267099732e-05, "loss": 0.08, "step": 4701 }, { "epoch": 1.437480892693366, "grad_norm": 0.2964639365673065, "learning_rate": 1.795567443637753e-05, "loss": 0.0918, "step": 4702 }, { "epoch": 1.4377866095995109, "grad_norm": 0.31973251700401306, "learning_rate": 1.795949560565533e-05, "loss": 0.0704, "step": 4703 }, { "epoch": 1.4380923265056558, "grad_norm": 0.3275454044342041, "learning_rate": 1.796331677493313e-05, "loss": 0.0732, "step": 4704 }, { "epoch": 1.4383980434118007, "grad_norm": 0.3812078535556793, "learning_rate": 1.7967137944210927e-05, "loss": 0.113, "step": 4705 }, { "epoch": 1.4387037603179456, "grad_norm": 0.35720640420913696, "learning_rate": 1.797095911348873e-05, "loss": 0.095, "step": 4706 }, { "epoch": 1.4390094772240904, "grad_norm": 0.49700450897216797, "learning_rate": 1.7974780282766528e-05, "loss": 0.1391, "step": 4707 }, { "epoch": 1.4393151941302353, "grad_norm": 1.0787618160247803, "learning_rate": 1.7978601452044327e-05, "loss": 0.0969, "step": 4708 }, { "epoch": 1.4396209110363802, "grad_norm": 0.7504636645317078, "learning_rate": 1.7982422621322126e-05, "loss": 0.1372, "step": 4709 }, { "epoch": 1.4399266279425254, "grad_norm": 0.49464157223701477, "learning_rate": 1.7986243790599924e-05, "loss": 0.1291, "step": 4710 }, { "epoch": 1.44023234484867, "grad_norm": 0.7107617259025574, "learning_rate": 1.7990064959877723e-05, "loss": 0.1667, "step": 4711 }, { "epoch": 1.4405380617548151, "grad_norm": 0.5649733543395996, "learning_rate": 1.7993886129155522e-05, "loss": 0.1949, "step": 4712 }, { "epoch": 1.44084377866096, "grad_norm": 1.209567666053772, "learning_rate": 1.7997707298433324e-05, "loss": 0.1592, "step": 4713 }, { "epoch": 1.441149495567105, "grad_norm": 0.6078881621360779, "learning_rate": 1.8001528467711123e-05, "loss": 0.2143, "step": 4714 }, { "epoch": 1.4414552124732498, "grad_norm": 0.8542225360870361, "learning_rate": 1.800534963698892e-05, "loss": 0.2109, "step": 4715 }, { "epoch": 1.4417609293793947, "grad_norm": 0.8394019603729248, "learning_rate": 1.800917080626672e-05, "loss": 0.217, "step": 4716 }, { "epoch": 1.4420666462855396, "grad_norm": 0.7614290714263916, "learning_rate": 1.801299197554452e-05, "loss": 0.235, "step": 4717 }, { "epoch": 1.4423723631916845, "grad_norm": 1.1122053861618042, "learning_rate": 1.8016813144822318e-05, "loss": 0.2569, "step": 4718 }, { "epoch": 1.4426780800978294, "grad_norm": 0.9465113878250122, "learning_rate": 1.8020634314100116e-05, "loss": 0.2441, "step": 4719 }, { "epoch": 1.4429837970039743, "grad_norm": 1.1661317348480225, "learning_rate": 1.8024455483377915e-05, "loss": 0.2438, "step": 4720 }, { "epoch": 1.4432895139101192, "grad_norm": 1.5605053901672363, "learning_rate": 1.8028276652655714e-05, "loss": 0.3453, "step": 4721 }, { "epoch": 1.443595230816264, "grad_norm": 0.5038012266159058, "learning_rate": 1.8032097821933513e-05, "loss": 0.1726, "step": 4722 }, { "epoch": 1.4439009477224092, "grad_norm": 0.3254985809326172, "learning_rate": 1.803591899121131e-05, "loss": 0.0934, "step": 4723 }, { "epoch": 1.4442066646285538, "grad_norm": 0.7147448062896729, "learning_rate": 1.803974016048911e-05, "loss": 0.0847, "step": 4724 }, { "epoch": 1.444512381534699, "grad_norm": 0.45063379406929016, "learning_rate": 1.804356132976691e-05, "loss": 0.0957, "step": 4725 }, { "epoch": 1.4448180984408439, "grad_norm": 0.3793570399284363, "learning_rate": 1.8047382499044708e-05, "loss": 0.0665, "step": 4726 }, { "epoch": 1.4451238153469887, "grad_norm": 0.5148293972015381, "learning_rate": 1.8051203668322506e-05, "loss": 0.0714, "step": 4727 }, { "epoch": 1.4454295322531336, "grad_norm": 0.41508421301841736, "learning_rate": 1.805502483760031e-05, "loss": 0.0756, "step": 4728 }, { "epoch": 1.4457352491592785, "grad_norm": 0.38273510336875916, "learning_rate": 1.8058846006878107e-05, "loss": 0.0829, "step": 4729 }, { "epoch": 1.4460409660654234, "grad_norm": 0.43832889199256897, "learning_rate": 1.8062667176155906e-05, "loss": 0.0997, "step": 4730 }, { "epoch": 1.4463466829715683, "grad_norm": 0.412302166223526, "learning_rate": 1.8066488345433705e-05, "loss": 0.0943, "step": 4731 }, { "epoch": 1.4466523998777132, "grad_norm": 0.5135859251022339, "learning_rate": 1.8070309514711503e-05, "loss": 0.1339, "step": 4732 }, { "epoch": 1.446958116783858, "grad_norm": 0.47645649313926697, "learning_rate": 1.8074130683989302e-05, "loss": 0.1025, "step": 4733 }, { "epoch": 1.447263833690003, "grad_norm": 0.3564198911190033, "learning_rate": 1.80779518532671e-05, "loss": 0.1321, "step": 4734 }, { "epoch": 1.447569550596148, "grad_norm": 0.6105384230613708, "learning_rate": 1.80817730225449e-05, "loss": 0.1807, "step": 4735 }, { "epoch": 1.447875267502293, "grad_norm": 0.7059176564216614, "learning_rate": 1.80855941918227e-05, "loss": 0.1843, "step": 4736 }, { "epoch": 1.4481809844084377, "grad_norm": 0.8438656330108643, "learning_rate": 1.8089415361100497e-05, "loss": 0.1982, "step": 4737 }, { "epoch": 1.4484867013145828, "grad_norm": 0.6564850807189941, "learning_rate": 1.8093236530378296e-05, "loss": 0.1955, "step": 4738 }, { "epoch": 1.4487924182207277, "grad_norm": 0.801173210144043, "learning_rate": 1.8097057699656095e-05, "loss": 0.2411, "step": 4739 }, { "epoch": 1.4490981351268726, "grad_norm": 0.5952441692352295, "learning_rate": 1.8100878868933893e-05, "loss": 0.1978, "step": 4740 }, { "epoch": 1.4494038520330175, "grad_norm": 1.0777703523635864, "learning_rate": 1.8104700038211692e-05, "loss": 0.2441, "step": 4741 }, { "epoch": 1.4497095689391624, "grad_norm": 1.1632254123687744, "learning_rate": 1.810852120748949e-05, "loss": 0.1968, "step": 4742 }, { "epoch": 1.4500152858453073, "grad_norm": 1.1337885856628418, "learning_rate": 1.811234237676729e-05, "loss": 0.2237, "step": 4743 }, { "epoch": 1.4503210027514521, "grad_norm": 0.9987307786941528, "learning_rate": 1.8116163546045092e-05, "loss": 0.2197, "step": 4744 }, { "epoch": 1.450626719657597, "grad_norm": 1.3166805505752563, "learning_rate": 1.811998471532289e-05, "loss": 0.2723, "step": 4745 }, { "epoch": 1.450932436563742, "grad_norm": 1.9564414024353027, "learning_rate": 1.812380588460069e-05, "loss": 0.3209, "step": 4746 }, { "epoch": 1.4512381534698868, "grad_norm": 0.4108181297779083, "learning_rate": 1.8127627053878488e-05, "loss": 0.1745, "step": 4747 }, { "epoch": 1.4515438703760317, "grad_norm": 0.3787122070789337, "learning_rate": 1.8131448223156287e-05, "loss": 0.1146, "step": 4748 }, { "epoch": 1.4518495872821768, "grad_norm": 0.5030591487884521, "learning_rate": 1.8135269392434086e-05, "loss": 0.1064, "step": 4749 }, { "epoch": 1.4521553041883215, "grad_norm": 0.31488728523254395, "learning_rate": 1.8139090561711884e-05, "loss": 0.0821, "step": 4750 }, { "epoch": 1.4524610210944666, "grad_norm": 0.9717572331428528, "learning_rate": 1.8142911730989683e-05, "loss": 0.0653, "step": 4751 }, { "epoch": 1.4527667380006115, "grad_norm": 0.3432055711746216, "learning_rate": 1.8146732900267482e-05, "loss": 0.0735, "step": 4752 }, { "epoch": 1.4530724549067564, "grad_norm": 0.2905144989490509, "learning_rate": 1.815055406954528e-05, "loss": 0.0558, "step": 4753 }, { "epoch": 1.4533781718129013, "grad_norm": 0.3459246754646301, "learning_rate": 1.815437523882308e-05, "loss": 0.0862, "step": 4754 }, { "epoch": 1.4536838887190462, "grad_norm": 0.432081013917923, "learning_rate": 1.8158196408100878e-05, "loss": 0.0783, "step": 4755 }, { "epoch": 1.453989605625191, "grad_norm": 0.4055587649345398, "learning_rate": 1.8162017577378677e-05, "loss": 0.0804, "step": 4756 }, { "epoch": 1.454295322531336, "grad_norm": 0.5259325504302979, "learning_rate": 1.8165838746656475e-05, "loss": 0.1419, "step": 4757 }, { "epoch": 1.4546010394374809, "grad_norm": 0.5198450684547424, "learning_rate": 1.8169659915934274e-05, "loss": 0.1311, "step": 4758 }, { "epoch": 1.4549067563436258, "grad_norm": 0.5156076550483704, "learning_rate": 1.8173481085212076e-05, "loss": 0.1033, "step": 4759 }, { "epoch": 1.4552124732497707, "grad_norm": 1.8685141801834106, "learning_rate": 1.8177302254489875e-05, "loss": 0.1362, "step": 4760 }, { "epoch": 1.4555181901559155, "grad_norm": 0.7285293936729431, "learning_rate": 1.8181123423767674e-05, "loss": 0.1705, "step": 4761 }, { "epoch": 1.4558239070620607, "grad_norm": 0.6753525137901306, "learning_rate": 1.8184944593045473e-05, "loss": 0.1747, "step": 4762 }, { "epoch": 1.4561296239682053, "grad_norm": 0.7339290380477905, "learning_rate": 1.818876576232327e-05, "loss": 0.1902, "step": 4763 }, { "epoch": 1.4564353408743504, "grad_norm": 0.7651944160461426, "learning_rate": 1.819258693160107e-05, "loss": 0.2124, "step": 4764 }, { "epoch": 1.4567410577804953, "grad_norm": 0.7182425856590271, "learning_rate": 1.819640810087887e-05, "loss": 0.2082, "step": 4765 }, { "epoch": 1.4570467746866402, "grad_norm": 0.9341498017311096, "learning_rate": 1.8200229270156668e-05, "loss": 0.2436, "step": 4766 }, { "epoch": 1.4573524915927851, "grad_norm": 1.0129387378692627, "learning_rate": 1.8204050439434466e-05, "loss": 0.2253, "step": 4767 }, { "epoch": 1.45765820849893, "grad_norm": 1.851635456085205, "learning_rate": 1.8207871608712265e-05, "loss": 0.2394, "step": 4768 }, { "epoch": 1.457963925405075, "grad_norm": NaN, "learning_rate": 1.8207871608712265e-05, "loss": 0.2607, "step": 4769 }, { "epoch": 1.4582696423112198, "grad_norm": 1.373154878616333, "learning_rate": 1.8211692777990064e-05, "loss": 0.2412, "step": 4770 }, { "epoch": 1.4585753592173647, "grad_norm": 1.4282996654510498, "learning_rate": 1.8215513947267863e-05, "loss": 0.3537, "step": 4771 }, { "epoch": 1.4588810761235096, "grad_norm": 0.5283370018005371, "learning_rate": 1.821933511654566e-05, "loss": 0.1909, "step": 4772 }, { "epoch": 1.4591867930296545, "grad_norm": 0.28918400406837463, "learning_rate": 1.822315628582346e-05, "loss": 0.1045, "step": 4773 }, { "epoch": 1.4594925099357994, "grad_norm": 0.4846096634864807, "learning_rate": 1.822697745510126e-05, "loss": 0.1091, "step": 4774 }, { "epoch": 1.4597982268419445, "grad_norm": 0.45258641242980957, "learning_rate": 1.8230798624379058e-05, "loss": 0.0838, "step": 4775 }, { "epoch": 1.4601039437480892, "grad_norm": 0.2858901619911194, "learning_rate": 1.823461979365686e-05, "loss": 0.0681, "step": 4776 }, { "epoch": 1.4604096606542343, "grad_norm": 0.3028877377510071, "learning_rate": 1.823844096293466e-05, "loss": 0.0838, "step": 4777 }, { "epoch": 1.4607153775603792, "grad_norm": 0.34226107597351074, "learning_rate": 1.8242262132212457e-05, "loss": 0.0814, "step": 4778 }, { "epoch": 1.461021094466524, "grad_norm": 0.31792035698890686, "learning_rate": 1.8246083301490256e-05, "loss": 0.082, "step": 4779 }, { "epoch": 1.461326811372669, "grad_norm": 0.2835521996021271, "learning_rate": 1.8249904470768055e-05, "loss": 0.0699, "step": 4780 }, { "epoch": 1.4616325282788138, "grad_norm": 0.38287580013275146, "learning_rate": 1.8253725640045853e-05, "loss": 0.0779, "step": 4781 }, { "epoch": 1.4619382451849587, "grad_norm": 0.8683164715766907, "learning_rate": 1.8257546809323652e-05, "loss": 0.1196, "step": 4782 }, { "epoch": 1.4622439620911036, "grad_norm": 0.4105708599090576, "learning_rate": 1.826136797860145e-05, "loss": 0.0922, "step": 4783 }, { "epoch": 1.4625496789972485, "grad_norm": 0.33246150612831116, "learning_rate": 1.826518914787925e-05, "loss": 0.1069, "step": 4784 }, { "epoch": 1.4628553959033934, "grad_norm": 0.40324217081069946, "learning_rate": 1.8269010317157052e-05, "loss": 0.0926, "step": 4785 }, { "epoch": 1.4631611128095383, "grad_norm": 0.5590804815292358, "learning_rate": 1.827283148643485e-05, "loss": 0.1634, "step": 4786 }, { "epoch": 1.4634668297156832, "grad_norm": 1.5065131187438965, "learning_rate": 1.827665265571265e-05, "loss": 0.1556, "step": 4787 }, { "epoch": 1.4637725466218283, "grad_norm": 0.7673102021217346, "learning_rate": 1.8280473824990448e-05, "loss": 0.2292, "step": 4788 }, { "epoch": 1.464078263527973, "grad_norm": 0.8140377998352051, "learning_rate": 1.8284294994268247e-05, "loss": 0.2352, "step": 4789 }, { "epoch": 1.464383980434118, "grad_norm": 0.7831479907035828, "learning_rate": 1.8288116163546045e-05, "loss": 0.2242, "step": 4790 }, { "epoch": 1.464689697340263, "grad_norm": 1.2605187892913818, "learning_rate": 1.8291937332823848e-05, "loss": 0.216, "step": 4791 }, { "epoch": 1.464995414246408, "grad_norm": 0.731024980545044, "learning_rate": 1.8295758502101646e-05, "loss": 0.2336, "step": 4792 }, { "epoch": 1.4653011311525528, "grad_norm": 0.991027295589447, "learning_rate": 1.8299579671379445e-05, "loss": 0.2442, "step": 4793 }, { "epoch": 1.4656068480586977, "grad_norm": 1.166697382926941, "learning_rate": 1.8303400840657244e-05, "loss": 0.277, "step": 4794 }, { "epoch": 1.4659125649648426, "grad_norm": 1.496653437614441, "learning_rate": 1.8307222009935043e-05, "loss": 0.2962, "step": 4795 }, { "epoch": 1.4662182818709875, "grad_norm": 1.649477243423462, "learning_rate": 1.831104317921284e-05, "loss": 0.3094, "step": 4796 }, { "epoch": 1.4665239987771324, "grad_norm": 1.0101054906845093, "learning_rate": 1.831486434849064e-05, "loss": 0.1758, "step": 4797 }, { "epoch": 1.4668297156832772, "grad_norm": 0.37762030959129333, "learning_rate": 1.831868551776844e-05, "loss": 0.1093, "step": 4798 }, { "epoch": 1.4671354325894221, "grad_norm": 0.34446755051612854, "learning_rate": 1.8322506687046238e-05, "loss": 0.0825, "step": 4799 }, { "epoch": 1.467441149495567, "grad_norm": 0.2805548906326294, "learning_rate": 1.8326327856324036e-05, "loss": 0.0668, "step": 4800 }, { "epoch": 1.4677468664017121, "grad_norm": 0.4186827540397644, "learning_rate": 1.8330149025601835e-05, "loss": 0.0824, "step": 4801 }, { "epoch": 1.4680525833078568, "grad_norm": 0.480983704328537, "learning_rate": 1.8333970194879634e-05, "loss": 0.1105, "step": 4802 }, { "epoch": 1.468358300214002, "grad_norm": 0.36676761507987976, "learning_rate": 1.8337791364157433e-05, "loss": 0.0829, "step": 4803 }, { "epoch": 1.4686640171201468, "grad_norm": 0.34841346740722656, "learning_rate": 1.834161253343523e-05, "loss": 0.0773, "step": 4804 }, { "epoch": 1.4689697340262917, "grad_norm": 0.4758620262145996, "learning_rate": 1.834543370271303e-05, "loss": 0.1017, "step": 4805 }, { "epoch": 1.4692754509324366, "grad_norm": 0.3121950030326843, "learning_rate": 1.834925487199083e-05, "loss": 0.0639, "step": 4806 }, { "epoch": 1.4695811678385815, "grad_norm": 0.7359133362770081, "learning_rate": 1.835307604126863e-05, "loss": 0.1289, "step": 4807 }, { "epoch": 1.4698868847447264, "grad_norm": 0.48026543855667114, "learning_rate": 1.835689721054643e-05, "loss": 0.132, "step": 4808 }, { "epoch": 1.4701926016508713, "grad_norm": 0.49495434761047363, "learning_rate": 1.836071837982423e-05, "loss": 0.1255, "step": 4809 }, { "epoch": 1.4704983185570162, "grad_norm": 0.7765718102455139, "learning_rate": 1.8364539549102027e-05, "loss": 0.1694, "step": 4810 }, { "epoch": 1.470804035463161, "grad_norm": 0.7124277949333191, "learning_rate": 1.8368360718379826e-05, "loss": 0.1499, "step": 4811 }, { "epoch": 1.471109752369306, "grad_norm": 0.8905240893363953, "learning_rate": 1.8372181887657625e-05, "loss": 0.1566, "step": 4812 }, { "epoch": 1.4714154692754509, "grad_norm": 1.144364833831787, "learning_rate": 1.8376003056935423e-05, "loss": 0.167, "step": 4813 }, { "epoch": 1.471721186181596, "grad_norm": 0.9015721082687378, "learning_rate": 1.8379824226213222e-05, "loss": 0.2014, "step": 4814 }, { "epoch": 1.4720269030877406, "grad_norm": 1.0717793703079224, "learning_rate": 1.838364539549102e-05, "loss": 0.2282, "step": 4815 }, { "epoch": 1.4723326199938858, "grad_norm": 1.016456127166748, "learning_rate": 1.838746656476882e-05, "loss": 0.226, "step": 4816 }, { "epoch": 1.4726383369000307, "grad_norm": 1.0589911937713623, "learning_rate": 1.839128773404662e-05, "loss": 0.2253, "step": 4817 }, { "epoch": 1.4729440538061755, "grad_norm": 1.33281409740448, "learning_rate": 1.8395108903324417e-05, "loss": 0.2789, "step": 4818 }, { "epoch": 1.4732497707123204, "grad_norm": 1.4702435731887817, "learning_rate": 1.8398930072602216e-05, "loss": 0.2056, "step": 4819 }, { "epoch": 1.4735554876184653, "grad_norm": 1.7266262769699097, "learning_rate": 1.8402751241880015e-05, "loss": 0.3096, "step": 4820 }, { "epoch": 1.4738612045246102, "grad_norm": 1.950448751449585, "learning_rate": 1.8406572411157813e-05, "loss": 0.3734, "step": 4821 }, { "epoch": 1.4741669214307551, "grad_norm": 0.45004546642303467, "learning_rate": 1.8410393580435615e-05, "loss": 0.1796, "step": 4822 }, { "epoch": 1.4744726383369, "grad_norm": 0.3101680278778076, "learning_rate": 1.8414214749713414e-05, "loss": 0.1156, "step": 4823 }, { "epoch": 1.474778355243045, "grad_norm": 0.4812849164009094, "learning_rate": 1.8418035918991213e-05, "loss": 0.1056, "step": 4824 }, { "epoch": 1.4750840721491898, "grad_norm": 0.2626362442970276, "learning_rate": 1.842185708826901e-05, "loss": 0.0793, "step": 4825 }, { "epoch": 1.4753897890553347, "grad_norm": 0.31946036219596863, "learning_rate": 1.842567825754681e-05, "loss": 0.0842, "step": 4826 }, { "epoch": 1.4756955059614798, "grad_norm": 0.44712305068969727, "learning_rate": 1.842949942682461e-05, "loss": 0.0657, "step": 4827 }, { "epoch": 1.4760012228676245, "grad_norm": 0.35728663206100464, "learning_rate": 1.8433320596102408e-05, "loss": 0.0795, "step": 4828 }, { "epoch": 1.4763069397737696, "grad_norm": 0.3156641125679016, "learning_rate": 1.8437141765380207e-05, "loss": 0.0612, "step": 4829 }, { "epoch": 1.4766126566799145, "grad_norm": 0.44164153933525085, "learning_rate": 1.8440962934658005e-05, "loss": 0.1066, "step": 4830 }, { "epoch": 1.4769183735860594, "grad_norm": 0.24689769744873047, "learning_rate": 1.8444784103935804e-05, "loss": 0.0611, "step": 4831 }, { "epoch": 1.4772240904922043, "grad_norm": 0.7551746368408203, "learning_rate": 1.8448605273213603e-05, "loss": 0.1038, "step": 4832 }, { "epoch": 1.4775298073983492, "grad_norm": 0.6952165961265564, "learning_rate": 1.84524264424914e-05, "loss": 0.1019, "step": 4833 }, { "epoch": 1.477835524304494, "grad_norm": 0.5390605330467224, "learning_rate": 1.84562476117692e-05, "loss": 0.1512, "step": 4834 }, { "epoch": 1.478141241210639, "grad_norm": 0.5842426419258118, "learning_rate": 1.8460068781047e-05, "loss": 0.1599, "step": 4835 }, { "epoch": 1.4784469581167838, "grad_norm": 0.5541746020317078, "learning_rate": 1.8463889950324798e-05, "loss": 0.1371, "step": 4836 }, { "epoch": 1.4787526750229287, "grad_norm": 0.545012354850769, "learning_rate": 1.8467711119602597e-05, "loss": 0.1831, "step": 4837 }, { "epoch": 1.4790583919290736, "grad_norm": 0.6738123893737793, "learning_rate": 1.84715322888804e-05, "loss": 0.2117, "step": 4838 }, { "epoch": 1.4793641088352185, "grad_norm": 0.83856600522995, "learning_rate": 1.8475353458158197e-05, "loss": 0.1918, "step": 4839 }, { "epoch": 1.4796698257413636, "grad_norm": 0.7526463866233826, "learning_rate": 1.8479174627435996e-05, "loss": 0.2347, "step": 4840 }, { "epoch": 1.4799755426475083, "grad_norm": 0.9721493124961853, "learning_rate": 1.8482995796713795e-05, "loss": 0.2277, "step": 4841 }, { "epoch": 1.4802812595536534, "grad_norm": 1.1178644895553589, "learning_rate": 1.8486816965991594e-05, "loss": 0.2383, "step": 4842 }, { "epoch": 1.4805869764597983, "grad_norm": 0.938744843006134, "learning_rate": 1.8490638135269392e-05, "loss": 0.2323, "step": 4843 }, { "epoch": 1.4808926933659432, "grad_norm": 1.1528502702713013, "learning_rate": 1.849445930454719e-05, "loss": 0.26, "step": 4844 }, { "epoch": 1.481198410272088, "grad_norm": 1.1671078205108643, "learning_rate": 1.849828047382499e-05, "loss": 0.2935, "step": 4845 }, { "epoch": 1.481504127178233, "grad_norm": 1.6182255744934082, "learning_rate": 1.850210164310279e-05, "loss": 0.3417, "step": 4846 }, { "epoch": 1.4818098440843779, "grad_norm": 0.9071745872497559, "learning_rate": 1.8505922812380587e-05, "loss": 0.1758, "step": 4847 }, { "epoch": 1.4821155609905228, "grad_norm": 0.3342526853084564, "learning_rate": 1.8509743981658386e-05, "loss": 0.1076, "step": 4848 }, { "epoch": 1.4824212778966677, "grad_norm": 0.6939647197723389, "learning_rate": 1.8513565150936185e-05, "loss": 0.0976, "step": 4849 }, { "epoch": 1.4827269948028126, "grad_norm": 0.2860855162143707, "learning_rate": 1.8517386320213984e-05, "loss": 0.0742, "step": 4850 }, { "epoch": 1.4830327117089575, "grad_norm": 0.29412582516670227, "learning_rate": 1.8521207489491782e-05, "loss": 0.0756, "step": 4851 }, { "epoch": 1.4833384286151023, "grad_norm": 0.6230106949806213, "learning_rate": 1.852502865876958e-05, "loss": 0.0903, "step": 4852 }, { "epoch": 1.4836441455212475, "grad_norm": 0.41926309466362, "learning_rate": 1.8528849828047383e-05, "loss": 0.0782, "step": 4853 }, { "epoch": 1.4839498624273921, "grad_norm": 0.3342480957508087, "learning_rate": 1.8532670997325182e-05, "loss": 0.0847, "step": 4854 }, { "epoch": 1.4842555793335372, "grad_norm": 0.4860488772392273, "learning_rate": 1.853649216660298e-05, "loss": 0.1146, "step": 4855 }, { "epoch": 1.484561296239682, "grad_norm": 0.3631926476955414, "learning_rate": 1.854031333588078e-05, "loss": 0.1051, "step": 4856 }, { "epoch": 1.484867013145827, "grad_norm": 0.8193742632865906, "learning_rate": 1.8544134505158578e-05, "loss": 0.1064, "step": 4857 }, { "epoch": 1.485172730051972, "grad_norm": 0.42979657649993896, "learning_rate": 1.8547955674436377e-05, "loss": 0.0783, "step": 4858 }, { "epoch": 1.4854784469581168, "grad_norm": 0.9498456716537476, "learning_rate": 1.8551776843714176e-05, "loss": 0.1276, "step": 4859 }, { "epoch": 1.4857841638642617, "grad_norm": 0.4821285605430603, "learning_rate": 1.8555598012991975e-05, "loss": 0.1806, "step": 4860 }, { "epoch": 1.4860898807704066, "grad_norm": 0.4794308841228485, "learning_rate": 1.8559419182269773e-05, "loss": 0.1412, "step": 4861 }, { "epoch": 1.4863955976765515, "grad_norm": 1.2683507204055786, "learning_rate": 1.8563240351547572e-05, "loss": 0.1881, "step": 4862 }, { "epoch": 1.4867013145826964, "grad_norm": 0.6553133726119995, "learning_rate": 1.8567061520825374e-05, "loss": 0.1872, "step": 4863 }, { "epoch": 1.4870070314888413, "grad_norm": 0.8719310164451599, "learning_rate": 1.8570882690103173e-05, "loss": 0.2009, "step": 4864 }, { "epoch": 1.4873127483949862, "grad_norm": 0.8890965580940247, "learning_rate": 1.857470385938097e-05, "loss": 0.2267, "step": 4865 }, { "epoch": 1.4876184653011313, "grad_norm": 0.6508729457855225, "learning_rate": 1.857852502865877e-05, "loss": 0.2092, "step": 4866 }, { "epoch": 1.487924182207276, "grad_norm": 0.8981496691703796, "learning_rate": 1.858234619793657e-05, "loss": 0.2112, "step": 4867 }, { "epoch": 1.488229899113421, "grad_norm": 0.8823046088218689, "learning_rate": 1.8586167367214368e-05, "loss": 0.2379, "step": 4868 }, { "epoch": 1.4885356160195657, "grad_norm": 0.9172952771186829, "learning_rate": 1.858998853649217e-05, "loss": 0.2357, "step": 4869 }, { "epoch": 1.4888413329257109, "grad_norm": 0.9687058329582214, "learning_rate": 1.859380970576997e-05, "loss": 0.2475, "step": 4870 }, { "epoch": 1.4891470498318558, "grad_norm": 2.1104469299316406, "learning_rate": 1.8597630875047767e-05, "loss": 0.3003, "step": 4871 }, { "epoch": 1.4894527667380006, "grad_norm": 0.435661256313324, "learning_rate": 1.8601452044325566e-05, "loss": 0.1953, "step": 4872 }, { "epoch": 1.4897584836441455, "grad_norm": 0.44318875670433044, "learning_rate": 1.8605273213603365e-05, "loss": 0.119, "step": 4873 }, { "epoch": 1.4900642005502904, "grad_norm": 0.3693072497844696, "learning_rate": 1.8609094382881164e-05, "loss": 0.0941, "step": 4874 }, { "epoch": 1.4903699174564353, "grad_norm": 0.267522931098938, "learning_rate": 1.8612915552158962e-05, "loss": 0.079, "step": 4875 }, { "epoch": 1.4906756343625802, "grad_norm": 0.351070761680603, "learning_rate": 1.861673672143676e-05, "loss": 0.0745, "step": 4876 }, { "epoch": 1.490981351268725, "grad_norm": 0.3657132089138031, "learning_rate": 1.862055789071456e-05, "loss": 0.0758, "step": 4877 }, { "epoch": 1.49128706817487, "grad_norm": 0.38672521710395813, "learning_rate": 1.862437905999236e-05, "loss": 0.0625, "step": 4878 }, { "epoch": 1.4915927850810151, "grad_norm": 0.3551631569862366, "learning_rate": 1.8628200229270157e-05, "loss": 0.064, "step": 4879 }, { "epoch": 1.4918985019871598, "grad_norm": 0.6282119750976562, "learning_rate": 1.8632021398547956e-05, "loss": 0.1179, "step": 4880 }, { "epoch": 1.492204218893305, "grad_norm": 0.8504628539085388, "learning_rate": 1.8635842567825755e-05, "loss": 0.0991, "step": 4881 }, { "epoch": 1.4925099357994496, "grad_norm": 0.4439716637134552, "learning_rate": 1.8639663737103554e-05, "loss": 0.0998, "step": 4882 }, { "epoch": 1.4928156527055947, "grad_norm": 0.5012518167495728, "learning_rate": 1.8643484906381352e-05, "loss": 0.121, "step": 4883 }, { "epoch": 1.4931213696117396, "grad_norm": 0.5956999659538269, "learning_rate": 1.8647306075659155e-05, "loss": 0.1318, "step": 4884 }, { "epoch": 1.4934270865178845, "grad_norm": 0.6755030155181885, "learning_rate": 1.8651127244936953e-05, "loss": 0.1752, "step": 4885 }, { "epoch": 1.4937328034240294, "grad_norm": 0.8299668431282043, "learning_rate": 1.8654948414214752e-05, "loss": 0.1497, "step": 4886 }, { "epoch": 1.4940385203301743, "grad_norm": 0.592824399471283, "learning_rate": 1.865876958349255e-05, "loss": 0.1655, "step": 4887 }, { "epoch": 1.4943442372363192, "grad_norm": 0.5742682814598083, "learning_rate": 1.866259075277035e-05, "loss": 0.1844, "step": 4888 }, { "epoch": 1.494649954142464, "grad_norm": 0.9308561086654663, "learning_rate": 1.8666411922048148e-05, "loss": 0.2329, "step": 4889 }, { "epoch": 1.494955671048609, "grad_norm": 0.6772736310958862, "learning_rate": 1.8670233091325947e-05, "loss": 0.185, "step": 4890 }, { "epoch": 1.4952613879547538, "grad_norm": 1.0678201913833618, "learning_rate": 1.8674054260603746e-05, "loss": 0.2221, "step": 4891 }, { "epoch": 1.495567104860899, "grad_norm": 1.006230354309082, "learning_rate": 1.8677875429881545e-05, "loss": 0.2217, "step": 4892 }, { "epoch": 1.4958728217670436, "grad_norm": 1.1349449157714844, "learning_rate": 1.8681696599159343e-05, "loss": 0.2458, "step": 4893 }, { "epoch": 1.4961785386731887, "grad_norm": 1.3316899538040161, "learning_rate": 1.8685517768437142e-05, "loss": 0.2171, "step": 4894 }, { "epoch": 1.4964842555793334, "grad_norm": 1.3718008995056152, "learning_rate": 1.868933893771494e-05, "loss": 0.2427, "step": 4895 }, { "epoch": 1.4967899724854785, "grad_norm": 1.9860987663269043, "learning_rate": 1.869316010699274e-05, "loss": 0.3148, "step": 4896 }, { "epoch": 1.4970956893916234, "grad_norm": 0.5136041045188904, "learning_rate": 1.8696981276270538e-05, "loss": 0.2001, "step": 4897 }, { "epoch": 1.4974014062977683, "grad_norm": 0.2837669551372528, "learning_rate": 1.8700802445548337e-05, "loss": 0.0967, "step": 4898 }, { "epoch": 1.4977071232039132, "grad_norm": 0.42063987255096436, "learning_rate": 1.8704623614826136e-05, "loss": 0.1003, "step": 4899 }, { "epoch": 1.498012840110058, "grad_norm": 0.35481446981430054, "learning_rate": 1.8708444784103938e-05, "loss": 0.0868, "step": 4900 }, { "epoch": 1.498318557016203, "grad_norm": 0.4224824011325836, "learning_rate": 1.8712265953381737e-05, "loss": 0.0679, "step": 4901 }, { "epoch": 1.4986242739223479, "grad_norm": 0.2965565025806427, "learning_rate": 1.8716087122659535e-05, "loss": 0.0474, "step": 4902 }, { "epoch": 1.4989299908284928, "grad_norm": 0.29887518286705017, "learning_rate": 1.8719908291937334e-05, "loss": 0.0566, "step": 4903 }, { "epoch": 1.4992357077346377, "grad_norm": 0.44939103722572327, "learning_rate": 1.8723729461215133e-05, "loss": 0.0874, "step": 4904 }, { "epoch": 1.4995414246407828, "grad_norm": 0.3187306523323059, "learning_rate": 1.872755063049293e-05, "loss": 0.0838, "step": 4905 }, { "epoch": 1.4998471415469274, "grad_norm": 0.3342761993408203, "learning_rate": 1.873137179977073e-05, "loss": 0.0955, "step": 4906 }, { "epoch": 1.5001528584530726, "grad_norm": 0.7481163144111633, "learning_rate": 1.873519296904853e-05, "loss": 0.0963, "step": 4907 }, { "epoch": 1.5004585753592172, "grad_norm": 0.40372052788734436, "learning_rate": 1.8739014138326328e-05, "loss": 0.1205, "step": 4908 }, { "epoch": 1.5007642922653623, "grad_norm": 0.5347312092781067, "learning_rate": 1.8742835307604127e-05, "loss": 0.1927, "step": 4909 }, { "epoch": 1.5010700091715072, "grad_norm": 0.5675156712532043, "learning_rate": 1.8746656476881925e-05, "loss": 0.1308, "step": 4910 }, { "epoch": 1.5013757260776521, "grad_norm": 0.468757688999176, "learning_rate": 1.8750477646159724e-05, "loss": 0.1532, "step": 4911 }, { "epoch": 1.501681442983797, "grad_norm": 0.5646935701370239, "learning_rate": 1.8754298815437523e-05, "loss": 0.2075, "step": 4912 }, { "epoch": 1.501987159889942, "grad_norm": 0.7390662431716919, "learning_rate": 1.875811998471532e-05, "loss": 0.2009, "step": 4913 }, { "epoch": 1.5022928767960868, "grad_norm": 3.3626301288604736, "learning_rate": 1.876194115399312e-05, "loss": 0.2085, "step": 4914 }, { "epoch": 1.5025985937022317, "grad_norm": 0.7420992851257324, "learning_rate": 1.8765762323270922e-05, "loss": 0.233, "step": 4915 }, { "epoch": 1.5029043106083766, "grad_norm": 0.9263088703155518, "learning_rate": 1.876958349254872e-05, "loss": 0.1942, "step": 4916 }, { "epoch": 1.5032100275145215, "grad_norm": 0.7841703295707703, "learning_rate": 1.877340466182652e-05, "loss": 0.2224, "step": 4917 }, { "epoch": 1.5035157444206666, "grad_norm": 0.8830205798149109, "learning_rate": 1.877722583110432e-05, "loss": 0.2073, "step": 4918 }, { "epoch": 1.5038214613268113, "grad_norm": 0.9138540029525757, "learning_rate": 1.8781047000382117e-05, "loss": 0.2277, "step": 4919 }, { "epoch": 1.5041271782329564, "grad_norm": 1.011960506439209, "learning_rate": 1.8784868169659916e-05, "loss": 0.3039, "step": 4920 }, { "epoch": 1.504432895139101, "grad_norm": 1.372729778289795, "learning_rate": 1.8788689338937715e-05, "loss": 0.2953, "step": 4921 }, { "epoch": 1.5047386120452462, "grad_norm": 0.5363647937774658, "learning_rate": 1.8792510508215514e-05, "loss": 0.1686, "step": 4922 }, { "epoch": 1.505044328951391, "grad_norm": 0.37383154034614563, "learning_rate": 1.8796331677493312e-05, "loss": 0.1195, "step": 4923 }, { "epoch": 1.505350045857536, "grad_norm": 0.38161203265190125, "learning_rate": 1.880015284677111e-05, "loss": 0.0807, "step": 4924 }, { "epoch": 1.5056557627636808, "grad_norm": 0.6028904318809509, "learning_rate": 1.880397401604891e-05, "loss": 0.0811, "step": 4925 }, { "epoch": 1.5059614796698257, "grad_norm": 1.5465846061706543, "learning_rate": 1.880779518532671e-05, "loss": 0.0865, "step": 4926 }, { "epoch": 1.5062671965759706, "grad_norm": 0.4263383448123932, "learning_rate": 1.8811616354604507e-05, "loss": 0.0821, "step": 4927 }, { "epoch": 1.5065729134821155, "grad_norm": 0.3597123622894287, "learning_rate": 1.8815437523882306e-05, "loss": 0.0871, "step": 4928 }, { "epoch": 1.5068786303882604, "grad_norm": 0.30315977334976196, "learning_rate": 1.8819258693160105e-05, "loss": 0.0594, "step": 4929 }, { "epoch": 1.5071843472944053, "grad_norm": 0.4478484094142914, "learning_rate": 1.8823079862437904e-05, "loss": 0.1104, "step": 4930 }, { "epoch": 1.5074900642005504, "grad_norm": 0.4360978305339813, "learning_rate": 1.8826901031715706e-05, "loss": 0.1024, "step": 4931 }, { "epoch": 1.507795781106695, "grad_norm": 0.27256324887275696, "learning_rate": 1.8830722200993504e-05, "loss": 0.084, "step": 4932 }, { "epoch": 1.5081014980128402, "grad_norm": 0.5102108716964722, "learning_rate": 1.8834543370271303e-05, "loss": 0.0828, "step": 4933 }, { "epoch": 1.5084072149189849, "grad_norm": 0.5726320147514343, "learning_rate": 1.8838364539549102e-05, "loss": 0.1217, "step": 4934 }, { "epoch": 1.50871293182513, "grad_norm": 0.6264142990112305, "learning_rate": 1.88421857088269e-05, "loss": 0.1888, "step": 4935 }, { "epoch": 1.509018648731275, "grad_norm": 0.4801129996776581, "learning_rate": 1.88460068781047e-05, "loss": 0.1435, "step": 4936 }, { "epoch": 1.5093243656374198, "grad_norm": 0.7151480913162231, "learning_rate": 1.8849828047382498e-05, "loss": 0.1989, "step": 4937 }, { "epoch": 1.5096300825435647, "grad_norm": 0.5343462824821472, "learning_rate": 1.8853649216660297e-05, "loss": 0.1855, "step": 4938 }, { "epoch": 1.5099357994497096, "grad_norm": 1.206960678100586, "learning_rate": 1.8857470385938096e-05, "loss": 0.2303, "step": 4939 }, { "epoch": 1.5102415163558545, "grad_norm": 0.6851158142089844, "learning_rate": 1.8861291555215894e-05, "loss": 0.2104, "step": 4940 }, { "epoch": 1.5105472332619994, "grad_norm": 1.0621110200881958, "learning_rate": 1.8865112724493697e-05, "loss": 0.2295, "step": 4941 }, { "epoch": 1.5108529501681442, "grad_norm": 1.4210673570632935, "learning_rate": 1.8868933893771495e-05, "loss": 0.205, "step": 4942 }, { "epoch": 1.5111586670742891, "grad_norm": 0.76194167137146, "learning_rate": 1.8872755063049294e-05, "loss": 0.2295, "step": 4943 }, { "epoch": 1.5114643839804343, "grad_norm": 1.1102235317230225, "learning_rate": 1.8876576232327093e-05, "loss": 0.2706, "step": 4944 }, { "epoch": 1.511770100886579, "grad_norm": 1.637995719909668, "learning_rate": 1.888039740160489e-05, "loss": 0.2842, "step": 4945 }, { "epoch": 1.512075817792724, "grad_norm": 1.5571255683898926, "learning_rate": 1.8884218570882694e-05, "loss": 0.3023, "step": 4946 }, { "epoch": 1.5123815346988687, "grad_norm": 0.38821372389793396, "learning_rate": 1.8888039740160492e-05, "loss": 0.1833, "step": 4947 }, { "epoch": 1.5126872516050138, "grad_norm": 0.36260664463043213, "learning_rate": 1.889186090943829e-05, "loss": 0.1132, "step": 4948 }, { "epoch": 1.5129929685111587, "grad_norm": 0.3587612211704254, "learning_rate": 1.889568207871609e-05, "loss": 0.0956, "step": 4949 }, { "epoch": 1.5132986854173036, "grad_norm": 1.148869514465332, "learning_rate": 1.889950324799389e-05, "loss": 0.0933, "step": 4950 }, { "epoch": 1.5136044023234485, "grad_norm": 0.5895735025405884, "learning_rate": 1.8903324417271687e-05, "loss": 0.075, "step": 4951 }, { "epoch": 1.5139101192295934, "grad_norm": 0.29207852482795715, "learning_rate": 1.8907145586549486e-05, "loss": 0.0557, "step": 4952 }, { "epoch": 1.5142158361357383, "grad_norm": 0.40057939291000366, "learning_rate": 1.8910966755827285e-05, "loss": 0.0748, "step": 4953 }, { "epoch": 1.5145215530418832, "grad_norm": 0.32331568002700806, "learning_rate": 1.8914787925105084e-05, "loss": 0.0806, "step": 4954 }, { "epoch": 1.514827269948028, "grad_norm": 0.34926503896713257, "learning_rate": 1.8918609094382882e-05, "loss": 0.0733, "step": 4955 }, { "epoch": 1.515132986854173, "grad_norm": 0.3299182057380676, "learning_rate": 1.892243026366068e-05, "loss": 0.0763, "step": 4956 }, { "epoch": 1.515438703760318, "grad_norm": 0.5612989068031311, "learning_rate": 1.892625143293848e-05, "loss": 0.1342, "step": 4957 }, { "epoch": 1.5157444206664628, "grad_norm": 0.7178888916969299, "learning_rate": 1.893007260221628e-05, "loss": 0.1031, "step": 4958 }, { "epoch": 1.5160501375726079, "grad_norm": 0.5304023623466492, "learning_rate": 1.8933893771494077e-05, "loss": 0.1101, "step": 4959 }, { "epoch": 1.5163558544787525, "grad_norm": 0.5711047053337097, "learning_rate": 1.8937714940771876e-05, "loss": 0.1415, "step": 4960 }, { "epoch": 1.5166615713848977, "grad_norm": 0.44714391231536865, "learning_rate": 1.8941536110049675e-05, "loss": 0.1234, "step": 4961 }, { "epoch": 1.5169672882910425, "grad_norm": 0.6735883355140686, "learning_rate": 1.8945357279327477e-05, "loss": 0.1673, "step": 4962 }, { "epoch": 1.5172730051971874, "grad_norm": 0.7732380628585815, "learning_rate": 1.8949178448605276e-05, "loss": 0.1786, "step": 4963 }, { "epoch": 1.5175787221033323, "grad_norm": 0.5790312886238098, "learning_rate": 1.8952999617883074e-05, "loss": 0.2603, "step": 4964 }, { "epoch": 1.5178844390094772, "grad_norm": 0.7518157958984375, "learning_rate": 1.8956820787160873e-05, "loss": 0.214, "step": 4965 }, { "epoch": 1.5181901559156221, "grad_norm": 0.8717190027236938, "learning_rate": 1.8960641956438672e-05, "loss": 0.1952, "step": 4966 }, { "epoch": 1.518495872821767, "grad_norm": 0.7249079346656799, "learning_rate": 1.896446312571647e-05, "loss": 0.2108, "step": 4967 }, { "epoch": 1.518801589727912, "grad_norm": 0.8577327728271484, "learning_rate": 1.896828429499427e-05, "loss": 0.2838, "step": 4968 }, { "epoch": 1.5191073066340568, "grad_norm": 1.3590738773345947, "learning_rate": 1.8972105464272068e-05, "loss": 0.225, "step": 4969 }, { "epoch": 1.519413023540202, "grad_norm": 1.5222363471984863, "learning_rate": 1.8975926633549867e-05, "loss": 0.2348, "step": 4970 }, { "epoch": 1.5197187404463466, "grad_norm": 3.0668606758117676, "learning_rate": 1.8979747802827666e-05, "loss": 0.333, "step": 4971 }, { "epoch": 1.5200244573524917, "grad_norm": 0.4727480411529541, "learning_rate": 1.8983568972105464e-05, "loss": 0.1565, "step": 4972 }, { "epoch": 1.5203301742586364, "grad_norm": 0.3431470990180969, "learning_rate": 1.8987390141383263e-05, "loss": 0.0782, "step": 4973 }, { "epoch": 1.5206358911647815, "grad_norm": 0.39911338686943054, "learning_rate": 1.8991211310661062e-05, "loss": 0.0923, "step": 4974 }, { "epoch": 1.5209416080709264, "grad_norm": 0.23813456296920776, "learning_rate": 1.899503247993886e-05, "loss": 0.0621, "step": 4975 }, { "epoch": 1.5212473249770713, "grad_norm": 0.3880382478237152, "learning_rate": 1.899885364921666e-05, "loss": 0.0774, "step": 4976 }, { "epoch": 1.5215530418832162, "grad_norm": 0.3030521273612976, "learning_rate": 1.900267481849446e-05, "loss": 0.0674, "step": 4977 }, { "epoch": 1.521858758789361, "grad_norm": 0.2954086661338806, "learning_rate": 1.900649598777226e-05, "loss": 0.0801, "step": 4978 }, { "epoch": 1.522164475695506, "grad_norm": 0.35800549387931824, "learning_rate": 1.901031715705006e-05, "loss": 0.086, "step": 4979 }, { "epoch": 1.5224701926016508, "grad_norm": 0.4349730312824249, "learning_rate": 1.9014138326327858e-05, "loss": 0.1065, "step": 4980 }, { "epoch": 1.5227759095077957, "grad_norm": 0.351866751909256, "learning_rate": 1.9017959495605657e-05, "loss": 0.0789, "step": 4981 }, { "epoch": 1.5230816264139406, "grad_norm": 0.391336053609848, "learning_rate": 1.9021780664883455e-05, "loss": 0.1181, "step": 4982 }, { "epoch": 1.5233873433200857, "grad_norm": 0.43385666608810425, "learning_rate": 1.9025601834161254e-05, "loss": 0.0877, "step": 4983 }, { "epoch": 1.5236930602262304, "grad_norm": 0.7044318318367004, "learning_rate": 1.9029423003439053e-05, "loss": 0.1109, "step": 4984 }, { "epoch": 1.5239987771323755, "grad_norm": 0.489425927400589, "learning_rate": 1.903324417271685e-05, "loss": 0.1654, "step": 4985 }, { "epoch": 1.5243044940385202, "grad_norm": 0.6873739361763, "learning_rate": 1.903706534199465e-05, "loss": 0.1568, "step": 4986 }, { "epoch": 1.5246102109446653, "grad_norm": 0.5043721199035645, "learning_rate": 1.904088651127245e-05, "loss": 0.1509, "step": 4987 }, { "epoch": 1.5249159278508102, "grad_norm": 0.8650913238525391, "learning_rate": 1.9044707680550248e-05, "loss": 0.1898, "step": 4988 }, { "epoch": 1.525221644756955, "grad_norm": 0.6952762603759766, "learning_rate": 1.9048528849828046e-05, "loss": 0.1875, "step": 4989 }, { "epoch": 1.5255273616631, "grad_norm": 1.081365942955017, "learning_rate": 1.9052350019105845e-05, "loss": 0.1945, "step": 4990 }, { "epoch": 1.5258330785692449, "grad_norm": 0.7554844617843628, "learning_rate": 1.9056171188383644e-05, "loss": 0.2416, "step": 4991 }, { "epoch": 1.5261387954753898, "grad_norm": 1.2621569633483887, "learning_rate": 1.9059992357661443e-05, "loss": 0.219, "step": 4992 }, { "epoch": 1.5264445123815347, "grad_norm": 0.9922550320625305, "learning_rate": 1.9063813526939245e-05, "loss": 0.1941, "step": 4993 }, { "epoch": 1.5267502292876796, "grad_norm": 1.2834727764129639, "learning_rate": 1.9067634696217044e-05, "loss": 0.2276, "step": 4994 }, { "epoch": 1.5270559461938245, "grad_norm": 1.2329216003417969, "learning_rate": 1.9071455865494842e-05, "loss": 0.2547, "step": 4995 }, { "epoch": 1.5273616630999696, "grad_norm": 3.148815631866455, "learning_rate": 1.907527703477264e-05, "loss": 0.327, "step": 4996 }, { "epoch": 1.5276673800061142, "grad_norm": 0.44290104508399963, "learning_rate": 1.907909820405044e-05, "loss": 0.165, "step": 4997 }, { "epoch": 1.5279730969122594, "grad_norm": 0.38635727763175964, "learning_rate": 1.908291937332824e-05, "loss": 0.1246, "step": 4998 }, { "epoch": 1.528278813818404, "grad_norm": 0.48612073063850403, "learning_rate": 1.9086740542606037e-05, "loss": 0.091, "step": 4999 }, { "epoch": 1.5285845307245491, "grad_norm": 0.24528491497039795, "learning_rate": 1.9090561711883836e-05, "loss": 0.0831, "step": 5000 }, { "epoch": 1.5285845307245491, "eval_cer": 0.19423259467775195, "eval_loss": 0.2664967179298401, "eval_runtime": 19.8431, "eval_samples_per_second": 228.694, "eval_steps_per_second": 0.756, "eval_wer": 0.3554369989665093, "step": 5000 }, { "epoch": 1.528890247630694, "grad_norm": 0.26022011041641235, "learning_rate": 1.9094382881161635e-05, "loss": 0.0728, "step": 5001 }, { "epoch": 1.529195964536839, "grad_norm": 0.33723950386047363, "learning_rate": 1.9098204050439434e-05, "loss": 0.0865, "step": 5002 }, { "epoch": 1.5295016814429838, "grad_norm": 0.6045982241630554, "learning_rate": 1.9102025219717232e-05, "loss": 0.0999, "step": 5003 }, { "epoch": 1.5298073983491287, "grad_norm": 0.371838241815567, "learning_rate": 1.910584638899503e-05, "loss": 0.0758, "step": 5004 }, { "epoch": 1.5301131152552736, "grad_norm": 0.3353448808193207, "learning_rate": 1.910966755827283e-05, "loss": 0.0898, "step": 5005 }, { "epoch": 1.5304188321614185, "grad_norm": 0.33614182472229004, "learning_rate": 1.911348872755063e-05, "loss": 0.0792, "step": 5006 }, { "epoch": 1.5307245490675634, "grad_norm": 0.45001932978630066, "learning_rate": 1.9117309896828427e-05, "loss": 0.1234, "step": 5007 }, { "epoch": 1.5310302659737083, "grad_norm": 0.3648279309272766, "learning_rate": 1.912113106610623e-05, "loss": 0.1189, "step": 5008 }, { "epoch": 1.5313359828798534, "grad_norm": 0.4051543176174164, "learning_rate": 1.9124952235384028e-05, "loss": 0.1072, "step": 5009 }, { "epoch": 1.531641699785998, "grad_norm": 0.9530391693115234, "learning_rate": 1.9128773404661827e-05, "loss": 0.1511, "step": 5010 }, { "epoch": 1.5319474166921432, "grad_norm": 0.7970960140228271, "learning_rate": 1.9132594573939626e-05, "loss": 0.1475, "step": 5011 }, { "epoch": 1.5322531335982879, "grad_norm": 0.6350681185722351, "learning_rate": 1.9136415743217424e-05, "loss": 0.1531, "step": 5012 }, { "epoch": 1.532558850504433, "grad_norm": 0.6647037863731384, "learning_rate": 1.9140236912495223e-05, "loss": 0.1954, "step": 5013 }, { "epoch": 1.5328645674105776, "grad_norm": 1.2845869064331055, "learning_rate": 1.9144058081773022e-05, "loss": 0.2006, "step": 5014 }, { "epoch": 1.5331702843167228, "grad_norm": 0.6773020029067993, "learning_rate": 1.914787925105082e-05, "loss": 0.1747, "step": 5015 }, { "epoch": 1.5334760012228676, "grad_norm": 0.7549684047698975, "learning_rate": 1.915170042032862e-05, "loss": 0.2088, "step": 5016 }, { "epoch": 1.5337817181290125, "grad_norm": 0.8783038854598999, "learning_rate": 1.9155521589606418e-05, "loss": 0.251, "step": 5017 }, { "epoch": 1.5340874350351574, "grad_norm": 1.4937266111373901, "learning_rate": 1.915934275888422e-05, "loss": 0.2361, "step": 5018 }, { "epoch": 1.5343931519413023, "grad_norm": 1.2812259197235107, "learning_rate": 1.916316392816202e-05, "loss": 0.2216, "step": 5019 }, { "epoch": 1.5346988688474472, "grad_norm": 3.4741451740264893, "learning_rate": 1.9166985097439818e-05, "loss": 0.266, "step": 5020 }, { "epoch": 1.535004585753592, "grad_norm": 1.8746196031570435, "learning_rate": 1.9170806266717616e-05, "loss": 0.3542, "step": 5021 }, { "epoch": 1.5353103026597372, "grad_norm": 0.751751720905304, "learning_rate": 1.9174627435995415e-05, "loss": 0.1609, "step": 5022 }, { "epoch": 1.535616019565882, "grad_norm": 0.47577935457229614, "learning_rate": 1.9178448605273214e-05, "loss": 0.0919, "step": 5023 }, { "epoch": 1.535921736472027, "grad_norm": 0.33609145879745483, "learning_rate": 1.9182269774551016e-05, "loss": 0.0849, "step": 5024 }, { "epoch": 1.5362274533781717, "grad_norm": 0.3044179379940033, "learning_rate": 1.9186090943828815e-05, "loss": 0.0602, "step": 5025 }, { "epoch": 1.5365331702843168, "grad_norm": 0.3211282193660736, "learning_rate": 1.9189912113106614e-05, "loss": 0.0826, "step": 5026 }, { "epoch": 1.5368388871904615, "grad_norm": 0.27924609184265137, "learning_rate": 1.9193733282384412e-05, "loss": 0.0551, "step": 5027 }, { "epoch": 1.5371446040966066, "grad_norm": 0.36021167039871216, "learning_rate": 1.919755445166221e-05, "loss": 0.0701, "step": 5028 }, { "epoch": 1.5374503210027515, "grad_norm": 0.43041202425956726, "learning_rate": 1.920137562094001e-05, "loss": 0.0828, "step": 5029 }, { "epoch": 1.5377560379088964, "grad_norm": 0.4870952069759369, "learning_rate": 1.920519679021781e-05, "loss": 0.0721, "step": 5030 }, { "epoch": 1.5380617548150413, "grad_norm": 0.45996716618537903, "learning_rate": 1.9209017959495607e-05, "loss": 0.0935, "step": 5031 }, { "epoch": 1.5383674717211862, "grad_norm": 0.49633684754371643, "learning_rate": 1.9212839128773406e-05, "loss": 0.137, "step": 5032 }, { "epoch": 1.538673188627331, "grad_norm": 0.42615729570388794, "learning_rate": 1.9216660298051205e-05, "loss": 0.1098, "step": 5033 }, { "epoch": 1.538978905533476, "grad_norm": 0.6506653428077698, "learning_rate": 1.9220481467329004e-05, "loss": 0.1312, "step": 5034 }, { "epoch": 1.539284622439621, "grad_norm": 0.6678722500801086, "learning_rate": 1.9224302636606802e-05, "loss": 0.1369, "step": 5035 }, { "epoch": 1.5395903393457657, "grad_norm": 0.6212034225463867, "learning_rate": 1.92281238058846e-05, "loss": 0.128, "step": 5036 }, { "epoch": 1.5398960562519108, "grad_norm": 0.5702893733978271, "learning_rate": 1.92319449751624e-05, "loss": 0.1648, "step": 5037 }, { "epoch": 1.5402017731580555, "grad_norm": 0.7908552885055542, "learning_rate": 1.92357661444402e-05, "loss": 0.213, "step": 5038 }, { "epoch": 1.5405074900642006, "grad_norm": 0.8467127680778503, "learning_rate": 1.9239587313718e-05, "loss": 0.1999, "step": 5039 }, { "epoch": 1.5408132069703453, "grad_norm": 0.7696900367736816, "learning_rate": 1.92434084829958e-05, "loss": 0.205, "step": 5040 }, { "epoch": 1.5411189238764904, "grad_norm": 1.2953177690505981, "learning_rate": 1.9247229652273598e-05, "loss": 0.2317, "step": 5041 }, { "epoch": 1.5414246407826353, "grad_norm": 5.688636302947998, "learning_rate": 1.9251050821551397e-05, "loss": 0.2468, "step": 5042 }, { "epoch": 1.5417303576887802, "grad_norm": 0.9428754448890686, "learning_rate": 1.9254871990829196e-05, "loss": 0.2542, "step": 5043 }, { "epoch": 1.542036074594925, "grad_norm": 1.133868932723999, "learning_rate": 1.9258693160106994e-05, "loss": 0.2663, "step": 5044 }, { "epoch": 1.54234179150107, "grad_norm": 1.019255518913269, "learning_rate": 1.9262514329384793e-05, "loss": 0.2101, "step": 5045 }, { "epoch": 1.5426475084072149, "grad_norm": 5.923823833465576, "learning_rate": 1.9266335498662592e-05, "loss": 0.329, "step": 5046 }, { "epoch": 1.5429532253133598, "grad_norm": 0.6197856664657593, "learning_rate": 1.927015666794039e-05, "loss": 0.2207, "step": 5047 }, { "epoch": 1.5432589422195049, "grad_norm": 0.34277698397636414, "learning_rate": 1.927397783721819e-05, "loss": 0.0897, "step": 5048 }, { "epoch": 1.5435646591256496, "grad_norm": 0.3203091621398926, "learning_rate": 1.9277799006495988e-05, "loss": 0.0991, "step": 5049 }, { "epoch": 1.5438703760317947, "grad_norm": 0.4086128771305084, "learning_rate": 1.9281620175773787e-05, "loss": 0.0788, "step": 5050 }, { "epoch": 1.5441760929379393, "grad_norm": 0.3898080289363861, "learning_rate": 1.9285441345051586e-05, "loss": 0.0835, "step": 5051 }, { "epoch": 1.5444818098440845, "grad_norm": 0.29863685369491577, "learning_rate": 1.9289262514329384e-05, "loss": 0.0575, "step": 5052 }, { "epoch": 1.5447875267502291, "grad_norm": 0.38095057010650635, "learning_rate": 1.9293083683607183e-05, "loss": 0.0871, "step": 5053 }, { "epoch": 1.5450932436563742, "grad_norm": 0.28735846281051636, "learning_rate": 1.9296904852884982e-05, "loss": 0.055, "step": 5054 }, { "epoch": 1.5453989605625191, "grad_norm": 0.4409816861152649, "learning_rate": 1.9300726022162784e-05, "loss": 0.0942, "step": 5055 }, { "epoch": 1.545704677468664, "grad_norm": 0.29672375321388245, "learning_rate": 1.9304547191440583e-05, "loss": 0.0644, "step": 5056 }, { "epoch": 1.546010394374809, "grad_norm": 0.43669456243515015, "learning_rate": 1.930836836071838e-05, "loss": 0.1148, "step": 5057 }, { "epoch": 1.5463161112809538, "grad_norm": 0.43646520376205444, "learning_rate": 1.931218952999618e-05, "loss": 0.1118, "step": 5058 }, { "epoch": 1.5466218281870987, "grad_norm": 0.5107153654098511, "learning_rate": 1.931601069927398e-05, "loss": 0.1106, "step": 5059 }, { "epoch": 1.5469275450932436, "grad_norm": 0.5184035897254944, "learning_rate": 1.9319831868551778e-05, "loss": 0.1346, "step": 5060 }, { "epoch": 1.5472332619993887, "grad_norm": 0.7768357992172241, "learning_rate": 1.9323653037829576e-05, "loss": 0.1672, "step": 5061 }, { "epoch": 1.5475389789055334, "grad_norm": 0.6040539741516113, "learning_rate": 1.9327474207107375e-05, "loss": 0.2024, "step": 5062 }, { "epoch": 1.5478446958116785, "grad_norm": 0.6710765957832336, "learning_rate": 1.9331295376385174e-05, "loss": 0.2084, "step": 5063 }, { "epoch": 1.5481504127178232, "grad_norm": 0.9016476273536682, "learning_rate": 1.9335116545662973e-05, "loss": 0.1961, "step": 5064 }, { "epoch": 1.5484561296239683, "grad_norm": 0.9498496651649475, "learning_rate": 1.933893771494077e-05, "loss": 0.221, "step": 5065 }, { "epoch": 1.548761846530113, "grad_norm": 1.0945305824279785, "learning_rate": 1.934275888421857e-05, "loss": 0.2423, "step": 5066 }, { "epoch": 1.549067563436258, "grad_norm": 0.809063732624054, "learning_rate": 1.934658005349637e-05, "loss": 0.1986, "step": 5067 }, { "epoch": 1.549373280342403, "grad_norm": 1.23871910572052, "learning_rate": 1.9350401222774168e-05, "loss": 0.2597, "step": 5068 }, { "epoch": 1.5496789972485479, "grad_norm": 1.0463440418243408, "learning_rate": 1.9354222392051966e-05, "loss": 0.2353, "step": 5069 }, { "epoch": 1.5499847141546927, "grad_norm": 1.0745391845703125, "learning_rate": 1.935804356132977e-05, "loss": 0.2968, "step": 5070 }, { "epoch": 1.5502904310608376, "grad_norm": 5.404260635375977, "learning_rate": 1.9361864730607567e-05, "loss": 0.2656, "step": 5071 }, { "epoch": 1.5505961479669825, "grad_norm": 0.3206956088542938, "learning_rate": 1.9365685899885366e-05, "loss": 0.1677, "step": 5072 }, { "epoch": 1.5509018648731274, "grad_norm": 0.2982710599899292, "learning_rate": 1.9369507069163165e-05, "loss": 0.1114, "step": 5073 }, { "epoch": 1.5512075817792725, "grad_norm": 0.47494742274284363, "learning_rate": 1.9373328238440963e-05, "loss": 0.0919, "step": 5074 }, { "epoch": 1.5515132986854172, "grad_norm": 0.3186666667461395, "learning_rate": 1.9377149407718762e-05, "loss": 0.096, "step": 5075 }, { "epoch": 1.5518190155915623, "grad_norm": 0.4480159282684326, "learning_rate": 1.938097057699656e-05, "loss": 0.0636, "step": 5076 }, { "epoch": 1.552124732497707, "grad_norm": 0.33549806475639343, "learning_rate": 1.938479174627436e-05, "loss": 0.0714, "step": 5077 }, { "epoch": 1.552430449403852, "grad_norm": 0.2994624972343445, "learning_rate": 1.938861291555216e-05, "loss": 0.0853, "step": 5078 }, { "epoch": 1.5527361663099968, "grad_norm": 0.4775316119194031, "learning_rate": 1.9392434084829957e-05, "loss": 0.0867, "step": 5079 }, { "epoch": 1.553041883216142, "grad_norm": 0.25942039489746094, "learning_rate": 1.9396255254107756e-05, "loss": 0.0651, "step": 5080 }, { "epoch": 1.5533476001222868, "grad_norm": 0.5041511654853821, "learning_rate": 1.9400076423385555e-05, "loss": 0.0862, "step": 5081 }, { "epoch": 1.5536533170284317, "grad_norm": 0.4237000346183777, "learning_rate": 1.9403897592663353e-05, "loss": 0.108, "step": 5082 }, { "epoch": 1.5539590339345766, "grad_norm": 0.3624696135520935, "learning_rate": 1.9407718761941152e-05, "loss": 0.0944, "step": 5083 }, { "epoch": 1.5542647508407215, "grad_norm": 0.7934198379516602, "learning_rate": 1.941153993121895e-05, "loss": 0.1332, "step": 5084 }, { "epoch": 1.5545704677468664, "grad_norm": 0.4816489517688751, "learning_rate": 1.941536110049675e-05, "loss": 0.156, "step": 5085 }, { "epoch": 1.5548761846530113, "grad_norm": 0.6687336564064026, "learning_rate": 1.9419182269774552e-05, "loss": 0.1402, "step": 5086 }, { "epoch": 1.5551819015591564, "grad_norm": 0.9034313559532166, "learning_rate": 1.942300343905235e-05, "loss": 0.1668, "step": 5087 }, { "epoch": 1.555487618465301, "grad_norm": 1.0727120637893677, "learning_rate": 1.942682460833015e-05, "loss": 0.2141, "step": 5088 }, { "epoch": 1.5557933353714462, "grad_norm": 0.6677419543266296, "learning_rate": 1.9430645777607948e-05, "loss": 0.214, "step": 5089 }, { "epoch": 1.5560990522775908, "grad_norm": 0.734214186668396, "learning_rate": 1.9434466946885747e-05, "loss": 0.2098, "step": 5090 }, { "epoch": 1.556404769183736, "grad_norm": 1.036514401435852, "learning_rate": 1.9438288116163546e-05, "loss": 0.2599, "step": 5091 }, { "epoch": 1.5567104860898806, "grad_norm": 1.1986281871795654, "learning_rate": 1.9442109285441344e-05, "loss": 0.1832, "step": 5092 }, { "epoch": 1.5570162029960257, "grad_norm": 1.0099387168884277, "learning_rate": 1.9445930454719143e-05, "loss": 0.2721, "step": 5093 }, { "epoch": 1.5573219199021706, "grad_norm": 0.8530307412147522, "learning_rate": 1.9449751623996942e-05, "loss": 0.235, "step": 5094 }, { "epoch": 1.5576276368083155, "grad_norm": 0.9974703788757324, "learning_rate": 1.945357279327474e-05, "loss": 0.2683, "step": 5095 }, { "epoch": 1.5579333537144604, "grad_norm": 2.3149476051330566, "learning_rate": 1.9457393962552543e-05, "loss": 0.3484, "step": 5096 }, { "epoch": 1.5582390706206053, "grad_norm": 0.6527627110481262, "learning_rate": 1.946121513183034e-05, "loss": 0.1626, "step": 5097 }, { "epoch": 1.5585447875267502, "grad_norm": 0.3839696943759918, "learning_rate": 1.946503630110814e-05, "loss": 0.1077, "step": 5098 }, { "epoch": 1.558850504432895, "grad_norm": 0.4403168261051178, "learning_rate": 1.946885747038594e-05, "loss": 0.1134, "step": 5099 }, { "epoch": 1.5591562213390402, "grad_norm": 0.35995373129844666, "learning_rate": 1.9472678639663738e-05, "loss": 0.073, "step": 5100 }, { "epoch": 1.5594619382451849, "grad_norm": 0.4127315580844879, "learning_rate": 1.9476499808941536e-05, "loss": 0.0985, "step": 5101 }, { "epoch": 1.55976765515133, "grad_norm": 0.4558067321777344, "learning_rate": 1.948032097821934e-05, "loss": 0.0647, "step": 5102 }, { "epoch": 1.5600733720574746, "grad_norm": 0.4747166335582733, "learning_rate": 1.9484142147497137e-05, "loss": 0.0759, "step": 5103 }, { "epoch": 1.5603790889636198, "grad_norm": 0.39999502897262573, "learning_rate": 1.9487963316774936e-05, "loss": 0.0797, "step": 5104 }, { "epoch": 1.5606848058697644, "grad_norm": 0.7318262457847595, "learning_rate": 1.9491784486052735e-05, "loss": 0.0833, "step": 5105 }, { "epoch": 1.5609905227759096, "grad_norm": 0.5549700856208801, "learning_rate": 1.9495605655330533e-05, "loss": 0.0841, "step": 5106 }, { "epoch": 1.5612962396820544, "grad_norm": 0.7885621190071106, "learning_rate": 1.9499426824608332e-05, "loss": 0.1343, "step": 5107 }, { "epoch": 1.5616019565881993, "grad_norm": 0.5142400860786438, "learning_rate": 1.950324799388613e-05, "loss": 0.0958, "step": 5108 }, { "epoch": 1.5619076734943442, "grad_norm": 0.7923698425292969, "learning_rate": 1.950706916316393e-05, "loss": 0.1102, "step": 5109 }, { "epoch": 1.5622133904004891, "grad_norm": 0.8715043663978577, "learning_rate": 1.951089033244173e-05, "loss": 0.1131, "step": 5110 }, { "epoch": 1.562519107306634, "grad_norm": 0.9259697794914246, "learning_rate": 1.9514711501719527e-05, "loss": 0.1885, "step": 5111 }, { "epoch": 1.562824824212779, "grad_norm": 0.9401617646217346, "learning_rate": 1.9518532670997326e-05, "loss": 0.1994, "step": 5112 }, { "epoch": 1.563130541118924, "grad_norm": 1.1270047426223755, "learning_rate": 1.9522353840275125e-05, "loss": 0.214, "step": 5113 }, { "epoch": 1.5634362580250687, "grad_norm": 0.7003158926963806, "learning_rate": 1.9526175009552923e-05, "loss": 0.1926, "step": 5114 }, { "epoch": 1.5637419749312138, "grad_norm": 0.8065859079360962, "learning_rate": 1.9529996178830722e-05, "loss": 0.1875, "step": 5115 }, { "epoch": 1.5640476918373585, "grad_norm": 0.9945093989372253, "learning_rate": 1.953381734810852e-05, "loss": 0.2123, "step": 5116 }, { "epoch": 1.5643534087435036, "grad_norm": 0.9198736548423767, "learning_rate": 1.9537638517386323e-05, "loss": 0.2267, "step": 5117 }, { "epoch": 1.5646591256496483, "grad_norm": 1.3606183528900146, "learning_rate": 1.9541459686664122e-05, "loss": 0.2154, "step": 5118 }, { "epoch": 1.5649648425557934, "grad_norm": 1.378411054611206, "learning_rate": 1.954528085594192e-05, "loss": 0.2259, "step": 5119 }, { "epoch": 1.5652705594619383, "grad_norm": 1.3437275886535645, "learning_rate": 1.954910202521972e-05, "loss": 0.2414, "step": 5120 }, { "epoch": 1.5655762763680832, "grad_norm": 2.5488944053649902, "learning_rate": 1.9552923194497518e-05, "loss": 0.3065, "step": 5121 }, { "epoch": 1.565881993274228, "grad_norm": 0.552987277507782, "learning_rate": 1.9556744363775317e-05, "loss": 0.1559, "step": 5122 }, { "epoch": 1.566187710180373, "grad_norm": 0.4341469407081604, "learning_rate": 1.9560565533053116e-05, "loss": 0.1008, "step": 5123 }, { "epoch": 1.5664934270865178, "grad_norm": 0.33807510137557983, "learning_rate": 1.9564386702330914e-05, "loss": 0.0736, "step": 5124 }, { "epoch": 1.5667991439926627, "grad_norm": 0.34213986992836, "learning_rate": 1.9568207871608713e-05, "loss": 0.0714, "step": 5125 }, { "epoch": 1.5671048608988078, "grad_norm": 0.3848195970058441, "learning_rate": 1.9572029040886512e-05, "loss": 0.0581, "step": 5126 }, { "epoch": 1.5674105778049525, "grad_norm": 0.6637454628944397, "learning_rate": 1.957585021016431e-05, "loss": 0.1004, "step": 5127 }, { "epoch": 1.5677162947110976, "grad_norm": 0.31985658407211304, "learning_rate": 1.957967137944211e-05, "loss": 0.0723, "step": 5128 }, { "epoch": 1.5680220116172423, "grad_norm": 0.414608895778656, "learning_rate": 1.9583492548719908e-05, "loss": 0.1066, "step": 5129 }, { "epoch": 1.5683277285233874, "grad_norm": 0.53715580701828, "learning_rate": 1.9587313717997707e-05, "loss": 0.0888, "step": 5130 }, { "epoch": 1.568633445429532, "grad_norm": 0.4653162956237793, "learning_rate": 1.9591134887275505e-05, "loss": 0.1084, "step": 5131 }, { "epoch": 1.5689391623356772, "grad_norm": 0.3439481556415558, "learning_rate": 1.9594956056553308e-05, "loss": 0.0958, "step": 5132 }, { "epoch": 1.569244879241822, "grad_norm": 0.5508539080619812, "learning_rate": 1.9598777225831106e-05, "loss": 0.1005, "step": 5133 }, { "epoch": 1.569550596147967, "grad_norm": 0.49559256434440613, "learning_rate": 1.9602598395108905e-05, "loss": 0.1374, "step": 5134 }, { "epoch": 1.5698563130541119, "grad_norm": 0.5176662802696228, "learning_rate": 1.9606419564386704e-05, "loss": 0.1091, "step": 5135 }, { "epoch": 1.5701620299602568, "grad_norm": 0.5294964909553528, "learning_rate": 1.9610240733664503e-05, "loss": 0.1413, "step": 5136 }, { "epoch": 1.5704677468664017, "grad_norm": 0.5865715146064758, "learning_rate": 1.96140619029423e-05, "loss": 0.1562, "step": 5137 }, { "epoch": 1.5707734637725466, "grad_norm": 0.8366187214851379, "learning_rate": 1.96178830722201e-05, "loss": 0.1957, "step": 5138 }, { "epoch": 1.5710791806786917, "grad_norm": 0.9046799540519714, "learning_rate": 1.96217042414979e-05, "loss": 0.1895, "step": 5139 }, { "epoch": 1.5713848975848363, "grad_norm": 0.9595535397529602, "learning_rate": 1.9625525410775698e-05, "loss": 0.2115, "step": 5140 }, { "epoch": 1.5716906144909815, "grad_norm": 0.8770933747291565, "learning_rate": 1.9629346580053496e-05, "loss": 0.2227, "step": 5141 }, { "epoch": 1.5719963313971261, "grad_norm": 0.7744597792625427, "learning_rate": 1.9633167749331295e-05, "loss": 0.2267, "step": 5142 }, { "epoch": 1.5723020483032712, "grad_norm": 1.4036036729812622, "learning_rate": 1.9636988918609094e-05, "loss": 0.23, "step": 5143 }, { "epoch": 1.572607765209416, "grad_norm": 1.0056453943252563, "learning_rate": 1.9640810087886893e-05, "loss": 0.2329, "step": 5144 }, { "epoch": 1.572913482115561, "grad_norm": 0.9469918012619019, "learning_rate": 1.964463125716469e-05, "loss": 0.2279, "step": 5145 }, { "epoch": 1.573219199021706, "grad_norm": 11.412364959716797, "learning_rate": 1.964845242644249e-05, "loss": 0.3656, "step": 5146 }, { "epoch": 1.5735249159278508, "grad_norm": 0.4329998791217804, "learning_rate": 1.965227359572029e-05, "loss": 0.1518, "step": 5147 }, { "epoch": 1.5738306328339957, "grad_norm": 0.35677170753479004, "learning_rate": 1.965609476499809e-05, "loss": 0.0885, "step": 5148 }, { "epoch": 1.5741363497401406, "grad_norm": 0.36903485655784607, "learning_rate": 1.965991593427589e-05, "loss": 0.0946, "step": 5149 }, { "epoch": 1.5744420666462855, "grad_norm": 0.34405362606048584, "learning_rate": 1.966373710355369e-05, "loss": 0.0798, "step": 5150 }, { "epoch": 1.5747477835524304, "grad_norm": 0.4204351007938385, "learning_rate": 1.9667558272831487e-05, "loss": 0.0698, "step": 5151 }, { "epoch": 1.5750535004585755, "grad_norm": 0.31410858035087585, "learning_rate": 1.9671379442109286e-05, "loss": 0.0672, "step": 5152 }, { "epoch": 1.5753592173647202, "grad_norm": 0.37890374660491943, "learning_rate": 1.9675200611387085e-05, "loss": 0.0703, "step": 5153 }, { "epoch": 1.5756649342708653, "grad_norm": 0.2670494318008423, "learning_rate": 1.9679021780664883e-05, "loss": 0.0753, "step": 5154 }, { "epoch": 1.57597065117701, "grad_norm": 0.4318024814128876, "learning_rate": 1.9682842949942682e-05, "loss": 0.1278, "step": 5155 }, { "epoch": 1.576276368083155, "grad_norm": 0.4576842784881592, "learning_rate": 1.968666411922048e-05, "loss": 0.0837, "step": 5156 }, { "epoch": 1.5765820849892997, "grad_norm": 0.34508785605430603, "learning_rate": 1.969048528849828e-05, "loss": 0.1101, "step": 5157 }, { "epoch": 1.5768878018954449, "grad_norm": 0.38483238220214844, "learning_rate": 1.969430645777608e-05, "loss": 0.0892, "step": 5158 }, { "epoch": 1.5771935188015898, "grad_norm": 0.5354366302490234, "learning_rate": 1.9698127627053877e-05, "loss": 0.1214, "step": 5159 }, { "epoch": 1.5774992357077346, "grad_norm": 0.6502383351325989, "learning_rate": 1.9701948796331676e-05, "loss": 0.1894, "step": 5160 }, { "epoch": 1.5778049526138795, "grad_norm": 0.5885339379310608, "learning_rate": 1.9705769965609475e-05, "loss": 0.1588, "step": 5161 }, { "epoch": 1.5781106695200244, "grad_norm": 0.5468118786811829, "learning_rate": 1.9709591134887273e-05, "loss": 0.1625, "step": 5162 }, { "epoch": 1.5784163864261693, "grad_norm": 0.6816175580024719, "learning_rate": 1.9713412304165075e-05, "loss": 0.1988, "step": 5163 }, { "epoch": 1.5787221033323142, "grad_norm": 0.7627678513526917, "learning_rate": 1.9717233473442874e-05, "loss": 0.1703, "step": 5164 }, { "epoch": 1.5790278202384593, "grad_norm": 0.6883099675178528, "learning_rate": 1.9721054642720673e-05, "loss": 0.1974, "step": 5165 }, { "epoch": 1.579333537144604, "grad_norm": 1.2252095937728882, "learning_rate": 1.972487581199847e-05, "loss": 0.2493, "step": 5166 }, { "epoch": 1.5796392540507491, "grad_norm": 1.0352284908294678, "learning_rate": 1.972869698127627e-05, "loss": 0.2356, "step": 5167 }, { "epoch": 1.5799449709568938, "grad_norm": 1.2120381593704224, "learning_rate": 1.973251815055407e-05, "loss": 0.2507, "step": 5168 }, { "epoch": 1.580250687863039, "grad_norm": 1.0180649757385254, "learning_rate": 1.9736339319831868e-05, "loss": 0.216, "step": 5169 }, { "epoch": 1.5805564047691836, "grad_norm": 1.2662690877914429, "learning_rate": 1.9740160489109667e-05, "loss": 0.2783, "step": 5170 }, { "epoch": 1.5808621216753287, "grad_norm": 1.677207589149475, "learning_rate": 1.9743981658387465e-05, "loss": 0.3091, "step": 5171 }, { "epoch": 1.5811678385814736, "grad_norm": 0.4580239951610565, "learning_rate": 1.9747802827665264e-05, "loss": 0.1633, "step": 5172 }, { "epoch": 1.5814735554876185, "grad_norm": 0.529797375202179, "learning_rate": 1.9751623996943063e-05, "loss": 0.1135, "step": 5173 }, { "epoch": 1.5817792723937634, "grad_norm": 0.808387815952301, "learning_rate": 1.9755445166220865e-05, "loss": 0.1383, "step": 5174 }, { "epoch": 1.5820849892999083, "grad_norm": 0.6126211881637573, "learning_rate": 1.9759266335498664e-05, "loss": 0.0811, "step": 5175 }, { "epoch": 1.5823907062060532, "grad_norm": 0.3759809732437134, "learning_rate": 1.9763087504776463e-05, "loss": 0.0681, "step": 5176 }, { "epoch": 1.582696423112198, "grad_norm": 0.3158976137638092, "learning_rate": 1.976690867405426e-05, "loss": 0.0833, "step": 5177 }, { "epoch": 1.5830021400183432, "grad_norm": 0.3789902329444885, "learning_rate": 1.977072984333206e-05, "loss": 0.0612, "step": 5178 }, { "epoch": 1.5833078569244878, "grad_norm": 0.46442320942878723, "learning_rate": 1.9774551012609862e-05, "loss": 0.0853, "step": 5179 }, { "epoch": 1.583613573830633, "grad_norm": 0.3315175771713257, "learning_rate": 1.977837218188766e-05, "loss": 0.0978, "step": 5180 }, { "epoch": 1.5839192907367776, "grad_norm": 0.31555598974227905, "learning_rate": 1.978219335116546e-05, "loss": 0.0727, "step": 5181 }, { "epoch": 1.5842250076429227, "grad_norm": 0.7469016313552856, "learning_rate": 1.978601452044326e-05, "loss": 0.1123, "step": 5182 }, { "epoch": 1.5845307245490674, "grad_norm": 0.47336244583129883, "learning_rate": 1.9789835689721057e-05, "loss": 0.1141, "step": 5183 }, { "epoch": 1.5848364414552125, "grad_norm": 0.45940735936164856, "learning_rate": 1.9793656858998856e-05, "loss": 0.1217, "step": 5184 }, { "epoch": 1.5851421583613574, "grad_norm": 0.5958283543586731, "learning_rate": 1.9797478028276655e-05, "loss": 0.1265, "step": 5185 }, { "epoch": 1.5854478752675023, "grad_norm": 0.6028069853782654, "learning_rate": 1.9801299197554453e-05, "loss": 0.1279, "step": 5186 }, { "epoch": 1.5857535921736472, "grad_norm": 0.7923275828361511, "learning_rate": 1.9805120366832252e-05, "loss": 0.1663, "step": 5187 }, { "epoch": 1.586059309079792, "grad_norm": 0.854086697101593, "learning_rate": 1.980894153611005e-05, "loss": 0.1978, "step": 5188 }, { "epoch": 1.586365025985937, "grad_norm": 0.8878032565116882, "learning_rate": 1.981276270538785e-05, "loss": 0.2073, "step": 5189 }, { "epoch": 1.5866707428920819, "grad_norm": 1.125856637954712, "learning_rate": 1.981658387466565e-05, "loss": 0.2161, "step": 5190 }, { "epoch": 1.586976459798227, "grad_norm": 0.8809707760810852, "learning_rate": 1.9820405043943447e-05, "loss": 0.259, "step": 5191 }, { "epoch": 1.5872821767043717, "grad_norm": 0.9908668994903564, "learning_rate": 1.9824226213221246e-05, "loss": 0.2397, "step": 5192 }, { "epoch": 1.5875878936105168, "grad_norm": 1.1273525953292847, "learning_rate": 1.9828047382499045e-05, "loss": 0.2487, "step": 5193 }, { "epoch": 1.5878936105166614, "grad_norm": 0.8710575699806213, "learning_rate": 1.9831868551776843e-05, "loss": 0.22, "step": 5194 }, { "epoch": 1.5881993274228066, "grad_norm": 1.4557217359542847, "learning_rate": 1.9835689721054645e-05, "loss": 0.2807, "step": 5195 }, { "epoch": 1.5885050443289512, "grad_norm": 1.7816076278686523, "learning_rate": 1.9839510890332444e-05, "loss": 0.3127, "step": 5196 }, { "epoch": 1.5888107612350963, "grad_norm": 0.364400178194046, "learning_rate": 1.9843332059610243e-05, "loss": 0.1431, "step": 5197 }, { "epoch": 1.5891164781412412, "grad_norm": 0.35690945386886597, "learning_rate": 1.984715322888804e-05, "loss": 0.0951, "step": 5198 }, { "epoch": 1.5894221950473861, "grad_norm": 0.32802245020866394, "learning_rate": 1.985097439816584e-05, "loss": 0.1092, "step": 5199 }, { "epoch": 1.589727911953531, "grad_norm": 0.36352214217185974, "learning_rate": 1.985479556744364e-05, "loss": 0.0657, "step": 5200 }, { "epoch": 1.590033628859676, "grad_norm": 0.2642286717891693, "learning_rate": 1.9858616736721438e-05, "loss": 0.0685, "step": 5201 }, { "epoch": 1.5903393457658208, "grad_norm": 0.26784536242485046, "learning_rate": 1.9862437905999237e-05, "loss": 0.0755, "step": 5202 }, { "epoch": 1.5906450626719657, "grad_norm": 0.25656798481941223, "learning_rate": 1.9866259075277035e-05, "loss": 0.0635, "step": 5203 }, { "epoch": 1.5909507795781108, "grad_norm": 0.4321632385253906, "learning_rate": 1.9870080244554834e-05, "loss": 0.0943, "step": 5204 }, { "epoch": 1.5912564964842555, "grad_norm": 0.31822681427001953, "learning_rate": 1.9873901413832633e-05, "loss": 0.0866, "step": 5205 }, { "epoch": 1.5915622133904006, "grad_norm": 1.6250436305999756, "learning_rate": 1.987772258311043e-05, "loss": 0.0835, "step": 5206 }, { "epoch": 1.5918679302965453, "grad_norm": 0.619614839553833, "learning_rate": 1.988154375238823e-05, "loss": 0.1159, "step": 5207 }, { "epoch": 1.5921736472026904, "grad_norm": 0.31867238879203796, "learning_rate": 1.988536492166603e-05, "loss": 0.0892, "step": 5208 }, { "epoch": 1.592479364108835, "grad_norm": 0.43935397267341614, "learning_rate": 1.9889186090943828e-05, "loss": 0.1069, "step": 5209 }, { "epoch": 1.5927850810149802, "grad_norm": 0.4024848937988281, "learning_rate": 1.989300726022163e-05, "loss": 0.159, "step": 5210 }, { "epoch": 1.593090797921125, "grad_norm": 0.620707631111145, "learning_rate": 1.989682842949943e-05, "loss": 0.1892, "step": 5211 }, { "epoch": 1.59339651482727, "grad_norm": 0.5763331055641174, "learning_rate": 1.9900649598777227e-05, "loss": 0.1701, "step": 5212 }, { "epoch": 1.5937022317334149, "grad_norm": 0.548140287399292, "learning_rate": 1.9904470768055026e-05, "loss": 0.2053, "step": 5213 }, { "epoch": 1.5940079486395597, "grad_norm": 0.7360578179359436, "learning_rate": 1.9908291937332825e-05, "loss": 0.2011, "step": 5214 }, { "epoch": 1.5943136655457046, "grad_norm": 0.6497951745986938, "learning_rate": 1.9912113106610624e-05, "loss": 0.2097, "step": 5215 }, { "epoch": 1.5946193824518495, "grad_norm": 1.0631632804870605, "learning_rate": 1.9915934275888422e-05, "loss": 0.2735, "step": 5216 }, { "epoch": 1.5949250993579946, "grad_norm": 0.7492690682411194, "learning_rate": 1.991975544516622e-05, "loss": 0.252, "step": 5217 }, { "epoch": 1.5952308162641393, "grad_norm": 0.8418283462524414, "learning_rate": 1.992357661444402e-05, "loss": 0.2011, "step": 5218 }, { "epoch": 1.5955365331702844, "grad_norm": 1.3622785806655884, "learning_rate": 1.992739778372182e-05, "loss": 0.2792, "step": 5219 }, { "epoch": 1.595842250076429, "grad_norm": 1.240796685218811, "learning_rate": 1.9931218952999617e-05, "loss": 0.2552, "step": 5220 }, { "epoch": 1.5961479669825742, "grad_norm": 1.7607814073562622, "learning_rate": 1.9935040122277416e-05, "loss": 0.3475, "step": 5221 }, { "epoch": 1.5964536838887189, "grad_norm": 0.3743898272514343, "learning_rate": 1.9938861291555215e-05, "loss": 0.1619, "step": 5222 }, { "epoch": 1.596759400794864, "grad_norm": 0.6053632497787476, "learning_rate": 1.9942682460833014e-05, "loss": 0.0893, "step": 5223 }, { "epoch": 1.597065117701009, "grad_norm": 0.43107256293296814, "learning_rate": 1.9946503630110812e-05, "loss": 0.0888, "step": 5224 }, { "epoch": 1.5973708346071538, "grad_norm": 0.45823168754577637, "learning_rate": 1.995032479938861e-05, "loss": 0.1168, "step": 5225 }, { "epoch": 1.5976765515132987, "grad_norm": 0.3061533570289612, "learning_rate": 1.9954145968666413e-05, "loss": 0.0908, "step": 5226 }, { "epoch": 1.5979822684194436, "grad_norm": 0.23541688919067383, "learning_rate": 1.9957967137944212e-05, "loss": 0.0449, "step": 5227 }, { "epoch": 1.5982879853255885, "grad_norm": 0.4210800230503082, "learning_rate": 1.996178830722201e-05, "loss": 0.0675, "step": 5228 }, { "epoch": 1.5985937022317334, "grad_norm": 0.29302600026130676, "learning_rate": 1.996560947649981e-05, "loss": 0.0786, "step": 5229 }, { "epoch": 1.5988994191378785, "grad_norm": 0.31581175327301025, "learning_rate": 1.9969430645777608e-05, "loss": 0.0914, "step": 5230 }, { "epoch": 1.5992051360440231, "grad_norm": 0.31966260075569153, "learning_rate": 1.9973251815055407e-05, "loss": 0.0982, "step": 5231 }, { "epoch": 1.5995108529501683, "grad_norm": 0.34516486525535583, "learning_rate": 1.9977072984333206e-05, "loss": 0.1065, "step": 5232 }, { "epoch": 1.599816569856313, "grad_norm": 0.35250529646873474, "learning_rate": 1.9980894153611005e-05, "loss": 0.1096, "step": 5233 }, { "epoch": 1.600122286762458, "grad_norm": 0.41582608222961426, "learning_rate": 1.9984715322888803e-05, "loss": 0.1201, "step": 5234 }, { "epoch": 1.6004280036686027, "grad_norm": 0.6054863333702087, "learning_rate": 1.9988536492166602e-05, "loss": 0.1217, "step": 5235 }, { "epoch": 1.6007337205747478, "grad_norm": 0.4615755081176758, "learning_rate": 1.99923576614444e-05, "loss": 0.149, "step": 5236 }, { "epoch": 1.6010394374808927, "grad_norm": 0.4937666058540344, "learning_rate": 1.99961788307222e-05, "loss": 0.1419, "step": 5237 }, { "epoch": 1.6013451543870376, "grad_norm": 0.7876324653625488, "learning_rate": 1.9999999999999998e-05, "loss": 0.1719, "step": 5238 }, { "epoch": 1.6016508712931825, "grad_norm": 0.8609725832939148, "learning_rate": 2.0003821169277797e-05, "loss": 0.1918, "step": 5239 }, { "epoch": 1.6019565881993274, "grad_norm": 0.9229646921157837, "learning_rate": 2.0007642338555596e-05, "loss": 0.2502, "step": 5240 }, { "epoch": 1.6022623051054723, "grad_norm": 1.6613225936889648, "learning_rate": 2.0011463507833398e-05, "loss": 0.2143, "step": 5241 }, { "epoch": 1.6025680220116172, "grad_norm": 0.6843658089637756, "learning_rate": 2.0015284677111197e-05, "loss": 0.2083, "step": 5242 }, { "epoch": 1.6028737389177623, "grad_norm": 0.7946720719337463, "learning_rate": 2.0019105846388995e-05, "loss": 0.2314, "step": 5243 }, { "epoch": 1.603179455823907, "grad_norm": 1.1567304134368896, "learning_rate": 2.0022927015666794e-05, "loss": 0.3187, "step": 5244 }, { "epoch": 1.603485172730052, "grad_norm": 0.9488333463668823, "learning_rate": 2.0026748184944593e-05, "loss": 0.2895, "step": 5245 }, { "epoch": 1.6037908896361968, "grad_norm": 1.824425458908081, "learning_rate": 2.003056935422239e-05, "loss": 0.3405, "step": 5246 }, { "epoch": 1.6040966065423419, "grad_norm": 0.4891558289527893, "learning_rate": 2.003439052350019e-05, "loss": 0.1579, "step": 5247 }, { "epoch": 1.6044023234484865, "grad_norm": 0.3794075846672058, "learning_rate": 2.003821169277799e-05, "loss": 0.1052, "step": 5248 }, { "epoch": 1.6047080403546317, "grad_norm": 0.28803837299346924, "learning_rate": 2.0042032862055788e-05, "loss": 0.0849, "step": 5249 }, { "epoch": 1.6050137572607766, "grad_norm": 0.37349218130111694, "learning_rate": 2.0045854031333587e-05, "loss": 0.1039, "step": 5250 }, { "epoch": 1.6053194741669214, "grad_norm": 0.2517586648464203, "learning_rate": 2.004967520061139e-05, "loss": 0.0539, "step": 5251 }, { "epoch": 1.6056251910730663, "grad_norm": 0.33480945229530334, "learning_rate": 2.0053496369889187e-05, "loss": 0.0708, "step": 5252 }, { "epoch": 1.6059309079792112, "grad_norm": 0.34712570905685425, "learning_rate": 2.0057317539166986e-05, "loss": 0.0848, "step": 5253 }, { "epoch": 1.6062366248853561, "grad_norm": 0.6146636605262756, "learning_rate": 2.0061138708444785e-05, "loss": 0.0887, "step": 5254 }, { "epoch": 1.606542341791501, "grad_norm": 0.426160603761673, "learning_rate": 2.0064959877722584e-05, "loss": 0.1173, "step": 5255 }, { "epoch": 1.6068480586976461, "grad_norm": 0.4997640550136566, "learning_rate": 2.0068781047000382e-05, "loss": 0.1011, "step": 5256 }, { "epoch": 1.6071537756037908, "grad_norm": 0.3988439738750458, "learning_rate": 2.0072602216278185e-05, "loss": 0.1205, "step": 5257 }, { "epoch": 1.607459492509936, "grad_norm": 0.43986204266548157, "learning_rate": 2.0076423385555983e-05, "loss": 0.109, "step": 5258 }, { "epoch": 1.6077652094160806, "grad_norm": 0.481719046831131, "learning_rate": 2.0080244554833782e-05, "loss": 0.1286, "step": 5259 }, { "epoch": 1.6080709263222257, "grad_norm": 0.6460108160972595, "learning_rate": 2.008406572411158e-05, "loss": 0.1337, "step": 5260 }, { "epoch": 1.6083766432283704, "grad_norm": 0.7487947940826416, "learning_rate": 2.008788689338938e-05, "loss": 0.1684, "step": 5261 }, { "epoch": 1.6086823601345155, "grad_norm": 0.5517814755439758, "learning_rate": 2.0091708062667178e-05, "loss": 0.1658, "step": 5262 }, { "epoch": 1.6089880770406604, "grad_norm": 0.5011700987815857, "learning_rate": 2.0095529231944977e-05, "loss": 0.1912, "step": 5263 }, { "epoch": 1.6092937939468053, "grad_norm": 1.1970725059509277, "learning_rate": 2.0099350401222776e-05, "loss": 0.2058, "step": 5264 }, { "epoch": 1.6095995108529502, "grad_norm": 0.752602219581604, "learning_rate": 2.0103171570500575e-05, "loss": 0.2344, "step": 5265 }, { "epoch": 1.609905227759095, "grad_norm": 1.7644014358520508, "learning_rate": 2.0106992739778373e-05, "loss": 0.2364, "step": 5266 }, { "epoch": 1.61021094466524, "grad_norm": 1.065018653869629, "learning_rate": 2.0110813909056172e-05, "loss": 0.2104, "step": 5267 }, { "epoch": 1.6105166615713848, "grad_norm": 2.0660953521728516, "learning_rate": 2.011463507833397e-05, "loss": 0.2642, "step": 5268 }, { "epoch": 1.61082237847753, "grad_norm": 0.8333101868629456, "learning_rate": 2.011845624761177e-05, "loss": 0.2059, "step": 5269 }, { "epoch": 1.6111280953836746, "grad_norm": 1.8377692699432373, "learning_rate": 2.0122277416889568e-05, "loss": 0.2608, "step": 5270 }, { "epoch": 1.6114338122898197, "grad_norm": 1.609156608581543, "learning_rate": 2.0126098586167367e-05, "loss": 0.354, "step": 5271 }, { "epoch": 1.6117395291959644, "grad_norm": 0.45084241032600403, "learning_rate": 2.012991975544517e-05, "loss": 0.2275, "step": 5272 }, { "epoch": 1.6120452461021095, "grad_norm": 0.3653039336204529, "learning_rate": 2.0133740924722968e-05, "loss": 0.0922, "step": 5273 }, { "epoch": 1.6123509630082542, "grad_norm": 0.4420746862888336, "learning_rate": 2.0137562094000767e-05, "loss": 0.1037, "step": 5274 }, { "epoch": 1.6126566799143993, "grad_norm": 0.42616599798202515, "learning_rate": 2.0141383263278565e-05, "loss": 0.0782, "step": 5275 }, { "epoch": 1.6129623968205442, "grad_norm": 0.28830140829086304, "learning_rate": 2.0145204432556364e-05, "loss": 0.0758, "step": 5276 }, { "epoch": 1.613268113726689, "grad_norm": 0.2918660640716553, "learning_rate": 2.0149025601834163e-05, "loss": 0.0497, "step": 5277 }, { "epoch": 1.613573830632834, "grad_norm": 0.3946269452571869, "learning_rate": 2.015284677111196e-05, "loss": 0.0797, "step": 5278 }, { "epoch": 1.6138795475389789, "grad_norm": 0.45897507667541504, "learning_rate": 2.015666794038976e-05, "loss": 0.0979, "step": 5279 }, { "epoch": 1.6141852644451238, "grad_norm": 0.4528775215148926, "learning_rate": 2.016048910966756e-05, "loss": 0.1081, "step": 5280 }, { "epoch": 1.6144909813512687, "grad_norm": 0.6288054585456848, "learning_rate": 2.0164310278945358e-05, "loss": 0.1344, "step": 5281 }, { "epoch": 1.6147966982574138, "grad_norm": 0.7428374290466309, "learning_rate": 2.0168131448223157e-05, "loss": 0.135, "step": 5282 }, { "epoch": 1.6151024151635585, "grad_norm": 0.38430655002593994, "learning_rate": 2.0171952617500955e-05, "loss": 0.0795, "step": 5283 }, { "epoch": 1.6154081320697036, "grad_norm": 0.5850895047187805, "learning_rate": 2.0175773786778754e-05, "loss": 0.1394, "step": 5284 }, { "epoch": 1.6157138489758482, "grad_norm": 1.0047225952148438, "learning_rate": 2.0179594956056553e-05, "loss": 0.1231, "step": 5285 }, { "epoch": 1.6160195658819934, "grad_norm": 0.6850611567497253, "learning_rate": 2.018341612533435e-05, "loss": 0.1654, "step": 5286 }, { "epoch": 1.616325282788138, "grad_norm": 0.6260771751403809, "learning_rate": 2.018723729461215e-05, "loss": 0.1479, "step": 5287 }, { "epoch": 1.6166309996942831, "grad_norm": 0.6523432731628418, "learning_rate": 2.0191058463889952e-05, "loss": 0.1628, "step": 5288 }, { "epoch": 1.616936716600428, "grad_norm": 0.8058981895446777, "learning_rate": 2.019487963316775e-05, "loss": 0.2137, "step": 5289 }, { "epoch": 1.617242433506573, "grad_norm": 0.8539188504219055, "learning_rate": 2.019870080244555e-05, "loss": 0.2255, "step": 5290 }, { "epoch": 1.6175481504127178, "grad_norm": 0.9974493384361267, "learning_rate": 2.020252197172335e-05, "loss": 0.208, "step": 5291 }, { "epoch": 1.6178538673188627, "grad_norm": 0.9780563712120056, "learning_rate": 2.0206343141001147e-05, "loss": 0.1987, "step": 5292 }, { "epoch": 1.6181595842250076, "grad_norm": 0.8859644532203674, "learning_rate": 2.0210164310278946e-05, "loss": 0.2296, "step": 5293 }, { "epoch": 1.6184653011311525, "grad_norm": 1.2698109149932861, "learning_rate": 2.0213985479556745e-05, "loss": 0.2726, "step": 5294 }, { "epoch": 1.6187710180372976, "grad_norm": 1.4103728532791138, "learning_rate": 2.0217806648834544e-05, "loss": 0.2841, "step": 5295 }, { "epoch": 1.6190767349434423, "grad_norm": 6.3820953369140625, "learning_rate": 2.0221627818112342e-05, "loss": 0.2942, "step": 5296 }, { "epoch": 1.6193824518495874, "grad_norm": 0.4586271345615387, "learning_rate": 2.022544898739014e-05, "loss": 0.1701, "step": 5297 }, { "epoch": 1.619688168755732, "grad_norm": 0.3701017498970032, "learning_rate": 2.022927015666794e-05, "loss": 0.1082, "step": 5298 }, { "epoch": 1.6199938856618772, "grad_norm": 0.5781363844871521, "learning_rate": 2.023309132594574e-05, "loss": 0.076, "step": 5299 }, { "epoch": 1.6202996025680219, "grad_norm": 0.7937321662902832, "learning_rate": 2.0236912495223537e-05, "loss": 0.088, "step": 5300 }, { "epoch": 1.620605319474167, "grad_norm": 0.31188735365867615, "learning_rate": 2.0240733664501336e-05, "loss": 0.091, "step": 5301 }, { "epoch": 1.6209110363803119, "grad_norm": 0.34442949295043945, "learning_rate": 2.0244554833779135e-05, "loss": 0.0665, "step": 5302 }, { "epoch": 1.6212167532864568, "grad_norm": 0.37690988183021545, "learning_rate": 2.0248376003056937e-05, "loss": 0.0794, "step": 5303 }, { "epoch": 1.6215224701926017, "grad_norm": 0.4447815418243408, "learning_rate": 2.0252197172334736e-05, "loss": 0.0961, "step": 5304 }, { "epoch": 1.6218281870987465, "grad_norm": 0.36255747079849243, "learning_rate": 2.0256018341612534e-05, "loss": 0.0708, "step": 5305 }, { "epoch": 1.6221339040048914, "grad_norm": 0.3918481767177582, "learning_rate": 2.0259839510890333e-05, "loss": 0.092, "step": 5306 }, { "epoch": 1.6224396209110363, "grad_norm": 0.6528555750846863, "learning_rate": 2.0263660680168132e-05, "loss": 0.0771, "step": 5307 }, { "epoch": 1.6227453378171814, "grad_norm": 0.6857051253318787, "learning_rate": 2.026748184944593e-05, "loss": 0.0901, "step": 5308 }, { "epoch": 1.6230510547233261, "grad_norm": 0.39859020709991455, "learning_rate": 2.027130301872373e-05, "loss": 0.1183, "step": 5309 }, { "epoch": 1.6233567716294712, "grad_norm": 0.5416128039360046, "learning_rate": 2.0275124188001528e-05, "loss": 0.1177, "step": 5310 }, { "epoch": 1.623662488535616, "grad_norm": 0.5974514484405518, "learning_rate": 2.0278945357279327e-05, "loss": 0.1295, "step": 5311 }, { "epoch": 1.623968205441761, "grad_norm": 0.6418554782867432, "learning_rate": 2.0282766526557126e-05, "loss": 0.1711, "step": 5312 }, { "epoch": 1.6242739223479057, "grad_norm": 0.6404048204421997, "learning_rate": 2.0286587695834924e-05, "loss": 0.1952, "step": 5313 }, { "epoch": 1.6245796392540508, "grad_norm": 1.212524175643921, "learning_rate": 2.0290408865112723e-05, "loss": 0.2102, "step": 5314 }, { "epoch": 1.6248853561601957, "grad_norm": 0.8442663550376892, "learning_rate": 2.0294230034390522e-05, "loss": 0.1793, "step": 5315 }, { "epoch": 1.6251910730663406, "grad_norm": 0.820777177810669, "learning_rate": 2.029805120366832e-05, "loss": 0.2263, "step": 5316 }, { "epoch": 1.6254967899724855, "grad_norm": 1.1918904781341553, "learning_rate": 2.030187237294612e-05, "loss": 0.2041, "step": 5317 }, { "epoch": 1.6258025068786304, "grad_norm": 3.9953386783599854, "learning_rate": 2.0305693542223918e-05, "loss": 0.3095, "step": 5318 }, { "epoch": 1.6261082237847753, "grad_norm": 0.9583189487457275, "learning_rate": 2.030951471150172e-05, "loss": 0.2502, "step": 5319 }, { "epoch": 1.6264139406909202, "grad_norm": 1.4874742031097412, "learning_rate": 2.031333588077952e-05, "loss": 0.2619, "step": 5320 }, { "epoch": 1.6267196575970653, "grad_norm": 2.098086357116699, "learning_rate": 2.0317157050057318e-05, "loss": 0.3819, "step": 5321 }, { "epoch": 1.62702537450321, "grad_norm": 0.39551234245300293, "learning_rate": 2.0320978219335117e-05, "loss": 0.1523, "step": 5322 }, { "epoch": 1.627331091409355, "grad_norm": 0.4185939431190491, "learning_rate": 2.0324799388612915e-05, "loss": 0.1118, "step": 5323 }, { "epoch": 1.6276368083154997, "grad_norm": 0.3405463397502899, "learning_rate": 2.0328620557890714e-05, "loss": 0.0904, "step": 5324 }, { "epoch": 1.6279425252216448, "grad_norm": 0.2569710612297058, "learning_rate": 2.0332441727168513e-05, "loss": 0.068, "step": 5325 }, { "epoch": 1.6282482421277895, "grad_norm": 0.419137567281723, "learning_rate": 2.033626289644631e-05, "loss": 0.1053, "step": 5326 }, { "epoch": 1.6285539590339346, "grad_norm": 0.394603431224823, "learning_rate": 2.034008406572411e-05, "loss": 0.0803, "step": 5327 }, { "epoch": 1.6288596759400795, "grad_norm": 0.38011008501052856, "learning_rate": 2.034390523500191e-05, "loss": 0.0873, "step": 5328 }, { "epoch": 1.6291653928462244, "grad_norm": 0.6186875104904175, "learning_rate": 2.034772640427971e-05, "loss": 0.0945, "step": 5329 }, { "epoch": 1.6294711097523693, "grad_norm": 0.3915429711341858, "learning_rate": 2.035154757355751e-05, "loss": 0.0904, "step": 5330 }, { "epoch": 1.6297768266585142, "grad_norm": 0.38885176181793213, "learning_rate": 2.035536874283531e-05, "loss": 0.0984, "step": 5331 }, { "epoch": 1.630082543564659, "grad_norm": 0.3859536051750183, "learning_rate": 2.0359189912113107e-05, "loss": 0.1106, "step": 5332 }, { "epoch": 1.630388260470804, "grad_norm": 0.343504399061203, "learning_rate": 2.0363011081390906e-05, "loss": 0.0891, "step": 5333 }, { "epoch": 1.630693977376949, "grad_norm": 0.42285847663879395, "learning_rate": 2.0366832250668708e-05, "loss": 0.1013, "step": 5334 }, { "epoch": 1.6309996942830938, "grad_norm": 0.9083321690559387, "learning_rate": 2.0370653419946507e-05, "loss": 0.1901, "step": 5335 }, { "epoch": 1.6313054111892389, "grad_norm": 0.5310477018356323, "learning_rate": 2.0374474589224306e-05, "loss": 0.1148, "step": 5336 }, { "epoch": 1.6316111280953836, "grad_norm": 0.5695691704750061, "learning_rate": 2.0378295758502104e-05, "loss": 0.1736, "step": 5337 }, { "epoch": 1.6319168450015287, "grad_norm": 1.0928691625595093, "learning_rate": 2.0382116927779903e-05, "loss": 0.2478, "step": 5338 }, { "epoch": 1.6322225619076733, "grad_norm": 0.8885790109634399, "learning_rate": 2.0385938097057702e-05, "loss": 0.2354, "step": 5339 }, { "epoch": 1.6325282788138185, "grad_norm": 0.8072815537452698, "learning_rate": 2.03897592663355e-05, "loss": 0.1918, "step": 5340 }, { "epoch": 1.6328339957199633, "grad_norm": 0.7128298878669739, "learning_rate": 2.03935804356133e-05, "loss": 0.2147, "step": 5341 }, { "epoch": 1.6331397126261082, "grad_norm": 1.444611668586731, "learning_rate": 2.0397401604891098e-05, "loss": 0.209, "step": 5342 }, { "epoch": 1.6334454295322531, "grad_norm": 1.2936323881149292, "learning_rate": 2.0401222774168897e-05, "loss": 0.2456, "step": 5343 }, { "epoch": 1.633751146438398, "grad_norm": 1.4360908269882202, "learning_rate": 2.0405043943446696e-05, "loss": 0.2726, "step": 5344 }, { "epoch": 1.634056863344543, "grad_norm": 1.1086095571517944, "learning_rate": 2.0408865112724494e-05, "loss": 0.2407, "step": 5345 }, { "epoch": 1.6343625802506878, "grad_norm": 2.0891036987304688, "learning_rate": 2.0412686282002293e-05, "loss": 0.2774, "step": 5346 }, { "epoch": 1.634668297156833, "grad_norm": 0.5246413946151733, "learning_rate": 2.0416507451280092e-05, "loss": 0.1869, "step": 5347 }, { "epoch": 1.6349740140629776, "grad_norm": 0.2873307764530182, "learning_rate": 2.042032862055789e-05, "loss": 0.0959, "step": 5348 }, { "epoch": 1.6352797309691227, "grad_norm": 0.27856960892677307, "learning_rate": 2.042414978983569e-05, "loss": 0.0875, "step": 5349 }, { "epoch": 1.6355854478752674, "grad_norm": 0.5445376634597778, "learning_rate": 2.042797095911349e-05, "loss": 0.1097, "step": 5350 }, { "epoch": 1.6358911647814125, "grad_norm": 0.3045123517513275, "learning_rate": 2.043179212839129e-05, "loss": 0.0542, "step": 5351 }, { "epoch": 1.6361968816875572, "grad_norm": 0.3198130130767822, "learning_rate": 2.043561329766909e-05, "loss": 0.085, "step": 5352 }, { "epoch": 1.6365025985937023, "grad_norm": 0.3053734004497528, "learning_rate": 2.0439434466946888e-05, "loss": 0.0533, "step": 5353 }, { "epoch": 1.6368083154998472, "grad_norm": 0.2832387685775757, "learning_rate": 2.0443255636224687e-05, "loss": 0.0751, "step": 5354 }, { "epoch": 1.637114032405992, "grad_norm": 0.5886026620864868, "learning_rate": 2.0447076805502485e-05, "loss": 0.1098, "step": 5355 }, { "epoch": 1.637419749312137, "grad_norm": 0.30623868107795715, "learning_rate": 2.0450897974780284e-05, "loss": 0.0864, "step": 5356 }, { "epoch": 1.6377254662182819, "grad_norm": 0.40032801032066345, "learning_rate": 2.0454719144058083e-05, "loss": 0.1114, "step": 5357 }, { "epoch": 1.6380311831244267, "grad_norm": 0.3147734999656677, "learning_rate": 2.045854031333588e-05, "loss": 0.0989, "step": 5358 }, { "epoch": 1.6383369000305716, "grad_norm": 0.5662768483161926, "learning_rate": 2.046236148261368e-05, "loss": 0.1349, "step": 5359 }, { "epoch": 1.6386426169367168, "grad_norm": 0.41652557253837585, "learning_rate": 2.046618265189148e-05, "loss": 0.1387, "step": 5360 }, { "epoch": 1.6389483338428614, "grad_norm": 0.6944168210029602, "learning_rate": 2.0470003821169278e-05, "loss": 0.156, "step": 5361 }, { "epoch": 1.6392540507490065, "grad_norm": 0.6260215640068054, "learning_rate": 2.0473824990447076e-05, "loss": 0.1375, "step": 5362 }, { "epoch": 1.6395597676551512, "grad_norm": 0.7955819368362427, "learning_rate": 2.0477646159724875e-05, "loss": 0.1754, "step": 5363 }, { "epoch": 1.6398654845612963, "grad_norm": 0.6979178786277771, "learning_rate": 2.0481467329002674e-05, "loss": 0.1888, "step": 5364 }, { "epoch": 1.640171201467441, "grad_norm": 0.9750168323516846, "learning_rate": 2.0485288498280476e-05, "loss": 0.2133, "step": 5365 }, { "epoch": 1.6404769183735861, "grad_norm": 0.8336147665977478, "learning_rate": 2.0489109667558275e-05, "loss": 0.192, "step": 5366 }, { "epoch": 1.640782635279731, "grad_norm": 0.9059227705001831, "learning_rate": 2.0492930836836074e-05, "loss": 0.2166, "step": 5367 }, { "epoch": 1.641088352185876, "grad_norm": 1.005125880241394, "learning_rate": 2.0496752006113872e-05, "loss": 0.2143, "step": 5368 }, { "epoch": 1.6413940690920208, "grad_norm": 1.4279593229293823, "learning_rate": 2.050057317539167e-05, "loss": 0.2826, "step": 5369 }, { "epoch": 1.6416997859981657, "grad_norm": 1.2369734048843384, "learning_rate": 2.050439434466947e-05, "loss": 0.2389, "step": 5370 }, { "epoch": 1.6420055029043106, "grad_norm": 1.9718960523605347, "learning_rate": 2.050821551394727e-05, "loss": 0.2762, "step": 5371 }, { "epoch": 1.6423112198104555, "grad_norm": 0.5783370733261108, "learning_rate": 2.0512036683225067e-05, "loss": 0.1745, "step": 5372 }, { "epoch": 1.6426169367166006, "grad_norm": 0.24725298583507538, "learning_rate": 2.0515857852502866e-05, "loss": 0.0778, "step": 5373 }, { "epoch": 1.6429226536227453, "grad_norm": 0.44749024510383606, "learning_rate": 2.0519679021780665e-05, "loss": 0.1124, "step": 5374 }, { "epoch": 1.6432283705288904, "grad_norm": 0.2913813292980194, "learning_rate": 2.0523500191058464e-05, "loss": 0.0751, "step": 5375 }, { "epoch": 1.643534087435035, "grad_norm": 0.3672531843185425, "learning_rate": 2.0527321360336262e-05, "loss": 0.0734, "step": 5376 }, { "epoch": 1.6438398043411802, "grad_norm": 0.3594619929790497, "learning_rate": 2.053114252961406e-05, "loss": 0.0559, "step": 5377 }, { "epoch": 1.6441455212473248, "grad_norm": 0.33547985553741455, "learning_rate": 2.053496369889186e-05, "loss": 0.0884, "step": 5378 }, { "epoch": 1.64445123815347, "grad_norm": 0.3057751953601837, "learning_rate": 2.053878486816966e-05, "loss": 0.0715, "step": 5379 }, { "epoch": 1.6447569550596148, "grad_norm": 0.4512922465801239, "learning_rate": 2.0542606037447457e-05, "loss": 0.0919, "step": 5380 }, { "epoch": 1.6450626719657597, "grad_norm": 0.4260003864765167, "learning_rate": 2.054642720672526e-05, "loss": 0.0895, "step": 5381 }, { "epoch": 1.6453683888719046, "grad_norm": 0.5418351888656616, "learning_rate": 2.0550248376003058e-05, "loss": 0.1044, "step": 5382 }, { "epoch": 1.6456741057780495, "grad_norm": 0.525894820690155, "learning_rate": 2.0554069545280857e-05, "loss": 0.092, "step": 5383 }, { "epoch": 1.6459798226841944, "grad_norm": 0.40811723470687866, "learning_rate": 2.0557890714558656e-05, "loss": 0.1085, "step": 5384 }, { "epoch": 1.6462855395903393, "grad_norm": 0.5967738628387451, "learning_rate": 2.0561711883836454e-05, "loss": 0.1428, "step": 5385 }, { "epoch": 1.6465912564964844, "grad_norm": 0.4653361737728119, "learning_rate": 2.0565533053114253e-05, "loss": 0.1301, "step": 5386 }, { "epoch": 1.646896973402629, "grad_norm": 0.6400185227394104, "learning_rate": 2.0569354222392052e-05, "loss": 0.1809, "step": 5387 }, { "epoch": 1.6472026903087742, "grad_norm": 0.865928053855896, "learning_rate": 2.057317539166985e-05, "loss": 0.1631, "step": 5388 }, { "epoch": 1.6475084072149189, "grad_norm": 0.7920725345611572, "learning_rate": 2.057699656094765e-05, "loss": 0.1839, "step": 5389 }, { "epoch": 1.647814124121064, "grad_norm": 0.8040024638175964, "learning_rate": 2.0580817730225448e-05, "loss": 0.2057, "step": 5390 }, { "epoch": 1.6481198410272087, "grad_norm": 0.8519954681396484, "learning_rate": 2.0584638899503247e-05, "loss": 0.2409, "step": 5391 }, { "epoch": 1.6484255579333538, "grad_norm": 1.2151710987091064, "learning_rate": 2.0588460068781046e-05, "loss": 0.2836, "step": 5392 }, { "epoch": 1.6487312748394987, "grad_norm": 2.7472524642944336, "learning_rate": 2.0592281238058844e-05, "loss": 0.2209, "step": 5393 }, { "epoch": 1.6490369917456436, "grad_norm": 2.394906759262085, "learning_rate": 2.0596102407336643e-05, "loss": 0.2383, "step": 5394 }, { "epoch": 1.6493427086517884, "grad_norm": 2.673557758331299, "learning_rate": 2.0599923576614442e-05, "loss": 0.2331, "step": 5395 }, { "epoch": 1.6496484255579333, "grad_norm": 1.990767240524292, "learning_rate": 2.0603744745892244e-05, "loss": 0.3149, "step": 5396 }, { "epoch": 1.6499541424640782, "grad_norm": 0.5727019906044006, "learning_rate": 2.0607565915170043e-05, "loss": 0.1792, "step": 5397 }, { "epoch": 1.6502598593702231, "grad_norm": 0.4215615391731262, "learning_rate": 2.061138708444784e-05, "loss": 0.1383, "step": 5398 }, { "epoch": 1.6505655762763682, "grad_norm": 0.3177843689918518, "learning_rate": 2.061520825372564e-05, "loss": 0.0726, "step": 5399 }, { "epoch": 1.650871293182513, "grad_norm": 0.23223517835140228, "learning_rate": 2.061902942300344e-05, "loss": 0.0571, "step": 5400 }, { "epoch": 1.651177010088658, "grad_norm": 0.42246267199516296, "learning_rate": 2.0622850592281238e-05, "loss": 0.0733, "step": 5401 }, { "epoch": 1.6514827269948027, "grad_norm": 0.41453075408935547, "learning_rate": 2.0626671761559036e-05, "loss": 0.0642, "step": 5402 }, { "epoch": 1.6517884439009478, "grad_norm": 0.3711669147014618, "learning_rate": 2.0630492930836835e-05, "loss": 0.0758, "step": 5403 }, { "epoch": 1.6520941608070925, "grad_norm": 0.32094141840934753, "learning_rate": 2.0634314100114634e-05, "loss": 0.0576, "step": 5404 }, { "epoch": 1.6523998777132376, "grad_norm": 0.5986273288726807, "learning_rate": 2.0638135269392433e-05, "loss": 0.1076, "step": 5405 }, { "epoch": 1.6527055946193825, "grad_norm": 1.0251927375793457, "learning_rate": 2.064195643867023e-05, "loss": 0.0743, "step": 5406 }, { "epoch": 1.6530113115255274, "grad_norm": 0.44772300124168396, "learning_rate": 2.0645777607948034e-05, "loss": 0.119, "step": 5407 }, { "epoch": 1.6533170284316723, "grad_norm": 0.399240106344223, "learning_rate": 2.0649598777225832e-05, "loss": 0.1379, "step": 5408 }, { "epoch": 1.6536227453378172, "grad_norm": 0.408916175365448, "learning_rate": 2.065341994650363e-05, "loss": 0.0936, "step": 5409 }, { "epoch": 1.653928462243962, "grad_norm": 0.5156513452529907, "learning_rate": 2.065724111578143e-05, "loss": 0.1718, "step": 5410 }, { "epoch": 1.654234179150107, "grad_norm": 0.8524584770202637, "learning_rate": 2.066106228505923e-05, "loss": 0.1511, "step": 5411 }, { "epoch": 1.654539896056252, "grad_norm": 1.0220321416854858, "learning_rate": 2.066488345433703e-05, "loss": 0.1974, "step": 5412 }, { "epoch": 1.6548456129623967, "grad_norm": 1.0437049865722656, "learning_rate": 2.066870462361483e-05, "loss": 0.2174, "step": 5413 }, { "epoch": 1.6551513298685419, "grad_norm": 0.6667874455451965, "learning_rate": 2.0672525792892628e-05, "loss": 0.2247, "step": 5414 }, { "epoch": 1.6554570467746865, "grad_norm": 0.665537416934967, "learning_rate": 2.0676346962170427e-05, "loss": 0.1762, "step": 5415 }, { "epoch": 1.6557627636808316, "grad_norm": 0.9752854108810425, "learning_rate": 2.0680168131448226e-05, "loss": 0.2196, "step": 5416 }, { "epoch": 1.6560684805869763, "grad_norm": 1.0499892234802246, "learning_rate": 2.0683989300726024e-05, "loss": 0.2762, "step": 5417 }, { "epoch": 1.6563741974931214, "grad_norm": 0.8499460816383362, "learning_rate": 2.0687810470003823e-05, "loss": 0.2605, "step": 5418 }, { "epoch": 1.6566799143992663, "grad_norm": 0.7770063877105713, "learning_rate": 2.0691631639281622e-05, "loss": 0.2668, "step": 5419 }, { "epoch": 1.6569856313054112, "grad_norm": 1.0846312046051025, "learning_rate": 2.069545280855942e-05, "loss": 0.2702, "step": 5420 }, { "epoch": 1.657291348211556, "grad_norm": 1.7249999046325684, "learning_rate": 2.069927397783722e-05, "loss": 0.3677, "step": 5421 }, { "epoch": 1.657597065117701, "grad_norm": 0.6421895623207092, "learning_rate": 2.0703095147115018e-05, "loss": 0.1978, "step": 5422 }, { "epoch": 1.657902782023846, "grad_norm": 0.32055971026420593, "learning_rate": 2.0706916316392817e-05, "loss": 0.0904, "step": 5423 }, { "epoch": 1.6582084989299908, "grad_norm": 0.46219709515571594, "learning_rate": 2.0710737485670616e-05, "loss": 0.1063, "step": 5424 }, { "epoch": 1.658514215836136, "grad_norm": 0.39012303948402405, "learning_rate": 2.0714558654948414e-05, "loss": 0.0808, "step": 5425 }, { "epoch": 1.6588199327422806, "grad_norm": 0.33020925521850586, "learning_rate": 2.0718379824226213e-05, "loss": 0.0634, "step": 5426 }, { "epoch": 1.6591256496484257, "grad_norm": 0.2851020395755768, "learning_rate": 2.0722200993504015e-05, "loss": 0.0627, "step": 5427 }, { "epoch": 1.6594313665545704, "grad_norm": 0.2929774224758148, "learning_rate": 2.0726022162781814e-05, "loss": 0.0647, "step": 5428 }, { "epoch": 1.6597370834607155, "grad_norm": 0.5521993041038513, "learning_rate": 2.0729843332059613e-05, "loss": 0.1099, "step": 5429 }, { "epoch": 1.6600428003668601, "grad_norm": 0.5974448919296265, "learning_rate": 2.073366450133741e-05, "loss": 0.0888, "step": 5430 }, { "epoch": 1.6603485172730053, "grad_norm": 0.29131758213043213, "learning_rate": 2.073748567061521e-05, "loss": 0.0591, "step": 5431 }, { "epoch": 1.6606542341791501, "grad_norm": 0.4510475695133209, "learning_rate": 2.074130683989301e-05, "loss": 0.1123, "step": 5432 }, { "epoch": 1.660959951085295, "grad_norm": 0.5876105427742004, "learning_rate": 2.0745128009170808e-05, "loss": 0.1069, "step": 5433 }, { "epoch": 1.66126566799144, "grad_norm": 0.853573739528656, "learning_rate": 2.0748949178448606e-05, "loss": 0.1794, "step": 5434 }, { "epoch": 1.6615713848975848, "grad_norm": 0.5302228331565857, "learning_rate": 2.0752770347726405e-05, "loss": 0.159, "step": 5435 }, { "epoch": 1.6618771018037297, "grad_norm": 0.998449981212616, "learning_rate": 2.0756591517004204e-05, "loss": 0.139, "step": 5436 }, { "epoch": 1.6621828187098746, "grad_norm": 0.7153952121734619, "learning_rate": 2.0760412686282003e-05, "loss": 0.162, "step": 5437 }, { "epoch": 1.6624885356160197, "grad_norm": 0.5309622287750244, "learning_rate": 2.07642338555598e-05, "loss": 0.1999, "step": 5438 }, { "epoch": 1.6627942525221644, "grad_norm": 1.0210449695587158, "learning_rate": 2.07680550248376e-05, "loss": 0.2264, "step": 5439 }, { "epoch": 1.6630999694283095, "grad_norm": 0.7759552001953125, "learning_rate": 2.07718761941154e-05, "loss": 0.2123, "step": 5440 }, { "epoch": 1.6634056863344542, "grad_norm": 0.6714988350868225, "learning_rate": 2.0775697363393198e-05, "loss": 0.1948, "step": 5441 }, { "epoch": 1.6637114032405993, "grad_norm": 1.0095198154449463, "learning_rate": 2.0779518532670996e-05, "loss": 0.271, "step": 5442 }, { "epoch": 1.664017120146744, "grad_norm": 0.9101609587669373, "learning_rate": 2.07833397019488e-05, "loss": 0.1786, "step": 5443 }, { "epoch": 1.664322837052889, "grad_norm": 1.52555513381958, "learning_rate": 2.0787160871226597e-05, "loss": 0.2253, "step": 5444 }, { "epoch": 1.664628553959034, "grad_norm": 3.0977015495300293, "learning_rate": 2.0790982040504396e-05, "loss": 0.2228, "step": 5445 }, { "epoch": 1.6649342708651789, "grad_norm": 1.514047622680664, "learning_rate": 2.0794803209782195e-05, "loss": 0.3116, "step": 5446 }, { "epoch": 1.6652399877713238, "grad_norm": 0.5746108889579773, "learning_rate": 2.0798624379059993e-05, "loss": 0.1568, "step": 5447 }, { "epoch": 1.6655457046774687, "grad_norm": 0.35507890582084656, "learning_rate": 2.0802445548337792e-05, "loss": 0.0829, "step": 5448 }, { "epoch": 1.6658514215836135, "grad_norm": 0.26868048310279846, "learning_rate": 2.080626671761559e-05, "loss": 0.0766, "step": 5449 }, { "epoch": 1.6661571384897584, "grad_norm": 0.3212776482105255, "learning_rate": 2.081008788689339e-05, "loss": 0.0788, "step": 5450 }, { "epoch": 1.6664628553959036, "grad_norm": 0.3473459780216217, "learning_rate": 2.081390905617119e-05, "loss": 0.0842, "step": 5451 }, { "epoch": 1.6667685723020482, "grad_norm": 0.2726270854473114, "learning_rate": 2.0817730225448987e-05, "loss": 0.0535, "step": 5452 }, { "epoch": 1.6670742892081933, "grad_norm": 0.3071592450141907, "learning_rate": 2.0821551394726786e-05, "loss": 0.0802, "step": 5453 }, { "epoch": 1.667380006114338, "grad_norm": 0.5885387659072876, "learning_rate": 2.0825372564004585e-05, "loss": 0.076, "step": 5454 }, { "epoch": 1.6676857230204831, "grad_norm": 0.5084386467933655, "learning_rate": 2.0829193733282383e-05, "loss": 0.092, "step": 5455 }, { "epoch": 1.6679914399266278, "grad_norm": 0.2853013873100281, "learning_rate": 2.0833014902560182e-05, "loss": 0.072, "step": 5456 }, { "epoch": 1.668297156832773, "grad_norm": 0.7523764371871948, "learning_rate": 2.083683607183798e-05, "loss": 0.1561, "step": 5457 }, { "epoch": 1.6686028737389178, "grad_norm": 0.6618975400924683, "learning_rate": 2.0840657241115783e-05, "loss": 0.0927, "step": 5458 }, { "epoch": 1.6689085906450627, "grad_norm": 0.38830938935279846, "learning_rate": 2.0844478410393582e-05, "loss": 0.0973, "step": 5459 }, { "epoch": 1.6692143075512076, "grad_norm": 0.8017683029174805, "learning_rate": 2.084829957967138e-05, "loss": 0.1371, "step": 5460 }, { "epoch": 1.6695200244573525, "grad_norm": 0.43316248059272766, "learning_rate": 2.085212074894918e-05, "loss": 0.1766, "step": 5461 }, { "epoch": 1.6698257413634974, "grad_norm": 0.7467924356460571, "learning_rate": 2.0855941918226978e-05, "loss": 0.1772, "step": 5462 }, { "epoch": 1.6701314582696423, "grad_norm": 1.0791795253753662, "learning_rate": 2.0859763087504777e-05, "loss": 0.1608, "step": 5463 }, { "epoch": 1.6704371751757874, "grad_norm": 0.6235798597335815, "learning_rate": 2.0863584256782576e-05, "loss": 0.2139, "step": 5464 }, { "epoch": 1.670742892081932, "grad_norm": 0.9633647203445435, "learning_rate": 2.0867405426060374e-05, "loss": 0.188, "step": 5465 }, { "epoch": 1.6710486089880772, "grad_norm": 2.708256959915161, "learning_rate": 2.0871226595338173e-05, "loss": 0.2225, "step": 5466 }, { "epoch": 1.6713543258942218, "grad_norm": 1.0581326484680176, "learning_rate": 2.0875047764615972e-05, "loss": 0.2431, "step": 5467 }, { "epoch": 1.671660042800367, "grad_norm": 0.9014422297477722, "learning_rate": 2.087886893389377e-05, "loss": 0.2143, "step": 5468 }, { "epoch": 1.6719657597065116, "grad_norm": 2.645918369293213, "learning_rate": 2.088269010317157e-05, "loss": 0.2213, "step": 5469 }, { "epoch": 1.6722714766126567, "grad_norm": 1.5184452533721924, "learning_rate": 2.0886511272449368e-05, "loss": 0.2576, "step": 5470 }, { "epoch": 1.6725771935188016, "grad_norm": 1.8510912656784058, "learning_rate": 2.0890332441727167e-05, "loss": 0.3575, "step": 5471 }, { "epoch": 1.6728829104249465, "grad_norm": 0.5531582236289978, "learning_rate": 2.0894153611004965e-05, "loss": 0.1476, "step": 5472 }, { "epoch": 1.6731886273310914, "grad_norm": 0.3425605595111847, "learning_rate": 2.0897974780282764e-05, "loss": 0.0802, "step": 5473 }, { "epoch": 1.6734943442372363, "grad_norm": 0.3631783127784729, "learning_rate": 2.0901795949560566e-05, "loss": 0.0982, "step": 5474 }, { "epoch": 1.6738000611433812, "grad_norm": 0.31759434938430786, "learning_rate": 2.0905617118838365e-05, "loss": 0.0865, "step": 5475 }, { "epoch": 1.674105778049526, "grad_norm": 0.3954648971557617, "learning_rate": 2.0909438288116164e-05, "loss": 0.0975, "step": 5476 }, { "epoch": 1.6744114949556712, "grad_norm": 0.22085098922252655, "learning_rate": 2.0913259457393963e-05, "loss": 0.0582, "step": 5477 }, { "epoch": 1.6747172118618159, "grad_norm": 0.39137157797813416, "learning_rate": 2.091708062667176e-05, "loss": 0.0652, "step": 5478 }, { "epoch": 1.675022928767961, "grad_norm": 0.24013039469718933, "learning_rate": 2.092090179594956e-05, "loss": 0.0581, "step": 5479 }, { "epoch": 1.6753286456741057, "grad_norm": 0.5696359276771545, "learning_rate": 2.092472296522736e-05, "loss": 0.1682, "step": 5480 }, { "epoch": 1.6756343625802508, "grad_norm": 0.33158230781555176, "learning_rate": 2.0928544134505158e-05, "loss": 0.0736, "step": 5481 }, { "epoch": 1.6759400794863955, "grad_norm": 0.8852296471595764, "learning_rate": 2.0932365303782956e-05, "loss": 0.118, "step": 5482 }, { "epoch": 1.6762457963925406, "grad_norm": 0.36237555742263794, "learning_rate": 2.0936186473060755e-05, "loss": 0.0774, "step": 5483 }, { "epoch": 1.6765515132986855, "grad_norm": 0.3034752607345581, "learning_rate": 2.0940007642338554e-05, "loss": 0.1056, "step": 5484 }, { "epoch": 1.6768572302048304, "grad_norm": 0.4531189799308777, "learning_rate": 2.0943828811616356e-05, "loss": 0.1523, "step": 5485 }, { "epoch": 1.6771629471109752, "grad_norm": 0.4736090898513794, "learning_rate": 2.0947649980894155e-05, "loss": 0.1456, "step": 5486 }, { "epoch": 1.6774686640171201, "grad_norm": 0.8298685550689697, "learning_rate": 2.0951471150171953e-05, "loss": 0.1706, "step": 5487 }, { "epoch": 1.677774380923265, "grad_norm": 0.48166024684906006, "learning_rate": 2.0955292319449752e-05, "loss": 0.1733, "step": 5488 }, { "epoch": 1.67808009782941, "grad_norm": 0.8239328861236572, "learning_rate": 2.0959113488727554e-05, "loss": 0.2046, "step": 5489 }, { "epoch": 1.678385814735555, "grad_norm": 0.619482159614563, "learning_rate": 2.0962934658005353e-05, "loss": 0.1905, "step": 5490 }, { "epoch": 1.6786915316416997, "grad_norm": 0.9169145226478577, "learning_rate": 2.0966755827283152e-05, "loss": 0.2227, "step": 5491 }, { "epoch": 1.6789972485478448, "grad_norm": 0.6089756488800049, "learning_rate": 2.097057699656095e-05, "loss": 0.2128, "step": 5492 }, { "epoch": 1.6793029654539895, "grad_norm": 0.8691756129264832, "learning_rate": 2.097439816583875e-05, "loss": 0.252, "step": 5493 }, { "epoch": 1.6796086823601346, "grad_norm": 0.8952469825744629, "learning_rate": 2.0978219335116548e-05, "loss": 0.2684, "step": 5494 }, { "epoch": 1.6799143992662793, "grad_norm": 0.9534359574317932, "learning_rate": 2.0982040504394347e-05, "loss": 0.2409, "step": 5495 }, { "epoch": 1.6802201161724244, "grad_norm": 2.0033771991729736, "learning_rate": 2.0985861673672146e-05, "loss": 0.3426, "step": 5496 }, { "epoch": 1.6805258330785693, "grad_norm": 0.3552451729774475, "learning_rate": 2.0989682842949944e-05, "loss": 0.1683, "step": 5497 }, { "epoch": 1.6808315499847142, "grad_norm": 0.45043712854385376, "learning_rate": 2.0993504012227743e-05, "loss": 0.107, "step": 5498 }, { "epoch": 1.681137266890859, "grad_norm": 0.31540483236312866, "learning_rate": 2.0997325181505542e-05, "loss": 0.0974, "step": 5499 }, { "epoch": 1.681442983797004, "grad_norm": 0.5143747329711914, "learning_rate": 2.100114635078334e-05, "loss": 0.1127, "step": 5500 }, { "epoch": 1.6817487007031489, "grad_norm": 0.30648231506347656, "learning_rate": 2.100496752006114e-05, "loss": 0.0767, "step": 5501 }, { "epoch": 1.6820544176092938, "grad_norm": 0.2634066045284271, "learning_rate": 2.1008788689338938e-05, "loss": 0.0985, "step": 5502 }, { "epoch": 1.6823601345154389, "grad_norm": 0.3244017958641052, "learning_rate": 2.1012609858616737e-05, "loss": 0.0763, "step": 5503 }, { "epoch": 1.6826658514215835, "grad_norm": 0.4545248746871948, "learning_rate": 2.1016431027894535e-05, "loss": 0.0734, "step": 5504 }, { "epoch": 1.6829715683277287, "grad_norm": 0.8149879574775696, "learning_rate": 2.1020252197172338e-05, "loss": 0.1087, "step": 5505 }, { "epoch": 1.6832772852338733, "grad_norm": 0.5142156481742859, "learning_rate": 2.1024073366450136e-05, "loss": 0.0834, "step": 5506 }, { "epoch": 1.6835830021400184, "grad_norm": 0.41197118163108826, "learning_rate": 2.1027894535727935e-05, "loss": 0.1022, "step": 5507 }, { "epoch": 1.683888719046163, "grad_norm": 0.46664318442344666, "learning_rate": 2.1031715705005734e-05, "loss": 0.1223, "step": 5508 }, { "epoch": 1.6841944359523082, "grad_norm": 0.450512170791626, "learning_rate": 2.1035536874283533e-05, "loss": 0.1119, "step": 5509 }, { "epoch": 1.6845001528584531, "grad_norm": 0.6240183711051941, "learning_rate": 2.103935804356133e-05, "loss": 0.1463, "step": 5510 }, { "epoch": 1.684805869764598, "grad_norm": 0.8740485310554504, "learning_rate": 2.104317921283913e-05, "loss": 0.1771, "step": 5511 }, { "epoch": 1.685111586670743, "grad_norm": 0.5594334006309509, "learning_rate": 2.104700038211693e-05, "loss": 0.1733, "step": 5512 }, { "epoch": 1.6854173035768878, "grad_norm": 0.7083232998847961, "learning_rate": 2.1050821551394728e-05, "loss": 0.19, "step": 5513 }, { "epoch": 1.6857230204830327, "grad_norm": 0.96764075756073, "learning_rate": 2.1054642720672526e-05, "loss": 0.2088, "step": 5514 }, { "epoch": 1.6860287373891776, "grad_norm": 0.8555235266685486, "learning_rate": 2.1058463889950325e-05, "loss": 0.2284, "step": 5515 }, { "epoch": 1.6863344542953227, "grad_norm": 0.7857459783554077, "learning_rate": 2.1062285059228124e-05, "loss": 0.2063, "step": 5516 }, { "epoch": 1.6866401712014674, "grad_norm": 0.6772654056549072, "learning_rate": 2.1066106228505923e-05, "loss": 0.2185, "step": 5517 }, { "epoch": 1.6869458881076125, "grad_norm": 0.848495364189148, "learning_rate": 2.106992739778372e-05, "loss": 0.2053, "step": 5518 }, { "epoch": 1.6872516050137571, "grad_norm": 1.2560343742370605, "learning_rate": 2.107374856706152e-05, "loss": 0.2426, "step": 5519 }, { "epoch": 1.6875573219199023, "grad_norm": 1.8744415044784546, "learning_rate": 2.1077569736339322e-05, "loss": 0.255, "step": 5520 }, { "epoch": 1.687863038826047, "grad_norm": 1.7080767154693604, "learning_rate": 2.108139090561712e-05, "loss": 0.316, "step": 5521 }, { "epoch": 1.688168755732192, "grad_norm": 0.4344256520271301, "learning_rate": 2.108521207489492e-05, "loss": 0.1401, "step": 5522 }, { "epoch": 1.688474472638337, "grad_norm": 0.32798057794570923, "learning_rate": 2.108903324417272e-05, "loss": 0.0795, "step": 5523 }, { "epoch": 1.6887801895444818, "grad_norm": 0.4523467719554901, "learning_rate": 2.1092854413450517e-05, "loss": 0.1101, "step": 5524 }, { "epoch": 1.6890859064506267, "grad_norm": 0.49572813510894775, "learning_rate": 2.1096675582728316e-05, "loss": 0.0548, "step": 5525 }, { "epoch": 1.6893916233567716, "grad_norm": 0.36275923252105713, "learning_rate": 2.1100496752006115e-05, "loss": 0.0723, "step": 5526 }, { "epoch": 1.6896973402629165, "grad_norm": 0.7452584505081177, "learning_rate": 2.1104317921283913e-05, "loss": 0.0876, "step": 5527 }, { "epoch": 1.6900030571690614, "grad_norm": 0.37326139211654663, "learning_rate": 2.1108139090561712e-05, "loss": 0.0652, "step": 5528 }, { "epoch": 1.6903087740752065, "grad_norm": 0.46251073479652405, "learning_rate": 2.111196025983951e-05, "loss": 0.0853, "step": 5529 }, { "epoch": 1.6906144909813512, "grad_norm": 0.38829073309898376, "learning_rate": 2.111578142911731e-05, "loss": 0.1058, "step": 5530 }, { "epoch": 1.6909202078874963, "grad_norm": 0.35976317524909973, "learning_rate": 2.111960259839511e-05, "loss": 0.0735, "step": 5531 }, { "epoch": 1.691225924793641, "grad_norm": 0.5438142418861389, "learning_rate": 2.1123423767672907e-05, "loss": 0.1237, "step": 5532 }, { "epoch": 1.691531641699786, "grad_norm": 0.30121952295303345, "learning_rate": 2.1127244936950706e-05, "loss": 0.0933, "step": 5533 }, { "epoch": 1.6918373586059308, "grad_norm": 0.666731059551239, "learning_rate": 2.1131066106228505e-05, "loss": 0.137, "step": 5534 }, { "epoch": 1.6921430755120759, "grad_norm": 0.690792977809906, "learning_rate": 2.1134887275506303e-05, "loss": 0.1444, "step": 5535 }, { "epoch": 1.6924487924182208, "grad_norm": 0.5410357713699341, "learning_rate": 2.1138708444784105e-05, "loss": 0.1495, "step": 5536 }, { "epoch": 1.6927545093243657, "grad_norm": 0.7982252836227417, "learning_rate": 2.1142529614061904e-05, "loss": 0.1567, "step": 5537 }, { "epoch": 1.6930602262305106, "grad_norm": 0.5118891596794128, "learning_rate": 2.1146350783339703e-05, "loss": 0.1613, "step": 5538 }, { "epoch": 1.6933659431366554, "grad_norm": 0.8464818596839905, "learning_rate": 2.1150171952617502e-05, "loss": 0.1818, "step": 5539 }, { "epoch": 1.6936716600428003, "grad_norm": 0.898289144039154, "learning_rate": 2.11539931218953e-05, "loss": 0.1875, "step": 5540 }, { "epoch": 1.6939773769489452, "grad_norm": 1.0735888481140137, "learning_rate": 2.11578142911731e-05, "loss": 0.2452, "step": 5541 }, { "epoch": 1.6942830938550903, "grad_norm": 1.1051886081695557, "learning_rate": 2.1161635460450898e-05, "loss": 0.2168, "step": 5542 }, { "epoch": 1.694588810761235, "grad_norm": 2.4751670360565186, "learning_rate": 2.1165456629728697e-05, "loss": 0.2631, "step": 5543 }, { "epoch": 1.6948945276673801, "grad_norm": 1.0150935649871826, "learning_rate": 2.1169277799006495e-05, "loss": 0.2276, "step": 5544 }, { "epoch": 1.6952002445735248, "grad_norm": 1.3351691961288452, "learning_rate": 2.1173098968284294e-05, "loss": 0.2681, "step": 5545 }, { "epoch": 1.69550596147967, "grad_norm": 1.2458231449127197, "learning_rate": 2.1176920137562093e-05, "loss": 0.3093, "step": 5546 }, { "epoch": 1.6958116783858146, "grad_norm": 0.48589229583740234, "learning_rate": 2.118074130683989e-05, "loss": 0.1619, "step": 5547 }, { "epoch": 1.6961173952919597, "grad_norm": 0.3470019996166229, "learning_rate": 2.118456247611769e-05, "loss": 0.1011, "step": 5548 }, { "epoch": 1.6964231121981046, "grad_norm": 0.3930821716785431, "learning_rate": 2.118838364539549e-05, "loss": 0.0836, "step": 5549 }, { "epoch": 1.6967288291042495, "grad_norm": 0.23035676777362823, "learning_rate": 2.1192204814673288e-05, "loss": 0.0561, "step": 5550 }, { "epoch": 1.6970345460103944, "grad_norm": 0.2744598686695099, "learning_rate": 2.119602598395109e-05, "loss": 0.0679, "step": 5551 }, { "epoch": 1.6973402629165393, "grad_norm": 0.33996573090553284, "learning_rate": 2.119984715322889e-05, "loss": 0.1072, "step": 5552 }, { "epoch": 1.6976459798226842, "grad_norm": 0.4470658302307129, "learning_rate": 2.1203668322506688e-05, "loss": 0.055, "step": 5553 }, { "epoch": 1.697951696728829, "grad_norm": 0.3949526846408844, "learning_rate": 2.1207489491784486e-05, "loss": 0.0575, "step": 5554 }, { "epoch": 1.6982574136349742, "grad_norm": 0.4201253056526184, "learning_rate": 2.1211310661062285e-05, "loss": 0.1168, "step": 5555 }, { "epoch": 1.6985631305411188, "grad_norm": 0.4633316993713379, "learning_rate": 2.1215131830340084e-05, "loss": 0.0764, "step": 5556 }, { "epoch": 1.698868847447264, "grad_norm": 0.6479843854904175, "learning_rate": 2.1218952999617882e-05, "loss": 0.1253, "step": 5557 }, { "epoch": 1.6991745643534086, "grad_norm": 0.6085544228553772, "learning_rate": 2.122277416889568e-05, "loss": 0.082, "step": 5558 }, { "epoch": 1.6994802812595537, "grad_norm": 0.4898277819156647, "learning_rate": 2.122659533817348e-05, "loss": 0.12, "step": 5559 }, { "epoch": 1.6997859981656984, "grad_norm": 0.47607114911079407, "learning_rate": 2.123041650745128e-05, "loss": 0.1111, "step": 5560 }, { "epoch": 1.7000917150718435, "grad_norm": 0.6986544728279114, "learning_rate": 2.1234237676729077e-05, "loss": 0.1709, "step": 5561 }, { "epoch": 1.7003974319779884, "grad_norm": 0.591979444026947, "learning_rate": 2.123805884600688e-05, "loss": 0.1367, "step": 5562 }, { "epoch": 1.7007031488841333, "grad_norm": 0.8471728563308716, "learning_rate": 2.124188001528468e-05, "loss": 0.1831, "step": 5563 }, { "epoch": 1.7010088657902782, "grad_norm": 1.0042495727539062, "learning_rate": 2.1245701184562477e-05, "loss": 0.1972, "step": 5564 }, { "epoch": 1.701314582696423, "grad_norm": 0.9070826768875122, "learning_rate": 2.1249522353840276e-05, "loss": 0.2098, "step": 5565 }, { "epoch": 1.701620299602568, "grad_norm": 1.5602749586105347, "learning_rate": 2.1253343523118075e-05, "loss": 0.2049, "step": 5566 }, { "epoch": 1.701926016508713, "grad_norm": 0.9293088316917419, "learning_rate": 2.1257164692395877e-05, "loss": 0.236, "step": 5567 }, { "epoch": 1.702231733414858, "grad_norm": 0.7002514004707336, "learning_rate": 2.1260985861673675e-05, "loss": 0.2311, "step": 5568 }, { "epoch": 1.7025374503210027, "grad_norm": 1.164824366569519, "learning_rate": 2.1264807030951474e-05, "loss": 0.2649, "step": 5569 }, { "epoch": 1.7028431672271478, "grad_norm": 1.0753871202468872, "learning_rate": 2.1268628200229273e-05, "loss": 0.2524, "step": 5570 }, { "epoch": 1.7031488841332925, "grad_norm": 1.4242374897003174, "learning_rate": 2.127244936950707e-05, "loss": 0.3471, "step": 5571 }, { "epoch": 1.7034546010394376, "grad_norm": 0.39294883608818054, "learning_rate": 2.127627053878487e-05, "loss": 0.1512, "step": 5572 }, { "epoch": 1.7037603179455822, "grad_norm": 0.40431079268455505, "learning_rate": 2.128009170806267e-05, "loss": 0.0936, "step": 5573 }, { "epoch": 1.7040660348517274, "grad_norm": 0.317741721868515, "learning_rate": 2.1283912877340468e-05, "loss": 0.0775, "step": 5574 }, { "epoch": 1.7043717517578723, "grad_norm": 0.4278109073638916, "learning_rate": 2.1287734046618267e-05, "loss": 0.1065, "step": 5575 }, { "epoch": 1.7046774686640171, "grad_norm": 0.35720837116241455, "learning_rate": 2.1291555215896065e-05, "loss": 0.0807, "step": 5576 }, { "epoch": 1.704983185570162, "grad_norm": 0.2665511667728424, "learning_rate": 2.1295376385173864e-05, "loss": 0.0413, "step": 5577 }, { "epoch": 1.705288902476307, "grad_norm": 0.3836120367050171, "learning_rate": 2.1299197554451663e-05, "loss": 0.0807, "step": 5578 }, { "epoch": 1.7055946193824518, "grad_norm": 0.3452036380767822, "learning_rate": 2.130301872372946e-05, "loss": 0.0914, "step": 5579 }, { "epoch": 1.7059003362885967, "grad_norm": 0.34173569083213806, "learning_rate": 2.130683989300726e-05, "loss": 0.0751, "step": 5580 }, { "epoch": 1.7062060531947418, "grad_norm": 0.33171045780181885, "learning_rate": 2.131066106228506e-05, "loss": 0.0861, "step": 5581 }, { "epoch": 1.7065117701008865, "grad_norm": 0.35060253739356995, "learning_rate": 2.131448223156286e-05, "loss": 0.0936, "step": 5582 }, { "epoch": 1.7068174870070316, "grad_norm": 0.5109509229660034, "learning_rate": 2.131830340084066e-05, "loss": 0.0937, "step": 5583 }, { "epoch": 1.7071232039131763, "grad_norm": 0.6761054992675781, "learning_rate": 2.132212457011846e-05, "loss": 0.1181, "step": 5584 }, { "epoch": 1.7074289208193214, "grad_norm": 0.5414308309555054, "learning_rate": 2.1325945739396258e-05, "loss": 0.1665, "step": 5585 }, { "epoch": 1.707734637725466, "grad_norm": 0.5457049608230591, "learning_rate": 2.1329766908674056e-05, "loss": 0.1445, "step": 5586 }, { "epoch": 1.7080403546316112, "grad_norm": 0.5657439827919006, "learning_rate": 2.1333588077951855e-05, "loss": 0.1833, "step": 5587 }, { "epoch": 1.708346071537756, "grad_norm": 0.9322499632835388, "learning_rate": 2.1337409247229654e-05, "loss": 0.1845, "step": 5588 }, { "epoch": 1.708651788443901, "grad_norm": 0.9393545389175415, "learning_rate": 2.1341230416507452e-05, "loss": 0.2137, "step": 5589 }, { "epoch": 1.7089575053500459, "grad_norm": 0.8745341300964355, "learning_rate": 2.134505158578525e-05, "loss": 0.1749, "step": 5590 }, { "epoch": 1.7092632222561908, "grad_norm": 1.1631219387054443, "learning_rate": 2.134887275506305e-05, "loss": 0.2152, "step": 5591 }, { "epoch": 1.7095689391623357, "grad_norm": 1.4014629125595093, "learning_rate": 2.135269392434085e-05, "loss": 0.2513, "step": 5592 }, { "epoch": 1.7098746560684805, "grad_norm": 2.447058916091919, "learning_rate": 2.1356515093618647e-05, "loss": 0.1914, "step": 5593 }, { "epoch": 1.7101803729746257, "grad_norm": 1.263744831085205, "learning_rate": 2.1360336262896446e-05, "loss": 0.2285, "step": 5594 }, { "epoch": 1.7104860898807703, "grad_norm": 1.2904667854309082, "learning_rate": 2.1364157432174245e-05, "loss": 0.2544, "step": 5595 }, { "epoch": 1.7107918067869154, "grad_norm": 2.1790578365325928, "learning_rate": 2.1367978601452044e-05, "loss": 0.2832, "step": 5596 }, { "epoch": 1.7110975236930601, "grad_norm": 0.5585746765136719, "learning_rate": 2.1371799770729842e-05, "loss": 0.138, "step": 5597 }, { "epoch": 1.7114032405992052, "grad_norm": 0.4854622781276703, "learning_rate": 2.1375620940007645e-05, "loss": 0.0891, "step": 5598 }, { "epoch": 1.71170895750535, "grad_norm": 0.3170180916786194, "learning_rate": 2.1379442109285443e-05, "loss": 0.0854, "step": 5599 }, { "epoch": 1.712014674411495, "grad_norm": 0.2891576588153839, "learning_rate": 2.1383263278563242e-05, "loss": 0.0608, "step": 5600 }, { "epoch": 1.71232039131764, "grad_norm": 0.28927767276763916, "learning_rate": 2.138708444784104e-05, "loss": 0.0942, "step": 5601 }, { "epoch": 1.7126261082237848, "grad_norm": 0.4725033938884735, "learning_rate": 2.139090561711884e-05, "loss": 0.0883, "step": 5602 }, { "epoch": 1.7129318251299297, "grad_norm": 0.4101293683052063, "learning_rate": 2.1394726786396638e-05, "loss": 0.092, "step": 5603 }, { "epoch": 1.7132375420360746, "grad_norm": 0.39536014199256897, "learning_rate": 2.1398547955674437e-05, "loss": 0.0797, "step": 5604 }, { "epoch": 1.7135432589422195, "grad_norm": 0.3696434497833252, "learning_rate": 2.1402369124952236e-05, "loss": 0.0731, "step": 5605 }, { "epoch": 1.7138489758483644, "grad_norm": 1.0323359966278076, "learning_rate": 2.1406190294230035e-05, "loss": 0.0806, "step": 5606 }, { "epoch": 1.7141546927545095, "grad_norm": 0.362203449010849, "learning_rate": 2.1410011463507833e-05, "loss": 0.1151, "step": 5607 }, { "epoch": 1.7144604096606542, "grad_norm": 0.3702980875968933, "learning_rate": 2.1413832632785632e-05, "loss": 0.1005, "step": 5608 }, { "epoch": 1.7147661265667993, "grad_norm": 0.4963300824165344, "learning_rate": 2.141765380206343e-05, "loss": 0.1441, "step": 5609 }, { "epoch": 1.715071843472944, "grad_norm": 0.5163498520851135, "learning_rate": 2.142147497134123e-05, "loss": 0.14, "step": 5610 }, { "epoch": 1.715377560379089, "grad_norm": 0.6304136514663696, "learning_rate": 2.1425296140619028e-05, "loss": 0.1737, "step": 5611 }, { "epoch": 1.7156832772852337, "grad_norm": 0.927107572555542, "learning_rate": 2.1429117309896827e-05, "loss": 0.1858, "step": 5612 }, { "epoch": 1.7159889941913788, "grad_norm": 0.8758786916732788, "learning_rate": 2.143293847917463e-05, "loss": 0.1921, "step": 5613 }, { "epoch": 1.7162947110975237, "grad_norm": 1.3945125341415405, "learning_rate": 2.1436759648452428e-05, "loss": 0.1782, "step": 5614 }, { "epoch": 1.7166004280036686, "grad_norm": 0.6937887072563171, "learning_rate": 2.1440580817730227e-05, "loss": 0.2314, "step": 5615 }, { "epoch": 1.7169061449098135, "grad_norm": 1.185513973236084, "learning_rate": 2.1444401987008025e-05, "loss": 0.2203, "step": 5616 }, { "epoch": 1.7172118618159584, "grad_norm": 1.1801124811172485, "learning_rate": 2.1448223156285824e-05, "loss": 0.2312, "step": 5617 }, { "epoch": 1.7175175787221033, "grad_norm": 1.0294996500015259, "learning_rate": 2.1452044325563623e-05, "loss": 0.2275, "step": 5618 }, { "epoch": 1.7178232956282482, "grad_norm": 0.9014450311660767, "learning_rate": 2.145586549484142e-05, "loss": 0.2711, "step": 5619 }, { "epoch": 1.7181290125343933, "grad_norm": 1.2333005666732788, "learning_rate": 2.145968666411922e-05, "loss": 0.2439, "step": 5620 }, { "epoch": 1.718434729440538, "grad_norm": 1.5229058265686035, "learning_rate": 2.146350783339702e-05, "loss": 0.2807, "step": 5621 }, { "epoch": 1.718740446346683, "grad_norm": 0.5060397982597351, "learning_rate": 2.1467329002674818e-05, "loss": 0.1636, "step": 5622 }, { "epoch": 1.7190461632528278, "grad_norm": 0.3251456022262573, "learning_rate": 2.1471150171952617e-05, "loss": 0.0975, "step": 5623 }, { "epoch": 1.719351880158973, "grad_norm": 0.37979257106781006, "learning_rate": 2.1474971341230415e-05, "loss": 0.1027, "step": 5624 }, { "epoch": 1.7196575970651176, "grad_norm": 0.2888834774494171, "learning_rate": 2.1478792510508214e-05, "loss": 0.0768, "step": 5625 }, { "epoch": 1.7199633139712627, "grad_norm": 0.26076197624206543, "learning_rate": 2.1482613679786013e-05, "loss": 0.0692, "step": 5626 }, { "epoch": 1.7202690308774076, "grad_norm": 0.31250864267349243, "learning_rate": 2.148643484906381e-05, "loss": 0.0658, "step": 5627 }, { "epoch": 1.7205747477835525, "grad_norm": 0.2783905267715454, "learning_rate": 2.149025601834161e-05, "loss": 0.0513, "step": 5628 }, { "epoch": 1.7208804646896974, "grad_norm": 0.3287924826145172, "learning_rate": 2.1494077187619412e-05, "loss": 0.0873, "step": 5629 }, { "epoch": 1.7211861815958422, "grad_norm": 0.5134536623954773, "learning_rate": 2.149789835689721e-05, "loss": 0.0911, "step": 5630 }, { "epoch": 1.7214918985019871, "grad_norm": 0.3100675940513611, "learning_rate": 2.150171952617501e-05, "loss": 0.0968, "step": 5631 }, { "epoch": 1.721797615408132, "grad_norm": 0.8241308331489563, "learning_rate": 2.150554069545281e-05, "loss": 0.1509, "step": 5632 }, { "epoch": 1.722103332314277, "grad_norm": 0.3246043026447296, "learning_rate": 2.1509361864730607e-05, "loss": 0.0921, "step": 5633 }, { "epoch": 1.7224090492204218, "grad_norm": 0.5367054343223572, "learning_rate": 2.1513183034008406e-05, "loss": 0.0884, "step": 5634 }, { "epoch": 1.722714766126567, "grad_norm": 0.4897212088108063, "learning_rate": 2.1517004203286205e-05, "loss": 0.1439, "step": 5635 }, { "epoch": 1.7230204830327116, "grad_norm": 0.548246443271637, "learning_rate": 2.1520825372564004e-05, "loss": 0.1297, "step": 5636 }, { "epoch": 1.7233261999388567, "grad_norm": 0.5265418291091919, "learning_rate": 2.1524646541841802e-05, "loss": 0.1654, "step": 5637 }, { "epoch": 1.7236319168450014, "grad_norm": 0.7853013277053833, "learning_rate": 2.15284677111196e-05, "loss": 0.1625, "step": 5638 }, { "epoch": 1.7239376337511465, "grad_norm": 0.5049507021903992, "learning_rate": 2.15322888803974e-05, "loss": 0.1827, "step": 5639 }, { "epoch": 1.7242433506572914, "grad_norm": 0.7875084280967712, "learning_rate": 2.1536110049675202e-05, "loss": 0.1985, "step": 5640 }, { "epoch": 1.7245490675634363, "grad_norm": 1.3204009532928467, "learning_rate": 2.1539931218953e-05, "loss": 0.2188, "step": 5641 }, { "epoch": 1.7248547844695812, "grad_norm": 0.6860550045967102, "learning_rate": 2.15437523882308e-05, "loss": 0.1939, "step": 5642 }, { "epoch": 1.725160501375726, "grad_norm": 0.7932855486869812, "learning_rate": 2.1547573557508598e-05, "loss": 0.2007, "step": 5643 }, { "epoch": 1.725466218281871, "grad_norm": 1.0030012130737305, "learning_rate": 2.1551394726786397e-05, "loss": 0.2481, "step": 5644 }, { "epoch": 1.7257719351880159, "grad_norm": 1.0500574111938477, "learning_rate": 2.15552158960642e-05, "loss": 0.2314, "step": 5645 }, { "epoch": 1.7260776520941608, "grad_norm": 2.762281656265259, "learning_rate": 2.1559037065341998e-05, "loss": 0.3486, "step": 5646 }, { "epoch": 1.7263833690003056, "grad_norm": 0.4745698869228363, "learning_rate": 2.1562858234619797e-05, "loss": 0.1572, "step": 5647 }, { "epoch": 1.7266890859064508, "grad_norm": 0.3307827413082123, "learning_rate": 2.1566679403897595e-05, "loss": 0.0995, "step": 5648 }, { "epoch": 1.7269948028125954, "grad_norm": 0.5351890325546265, "learning_rate": 2.1570500573175394e-05, "loss": 0.1318, "step": 5649 }, { "epoch": 1.7273005197187405, "grad_norm": 0.2855494022369385, "learning_rate": 2.1574321742453193e-05, "loss": 0.0914, "step": 5650 }, { "epoch": 1.7276062366248852, "grad_norm": 0.3247462511062622, "learning_rate": 2.157814291173099e-05, "loss": 0.0771, "step": 5651 }, { "epoch": 1.7279119535310303, "grad_norm": 0.5176777243614197, "learning_rate": 2.158196408100879e-05, "loss": 0.0526, "step": 5652 }, { "epoch": 1.7282176704371752, "grad_norm": 0.387445867061615, "learning_rate": 2.158578525028659e-05, "loss": 0.065, "step": 5653 }, { "epoch": 1.7285233873433201, "grad_norm": 0.5060367584228516, "learning_rate": 2.1589606419564388e-05, "loss": 0.0785, "step": 5654 }, { "epoch": 1.728829104249465, "grad_norm": 0.6130343079566956, "learning_rate": 2.1593427588842187e-05, "loss": 0.087, "step": 5655 }, { "epoch": 1.72913482115561, "grad_norm": 1.5518805980682373, "learning_rate": 2.1597248758119985e-05, "loss": 0.0991, "step": 5656 }, { "epoch": 1.7294405380617548, "grad_norm": 0.3673887252807617, "learning_rate": 2.1601069927397784e-05, "loss": 0.1221, "step": 5657 }, { "epoch": 1.7297462549678997, "grad_norm": 0.6009191274642944, "learning_rate": 2.1604891096675583e-05, "loss": 0.0998, "step": 5658 }, { "epoch": 1.7300519718740446, "grad_norm": 0.4554536044597626, "learning_rate": 2.160871226595338e-05, "loss": 0.1054, "step": 5659 }, { "epoch": 1.7303576887801895, "grad_norm": 0.533351480960846, "learning_rate": 2.1612533435231184e-05, "loss": 0.1162, "step": 5660 }, { "epoch": 1.7306634056863346, "grad_norm": 0.5623989701271057, "learning_rate": 2.1616354604508982e-05, "loss": 0.1181, "step": 5661 }, { "epoch": 1.7309691225924793, "grad_norm": 1.0945971012115479, "learning_rate": 2.162017577378678e-05, "loss": 0.1605, "step": 5662 }, { "epoch": 1.7312748394986244, "grad_norm": 0.6717748641967773, "learning_rate": 2.162399694306458e-05, "loss": 0.1856, "step": 5663 }, { "epoch": 1.731580556404769, "grad_norm": 0.7039706110954285, "learning_rate": 2.162781811234238e-05, "loss": 0.1559, "step": 5664 }, { "epoch": 1.7318862733109142, "grad_norm": 1.0255331993103027, "learning_rate": 2.1631639281620177e-05, "loss": 0.2047, "step": 5665 }, { "epoch": 1.732191990217059, "grad_norm": 0.7662827968597412, "learning_rate": 2.1635460450897976e-05, "loss": 0.1901, "step": 5666 }, { "epoch": 1.732497707123204, "grad_norm": 1.2615309953689575, "learning_rate": 2.1639281620175775e-05, "loss": 0.2207, "step": 5667 }, { "epoch": 1.7328034240293488, "grad_norm": 1.223204255104065, "learning_rate": 2.1643102789453574e-05, "loss": 0.2143, "step": 5668 }, { "epoch": 1.7331091409354937, "grad_norm": 2.278918981552124, "learning_rate": 2.1646923958731372e-05, "loss": 0.2508, "step": 5669 }, { "epoch": 1.7334148578416386, "grad_norm": 0.8716095089912415, "learning_rate": 2.165074512800917e-05, "loss": 0.195, "step": 5670 }, { "epoch": 1.7337205747477835, "grad_norm": 2.7689759731292725, "learning_rate": 2.165456629728697e-05, "loss": 0.3409, "step": 5671 }, { "epoch": 1.7340262916539284, "grad_norm": 0.40993455052375793, "learning_rate": 2.165838746656477e-05, "loss": 0.1461, "step": 5672 }, { "epoch": 1.7343320085600733, "grad_norm": 0.8662793040275574, "learning_rate": 2.1662208635842567e-05, "loss": 0.1132, "step": 5673 }, { "epoch": 1.7346377254662184, "grad_norm": 0.303072065114975, "learning_rate": 2.1666029805120366e-05, "loss": 0.0792, "step": 5674 }, { "epoch": 1.734943442372363, "grad_norm": 0.34591323137283325, "learning_rate": 2.1669850974398165e-05, "loss": 0.0701, "step": 5675 }, { "epoch": 1.7352491592785082, "grad_norm": 0.25517743825912476, "learning_rate": 2.1673672143675967e-05, "loss": 0.0746, "step": 5676 }, { "epoch": 1.7355548761846529, "grad_norm": 0.25950026512145996, "learning_rate": 2.1677493312953766e-05, "loss": 0.0504, "step": 5677 }, { "epoch": 1.735860593090798, "grad_norm": 0.28562745451927185, "learning_rate": 2.1681314482231564e-05, "loss": 0.0743, "step": 5678 }, { "epoch": 1.7361663099969429, "grad_norm": 0.2792230546474457, "learning_rate": 2.1685135651509363e-05, "loss": 0.0513, "step": 5679 }, { "epoch": 1.7364720269030878, "grad_norm": 0.373216450214386, "learning_rate": 2.1688956820787162e-05, "loss": 0.1033, "step": 5680 }, { "epoch": 1.7367777438092327, "grad_norm": 0.3384038209915161, "learning_rate": 2.169277799006496e-05, "loss": 0.0813, "step": 5681 }, { "epoch": 1.7370834607153776, "grad_norm": 0.5056743621826172, "learning_rate": 2.169659915934276e-05, "loss": 0.1074, "step": 5682 }, { "epoch": 1.7373891776215225, "grad_norm": 0.6662783026695251, "learning_rate": 2.1700420328620558e-05, "loss": 0.0865, "step": 5683 }, { "epoch": 1.7376948945276673, "grad_norm": 0.36440980434417725, "learning_rate": 2.1704241497898357e-05, "loss": 0.1034, "step": 5684 }, { "epoch": 1.7380006114338122, "grad_norm": 0.7505896687507629, "learning_rate": 2.1708062667176156e-05, "loss": 0.1625, "step": 5685 }, { "epoch": 1.7383063283399571, "grad_norm": 0.5804044008255005, "learning_rate": 2.1711883836453954e-05, "loss": 0.1484, "step": 5686 }, { "epoch": 1.7386120452461022, "grad_norm": 0.8451066017150879, "learning_rate": 2.1715705005731753e-05, "loss": 0.158, "step": 5687 }, { "epoch": 1.738917762152247, "grad_norm": 0.8569531440734863, "learning_rate": 2.1719526175009552e-05, "loss": 0.1864, "step": 5688 }, { "epoch": 1.739223479058392, "grad_norm": 0.5921459794044495, "learning_rate": 2.172334734428735e-05, "loss": 0.1654, "step": 5689 }, { "epoch": 1.7395291959645367, "grad_norm": 1.682094693183899, "learning_rate": 2.172716851356515e-05, "loss": 0.2242, "step": 5690 }, { "epoch": 1.7398349128706818, "grad_norm": 1.0114341974258423, "learning_rate": 2.173098968284295e-05, "loss": 0.2167, "step": 5691 }, { "epoch": 1.7401406297768267, "grad_norm": 1.3110108375549316, "learning_rate": 2.173481085212075e-05, "loss": 0.1983, "step": 5692 }, { "epoch": 1.7404463466829716, "grad_norm": 1.1317652463912964, "learning_rate": 2.173863202139855e-05, "loss": 0.2313, "step": 5693 }, { "epoch": 1.7407520635891165, "grad_norm": 1.1687287092208862, "learning_rate": 2.1742453190676348e-05, "loss": 0.2364, "step": 5694 }, { "epoch": 1.7410577804952614, "grad_norm": 0.9792826175689697, "learning_rate": 2.1746274359954147e-05, "loss": 0.2572, "step": 5695 }, { "epoch": 1.7413634974014063, "grad_norm": 1.7653956413269043, "learning_rate": 2.1750095529231945e-05, "loss": 0.2664, "step": 5696 }, { "epoch": 1.7416692143075512, "grad_norm": 0.39695286750793457, "learning_rate": 2.1753916698509744e-05, "loss": 0.1944, "step": 5697 }, { "epoch": 1.741974931213696, "grad_norm": 0.2974017262458801, "learning_rate": 2.1757737867787543e-05, "loss": 0.0965, "step": 5698 }, { "epoch": 1.742280648119841, "grad_norm": 0.6425020694732666, "learning_rate": 2.176155903706534e-05, "loss": 0.1131, "step": 5699 }, { "epoch": 1.742586365025986, "grad_norm": 0.29942378401756287, "learning_rate": 2.176538020634314e-05, "loss": 0.0555, "step": 5700 }, { "epoch": 1.7428920819321307, "grad_norm": 0.3879173994064331, "learning_rate": 2.176920137562094e-05, "loss": 0.0911, "step": 5701 }, { "epoch": 1.7431977988382759, "grad_norm": 0.2753501236438751, "learning_rate": 2.1773022544898738e-05, "loss": 0.0671, "step": 5702 }, { "epoch": 1.7435035157444205, "grad_norm": 0.5660889744758606, "learning_rate": 2.1776843714176536e-05, "loss": 0.092, "step": 5703 }, { "epoch": 1.7438092326505656, "grad_norm": 0.4409513771533966, "learning_rate": 2.1780664883454335e-05, "loss": 0.0835, "step": 5704 }, { "epoch": 1.7441149495567105, "grad_norm": 0.5306555032730103, "learning_rate": 2.1784486052732134e-05, "loss": 0.1103, "step": 5705 }, { "epoch": 1.7444206664628554, "grad_norm": 0.6961706280708313, "learning_rate": 2.1788307222009936e-05, "loss": 0.0805, "step": 5706 }, { "epoch": 1.7447263833690003, "grad_norm": 0.3978375494480133, "learning_rate": 2.1792128391287735e-05, "loss": 0.1114, "step": 5707 }, { "epoch": 1.7450321002751452, "grad_norm": 0.3658231496810913, "learning_rate": 2.1795949560565534e-05, "loss": 0.0825, "step": 5708 }, { "epoch": 1.74533781718129, "grad_norm": 0.43397724628448486, "learning_rate": 2.1799770729843332e-05, "loss": 0.1046, "step": 5709 }, { "epoch": 1.745643534087435, "grad_norm": 0.7690596580505371, "learning_rate": 2.180359189912113e-05, "loss": 0.1663, "step": 5710 }, { "epoch": 1.74594925099358, "grad_norm": 0.6958594918251038, "learning_rate": 2.180741306839893e-05, "loss": 0.1699, "step": 5711 }, { "epoch": 1.7462549678997248, "grad_norm": 0.7851148247718811, "learning_rate": 2.181123423767673e-05, "loss": 0.1405, "step": 5712 }, { "epoch": 1.74656068480587, "grad_norm": 0.8404000997543335, "learning_rate": 2.1815055406954527e-05, "loss": 0.201, "step": 5713 }, { "epoch": 1.7468664017120146, "grad_norm": 1.160452961921692, "learning_rate": 2.1818876576232326e-05, "loss": 0.1762, "step": 5714 }, { "epoch": 1.7471721186181597, "grad_norm": 1.048768401145935, "learning_rate": 2.1822697745510125e-05, "loss": 0.2142, "step": 5715 }, { "epoch": 1.7474778355243044, "grad_norm": 1.0011056661605835, "learning_rate": 2.1826518914787924e-05, "loss": 0.2307, "step": 5716 }, { "epoch": 1.7477835524304495, "grad_norm": 0.9738185405731201, "learning_rate": 2.1830340084065722e-05, "loss": 0.2222, "step": 5717 }, { "epoch": 1.7480892693365944, "grad_norm": 1.4707037210464478, "learning_rate": 2.1834161253343524e-05, "loss": 0.2588, "step": 5718 }, { "epoch": 1.7483949862427393, "grad_norm": 1.1459094285964966, "learning_rate": 2.1837982422621323e-05, "loss": 0.209, "step": 5719 }, { "epoch": 1.7487007031488842, "grad_norm": 1.1123753786087036, "learning_rate": 2.1841803591899122e-05, "loss": 0.2898, "step": 5720 }, { "epoch": 1.749006420055029, "grad_norm": 1.8321588039398193, "learning_rate": 2.184562476117692e-05, "loss": 0.2686, "step": 5721 }, { "epoch": 1.749312136961174, "grad_norm": 0.3907623291015625, "learning_rate": 2.1849445930454723e-05, "loss": 0.1564, "step": 5722 }, { "epoch": 1.7496178538673188, "grad_norm": 0.2748531103134155, "learning_rate": 2.185326709973252e-05, "loss": 0.0911, "step": 5723 }, { "epoch": 1.7499235707734637, "grad_norm": 0.6297683119773865, "learning_rate": 2.185708826901032e-05, "loss": 0.0868, "step": 5724 }, { "epoch": 1.7502292876796086, "grad_norm": 0.3354841470718384, "learning_rate": 2.186090943828812e-05, "loss": 0.0711, "step": 5725 }, { "epoch": 1.7505350045857537, "grad_norm": 0.2792954444885254, "learning_rate": 2.1864730607565918e-05, "loss": 0.0692, "step": 5726 }, { "epoch": 1.7508407214918984, "grad_norm": 0.4237620234489441, "learning_rate": 2.1868551776843717e-05, "loss": 0.0782, "step": 5727 }, { "epoch": 1.7511464383980435, "grad_norm": 0.3057308793067932, "learning_rate": 2.1872372946121515e-05, "loss": 0.0553, "step": 5728 }, { "epoch": 1.7514521553041882, "grad_norm": 0.3477180004119873, "learning_rate": 2.1876194115399314e-05, "loss": 0.0788, "step": 5729 }, { "epoch": 1.7517578722103333, "grad_norm": 1.0016857385635376, "learning_rate": 2.1880015284677113e-05, "loss": 0.0923, "step": 5730 }, { "epoch": 1.7520635891164782, "grad_norm": 1.3033428192138672, "learning_rate": 2.188383645395491e-05, "loss": 0.0791, "step": 5731 }, { "epoch": 1.752369306022623, "grad_norm": 0.5236884951591492, "learning_rate": 2.188765762323271e-05, "loss": 0.1066, "step": 5732 }, { "epoch": 1.752675022928768, "grad_norm": 0.5087285041809082, "learning_rate": 2.189147879251051e-05, "loss": 0.1158, "step": 5733 }, { "epoch": 1.7529807398349129, "grad_norm": 0.5178178548812866, "learning_rate": 2.1895299961788308e-05, "loss": 0.1178, "step": 5734 }, { "epoch": 1.7532864567410578, "grad_norm": 0.5082042813301086, "learning_rate": 2.1899121131066106e-05, "loss": 0.1397, "step": 5735 }, { "epoch": 1.7535921736472027, "grad_norm": 0.5361162424087524, "learning_rate": 2.1902942300343905e-05, "loss": 0.1242, "step": 5736 }, { "epoch": 1.7538978905533475, "grad_norm": 0.6213585734367371, "learning_rate": 2.1906763469621704e-05, "loss": 0.1765, "step": 5737 }, { "epoch": 1.7542036074594924, "grad_norm": 0.8464465737342834, "learning_rate": 2.1910584638899506e-05, "loss": 0.204, "step": 5738 }, { "epoch": 1.7545093243656376, "grad_norm": 0.5493717193603516, "learning_rate": 2.1914405808177305e-05, "loss": 0.1641, "step": 5739 }, { "epoch": 1.7548150412717822, "grad_norm": 0.7009583115577698, "learning_rate": 2.1918226977455104e-05, "loss": 0.202, "step": 5740 }, { "epoch": 1.7551207581779273, "grad_norm": 1.103044867515564, "learning_rate": 2.1922048146732902e-05, "loss": 0.2096, "step": 5741 }, { "epoch": 1.755426475084072, "grad_norm": 1.4629658460617065, "learning_rate": 2.19258693160107e-05, "loss": 0.2726, "step": 5742 }, { "epoch": 1.7557321919902171, "grad_norm": 1.0703415870666504, "learning_rate": 2.19296904852885e-05, "loss": 0.2527, "step": 5743 }, { "epoch": 1.756037908896362, "grad_norm": 0.8744505047798157, "learning_rate": 2.19335116545663e-05, "loss": 0.2207, "step": 5744 }, { "epoch": 1.756343625802507, "grad_norm": 1.093031883239746, "learning_rate": 2.1937332823844097e-05, "loss": 0.2744, "step": 5745 }, { "epoch": 1.7566493427086518, "grad_norm": 3.1259727478027344, "learning_rate": 2.1941153993121896e-05, "loss": 0.2899, "step": 5746 }, { "epoch": 1.7569550596147967, "grad_norm": 0.3849698305130005, "learning_rate": 2.1944975162399695e-05, "loss": 0.1673, "step": 5747 }, { "epoch": 1.7572607765209416, "grad_norm": 0.4546962380409241, "learning_rate": 2.1948796331677494e-05, "loss": 0.1013, "step": 5748 }, { "epoch": 1.7575664934270865, "grad_norm": 0.33535438776016235, "learning_rate": 2.1952617500955292e-05, "loss": 0.0875, "step": 5749 }, { "epoch": 1.7578722103332314, "grad_norm": 0.23353707790374756, "learning_rate": 2.195643867023309e-05, "loss": 0.0856, "step": 5750 }, { "epoch": 1.7581779272393763, "grad_norm": 0.350478857755661, "learning_rate": 2.196025983951089e-05, "loss": 0.0528, "step": 5751 }, { "epoch": 1.7584836441455214, "grad_norm": 0.3675864338874817, "learning_rate": 2.196408100878869e-05, "loss": 0.0579, "step": 5752 }, { "epoch": 1.758789361051666, "grad_norm": 0.34734463691711426, "learning_rate": 2.196790217806649e-05, "loss": 0.0644, "step": 5753 }, { "epoch": 1.7590950779578112, "grad_norm": 0.39533767104148865, "learning_rate": 2.197172334734429e-05, "loss": 0.0852, "step": 5754 }, { "epoch": 1.7594007948639558, "grad_norm": 0.331017404794693, "learning_rate": 2.1975544516622088e-05, "loss": 0.077, "step": 5755 }, { "epoch": 1.759706511770101, "grad_norm": 0.4384509325027466, "learning_rate": 2.1979365685899887e-05, "loss": 0.1066, "step": 5756 }, { "epoch": 1.7600122286762458, "grad_norm": 0.41984060406684875, "learning_rate": 2.1983186855177686e-05, "loss": 0.1133, "step": 5757 }, { "epoch": 1.7603179455823907, "grad_norm": 0.39927464723587036, "learning_rate": 2.1987008024455484e-05, "loss": 0.0973, "step": 5758 }, { "epoch": 1.7606236624885356, "grad_norm": 0.6045545935630798, "learning_rate": 2.1990829193733283e-05, "loss": 0.1327, "step": 5759 }, { "epoch": 1.7609293793946805, "grad_norm": 0.528374969959259, "learning_rate": 2.1994650363011082e-05, "loss": 0.1722, "step": 5760 }, { "epoch": 1.7612350963008254, "grad_norm": 0.6691190600395203, "learning_rate": 2.199847153228888e-05, "loss": 0.1303, "step": 5761 }, { "epoch": 1.7615408132069703, "grad_norm": 0.5270003080368042, "learning_rate": 2.200229270156668e-05, "loss": 0.1452, "step": 5762 }, { "epoch": 1.7618465301131152, "grad_norm": 0.6498697996139526, "learning_rate": 2.2006113870844478e-05, "loss": 0.1827, "step": 5763 }, { "epoch": 1.76215224701926, "grad_norm": 0.8281863331794739, "learning_rate": 2.2009935040122277e-05, "loss": 0.2353, "step": 5764 }, { "epoch": 1.7624579639254052, "grad_norm": 0.5852746367454529, "learning_rate": 2.2013756209400076e-05, "loss": 0.1784, "step": 5765 }, { "epoch": 1.7627636808315499, "grad_norm": 1.44893479347229, "learning_rate": 2.2017577378677874e-05, "loss": 0.2208, "step": 5766 }, { "epoch": 1.763069397737695, "grad_norm": 1.574366807937622, "learning_rate": 2.2021398547955673e-05, "loss": 0.2384, "step": 5767 }, { "epoch": 1.7633751146438397, "grad_norm": 1.0876821279525757, "learning_rate": 2.2025219717233472e-05, "loss": 0.2221, "step": 5768 }, { "epoch": 1.7636808315499848, "grad_norm": 1.5580813884735107, "learning_rate": 2.2029040886511274e-05, "loss": 0.2529, "step": 5769 }, { "epoch": 1.7639865484561297, "grad_norm": 2.756408214569092, "learning_rate": 2.2032862055789073e-05, "loss": 0.2962, "step": 5770 }, { "epoch": 1.7642922653622746, "grad_norm": 16.54250144958496, "learning_rate": 2.203668322506687e-05, "loss": 0.2944, "step": 5771 }, { "epoch": 1.7645979822684195, "grad_norm": 0.5081109404563904, "learning_rate": 2.204050439434467e-05, "loss": 0.1844, "step": 5772 }, { "epoch": 1.7649036991745644, "grad_norm": 0.33686140179634094, "learning_rate": 2.204432556362247e-05, "loss": 0.105, "step": 5773 }, { "epoch": 1.7652094160807092, "grad_norm": 0.3446650803089142, "learning_rate": 2.2048146732900268e-05, "loss": 0.0877, "step": 5774 }, { "epoch": 1.7655151329868541, "grad_norm": 0.9472780227661133, "learning_rate": 2.2051967902178066e-05, "loss": 0.0688, "step": 5775 }, { "epoch": 1.765820849892999, "grad_norm": 0.25679129362106323, "learning_rate": 2.2055789071455865e-05, "loss": 0.0661, "step": 5776 }, { "epoch": 1.766126566799144, "grad_norm": 0.2900333106517792, "learning_rate": 2.2059610240733664e-05, "loss": 0.0673, "step": 5777 }, { "epoch": 1.766432283705289, "grad_norm": 0.3807234466075897, "learning_rate": 2.2063431410011463e-05, "loss": 0.0895, "step": 5778 }, { "epoch": 1.7667380006114337, "grad_norm": 0.49225151538848877, "learning_rate": 2.206725257928926e-05, "loss": 0.0863, "step": 5779 }, { "epoch": 1.7670437175175788, "grad_norm": 0.361872136592865, "learning_rate": 2.207107374856706e-05, "loss": 0.0915, "step": 5780 }, { "epoch": 1.7673494344237235, "grad_norm": 0.24563533067703247, "learning_rate": 2.207489491784486e-05, "loss": 0.0712, "step": 5781 }, { "epoch": 1.7676551513298686, "grad_norm": 0.4747968912124634, "learning_rate": 2.2078716087122658e-05, "loss": 0.1155, "step": 5782 }, { "epoch": 1.7679608682360135, "grad_norm": 0.5836007595062256, "learning_rate": 2.2082537256400456e-05, "loss": 0.1425, "step": 5783 }, { "epoch": 1.7682665851421584, "grad_norm": 0.4401434063911438, "learning_rate": 2.208635842567826e-05, "loss": 0.1464, "step": 5784 }, { "epoch": 1.7685723020483033, "grad_norm": 0.7328463196754456, "learning_rate": 2.2090179594956057e-05, "loss": 0.1186, "step": 5785 }, { "epoch": 1.7688780189544482, "grad_norm": 0.6518155336380005, "learning_rate": 2.2094000764233856e-05, "loss": 0.1495, "step": 5786 }, { "epoch": 1.769183735860593, "grad_norm": 0.5142185091972351, "learning_rate": 2.2097821933511655e-05, "loss": 0.1393, "step": 5787 }, { "epoch": 1.769489452766738, "grad_norm": 0.8337002396583557, "learning_rate": 2.2101643102789453e-05, "loss": 0.1625, "step": 5788 }, { "epoch": 1.7697951696728829, "grad_norm": 0.6420124173164368, "learning_rate": 2.2105464272067252e-05, "loss": 0.186, "step": 5789 }, { "epoch": 1.7701008865790278, "grad_norm": 0.5537239909172058, "learning_rate": 2.210928544134505e-05, "loss": 0.1718, "step": 5790 }, { "epoch": 1.7704066034851729, "grad_norm": 0.6893072724342346, "learning_rate": 2.211310661062285e-05, "loss": 0.1664, "step": 5791 }, { "epoch": 1.7707123203913175, "grad_norm": 1.0649255514144897, "learning_rate": 2.211692777990065e-05, "loss": 0.2657, "step": 5792 }, { "epoch": 1.7710180372974627, "grad_norm": 1.7234448194503784, "learning_rate": 2.2120748949178447e-05, "loss": 0.2051, "step": 5793 }, { "epoch": 1.7713237542036073, "grad_norm": 0.9869505763053894, "learning_rate": 2.2124570118456246e-05, "loss": 0.2326, "step": 5794 }, { "epoch": 1.7716294711097524, "grad_norm": 1.4931960105895996, "learning_rate": 2.2128391287734045e-05, "loss": 0.2881, "step": 5795 }, { "epoch": 1.7719351880158973, "grad_norm": 2.2789371013641357, "learning_rate": 2.2132212457011847e-05, "loss": 0.3849, "step": 5796 }, { "epoch": 1.7722409049220422, "grad_norm": 0.5379346609115601, "learning_rate": 2.2136033626289646e-05, "loss": 0.164, "step": 5797 }, { "epoch": 1.7725466218281871, "grad_norm": 0.415505051612854, "learning_rate": 2.2139854795567444e-05, "loss": 0.0871, "step": 5798 }, { "epoch": 1.772852338734332, "grad_norm": 0.30465567111968994, "learning_rate": 2.2143675964845243e-05, "loss": 0.079, "step": 5799 }, { "epoch": 1.773158055640477, "grad_norm": 0.33706241846084595, "learning_rate": 2.2147497134123045e-05, "loss": 0.0788, "step": 5800 }, { "epoch": 1.7734637725466218, "grad_norm": 0.3500748872756958, "learning_rate": 2.2151318303400844e-05, "loss": 0.0693, "step": 5801 }, { "epoch": 1.7737694894527667, "grad_norm": 0.3958848714828491, "learning_rate": 2.2155139472678643e-05, "loss": 0.0873, "step": 5802 }, { "epoch": 1.7740752063589116, "grad_norm": 0.3024647533893585, "learning_rate": 2.215896064195644e-05, "loss": 0.0618, "step": 5803 }, { "epoch": 1.7743809232650567, "grad_norm": 0.4242079555988312, "learning_rate": 2.216278181123424e-05, "loss": 0.0649, "step": 5804 }, { "epoch": 1.7746866401712014, "grad_norm": 0.5025117993354797, "learning_rate": 2.216660298051204e-05, "loss": 0.1154, "step": 5805 }, { "epoch": 1.7749923570773465, "grad_norm": 0.3082609176635742, "learning_rate": 2.2170424149789838e-05, "loss": 0.076, "step": 5806 }, { "epoch": 1.7752980739834912, "grad_norm": 0.3318847119808197, "learning_rate": 2.2174245319067636e-05, "loss": 0.1059, "step": 5807 }, { "epoch": 1.7756037908896363, "grad_norm": 0.30241677165031433, "learning_rate": 2.2178066488345435e-05, "loss": 0.095, "step": 5808 }, { "epoch": 1.7759095077957812, "grad_norm": 0.6339489817619324, "learning_rate": 2.2181887657623234e-05, "loss": 0.1008, "step": 5809 }, { "epoch": 1.776215224701926, "grad_norm": 1.7381994724273682, "learning_rate": 2.2185708826901033e-05, "loss": 0.1354, "step": 5810 }, { "epoch": 1.776520941608071, "grad_norm": 1.2239830493927002, "learning_rate": 2.218952999617883e-05, "loss": 0.1557, "step": 5811 }, { "epoch": 1.7768266585142158, "grad_norm": 0.649924635887146, "learning_rate": 2.219335116545663e-05, "loss": 0.1998, "step": 5812 }, { "epoch": 1.7771323754203607, "grad_norm": 0.9315193891525269, "learning_rate": 2.219717233473443e-05, "loss": 0.1636, "step": 5813 }, { "epoch": 1.7774380923265056, "grad_norm": 1.1722546815872192, "learning_rate": 2.2200993504012228e-05, "loss": 0.2205, "step": 5814 }, { "epoch": 1.7777438092326505, "grad_norm": 0.967088520526886, "learning_rate": 2.220481467329003e-05, "loss": 0.2192, "step": 5815 }, { "epoch": 1.7780495261387954, "grad_norm": 0.7863870859146118, "learning_rate": 2.220863584256783e-05, "loss": 0.2018, "step": 5816 }, { "epoch": 1.7783552430449405, "grad_norm": 0.978602945804596, "learning_rate": 2.2212457011845627e-05, "loss": 0.1953, "step": 5817 }, { "epoch": 1.7786609599510852, "grad_norm": 1.6236393451690674, "learning_rate": 2.2216278181123426e-05, "loss": 0.2641, "step": 5818 }, { "epoch": 1.7789666768572303, "grad_norm": 0.9734055995941162, "learning_rate": 2.2220099350401225e-05, "loss": 0.1778, "step": 5819 }, { "epoch": 1.779272393763375, "grad_norm": 1.7033063173294067, "learning_rate": 2.2223920519679023e-05, "loss": 0.2647, "step": 5820 }, { "epoch": 1.77957811066952, "grad_norm": 5.2076640129089355, "learning_rate": 2.2227741688956822e-05, "loss": 0.2854, "step": 5821 }, { "epoch": 1.779883827575665, "grad_norm": 0.6002863049507141, "learning_rate": 2.223156285823462e-05, "loss": 0.164, "step": 5822 }, { "epoch": 1.7801895444818099, "grad_norm": 0.35652729868888855, "learning_rate": 2.223538402751242e-05, "loss": 0.109, "step": 5823 }, { "epoch": 1.7804952613879548, "grad_norm": 0.2575831115245819, "learning_rate": 2.223920519679022e-05, "loss": 0.0933, "step": 5824 }, { "epoch": 1.7808009782940997, "grad_norm": 0.267812579870224, "learning_rate": 2.2243026366068017e-05, "loss": 0.1046, "step": 5825 }, { "epoch": 1.7811066952002446, "grad_norm": 0.2703970968723297, "learning_rate": 2.2246847535345816e-05, "loss": 0.0645, "step": 5826 }, { "epoch": 1.7814124121063895, "grad_norm": 0.4692670702934265, "learning_rate": 2.2250668704623615e-05, "loss": 0.0769, "step": 5827 }, { "epoch": 1.7817181290125343, "grad_norm": 0.48072925209999084, "learning_rate": 2.2254489873901413e-05, "loss": 0.0622, "step": 5828 }, { "epoch": 1.7820238459186792, "grad_norm": 0.3060126304626465, "learning_rate": 2.2258311043179212e-05, "loss": 0.0941, "step": 5829 }, { "epoch": 1.7823295628248244, "grad_norm": 0.3701476752758026, "learning_rate": 2.226213221245701e-05, "loss": 0.1383, "step": 5830 }, { "epoch": 1.782635279730969, "grad_norm": 0.31028029322624207, "learning_rate": 2.2265953381734813e-05, "loss": 0.0938, "step": 5831 }, { "epoch": 1.7829409966371141, "grad_norm": 0.5716168880462646, "learning_rate": 2.2269774551012612e-05, "loss": 0.096, "step": 5832 }, { "epoch": 1.7832467135432588, "grad_norm": 2.1408395767211914, "learning_rate": 2.227359572029041e-05, "loss": 0.1052, "step": 5833 }, { "epoch": 1.783552430449404, "grad_norm": 0.5453529953956604, "learning_rate": 2.227741688956821e-05, "loss": 0.1141, "step": 5834 }, { "epoch": 1.7838581473555486, "grad_norm": 0.6052824854850769, "learning_rate": 2.2281238058846008e-05, "loss": 0.1375, "step": 5835 }, { "epoch": 1.7841638642616937, "grad_norm": 0.6152344942092896, "learning_rate": 2.2285059228123807e-05, "loss": 0.1367, "step": 5836 }, { "epoch": 1.7844695811678386, "grad_norm": 0.6694599390029907, "learning_rate": 2.2288880397401606e-05, "loss": 0.1837, "step": 5837 }, { "epoch": 1.7847752980739835, "grad_norm": 0.894462525844574, "learning_rate": 2.2292701566679404e-05, "loss": 0.1773, "step": 5838 }, { "epoch": 1.7850810149801284, "grad_norm": 0.8183159828186035, "learning_rate": 2.2296522735957203e-05, "loss": 0.1707, "step": 5839 }, { "epoch": 1.7853867318862733, "grad_norm": 0.9135761260986328, "learning_rate": 2.2300343905235002e-05, "loss": 0.2192, "step": 5840 }, { "epoch": 1.7856924487924182, "grad_norm": 0.7747254371643066, "learning_rate": 2.23041650745128e-05, "loss": 0.2034, "step": 5841 }, { "epoch": 1.785998165698563, "grad_norm": 1.0594807863235474, "learning_rate": 2.23079862437906e-05, "loss": 0.2417, "step": 5842 }, { "epoch": 1.7863038826047082, "grad_norm": 0.990418016910553, "learning_rate": 2.2311807413068398e-05, "loss": 0.2252, "step": 5843 }, { "epoch": 1.7866095995108529, "grad_norm": 1.3138294219970703, "learning_rate": 2.2315628582346197e-05, "loss": 0.2766, "step": 5844 }, { "epoch": 1.786915316416998, "grad_norm": 1.2702687978744507, "learning_rate": 2.2319449751623995e-05, "loss": 0.3141, "step": 5845 }, { "epoch": 1.7872210333231426, "grad_norm": 2.0702455043792725, "learning_rate": 2.2323270920901798e-05, "loss": 0.3074, "step": 5846 }, { "epoch": 1.7875267502292878, "grad_norm": 0.5697043538093567, "learning_rate": 2.2327092090179596e-05, "loss": 0.1593, "step": 5847 }, { "epoch": 1.7878324671354324, "grad_norm": 0.35456329584121704, "learning_rate": 2.2330913259457395e-05, "loss": 0.0896, "step": 5848 }, { "epoch": 1.7881381840415775, "grad_norm": 0.5906016826629639, "learning_rate": 2.2334734428735194e-05, "loss": 0.1043, "step": 5849 }, { "epoch": 1.7884439009477224, "grad_norm": 0.3309856057167053, "learning_rate": 2.2338555598012993e-05, "loss": 0.065, "step": 5850 }, { "epoch": 1.7887496178538673, "grad_norm": 0.2765483856201172, "learning_rate": 2.234237676729079e-05, "loss": 0.0515, "step": 5851 }, { "epoch": 1.7890553347600122, "grad_norm": 0.5722607970237732, "learning_rate": 2.234619793656859e-05, "loss": 0.0834, "step": 5852 }, { "epoch": 1.789361051666157, "grad_norm": 1.8791929483413696, "learning_rate": 2.235001910584639e-05, "loss": 0.063, "step": 5853 }, { "epoch": 1.789666768572302, "grad_norm": 0.43426334857940674, "learning_rate": 2.2353840275124188e-05, "loss": 0.0972, "step": 5854 }, { "epoch": 1.789972485478447, "grad_norm": 0.359157532453537, "learning_rate": 2.2357661444401986e-05, "loss": 0.0845, "step": 5855 }, { "epoch": 1.790278202384592, "grad_norm": 0.45966804027557373, "learning_rate": 2.2361482613679785e-05, "loss": 0.0888, "step": 5856 }, { "epoch": 1.7905839192907367, "grad_norm": 0.7142115831375122, "learning_rate": 2.2365303782957584e-05, "loss": 0.1062, "step": 5857 }, { "epoch": 1.7908896361968818, "grad_norm": 0.4482080042362213, "learning_rate": 2.2369124952235383e-05, "loss": 0.1236, "step": 5858 }, { "epoch": 1.7911953531030265, "grad_norm": 0.6693216562271118, "learning_rate": 2.237294612151318e-05, "loss": 0.124, "step": 5859 }, { "epoch": 1.7915010700091716, "grad_norm": 0.49907010793685913, "learning_rate": 2.237676729079098e-05, "loss": 0.1477, "step": 5860 }, { "epoch": 1.7918067869153163, "grad_norm": 0.7414682507514954, "learning_rate": 2.238058846006878e-05, "loss": 0.1233, "step": 5861 }, { "epoch": 1.7921125038214614, "grad_norm": 0.588272213935852, "learning_rate": 2.238440962934658e-05, "loss": 0.1434, "step": 5862 }, { "epoch": 1.7924182207276063, "grad_norm": 1.088295340538025, "learning_rate": 2.238823079862438e-05, "loss": 0.2174, "step": 5863 }, { "epoch": 1.7927239376337512, "grad_norm": 0.6777440309524536, "learning_rate": 2.239205196790218e-05, "loss": 0.17, "step": 5864 }, { "epoch": 1.793029654539896, "grad_norm": 1.035912036895752, "learning_rate": 2.2395873137179977e-05, "loss": 0.2203, "step": 5865 }, { "epoch": 1.793335371446041, "grad_norm": 1.0097694396972656, "learning_rate": 2.2399694306457776e-05, "loss": 0.2382, "step": 5866 }, { "epoch": 1.7936410883521858, "grad_norm": 0.8431397676467896, "learning_rate": 2.2403515475735575e-05, "loss": 0.1937, "step": 5867 }, { "epoch": 1.7939468052583307, "grad_norm": 1.335750937461853, "learning_rate": 2.2407336645013373e-05, "loss": 0.1995, "step": 5868 }, { "epoch": 1.7942525221644758, "grad_norm": 1.3541984558105469, "learning_rate": 2.2411157814291172e-05, "loss": 0.2231, "step": 5869 }, { "epoch": 1.7945582390706205, "grad_norm": 1.721065878868103, "learning_rate": 2.241497898356897e-05, "loss": 0.2235, "step": 5870 }, { "epoch": 1.7948639559767656, "grad_norm": 1.5981593132019043, "learning_rate": 2.241880015284677e-05, "loss": 0.3167, "step": 5871 }, { "epoch": 1.7951696728829103, "grad_norm": 0.6422430872917175, "learning_rate": 2.242262132212457e-05, "loss": 0.1603, "step": 5872 }, { "epoch": 1.7954753897890554, "grad_norm": 0.3531329333782196, "learning_rate": 2.242644249140237e-05, "loss": 0.1102, "step": 5873 }, { "epoch": 1.7957811066952, "grad_norm": 0.4634765088558197, "learning_rate": 2.243026366068017e-05, "loss": 0.1041, "step": 5874 }, { "epoch": 1.7960868236013452, "grad_norm": 0.4391295313835144, "learning_rate": 2.2434084829957968e-05, "loss": 0.0636, "step": 5875 }, { "epoch": 1.79639254050749, "grad_norm": 0.9989938735961914, "learning_rate": 2.2437905999235767e-05, "loss": 0.0704, "step": 5876 }, { "epoch": 1.796698257413635, "grad_norm": 0.42166298627853394, "learning_rate": 2.244172716851357e-05, "loss": 0.0574, "step": 5877 }, { "epoch": 1.7970039743197799, "grad_norm": 0.4236437678337097, "learning_rate": 2.2445548337791368e-05, "loss": 0.1136, "step": 5878 }, { "epoch": 1.7973096912259248, "grad_norm": 0.4987923800945282, "learning_rate": 2.2449369507069166e-05, "loss": 0.093, "step": 5879 }, { "epoch": 1.7976154081320697, "grad_norm": 0.44421297311782837, "learning_rate": 2.2453190676346965e-05, "loss": 0.1016, "step": 5880 }, { "epoch": 1.7979211250382146, "grad_norm": 0.5402381420135498, "learning_rate": 2.2457011845624764e-05, "loss": 0.1065, "step": 5881 }, { "epoch": 1.7982268419443597, "grad_norm": 0.5198651552200317, "learning_rate": 2.2460833014902563e-05, "loss": 0.1192, "step": 5882 }, { "epoch": 1.7985325588505043, "grad_norm": 0.5478948354721069, "learning_rate": 2.246465418418036e-05, "loss": 0.0969, "step": 5883 }, { "epoch": 1.7988382757566495, "grad_norm": 0.5815957188606262, "learning_rate": 2.246847535345816e-05, "loss": 0.1031, "step": 5884 }, { "epoch": 1.7991439926627941, "grad_norm": 0.556894063949585, "learning_rate": 2.247229652273596e-05, "loss": 0.1439, "step": 5885 }, { "epoch": 1.7994497095689392, "grad_norm": 0.5779536366462708, "learning_rate": 2.2476117692013758e-05, "loss": 0.1467, "step": 5886 }, { "epoch": 1.799755426475084, "grad_norm": 0.9166035056114197, "learning_rate": 2.2479938861291556e-05, "loss": 0.2056, "step": 5887 }, { "epoch": 1.800061143381229, "grad_norm": 0.6454905867576599, "learning_rate": 2.2483760030569355e-05, "loss": 0.1755, "step": 5888 }, { "epoch": 1.800366860287374, "grad_norm": 0.6041117310523987, "learning_rate": 2.2487581199847154e-05, "loss": 0.2064, "step": 5889 }, { "epoch": 1.8006725771935188, "grad_norm": 1.2116774320602417, "learning_rate": 2.2491402369124953e-05, "loss": 0.1977, "step": 5890 }, { "epoch": 1.8009782940996637, "grad_norm": 0.7441605925559998, "learning_rate": 2.249522353840275e-05, "loss": 0.1984, "step": 5891 }, { "epoch": 1.8012840110058086, "grad_norm": 1.3444339036941528, "learning_rate": 2.249904470768055e-05, "loss": 0.1922, "step": 5892 }, { "epoch": 1.8015897279119535, "grad_norm": 1.5300246477127075, "learning_rate": 2.2502865876958352e-05, "loss": 0.246, "step": 5893 }, { "epoch": 1.8018954448180984, "grad_norm": 1.8901561498641968, "learning_rate": 2.250668704623615e-05, "loss": 0.2171, "step": 5894 }, { "epoch": 1.8022011617242435, "grad_norm": 1.2577672004699707, "learning_rate": 2.251050821551395e-05, "loss": 0.2142, "step": 5895 }, { "epoch": 1.8025068786303882, "grad_norm": 1.3615971803665161, "learning_rate": 2.251432938479175e-05, "loss": 0.288, "step": 5896 }, { "epoch": 1.8028125955365333, "grad_norm": 0.4944442808628082, "learning_rate": 2.2518150554069547e-05, "loss": 0.1497, "step": 5897 }, { "epoch": 1.803118312442678, "grad_norm": 0.7004675269126892, "learning_rate": 2.2521971723347346e-05, "loss": 0.0862, "step": 5898 }, { "epoch": 1.803424029348823, "grad_norm": 0.5106249451637268, "learning_rate": 2.2525792892625145e-05, "loss": 0.0885, "step": 5899 }, { "epoch": 1.8037297462549677, "grad_norm": 0.4771416485309601, "learning_rate": 2.2529614061902943e-05, "loss": 0.0865, "step": 5900 }, { "epoch": 1.8040354631611129, "grad_norm": 0.25663328170776367, "learning_rate": 2.2533435231180742e-05, "loss": 0.0606, "step": 5901 }, { "epoch": 1.8043411800672577, "grad_norm": 0.44941478967666626, "learning_rate": 2.253725640045854e-05, "loss": 0.0654, "step": 5902 }, { "epoch": 1.8046468969734026, "grad_norm": 0.5870100259780884, "learning_rate": 2.254107756973634e-05, "loss": 0.0538, "step": 5903 }, { "epoch": 1.8049526138795475, "grad_norm": 0.3680556118488312, "learning_rate": 2.254489873901414e-05, "loss": 0.0908, "step": 5904 }, { "epoch": 1.8052583307856924, "grad_norm": 0.2977650463581085, "learning_rate": 2.2548719908291937e-05, "loss": 0.0732, "step": 5905 }, { "epoch": 1.8055640476918373, "grad_norm": 0.39700815081596375, "learning_rate": 2.2552541077569736e-05, "loss": 0.0736, "step": 5906 }, { "epoch": 1.8058697645979822, "grad_norm": 5.255709648132324, "learning_rate": 2.2556362246847535e-05, "loss": 0.1178, "step": 5907 }, { "epoch": 1.8061754815041273, "grad_norm": 0.5550448894500732, "learning_rate": 2.2560183416125337e-05, "loss": 0.0936, "step": 5908 }, { "epoch": 1.806481198410272, "grad_norm": 0.3751942813396454, "learning_rate": 2.2564004585403135e-05, "loss": 0.1046, "step": 5909 }, { "epoch": 1.806786915316417, "grad_norm": 0.75841224193573, "learning_rate": 2.2567825754680934e-05, "loss": 0.1599, "step": 5910 }, { "epoch": 1.8070926322225618, "grad_norm": 1.12236750125885, "learning_rate": 2.2571646923958733e-05, "loss": 0.1706, "step": 5911 }, { "epoch": 1.807398349128707, "grad_norm": 0.6064581274986267, "learning_rate": 2.2575468093236532e-05, "loss": 0.1955, "step": 5912 }, { "epoch": 1.8077040660348516, "grad_norm": 0.7133928537368774, "learning_rate": 2.257928926251433e-05, "loss": 0.1773, "step": 5913 }, { "epoch": 1.8080097829409967, "grad_norm": 1.464132308959961, "learning_rate": 2.258311043179213e-05, "loss": 0.1883, "step": 5914 }, { "epoch": 1.8083154998471416, "grad_norm": 1.0504883527755737, "learning_rate": 2.2586931601069928e-05, "loss": 0.246, "step": 5915 }, { "epoch": 1.8086212167532865, "grad_norm": 0.9956822991371155, "learning_rate": 2.2590752770347727e-05, "loss": 0.2557, "step": 5916 }, { "epoch": 1.8089269336594314, "grad_norm": 2.7273988723754883, "learning_rate": 2.2594573939625525e-05, "loss": 0.2368, "step": 5917 }, { "epoch": 1.8092326505655763, "grad_norm": 1.6115093231201172, "learning_rate": 2.2598395108903324e-05, "loss": 0.2188, "step": 5918 }, { "epoch": 1.8095383674717211, "grad_norm": 1.6936719417572021, "learning_rate": 2.2602216278181123e-05, "loss": 0.2826, "step": 5919 }, { "epoch": 1.809844084377866, "grad_norm": 1.1447290182113647, "learning_rate": 2.260603744745892e-05, "loss": 0.2403, "step": 5920 }, { "epoch": 1.8101498012840112, "grad_norm": 1.4659391641616821, "learning_rate": 2.260985861673672e-05, "loss": 0.3239, "step": 5921 }, { "epoch": 1.8104555181901558, "grad_norm": 0.5760611295700073, "learning_rate": 2.261367978601452e-05, "loss": 0.1671, "step": 5922 }, { "epoch": 1.810761235096301, "grad_norm": 0.44105264544487, "learning_rate": 2.2617500955292318e-05, "loss": 0.1087, "step": 5923 }, { "epoch": 1.8110669520024456, "grad_norm": 0.5981799364089966, "learning_rate": 2.262132212457012e-05, "loss": 0.1276, "step": 5924 }, { "epoch": 1.8113726689085907, "grad_norm": 0.26397836208343506, "learning_rate": 2.262514329384792e-05, "loss": 0.0766, "step": 5925 }, { "epoch": 1.8116783858147354, "grad_norm": 0.32751360535621643, "learning_rate": 2.2628964463125718e-05, "loss": 0.0873, "step": 5926 }, { "epoch": 1.8119841027208805, "grad_norm": 0.31205564737319946, "learning_rate": 2.2632785632403516e-05, "loss": 0.0855, "step": 5927 }, { "epoch": 1.8122898196270254, "grad_norm": 0.3058909475803375, "learning_rate": 2.2636606801681315e-05, "loss": 0.0739, "step": 5928 }, { "epoch": 1.8125955365331703, "grad_norm": 0.27767452597618103, "learning_rate": 2.2640427970959114e-05, "loss": 0.0926, "step": 5929 }, { "epoch": 1.8129012534393152, "grad_norm": 0.5123608708381653, "learning_rate": 2.2644249140236912e-05, "loss": 0.1176, "step": 5930 }, { "epoch": 1.81320697034546, "grad_norm": 0.3540153205394745, "learning_rate": 2.264807030951471e-05, "loss": 0.0708, "step": 5931 }, { "epoch": 1.813512687251605, "grad_norm": 0.521935224533081, "learning_rate": 2.265189147879251e-05, "loss": 0.127, "step": 5932 }, { "epoch": 1.8138184041577499, "grad_norm": 0.6086746454238892, "learning_rate": 2.265571264807031e-05, "loss": 0.1539, "step": 5933 }, { "epoch": 1.814124121063895, "grad_norm": 0.5610914826393127, "learning_rate": 2.2659533817348107e-05, "loss": 0.1215, "step": 5934 }, { "epoch": 1.8144298379700396, "grad_norm": 0.4456064999103546, "learning_rate": 2.2663354986625906e-05, "loss": 0.1313, "step": 5935 }, { "epoch": 1.8147355548761848, "grad_norm": 0.5292341709136963, "learning_rate": 2.2667176155903705e-05, "loss": 0.1492, "step": 5936 }, { "epoch": 1.8150412717823294, "grad_norm": 0.8701665997505188, "learning_rate": 2.2670997325181504e-05, "loss": 0.1775, "step": 5937 }, { "epoch": 1.8153469886884745, "grad_norm": 0.8343828916549683, "learning_rate": 2.2674818494459302e-05, "loss": 0.1814, "step": 5938 }, { "epoch": 1.8156527055946192, "grad_norm": 0.7170079946517944, "learning_rate": 2.2678639663737105e-05, "loss": 0.1863, "step": 5939 }, { "epoch": 1.8159584225007643, "grad_norm": 2.637075185775757, "learning_rate": 2.2682460833014903e-05, "loss": 0.1764, "step": 5940 }, { "epoch": 1.8162641394069092, "grad_norm": 0.7747604846954346, "learning_rate": 2.2686282002292702e-05, "loss": 0.1802, "step": 5941 }, { "epoch": 1.8165698563130541, "grad_norm": 1.8888065814971924, "learning_rate": 2.26901031715705e-05, "loss": 0.2352, "step": 5942 }, { "epoch": 1.816875573219199, "grad_norm": 2.3458054065704346, "learning_rate": 2.26939243408483e-05, "loss": 0.2423, "step": 5943 }, { "epoch": 1.817181290125344, "grad_norm": 1.152510404586792, "learning_rate": 2.26977455101261e-05, "loss": 0.2696, "step": 5944 }, { "epoch": 1.8174870070314888, "grad_norm": 2.3650569915771484, "learning_rate": 2.2701566679403897e-05, "loss": 0.2321, "step": 5945 }, { "epoch": 1.8177927239376337, "grad_norm": 2.6973440647125244, "learning_rate": 2.2705387848681696e-05, "loss": 0.3359, "step": 5946 }, { "epoch": 1.8180984408437788, "grad_norm": 0.5127320885658264, "learning_rate": 2.2709209017959495e-05, "loss": 0.1755, "step": 5947 }, { "epoch": 1.8184041577499235, "grad_norm": 0.5036792755126953, "learning_rate": 2.2713030187237293e-05, "loss": 0.0988, "step": 5948 }, { "epoch": 1.8187098746560686, "grad_norm": 0.4437064528465271, "learning_rate": 2.2716851356515092e-05, "loss": 0.0816, "step": 5949 }, { "epoch": 1.8190155915622133, "grad_norm": 0.29185858368873596, "learning_rate": 2.272067252579289e-05, "loss": 0.0703, "step": 5950 }, { "epoch": 1.8193213084683584, "grad_norm": 0.4918385446071625, "learning_rate": 2.2724493695070693e-05, "loss": 0.1078, "step": 5951 }, { "epoch": 1.819627025374503, "grad_norm": 0.34700602293014526, "learning_rate": 2.272831486434849e-05, "loss": 0.0644, "step": 5952 }, { "epoch": 1.8199327422806482, "grad_norm": 0.3347308337688446, "learning_rate": 2.273213603362629e-05, "loss": 0.0889, "step": 5953 }, { "epoch": 1.820238459186793, "grad_norm": 0.2590063512325287, "learning_rate": 2.273595720290409e-05, "loss": 0.066, "step": 5954 }, { "epoch": 1.820544176092938, "grad_norm": 0.764342725276947, "learning_rate": 2.273977837218189e-05, "loss": 0.1036, "step": 5955 }, { "epoch": 1.8208498929990828, "grad_norm": 0.507537305355072, "learning_rate": 2.274359954145969e-05, "loss": 0.0866, "step": 5956 }, { "epoch": 1.8211556099052277, "grad_norm": 0.3078964650630951, "learning_rate": 2.274742071073749e-05, "loss": 0.0886, "step": 5957 }, { "epoch": 1.8214613268113726, "grad_norm": 0.34287068247795105, "learning_rate": 2.2751241880015288e-05, "loss": 0.0916, "step": 5958 }, { "epoch": 1.8217670437175175, "grad_norm": 0.5799626111984253, "learning_rate": 2.2755063049293086e-05, "loss": 0.1319, "step": 5959 }, { "epoch": 1.8220727606236626, "grad_norm": 0.4959411323070526, "learning_rate": 2.2758884218570885e-05, "loss": 0.134, "step": 5960 }, { "epoch": 1.8223784775298073, "grad_norm": 0.621892511844635, "learning_rate": 2.2762705387848684e-05, "loss": 0.1388, "step": 5961 }, { "epoch": 1.8226841944359524, "grad_norm": 0.5363364219665527, "learning_rate": 2.2766526557126482e-05, "loss": 0.1565, "step": 5962 }, { "epoch": 1.822989911342097, "grad_norm": 0.9225705862045288, "learning_rate": 2.277034772640428e-05, "loss": 0.1562, "step": 5963 }, { "epoch": 1.8232956282482422, "grad_norm": 0.6279116272926331, "learning_rate": 2.277416889568208e-05, "loss": 0.2125, "step": 5964 }, { "epoch": 1.8236013451543869, "grad_norm": 0.9207148551940918, "learning_rate": 2.277799006495988e-05, "loss": 0.1961, "step": 5965 }, { "epoch": 1.823907062060532, "grad_norm": 0.8382583856582642, "learning_rate": 2.2781811234237677e-05, "loss": 0.2054, "step": 5966 }, { "epoch": 1.8242127789666769, "grad_norm": 0.8043766617774963, "learning_rate": 2.2785632403515476e-05, "loss": 0.1841, "step": 5967 }, { "epoch": 1.8245184958728218, "grad_norm": 1.0227431058883667, "learning_rate": 2.2789453572793275e-05, "loss": 0.232, "step": 5968 }, { "epoch": 1.8248242127789667, "grad_norm": 0.9342317581176758, "learning_rate": 2.2793274742071074e-05, "loss": 0.2293, "step": 5969 }, { "epoch": 1.8251299296851116, "grad_norm": 1.569667935371399, "learning_rate": 2.2797095911348876e-05, "loss": 0.2864, "step": 5970 }, { "epoch": 1.8254356465912565, "grad_norm": 2.5787160396575928, "learning_rate": 2.2800917080626675e-05, "loss": 0.3317, "step": 5971 }, { "epoch": 1.8257413634974013, "grad_norm": 0.9043715596199036, "learning_rate": 2.2804738249904473e-05, "loss": 0.1725, "step": 5972 }, { "epoch": 1.8260470804035465, "grad_norm": 0.3552992045879364, "learning_rate": 2.2808559419182272e-05, "loss": 0.0994, "step": 5973 }, { "epoch": 1.8263527973096911, "grad_norm": 0.3904891610145569, "learning_rate": 2.281238058846007e-05, "loss": 0.1186, "step": 5974 }, { "epoch": 1.8266585142158362, "grad_norm": 0.5919166207313538, "learning_rate": 2.281620175773787e-05, "loss": 0.0628, "step": 5975 }, { "epoch": 1.826964231121981, "grad_norm": 0.3259316384792328, "learning_rate": 2.2820022927015668e-05, "loss": 0.0881, "step": 5976 }, { "epoch": 1.827269948028126, "grad_norm": 0.3797752857208252, "learning_rate": 2.2823844096293467e-05, "loss": 0.0611, "step": 5977 }, { "epoch": 1.8275756649342707, "grad_norm": 0.32800665497779846, "learning_rate": 2.2827665265571266e-05, "loss": 0.0726, "step": 5978 }, { "epoch": 1.8278813818404158, "grad_norm": 0.5499637722969055, "learning_rate": 2.2831486434849065e-05, "loss": 0.0831, "step": 5979 }, { "epoch": 1.8281870987465607, "grad_norm": 0.3848051130771637, "learning_rate": 2.2835307604126863e-05, "loss": 0.1028, "step": 5980 }, { "epoch": 1.8284928156527056, "grad_norm": 0.3856937289237976, "learning_rate": 2.2839128773404662e-05, "loss": 0.0623, "step": 5981 }, { "epoch": 1.8287985325588505, "grad_norm": 0.4583621025085449, "learning_rate": 2.284294994268246e-05, "loss": 0.1235, "step": 5982 }, { "epoch": 1.8291042494649954, "grad_norm": 0.39856624603271484, "learning_rate": 2.284677111196026e-05, "loss": 0.074, "step": 5983 }, { "epoch": 1.8294099663711403, "grad_norm": 0.577545702457428, "learning_rate": 2.2850592281238058e-05, "loss": 0.0899, "step": 5984 }, { "epoch": 1.8297156832772852, "grad_norm": 0.44580528140068054, "learning_rate": 2.2854413450515857e-05, "loss": 0.1132, "step": 5985 }, { "epoch": 1.8300214001834303, "grad_norm": 0.7754372358322144, "learning_rate": 2.285823461979366e-05, "loss": 0.1853, "step": 5986 }, { "epoch": 1.830327117089575, "grad_norm": 0.46062400937080383, "learning_rate": 2.2862055789071458e-05, "loss": 0.1486, "step": 5987 }, { "epoch": 1.83063283399572, "grad_norm": 0.6113187074661255, "learning_rate": 2.2865876958349257e-05, "loss": 0.2191, "step": 5988 }, { "epoch": 1.8309385509018647, "grad_norm": 0.4782533049583435, "learning_rate": 2.2869698127627055e-05, "loss": 0.1737, "step": 5989 }, { "epoch": 1.8312442678080099, "grad_norm": 0.7474223375320435, "learning_rate": 2.2873519296904854e-05, "loss": 0.187, "step": 5990 }, { "epoch": 1.8315499847141545, "grad_norm": 0.7757048606872559, "learning_rate": 2.2877340466182653e-05, "loss": 0.1994, "step": 5991 }, { "epoch": 1.8318557016202996, "grad_norm": 4.167563438415527, "learning_rate": 2.288116163546045e-05, "loss": 0.2466, "step": 5992 }, { "epoch": 1.8321614185264445, "grad_norm": 0.8352805972099304, "learning_rate": 2.288498280473825e-05, "loss": 0.187, "step": 5993 }, { "epoch": 1.8324671354325894, "grad_norm": 1.2836756706237793, "learning_rate": 2.288880397401605e-05, "loss": 0.2033, "step": 5994 }, { "epoch": 1.8327728523387343, "grad_norm": 1.5363075733184814, "learning_rate": 2.2892625143293848e-05, "loss": 0.2163, "step": 5995 }, { "epoch": 1.8330785692448792, "grad_norm": 1.9450304508209229, "learning_rate": 2.2896446312571647e-05, "loss": 0.2948, "step": 5996 }, { "epoch": 1.8333842861510241, "grad_norm": 0.5917288661003113, "learning_rate": 2.2900267481849445e-05, "loss": 0.1805, "step": 5997 }, { "epoch": 1.833690003057169, "grad_norm": 0.39563238620758057, "learning_rate": 2.2904088651127244e-05, "loss": 0.114, "step": 5998 }, { "epoch": 1.8339957199633141, "grad_norm": 0.38310977816581726, "learning_rate": 2.2907909820405043e-05, "loss": 0.0755, "step": 5999 }, { "epoch": 1.8343014368694588, "grad_norm": 0.485666960477829, "learning_rate": 2.291173098968284e-05, "loss": 0.0805, "step": 6000 }, { "epoch": 1.8343014368694588, "eval_cer": 0.19195771005859055, "eval_loss": 0.26742246747016907, "eval_runtime": 23.282, "eval_samples_per_second": 194.914, "eval_steps_per_second": 0.644, "eval_wer": 0.3433144325577498, "step": 6000 }, { "epoch": 1.834607153775604, "grad_norm": 0.24815250933170319, "learning_rate": 2.2915552158960644e-05, "loss": 0.0533, "step": 6001 }, { "epoch": 1.8349128706817486, "grad_norm": 0.23052364587783813, "learning_rate": 2.2919373328238442e-05, "loss": 0.0593, "step": 6002 }, { "epoch": 1.8352185875878937, "grad_norm": 0.7028427124023438, "learning_rate": 2.292319449751624e-05, "loss": 0.0604, "step": 6003 }, { "epoch": 1.8355243044940384, "grad_norm": 0.4023300111293793, "learning_rate": 2.292701566679404e-05, "loss": 0.109, "step": 6004 }, { "epoch": 1.8358300214001835, "grad_norm": 0.27435094118118286, "learning_rate": 2.293083683607184e-05, "loss": 0.0586, "step": 6005 }, { "epoch": 1.8361357383063284, "grad_norm": 0.36521270871162415, "learning_rate": 2.2934658005349637e-05, "loss": 0.0733, "step": 6006 }, { "epoch": 1.8364414552124733, "grad_norm": 0.4195084869861603, "learning_rate": 2.2938479174627436e-05, "loss": 0.1036, "step": 6007 }, { "epoch": 1.8367471721186182, "grad_norm": 0.4386534094810486, "learning_rate": 2.2942300343905235e-05, "loss": 0.1071, "step": 6008 }, { "epoch": 1.837052889024763, "grad_norm": 0.42417171597480774, "learning_rate": 2.2946121513183034e-05, "loss": 0.1191, "step": 6009 }, { "epoch": 1.837358605930908, "grad_norm": 0.4365463852882385, "learning_rate": 2.2949942682460832e-05, "loss": 0.1416, "step": 6010 }, { "epoch": 1.8376643228370528, "grad_norm": 0.4597840905189514, "learning_rate": 2.295376385173863e-05, "loss": 0.1613, "step": 6011 }, { "epoch": 1.837970039743198, "grad_norm": 0.5031896233558655, "learning_rate": 2.295758502101643e-05, "loss": 0.1796, "step": 6012 }, { "epoch": 1.8382757566493426, "grad_norm": 0.5469785928726196, "learning_rate": 2.296140619029423e-05, "loss": 0.1666, "step": 6013 }, { "epoch": 1.8385814735554877, "grad_norm": 0.5878551602363586, "learning_rate": 2.2965227359572027e-05, "loss": 0.1636, "step": 6014 }, { "epoch": 1.8388871904616324, "grad_norm": 2.122934579849243, "learning_rate": 2.2969048528849826e-05, "loss": 0.1957, "step": 6015 }, { "epoch": 1.8391929073677775, "grad_norm": 0.706180214881897, "learning_rate": 2.2972869698127625e-05, "loss": 0.1993, "step": 6016 }, { "epoch": 1.8394986242739222, "grad_norm": 0.7046698927879333, "learning_rate": 2.2976690867405427e-05, "loss": 0.1964, "step": 6017 }, { "epoch": 1.8398043411800673, "grad_norm": 1.353708267211914, "learning_rate": 2.2980512036683226e-05, "loss": 0.2078, "step": 6018 }, { "epoch": 1.8401100580862122, "grad_norm": 0.9611049890518188, "learning_rate": 2.2984333205961024e-05, "loss": 0.2496, "step": 6019 }, { "epoch": 1.840415774992357, "grad_norm": 1.2422980070114136, "learning_rate": 2.2988154375238823e-05, "loss": 0.2861, "step": 6020 }, { "epoch": 1.840721491898502, "grad_norm": 1.5035769939422607, "learning_rate": 2.2991975544516622e-05, "loss": 0.3464, "step": 6021 }, { "epoch": 1.8410272088046469, "grad_norm": 0.4039274752140045, "learning_rate": 2.299579671379442e-05, "loss": 0.215, "step": 6022 }, { "epoch": 1.8413329257107918, "grad_norm": 0.2809755504131317, "learning_rate": 2.299961788307222e-05, "loss": 0.1068, "step": 6023 }, { "epoch": 1.8416386426169367, "grad_norm": 0.29740795493125916, "learning_rate": 2.3003439052350018e-05, "loss": 0.0941, "step": 6024 }, { "epoch": 1.8419443595230818, "grad_norm": 0.3050239384174347, "learning_rate": 2.3007260221627817e-05, "loss": 0.0776, "step": 6025 }, { "epoch": 1.8422500764292264, "grad_norm": 0.26606670022010803, "learning_rate": 2.3011081390905616e-05, "loss": 0.0725, "step": 6026 }, { "epoch": 1.8425557933353716, "grad_norm": 0.3418686091899872, "learning_rate": 2.3014902560183414e-05, "loss": 0.0943, "step": 6027 }, { "epoch": 1.8428615102415162, "grad_norm": 0.4112890958786011, "learning_rate": 2.3018723729461213e-05, "loss": 0.0606, "step": 6028 }, { "epoch": 1.8431672271476613, "grad_norm": 0.27151647210121155, "learning_rate": 2.3022544898739015e-05, "loss": 0.0633, "step": 6029 }, { "epoch": 1.843472944053806, "grad_norm": 0.5808164477348328, "learning_rate": 2.3026366068016814e-05, "loss": 0.1206, "step": 6030 }, { "epoch": 1.8437786609599511, "grad_norm": 0.49281537532806396, "learning_rate": 2.3030187237294613e-05, "loss": 0.0907, "step": 6031 }, { "epoch": 1.844084377866096, "grad_norm": 0.5526334047317505, "learning_rate": 2.303400840657241e-05, "loss": 0.1091, "step": 6032 }, { "epoch": 1.844390094772241, "grad_norm": 0.40409213304519653, "learning_rate": 2.3037829575850214e-05, "loss": 0.1074, "step": 6033 }, { "epoch": 1.8446958116783858, "grad_norm": 0.4875679314136505, "learning_rate": 2.3041650745128012e-05, "loss": 0.1256, "step": 6034 }, { "epoch": 1.8450015285845307, "grad_norm": 0.5249391794204712, "learning_rate": 2.304547191440581e-05, "loss": 0.1225, "step": 6035 }, { "epoch": 1.8453072454906756, "grad_norm": 0.8566056489944458, "learning_rate": 2.304929308368361e-05, "loss": 0.1567, "step": 6036 }, { "epoch": 1.8456129623968205, "grad_norm": 0.5085504651069641, "learning_rate": 2.305311425296141e-05, "loss": 0.144, "step": 6037 }, { "epoch": 1.8459186793029656, "grad_norm": 0.8800804018974304, "learning_rate": 2.3056935422239207e-05, "loss": 0.2, "step": 6038 }, { "epoch": 1.8462243962091103, "grad_norm": 1.4566508531570435, "learning_rate": 2.3060756591517006e-05, "loss": 0.2287, "step": 6039 }, { "epoch": 1.8465301131152554, "grad_norm": 1.6660709381103516, "learning_rate": 2.3064577760794805e-05, "loss": 0.2142, "step": 6040 }, { "epoch": 1.8468358300214, "grad_norm": 0.7256160974502563, "learning_rate": 2.3068398930072604e-05, "loss": 0.235, "step": 6041 }, { "epoch": 1.8471415469275452, "grad_norm": 1.6429780721664429, "learning_rate": 2.3072220099350402e-05, "loss": 0.1971, "step": 6042 }, { "epoch": 1.8474472638336898, "grad_norm": 0.8803927302360535, "learning_rate": 2.30760412686282e-05, "loss": 0.2534, "step": 6043 }, { "epoch": 1.847752980739835, "grad_norm": 2.4318559169769287, "learning_rate": 2.3079862437906e-05, "loss": 0.4107, "step": 6044 }, { "epoch": 1.8480586976459799, "grad_norm": 1.1700897216796875, "learning_rate": 2.30836836071838e-05, "loss": 0.265, "step": 6045 }, { "epoch": 1.8483644145521247, "grad_norm": 1.6476821899414062, "learning_rate": 2.3087504776461597e-05, "loss": 0.3109, "step": 6046 }, { "epoch": 1.8486701314582696, "grad_norm": 0.5167468786239624, "learning_rate": 2.3091325945739396e-05, "loss": 0.1883, "step": 6047 }, { "epoch": 1.8489758483644145, "grad_norm": 0.3587963283061981, "learning_rate": 2.3095147115017198e-05, "loss": 0.1016, "step": 6048 }, { "epoch": 1.8492815652705594, "grad_norm": 0.7049528956413269, "learning_rate": 2.3098968284294997e-05, "loss": 0.0826, "step": 6049 }, { "epoch": 1.8495872821767043, "grad_norm": 0.3398105502128601, "learning_rate": 2.3102789453572796e-05, "loss": 0.08, "step": 6050 }, { "epoch": 1.8498929990828494, "grad_norm": 0.29673492908477783, "learning_rate": 2.3106610622850594e-05, "loss": 0.0531, "step": 6051 }, { "epoch": 1.850198715988994, "grad_norm": 0.3619558811187744, "learning_rate": 2.3110431792128393e-05, "loss": 0.0685, "step": 6052 }, { "epoch": 1.8505044328951392, "grad_norm": 0.5296640992164612, "learning_rate": 2.3114252961406192e-05, "loss": 0.0693, "step": 6053 }, { "epoch": 1.8508101498012839, "grad_norm": 0.2668273150920868, "learning_rate": 2.311807413068399e-05, "loss": 0.0838, "step": 6054 }, { "epoch": 1.851115866707429, "grad_norm": 0.29522639513015747, "learning_rate": 2.312189529996179e-05, "loss": 0.0613, "step": 6055 }, { "epoch": 1.8514215836135737, "grad_norm": 0.45995473861694336, "learning_rate": 2.3125716469239588e-05, "loss": 0.1064, "step": 6056 }, { "epoch": 1.8517273005197188, "grad_norm": 0.4181731641292572, "learning_rate": 2.3129537638517387e-05, "loss": 0.118, "step": 6057 }, { "epoch": 1.8520330174258637, "grad_norm": 0.3703547418117523, "learning_rate": 2.3133358807795186e-05, "loss": 0.1249, "step": 6058 }, { "epoch": 1.8523387343320086, "grad_norm": 0.5266970992088318, "learning_rate": 2.3137179977072984e-05, "loss": 0.1125, "step": 6059 }, { "epoch": 1.8526444512381535, "grad_norm": 0.42795971035957336, "learning_rate": 2.3141001146350783e-05, "loss": 0.1509, "step": 6060 }, { "epoch": 1.8529501681442984, "grad_norm": 0.573658287525177, "learning_rate": 2.3144822315628582e-05, "loss": 0.1405, "step": 6061 }, { "epoch": 1.8532558850504433, "grad_norm": 1.063928484916687, "learning_rate": 2.314864348490638e-05, "loss": 0.2056, "step": 6062 }, { "epoch": 1.8535616019565881, "grad_norm": 0.6591883897781372, "learning_rate": 2.3152464654184183e-05, "loss": 0.1895, "step": 6063 }, { "epoch": 1.8538673188627333, "grad_norm": 0.8760790228843689, "learning_rate": 2.315628582346198e-05, "loss": 0.17, "step": 6064 }, { "epoch": 1.854173035768878, "grad_norm": 1.2016613483428955, "learning_rate": 2.316010699273978e-05, "loss": 0.2067, "step": 6065 }, { "epoch": 1.854478752675023, "grad_norm": 0.7233198881149292, "learning_rate": 2.316392816201758e-05, "loss": 0.2445, "step": 6066 }, { "epoch": 1.8547844695811677, "grad_norm": 0.807338535785675, "learning_rate": 2.3167749331295378e-05, "loss": 0.2908, "step": 6067 }, { "epoch": 1.8550901864873128, "grad_norm": 3.3646016120910645, "learning_rate": 2.3171570500573177e-05, "loss": 0.1878, "step": 6068 }, { "epoch": 1.8553959033934575, "grad_norm": 0.8479036092758179, "learning_rate": 2.3175391669850975e-05, "loss": 0.2505, "step": 6069 }, { "epoch": 1.8557016202996026, "grad_norm": 1.3307287693023682, "learning_rate": 2.3179212839128774e-05, "loss": 0.2684, "step": 6070 }, { "epoch": 1.8560073372057475, "grad_norm": 2.7427031993865967, "learning_rate": 2.3183034008406573e-05, "loss": 0.3179, "step": 6071 }, { "epoch": 1.8563130541118924, "grad_norm": 0.6609840989112854, "learning_rate": 2.318685517768437e-05, "loss": 0.1835, "step": 6072 }, { "epoch": 1.8566187710180373, "grad_norm": 0.45634782314300537, "learning_rate": 2.319067634696217e-05, "loss": 0.0883, "step": 6073 }, { "epoch": 1.8569244879241822, "grad_norm": 0.41551628708839417, "learning_rate": 2.319449751623997e-05, "loss": 0.0698, "step": 6074 }, { "epoch": 1.857230204830327, "grad_norm": 0.26502111554145813, "learning_rate": 2.3198318685517768e-05, "loss": 0.0733, "step": 6075 }, { "epoch": 1.857535921736472, "grad_norm": 0.3476191461086273, "learning_rate": 2.3202139854795566e-05, "loss": 0.0989, "step": 6076 }, { "epoch": 1.857841638642617, "grad_norm": 0.35617342591285706, "learning_rate": 2.3205961024073365e-05, "loss": 0.0567, "step": 6077 }, { "epoch": 1.8581473555487618, "grad_norm": 0.38514530658721924, "learning_rate": 2.3209782193351164e-05, "loss": 0.0999, "step": 6078 }, { "epoch": 1.8584530724549069, "grad_norm": 0.5151987075805664, "learning_rate": 2.3213603362628966e-05, "loss": 0.0739, "step": 6079 }, { "epoch": 1.8587587893610515, "grad_norm": 0.33939510583877563, "learning_rate": 2.3217424531906765e-05, "loss": 0.083, "step": 6080 }, { "epoch": 1.8590645062671967, "grad_norm": 0.32553890347480774, "learning_rate": 2.3221245701184564e-05, "loss": 0.0721, "step": 6081 }, { "epoch": 1.8593702231733413, "grad_norm": 0.47049781680107117, "learning_rate": 2.3225066870462362e-05, "loss": 0.1069, "step": 6082 }, { "epoch": 1.8596759400794864, "grad_norm": 0.5272727608680725, "learning_rate": 2.322888803974016e-05, "loss": 0.1032, "step": 6083 }, { "epoch": 1.8599816569856313, "grad_norm": 0.45727282762527466, "learning_rate": 2.323270920901796e-05, "loss": 0.159, "step": 6084 }, { "epoch": 1.8602873738917762, "grad_norm": 0.5480346083641052, "learning_rate": 2.323653037829576e-05, "loss": 0.1837, "step": 6085 }, { "epoch": 1.8605930907979211, "grad_norm": 0.9625860452651978, "learning_rate": 2.3240351547573557e-05, "loss": 0.2016, "step": 6086 }, { "epoch": 1.860898807704066, "grad_norm": 0.5417125821113586, "learning_rate": 2.3244172716851356e-05, "loss": 0.145, "step": 6087 }, { "epoch": 1.861204524610211, "grad_norm": 0.5978454351425171, "learning_rate": 2.3247993886129155e-05, "loss": 0.1712, "step": 6088 }, { "epoch": 1.8615102415163558, "grad_norm": 2.2526772022247314, "learning_rate": 2.3251815055406954e-05, "loss": 0.1916, "step": 6089 }, { "epoch": 1.861815958422501, "grad_norm": 0.6484524607658386, "learning_rate": 2.3255636224684752e-05, "loss": 0.1851, "step": 6090 }, { "epoch": 1.8621216753286456, "grad_norm": 0.770784318447113, "learning_rate": 2.325945739396255e-05, "loss": 0.235, "step": 6091 }, { "epoch": 1.8624273922347907, "grad_norm": 0.9086342453956604, "learning_rate": 2.326327856324035e-05, "loss": 0.231, "step": 6092 }, { "epoch": 1.8627331091409354, "grad_norm": 1.508226990699768, "learning_rate": 2.326709973251815e-05, "loss": 0.2367, "step": 6093 }, { "epoch": 1.8630388260470805, "grad_norm": 1.518229365348816, "learning_rate": 2.327092090179595e-05, "loss": 0.2446, "step": 6094 }, { "epoch": 1.8633445429532252, "grad_norm": 1.253859281539917, "learning_rate": 2.327474207107375e-05, "loss": 0.2975, "step": 6095 }, { "epoch": 1.8636502598593703, "grad_norm": 2.4373972415924072, "learning_rate": 2.3278563240351548e-05, "loss": 0.3051, "step": 6096 }, { "epoch": 1.8639559767655152, "grad_norm": 1.1029483079910278, "learning_rate": 2.3282384409629347e-05, "loss": 0.1652, "step": 6097 }, { "epoch": 1.86426169367166, "grad_norm": 0.5927751660346985, "learning_rate": 2.3286205578907146e-05, "loss": 0.1189, "step": 6098 }, { "epoch": 1.864567410577805, "grad_norm": 0.25625884532928467, "learning_rate": 2.3290026748184944e-05, "loss": 0.0811, "step": 6099 }, { "epoch": 1.8648731274839498, "grad_norm": 0.381960928440094, "learning_rate": 2.3293847917462743e-05, "loss": 0.0791, "step": 6100 }, { "epoch": 1.8651788443900947, "grad_norm": 0.9638153314590454, "learning_rate": 2.3297669086740542e-05, "loss": 0.0644, "step": 6101 }, { "epoch": 1.8654845612962396, "grad_norm": 0.37526464462280273, "learning_rate": 2.330149025601834e-05, "loss": 0.0988, "step": 6102 }, { "epoch": 1.8657902782023847, "grad_norm": 0.379272997379303, "learning_rate": 2.330531142529614e-05, "loss": 0.094, "step": 6103 }, { "epoch": 1.8660959951085294, "grad_norm": 0.4647672772407532, "learning_rate": 2.3309132594573938e-05, "loss": 0.1185, "step": 6104 }, { "epoch": 1.8664017120146745, "grad_norm": 0.4536495506763458, "learning_rate": 2.3312953763851737e-05, "loss": 0.0636, "step": 6105 }, { "epoch": 1.8667074289208192, "grad_norm": 0.39094820618629456, "learning_rate": 2.3316774933129536e-05, "loss": 0.088, "step": 6106 }, { "epoch": 1.8670131458269643, "grad_norm": 0.4619564712047577, "learning_rate": 2.3320596102407338e-05, "loss": 0.0863, "step": 6107 }, { "epoch": 1.867318862733109, "grad_norm": 0.4612670838832855, "learning_rate": 2.3324417271685136e-05, "loss": 0.1117, "step": 6108 }, { "epoch": 1.867624579639254, "grad_norm": 0.6628933548927307, "learning_rate": 2.3328238440962935e-05, "loss": 0.1501, "step": 6109 }, { "epoch": 1.867930296545399, "grad_norm": 0.41032546758651733, "learning_rate": 2.3332059610240737e-05, "loss": 0.1169, "step": 6110 }, { "epoch": 1.8682360134515439, "grad_norm": 0.7370903491973877, "learning_rate": 2.3335880779518536e-05, "loss": 0.1443, "step": 6111 }, { "epoch": 1.8685417303576888, "grad_norm": 1.0438936948776245, "learning_rate": 2.3339701948796335e-05, "loss": 0.1432, "step": 6112 }, { "epoch": 1.8688474472638337, "grad_norm": 0.9412246346473694, "learning_rate": 2.3343523118074134e-05, "loss": 0.2095, "step": 6113 }, { "epoch": 1.8691531641699786, "grad_norm": 0.7576168775558472, "learning_rate": 2.3347344287351932e-05, "loss": 0.1934, "step": 6114 }, { "epoch": 1.8694588810761235, "grad_norm": 1.3182241916656494, "learning_rate": 2.335116545662973e-05, "loss": 0.2253, "step": 6115 }, { "epoch": 1.8697645979822686, "grad_norm": 0.7830142974853516, "learning_rate": 2.335498662590753e-05, "loss": 0.2287, "step": 6116 }, { "epoch": 1.8700703148884132, "grad_norm": 0.783761203289032, "learning_rate": 2.335880779518533e-05, "loss": 0.215, "step": 6117 }, { "epoch": 1.8703760317945584, "grad_norm": 1.917513132095337, "learning_rate": 2.3362628964463127e-05, "loss": 0.2454, "step": 6118 }, { "epoch": 1.870681748700703, "grad_norm": 1.1069287061691284, "learning_rate": 2.3366450133740926e-05, "loss": 0.2317, "step": 6119 }, { "epoch": 1.8709874656068481, "grad_norm": 1.1889102458953857, "learning_rate": 2.3370271303018725e-05, "loss": 0.2438, "step": 6120 }, { "epoch": 1.8712931825129928, "grad_norm": 1.2921924591064453, "learning_rate": 2.3374092472296524e-05, "loss": 0.2763, "step": 6121 }, { "epoch": 1.871598899419138, "grad_norm": 1.2166037559509277, "learning_rate": 2.3377913641574322e-05, "loss": 0.1839, "step": 6122 }, { "epoch": 1.8719046163252828, "grad_norm": 0.3166516125202179, "learning_rate": 2.338173481085212e-05, "loss": 0.092, "step": 6123 }, { "epoch": 1.8722103332314277, "grad_norm": 0.2885296642780304, "learning_rate": 2.338555598012992e-05, "loss": 0.0747, "step": 6124 }, { "epoch": 1.8725160501375726, "grad_norm": 0.27729353308677673, "learning_rate": 2.338937714940772e-05, "loss": 0.087, "step": 6125 }, { "epoch": 1.8728217670437175, "grad_norm": 0.3377351760864258, "learning_rate": 2.339319831868552e-05, "loss": 0.0613, "step": 6126 }, { "epoch": 1.8731274839498624, "grad_norm": 0.4633639454841614, "learning_rate": 2.339701948796332e-05, "loss": 0.0694, "step": 6127 }, { "epoch": 1.8734332008560073, "grad_norm": 0.5172451734542847, "learning_rate": 2.3400840657241118e-05, "loss": 0.0552, "step": 6128 }, { "epoch": 1.8737389177621524, "grad_norm": 0.3487568199634552, "learning_rate": 2.3404661826518917e-05, "loss": 0.0726, "step": 6129 }, { "epoch": 1.874044634668297, "grad_norm": 0.43810248374938965, "learning_rate": 2.3408482995796716e-05, "loss": 0.0931, "step": 6130 }, { "epoch": 1.8743503515744422, "grad_norm": 0.34903597831726074, "learning_rate": 2.3412304165074514e-05, "loss": 0.0836, "step": 6131 }, { "epoch": 1.8746560684805869, "grad_norm": 0.43768954277038574, "learning_rate": 2.3416125334352313e-05, "loss": 0.1144, "step": 6132 }, { "epoch": 1.874961785386732, "grad_norm": 0.5261037945747375, "learning_rate": 2.3419946503630112e-05, "loss": 0.0894, "step": 6133 }, { "epoch": 1.8752675022928766, "grad_norm": 0.40334412455558777, "learning_rate": 2.342376767290791e-05, "loss": 0.1307, "step": 6134 }, { "epoch": 1.8755732191990218, "grad_norm": 0.45711272954940796, "learning_rate": 2.342758884218571e-05, "loss": 0.1429, "step": 6135 }, { "epoch": 1.8758789361051666, "grad_norm": 0.37845832109451294, "learning_rate": 2.3431410011463508e-05, "loss": 0.1171, "step": 6136 }, { "epoch": 1.8761846530113115, "grad_norm": 0.7684354186058044, "learning_rate": 2.3435231180741307e-05, "loss": 0.1718, "step": 6137 }, { "epoch": 1.8764903699174564, "grad_norm": 0.7282891869544983, "learning_rate": 2.3439052350019106e-05, "loss": 0.2181, "step": 6138 }, { "epoch": 1.8767960868236013, "grad_norm": 0.5908802151679993, "learning_rate": 2.3442873519296904e-05, "loss": 0.1733, "step": 6139 }, { "epoch": 1.8771018037297462, "grad_norm": 0.5911440849304199, "learning_rate": 2.3446694688574703e-05, "loss": 0.2018, "step": 6140 }, { "epoch": 1.8774075206358911, "grad_norm": 0.9210339188575745, "learning_rate": 2.3450515857852505e-05, "loss": 0.2511, "step": 6141 }, { "epoch": 1.8777132375420362, "grad_norm": 0.9261724948883057, "learning_rate": 2.3454337027130304e-05, "loss": 0.2502, "step": 6142 }, { "epoch": 1.878018954448181, "grad_norm": 0.9901486039161682, "learning_rate": 2.3458158196408103e-05, "loss": 0.3018, "step": 6143 }, { "epoch": 1.878324671354326, "grad_norm": 1.091254711151123, "learning_rate": 2.34619793656859e-05, "loss": 0.3194, "step": 6144 }, { "epoch": 1.8786303882604707, "grad_norm": 1.0374457836151123, "learning_rate": 2.34658005349637e-05, "loss": 0.2874, "step": 6145 }, { "epoch": 1.8789361051666158, "grad_norm": 1.9189966917037964, "learning_rate": 2.34696217042415e-05, "loss": 0.2943, "step": 6146 }, { "epoch": 1.8792418220727605, "grad_norm": 0.33431798219680786, "learning_rate": 2.3473442873519298e-05, "loss": 0.1748, "step": 6147 }, { "epoch": 1.8795475389789056, "grad_norm": 0.2523156404495239, "learning_rate": 2.3477264042797096e-05, "loss": 0.0879, "step": 6148 }, { "epoch": 1.8798532558850505, "grad_norm": 0.4419257342815399, "learning_rate": 2.3481085212074895e-05, "loss": 0.0855, "step": 6149 }, { "epoch": 1.8801589727911954, "grad_norm": 2.228261709213257, "learning_rate": 2.3484906381352694e-05, "loss": 0.0988, "step": 6150 }, { "epoch": 1.8804646896973403, "grad_norm": 0.3143896758556366, "learning_rate": 2.3488727550630493e-05, "loss": 0.085, "step": 6151 }, { "epoch": 1.8807704066034852, "grad_norm": 0.2719569206237793, "learning_rate": 2.349254871990829e-05, "loss": 0.0809, "step": 6152 }, { "epoch": 1.88107612350963, "grad_norm": 0.39686620235443115, "learning_rate": 2.349636988918609e-05, "loss": 0.0813, "step": 6153 }, { "epoch": 1.881381840415775, "grad_norm": 0.5021384358406067, "learning_rate": 2.350019105846389e-05, "loss": 0.0771, "step": 6154 }, { "epoch": 1.88168755732192, "grad_norm": 0.36900368332862854, "learning_rate": 2.3504012227741688e-05, "loss": 0.1012, "step": 6155 }, { "epoch": 1.8819932742280647, "grad_norm": 0.4269985258579254, "learning_rate": 2.350783339701949e-05, "loss": 0.1051, "step": 6156 }, { "epoch": 1.8822989911342098, "grad_norm": 0.3077365458011627, "learning_rate": 2.351165456629729e-05, "loss": 0.0935, "step": 6157 }, { "epoch": 1.8826047080403545, "grad_norm": 0.36300331354141235, "learning_rate": 2.3515475735575087e-05, "loss": 0.0924, "step": 6158 }, { "epoch": 1.8829104249464996, "grad_norm": 0.44892674684524536, "learning_rate": 2.3519296904852886e-05, "loss": 0.1116, "step": 6159 }, { "epoch": 1.8832161418526443, "grad_norm": 0.5969120264053345, "learning_rate": 2.3523118074130685e-05, "loss": 0.1652, "step": 6160 }, { "epoch": 1.8835218587587894, "grad_norm": 0.4726010859012604, "learning_rate": 2.3526939243408483e-05, "loss": 0.1439, "step": 6161 }, { "epoch": 1.8838275756649343, "grad_norm": 0.47112664580345154, "learning_rate": 2.3530760412686282e-05, "loss": 0.1834, "step": 6162 }, { "epoch": 1.8841332925710792, "grad_norm": 0.7262343168258667, "learning_rate": 2.353458158196408e-05, "loss": 0.2177, "step": 6163 }, { "epoch": 1.884439009477224, "grad_norm": 0.9395480155944824, "learning_rate": 2.353840275124188e-05, "loss": 0.1884, "step": 6164 }, { "epoch": 1.884744726383369, "grad_norm": 1.0403962135314941, "learning_rate": 2.354222392051968e-05, "loss": 0.2113, "step": 6165 }, { "epoch": 1.8850504432895139, "grad_norm": 0.7155374884605408, "learning_rate": 2.3546045089797477e-05, "loss": 0.2346, "step": 6166 }, { "epoch": 1.8853561601956588, "grad_norm": 0.7361756563186646, "learning_rate": 2.3549866259075276e-05, "loss": 0.2144, "step": 6167 }, { "epoch": 1.8856618771018039, "grad_norm": 1.117415428161621, "learning_rate": 2.3553687428353075e-05, "loss": 0.2183, "step": 6168 }, { "epoch": 1.8859675940079486, "grad_norm": 1.2156795263290405, "learning_rate": 2.3557508597630873e-05, "loss": 0.2664, "step": 6169 }, { "epoch": 1.8862733109140937, "grad_norm": 1.8567301034927368, "learning_rate": 2.3561329766908672e-05, "loss": 0.2496, "step": 6170 }, { "epoch": 1.8865790278202383, "grad_norm": 2.3072304725646973, "learning_rate": 2.356515093618647e-05, "loss": 0.3091, "step": 6171 }, { "epoch": 1.8868847447263835, "grad_norm": 0.4018270969390869, "learning_rate": 2.3568972105464273e-05, "loss": 0.1672, "step": 6172 }, { "epoch": 1.8871904616325281, "grad_norm": 0.2849985659122467, "learning_rate": 2.3572793274742072e-05, "loss": 0.098, "step": 6173 }, { "epoch": 1.8874961785386732, "grad_norm": 0.5963870286941528, "learning_rate": 2.357661444401987e-05, "loss": 0.0793, "step": 6174 }, { "epoch": 1.8878018954448181, "grad_norm": 0.6906450390815735, "learning_rate": 2.358043561329767e-05, "loss": 0.092, "step": 6175 }, { "epoch": 1.888107612350963, "grad_norm": 0.4033399820327759, "learning_rate": 2.3584256782575468e-05, "loss": 0.062, "step": 6176 }, { "epoch": 1.888413329257108, "grad_norm": 0.4160445034503937, "learning_rate": 2.3588077951853267e-05, "loss": 0.0796, "step": 6177 }, { "epoch": 1.8887190461632528, "grad_norm": 0.9556686878204346, "learning_rate": 2.3591899121131066e-05, "loss": 0.054, "step": 6178 }, { "epoch": 1.8890247630693977, "grad_norm": 0.7496342658996582, "learning_rate": 2.3595720290408864e-05, "loss": 0.091, "step": 6179 }, { "epoch": 1.8893304799755426, "grad_norm": 0.34428226947784424, "learning_rate": 2.3599541459686663e-05, "loss": 0.0722, "step": 6180 }, { "epoch": 1.8896361968816877, "grad_norm": 0.28131604194641113, "learning_rate": 2.3603362628964462e-05, "loss": 0.0711, "step": 6181 }, { "epoch": 1.8899419137878324, "grad_norm": 0.4007118344306946, "learning_rate": 2.360718379824226e-05, "loss": 0.1142, "step": 6182 }, { "epoch": 1.8902476306939775, "grad_norm": 0.5233922004699707, "learning_rate": 2.361100496752006e-05, "loss": 0.114, "step": 6183 }, { "epoch": 1.8905533476001222, "grad_norm": 0.4069688022136688, "learning_rate": 2.361482613679786e-05, "loss": 0.1268, "step": 6184 }, { "epoch": 1.8908590645062673, "grad_norm": 0.5490656495094299, "learning_rate": 2.361864730607566e-05, "loss": 0.1501, "step": 6185 }, { "epoch": 1.891164781412412, "grad_norm": 0.9077187776565552, "learning_rate": 2.362246847535346e-05, "loss": 0.1245, "step": 6186 }, { "epoch": 1.891470498318557, "grad_norm": 0.5418707728385925, "learning_rate": 2.3626289644631258e-05, "loss": 0.1523, "step": 6187 }, { "epoch": 1.891776215224702, "grad_norm": 0.731663167476654, "learning_rate": 2.363011081390906e-05, "loss": 0.1895, "step": 6188 }, { "epoch": 1.8920819321308469, "grad_norm": 0.7320930361747742, "learning_rate": 2.363393198318686e-05, "loss": 0.1811, "step": 6189 }, { "epoch": 1.8923876490369917, "grad_norm": 0.5933003425598145, "learning_rate": 2.3637753152464657e-05, "loss": 0.1772, "step": 6190 }, { "epoch": 1.8926933659431366, "grad_norm": 0.7650347948074341, "learning_rate": 2.3641574321742456e-05, "loss": 0.1923, "step": 6191 }, { "epoch": 1.8929990828492815, "grad_norm": 1.1398084163665771, "learning_rate": 2.3645395491020255e-05, "loss": 0.2141, "step": 6192 }, { "epoch": 1.8933047997554264, "grad_norm": 1.6113300323486328, "learning_rate": 2.3649216660298053e-05, "loss": 0.2085, "step": 6193 }, { "epoch": 1.8936105166615715, "grad_norm": 1.4514436721801758, "learning_rate": 2.3653037829575852e-05, "loss": 0.2288, "step": 6194 }, { "epoch": 1.8939162335677162, "grad_norm": 1.7324949502944946, "learning_rate": 2.365685899885365e-05, "loss": 0.2611, "step": 6195 }, { "epoch": 1.8942219504738613, "grad_norm": 2.5099546909332275, "learning_rate": 2.366068016813145e-05, "loss": 0.3028, "step": 6196 }, { "epoch": 1.894527667380006, "grad_norm": 0.8243921399116516, "learning_rate": 2.366450133740925e-05, "loss": 0.2024, "step": 6197 }, { "epoch": 1.8948333842861511, "grad_norm": 0.36367306113243103, "learning_rate": 2.3668322506687047e-05, "loss": 0.085, "step": 6198 }, { "epoch": 1.8951391011922958, "grad_norm": 0.36767446994781494, "learning_rate": 2.3672143675964846e-05, "loss": 0.089, "step": 6199 }, { "epoch": 1.895444818098441, "grad_norm": 0.28965142369270325, "learning_rate": 2.3675964845242645e-05, "loss": 0.0711, "step": 6200 }, { "epoch": 1.8957505350045858, "grad_norm": 0.2355983555316925, "learning_rate": 2.3679786014520443e-05, "loss": 0.0567, "step": 6201 }, { "epoch": 1.8960562519107307, "grad_norm": 0.3679572641849518, "learning_rate": 2.3683607183798242e-05, "loss": 0.0822, "step": 6202 }, { "epoch": 1.8963619688168756, "grad_norm": 0.3021470904350281, "learning_rate": 2.3687428353076044e-05, "loss": 0.0554, "step": 6203 }, { "epoch": 1.8966676857230205, "grad_norm": 0.5227674841880798, "learning_rate": 2.3691249522353843e-05, "loss": 0.095, "step": 6204 }, { "epoch": 1.8969734026291654, "grad_norm": 0.3849219083786011, "learning_rate": 2.3695070691631642e-05, "loss": 0.0946, "step": 6205 }, { "epoch": 1.8972791195353103, "grad_norm": 0.6217184066772461, "learning_rate": 2.369889186090944e-05, "loss": 0.0819, "step": 6206 }, { "epoch": 1.8975848364414554, "grad_norm": 0.3693218231201172, "learning_rate": 2.370271303018724e-05, "loss": 0.1074, "step": 6207 }, { "epoch": 1.8978905533476, "grad_norm": 0.4755726158618927, "learning_rate": 2.3706534199465038e-05, "loss": 0.1234, "step": 6208 }, { "epoch": 1.8981962702537452, "grad_norm": 0.42122459411621094, "learning_rate": 2.3710355368742837e-05, "loss": 0.0972, "step": 6209 }, { "epoch": 1.8985019871598898, "grad_norm": 0.5083503127098083, "learning_rate": 2.3714176538020636e-05, "loss": 0.1834, "step": 6210 }, { "epoch": 1.898807704066035, "grad_norm": 0.4529668390750885, "learning_rate": 2.3717997707298434e-05, "loss": 0.1382, "step": 6211 }, { "epoch": 1.8991134209721796, "grad_norm": 0.5395588278770447, "learning_rate": 2.3721818876576233e-05, "loss": 0.1339, "step": 6212 }, { "epoch": 1.8994191378783247, "grad_norm": 0.6238298416137695, "learning_rate": 2.3725640045854032e-05, "loss": 0.1925, "step": 6213 }, { "epoch": 1.8997248547844696, "grad_norm": 1.176436185836792, "learning_rate": 2.372946121513183e-05, "loss": 0.2193, "step": 6214 }, { "epoch": 1.9000305716906145, "grad_norm": 0.8439923524856567, "learning_rate": 2.373328238440963e-05, "loss": 0.196, "step": 6215 }, { "epoch": 1.9003362885967594, "grad_norm": 0.6802811026573181, "learning_rate": 2.3737103553687428e-05, "loss": 0.2364, "step": 6216 }, { "epoch": 1.9006420055029043, "grad_norm": 0.8301566243171692, "learning_rate": 2.3740924722965227e-05, "loss": 0.2007, "step": 6217 }, { "epoch": 1.9009477224090492, "grad_norm": 1.2253658771514893, "learning_rate": 2.3744745892243025e-05, "loss": 0.2242, "step": 6218 }, { "epoch": 1.901253439315194, "grad_norm": 1.3864530324935913, "learning_rate": 2.3748567061520828e-05, "loss": 0.2326, "step": 6219 }, { "epoch": 1.9015591562213392, "grad_norm": 1.9632599353790283, "learning_rate": 2.3752388230798626e-05, "loss": 0.2964, "step": 6220 }, { "epoch": 1.9018648731274839, "grad_norm": 1.4190123081207275, "learning_rate": 2.3756209400076425e-05, "loss": 0.3207, "step": 6221 }, { "epoch": 1.902170590033629, "grad_norm": 0.37556785345077515, "learning_rate": 2.3760030569354224e-05, "loss": 0.1467, "step": 6222 }, { "epoch": 1.9024763069397737, "grad_norm": 0.43115219473838806, "learning_rate": 2.3763851738632023e-05, "loss": 0.114, "step": 6223 }, { "epoch": 1.9027820238459188, "grad_norm": 0.5242397785186768, "learning_rate": 2.376767290790982e-05, "loss": 0.0847, "step": 6224 }, { "epoch": 1.9030877407520634, "grad_norm": 0.3463306725025177, "learning_rate": 2.377149407718762e-05, "loss": 0.085, "step": 6225 }, { "epoch": 1.9033934576582086, "grad_norm": 0.3236393332481384, "learning_rate": 2.377531524646542e-05, "loss": 0.0853, "step": 6226 }, { "epoch": 1.9036991745643534, "grad_norm": 0.3594732880592346, "learning_rate": 2.3779136415743218e-05, "loss": 0.078, "step": 6227 }, { "epoch": 1.9040048914704983, "grad_norm": 0.31130871176719666, "learning_rate": 2.3782957585021016e-05, "loss": 0.0854, "step": 6228 }, { "epoch": 1.9043106083766432, "grad_norm": 0.3426867425441742, "learning_rate": 2.3786778754298815e-05, "loss": 0.0904, "step": 6229 }, { "epoch": 1.9046163252827881, "grad_norm": 0.38581734895706177, "learning_rate": 2.3790599923576614e-05, "loss": 0.0889, "step": 6230 }, { "epoch": 1.904922042188933, "grad_norm": 0.29743531346321106, "learning_rate": 2.3794421092854413e-05, "loss": 0.061, "step": 6231 }, { "epoch": 1.905227759095078, "grad_norm": 0.4988816976547241, "learning_rate": 2.379824226213221e-05, "loss": 0.1089, "step": 6232 }, { "epoch": 1.905533476001223, "grad_norm": 0.6603725552558899, "learning_rate": 2.380206343141001e-05, "loss": 0.1096, "step": 6233 }, { "epoch": 1.9058391929073677, "grad_norm": 0.36461493372917175, "learning_rate": 2.3805884600687812e-05, "loss": 0.1159, "step": 6234 }, { "epoch": 1.9061449098135128, "grad_norm": 0.45275822281837463, "learning_rate": 2.380970576996561e-05, "loss": 0.1179, "step": 6235 }, { "epoch": 1.9064506267196575, "grad_norm": 0.7151157855987549, "learning_rate": 2.381352693924341e-05, "loss": 0.1867, "step": 6236 }, { "epoch": 1.9067563436258026, "grad_norm": 0.5507813096046448, "learning_rate": 2.381734810852121e-05, "loss": 0.2041, "step": 6237 }, { "epoch": 1.9070620605319473, "grad_norm": 1.3933318853378296, "learning_rate": 2.3821169277799007e-05, "loss": 0.1709, "step": 6238 }, { "epoch": 1.9073677774380924, "grad_norm": 2.6596157550811768, "learning_rate": 2.3824990447076806e-05, "loss": 0.1939, "step": 6239 }, { "epoch": 1.9076734943442373, "grad_norm": 1.035461664199829, "learning_rate": 2.3828811616354605e-05, "loss": 0.2249, "step": 6240 }, { "epoch": 1.9079792112503822, "grad_norm": 0.7187370657920837, "learning_rate": 2.3832632785632403e-05, "loss": 0.2004, "step": 6241 }, { "epoch": 1.908284928156527, "grad_norm": 1.2733266353607178, "learning_rate": 2.3836453954910202e-05, "loss": 0.2138, "step": 6242 }, { "epoch": 1.908590645062672, "grad_norm": 1.6778502464294434, "learning_rate": 2.3840275124188e-05, "loss": 0.2296, "step": 6243 }, { "epoch": 1.9088963619688168, "grad_norm": 1.3895150423049927, "learning_rate": 2.38440962934658e-05, "loss": 0.2145, "step": 6244 }, { "epoch": 1.9092020788749617, "grad_norm": 1.4676145315170288, "learning_rate": 2.38479174627436e-05, "loss": 0.2413, "step": 6245 }, { "epoch": 1.9095077957811069, "grad_norm": 1.84783136844635, "learning_rate": 2.3851738632021397e-05, "loss": 0.283, "step": 6246 }, { "epoch": 1.9098135126872515, "grad_norm": 0.4192536175251007, "learning_rate": 2.3855559801299196e-05, "loss": 0.1816, "step": 6247 }, { "epoch": 1.9101192295933966, "grad_norm": 0.5583586096763611, "learning_rate": 2.3859380970576995e-05, "loss": 0.1105, "step": 6248 }, { "epoch": 1.9104249464995413, "grad_norm": 0.38499709963798523, "learning_rate": 2.3863202139854793e-05, "loss": 0.079, "step": 6249 }, { "epoch": 1.9107306634056864, "grad_norm": 0.43461182713508606, "learning_rate": 2.3867023309132595e-05, "loss": 0.0612, "step": 6250 }, { "epoch": 1.911036380311831, "grad_norm": 0.26779717206954956, "learning_rate": 2.3870844478410394e-05, "loss": 0.078, "step": 6251 }, { "epoch": 1.9113420972179762, "grad_norm": 0.4752318561077118, "learning_rate": 2.3874665647688193e-05, "loss": 0.056, "step": 6252 }, { "epoch": 1.911647814124121, "grad_norm": 0.3842249810695648, "learning_rate": 2.3878486816965992e-05, "loss": 0.0805, "step": 6253 }, { "epoch": 1.911953531030266, "grad_norm": 0.34528616070747375, "learning_rate": 2.388230798624379e-05, "loss": 0.0706, "step": 6254 }, { "epoch": 1.9122592479364109, "grad_norm": 0.31855717301368713, "learning_rate": 2.388612915552159e-05, "loss": 0.0792, "step": 6255 }, { "epoch": 1.9125649648425558, "grad_norm": 0.35161325335502625, "learning_rate": 2.3889950324799388e-05, "loss": 0.0789, "step": 6256 }, { "epoch": 1.9128706817487007, "grad_norm": 1.4172834157943726, "learning_rate": 2.3893771494077187e-05, "loss": 0.1112, "step": 6257 }, { "epoch": 1.9131763986548456, "grad_norm": 0.5422112345695496, "learning_rate": 2.3897592663354985e-05, "loss": 0.0957, "step": 6258 }, { "epoch": 1.9134821155609907, "grad_norm": 0.403214693069458, "learning_rate": 2.3901413832632784e-05, "loss": 0.0832, "step": 6259 }, { "epoch": 1.9137878324671354, "grad_norm": 1.6375287771224976, "learning_rate": 2.3905235001910583e-05, "loss": 0.1554, "step": 6260 }, { "epoch": 1.9140935493732805, "grad_norm": 0.5669798254966736, "learning_rate": 2.390905617118838e-05, "loss": 0.1327, "step": 6261 }, { "epoch": 1.9143992662794251, "grad_norm": 0.7527708411216736, "learning_rate": 2.3912877340466184e-05, "loss": 0.2214, "step": 6262 }, { "epoch": 1.9147049831855703, "grad_norm": 0.82986980676651, "learning_rate": 2.3916698509743983e-05, "loss": 0.1674, "step": 6263 }, { "epoch": 1.915010700091715, "grad_norm": 1.1495851278305054, "learning_rate": 2.392051967902178e-05, "loss": 0.1804, "step": 6264 }, { "epoch": 1.91531641699786, "grad_norm": 0.7946401238441467, "learning_rate": 2.3924340848299583e-05, "loss": 0.2265, "step": 6265 }, { "epoch": 1.915622133904005, "grad_norm": 0.8473116159439087, "learning_rate": 2.3928162017577382e-05, "loss": 0.2315, "step": 6266 }, { "epoch": 1.9159278508101498, "grad_norm": 1.1015578508377075, "learning_rate": 2.393198318685518e-05, "loss": 0.2454, "step": 6267 }, { "epoch": 1.9162335677162947, "grad_norm": 0.8492112159729004, "learning_rate": 2.393580435613298e-05, "loss": 0.2182, "step": 6268 }, { "epoch": 1.9165392846224396, "grad_norm": 1.3183790445327759, "learning_rate": 2.393962552541078e-05, "loss": 0.2367, "step": 6269 }, { "epoch": 1.9168450015285845, "grad_norm": 1.1667304039001465, "learning_rate": 2.3943446694688577e-05, "loss": 0.2439, "step": 6270 }, { "epoch": 1.9171507184347294, "grad_norm": 4.363717079162598, "learning_rate": 2.3947267863966376e-05, "loss": 0.3367, "step": 6271 }, { "epoch": 1.9174564353408745, "grad_norm": 0.7834454774856567, "learning_rate": 2.3951089033244175e-05, "loss": 0.1675, "step": 6272 }, { "epoch": 1.9177621522470192, "grad_norm": 0.3864767551422119, "learning_rate": 2.3954910202521973e-05, "loss": 0.0957, "step": 6273 }, { "epoch": 1.9180678691531643, "grad_norm": 0.2783229947090149, "learning_rate": 2.3958731371799772e-05, "loss": 0.0796, "step": 6274 }, { "epoch": 1.918373586059309, "grad_norm": 1.288156509399414, "learning_rate": 2.396255254107757e-05, "loss": 0.0753, "step": 6275 }, { "epoch": 1.918679302965454, "grad_norm": 0.4017045795917511, "learning_rate": 2.396637371035537e-05, "loss": 0.0744, "step": 6276 }, { "epoch": 1.9189850198715988, "grad_norm": 0.271890789270401, "learning_rate": 2.397019487963317e-05, "loss": 0.0629, "step": 6277 }, { "epoch": 1.9192907367777439, "grad_norm": 0.5696129202842712, "learning_rate": 2.3974016048910967e-05, "loss": 0.0671, "step": 6278 }, { "epoch": 1.9195964536838888, "grad_norm": 0.32505613565444946, "learning_rate": 2.3977837218188766e-05, "loss": 0.0695, "step": 6279 }, { "epoch": 1.9199021705900337, "grad_norm": 0.501740038394928, "learning_rate": 2.3981658387466565e-05, "loss": 0.071, "step": 6280 }, { "epoch": 1.9202078874961785, "grad_norm": 0.6105679869651794, "learning_rate": 2.3985479556744367e-05, "loss": 0.1209, "step": 6281 }, { "epoch": 1.9205136044023234, "grad_norm": 0.38680511713027954, "learning_rate": 2.3989300726022165e-05, "loss": 0.1011, "step": 6282 }, { "epoch": 1.9208193213084683, "grad_norm": 0.3295097351074219, "learning_rate": 2.3993121895299964e-05, "loss": 0.0696, "step": 6283 }, { "epoch": 1.9211250382146132, "grad_norm": 0.4093138873577118, "learning_rate": 2.3996943064577763e-05, "loss": 0.1215, "step": 6284 }, { "epoch": 1.9214307551207583, "grad_norm": 0.8294578790664673, "learning_rate": 2.4000764233855562e-05, "loss": 0.1487, "step": 6285 }, { "epoch": 1.921736472026903, "grad_norm": 0.45892608165740967, "learning_rate": 2.400458540313336e-05, "loss": 0.1228, "step": 6286 }, { "epoch": 1.9220421889330481, "grad_norm": 0.5335274338722229, "learning_rate": 2.400840657241116e-05, "loss": 0.1588, "step": 6287 }, { "epoch": 1.9223479058391928, "grad_norm": 2.3776092529296875, "learning_rate": 2.4012227741688958e-05, "loss": 0.1631, "step": 6288 }, { "epoch": 1.922653622745338, "grad_norm": 0.9512613415718079, "learning_rate": 2.4016048910966757e-05, "loss": 0.2086, "step": 6289 }, { "epoch": 1.9229593396514826, "grad_norm": 3.2534704208374023, "learning_rate": 2.4019870080244555e-05, "loss": 0.2272, "step": 6290 }, { "epoch": 1.9232650565576277, "grad_norm": 0.7222756743431091, "learning_rate": 2.4023691249522354e-05, "loss": 0.1756, "step": 6291 }, { "epoch": 1.9235707734637726, "grad_norm": 0.7818479537963867, "learning_rate": 2.4027512418800153e-05, "loss": 0.186, "step": 6292 }, { "epoch": 1.9238764903699175, "grad_norm": 0.9681180715560913, "learning_rate": 2.403133358807795e-05, "loss": 0.2312, "step": 6293 }, { "epoch": 1.9241822072760624, "grad_norm": 1.4741053581237793, "learning_rate": 2.403515475735575e-05, "loss": 0.2431, "step": 6294 }, { "epoch": 1.9244879241822073, "grad_norm": 1.6540642976760864, "learning_rate": 2.403897592663355e-05, "loss": 0.2601, "step": 6295 }, { "epoch": 1.9247936410883522, "grad_norm": 1.6880269050598145, "learning_rate": 2.404279709591135e-05, "loss": 0.3709, "step": 6296 }, { "epoch": 1.925099357994497, "grad_norm": 0.9517306685447693, "learning_rate": 2.404661826518915e-05, "loss": 0.1803, "step": 6297 }, { "epoch": 1.9254050749006422, "grad_norm": 0.3323294520378113, "learning_rate": 2.405043943446695e-05, "loss": 0.1042, "step": 6298 }, { "epoch": 1.9257107918067868, "grad_norm": 0.3795206546783447, "learning_rate": 2.4054260603744748e-05, "loss": 0.0771, "step": 6299 }, { "epoch": 1.926016508712932, "grad_norm": 0.3179398477077484, "learning_rate": 2.4058081773022546e-05, "loss": 0.0839, "step": 6300 }, { "epoch": 1.9263222256190766, "grad_norm": 0.31688830256462097, "learning_rate": 2.4061902942300345e-05, "loss": 0.073, "step": 6301 }, { "epoch": 1.9266279425252217, "grad_norm": 0.46821147203445435, "learning_rate": 2.4065724111578144e-05, "loss": 0.0728, "step": 6302 }, { "epoch": 1.9269336594313664, "grad_norm": 0.2907589077949524, "learning_rate": 2.4069545280855943e-05, "loss": 0.069, "step": 6303 }, { "epoch": 1.9272393763375115, "grad_norm": 0.4129140079021454, "learning_rate": 2.407336645013374e-05, "loss": 0.078, "step": 6304 }, { "epoch": 1.9275450932436564, "grad_norm": 0.3710465729236603, "learning_rate": 2.407718761941154e-05, "loss": 0.0665, "step": 6305 }, { "epoch": 1.9278508101498013, "grad_norm": 0.38220545649528503, "learning_rate": 2.408100878868934e-05, "loss": 0.0687, "step": 6306 }, { "epoch": 1.9281565270559462, "grad_norm": 0.6528335809707642, "learning_rate": 2.4084829957967137e-05, "loss": 0.0911, "step": 6307 }, { "epoch": 1.928462243962091, "grad_norm": 0.6284512281417847, "learning_rate": 2.4088651127244936e-05, "loss": 0.0971, "step": 6308 }, { "epoch": 1.928767960868236, "grad_norm": 0.7566723227500916, "learning_rate": 2.4092472296522735e-05, "loss": 0.1133, "step": 6309 }, { "epoch": 1.9290736777743809, "grad_norm": 0.9057125449180603, "learning_rate": 2.4096293465800534e-05, "loss": 0.1398, "step": 6310 }, { "epoch": 1.929379394680526, "grad_norm": 0.6125158071517944, "learning_rate": 2.4100114635078332e-05, "loss": 0.1601, "step": 6311 }, { "epoch": 1.9296851115866707, "grad_norm": 0.6731373071670532, "learning_rate": 2.4103935804356135e-05, "loss": 0.1816, "step": 6312 }, { "epoch": 1.9299908284928158, "grad_norm": 0.8910582065582275, "learning_rate": 2.4107756973633933e-05, "loss": 0.1727, "step": 6313 }, { "epoch": 1.9302965453989605, "grad_norm": 0.881213903427124, "learning_rate": 2.4111578142911732e-05, "loss": 0.1802, "step": 6314 }, { "epoch": 1.9306022623051056, "grad_norm": 0.6779544353485107, "learning_rate": 2.411539931218953e-05, "loss": 0.1983, "step": 6315 }, { "epoch": 1.9309079792112502, "grad_norm": 0.9326953291893005, "learning_rate": 2.411922048146733e-05, "loss": 0.2285, "step": 6316 }, { "epoch": 1.9312136961173954, "grad_norm": 8.224325180053711, "learning_rate": 2.412304165074513e-05, "loss": 0.2588, "step": 6317 }, { "epoch": 1.9315194130235402, "grad_norm": 0.747307538986206, "learning_rate": 2.4126862820022927e-05, "loss": 0.2246, "step": 6318 }, { "epoch": 1.9318251299296851, "grad_norm": 3.124185085296631, "learning_rate": 2.4130683989300726e-05, "loss": 0.2239, "step": 6319 }, { "epoch": 1.93213084683583, "grad_norm": 1.343912124633789, "learning_rate": 2.4134505158578525e-05, "loss": 0.2541, "step": 6320 }, { "epoch": 1.932436563741975, "grad_norm": 2.7196948528289795, "learning_rate": 2.4138326327856323e-05, "loss": 0.2788, "step": 6321 }, { "epoch": 1.9327422806481198, "grad_norm": 0.6205378770828247, "learning_rate": 2.4142147497134122e-05, "loss": 0.1978, "step": 6322 }, { "epoch": 1.9330479975542647, "grad_norm": 0.5568069219589233, "learning_rate": 2.414596866641192e-05, "loss": 0.0933, "step": 6323 }, { "epoch": 1.9333537144604098, "grad_norm": 0.6534050107002258, "learning_rate": 2.414978983568972e-05, "loss": 0.096, "step": 6324 }, { "epoch": 1.9336594313665545, "grad_norm": 0.8517934083938599, "learning_rate": 2.4153611004967518e-05, "loss": 0.0839, "step": 6325 }, { "epoch": 1.9339651482726996, "grad_norm": 0.2751946449279785, "learning_rate": 2.4157432174245317e-05, "loss": 0.0731, "step": 6326 }, { "epoch": 1.9342708651788443, "grad_norm": 0.44100239872932434, "learning_rate": 2.416125334352312e-05, "loss": 0.0718, "step": 6327 }, { "epoch": 1.9345765820849894, "grad_norm": 0.4922046959400177, "learning_rate": 2.4165074512800918e-05, "loss": 0.1308, "step": 6328 }, { "epoch": 1.934882298991134, "grad_norm": 0.31674277782440186, "learning_rate": 2.4168895682078717e-05, "loss": 0.0765, "step": 6329 }, { "epoch": 1.9351880158972792, "grad_norm": 0.37844640016555786, "learning_rate": 2.4172716851356515e-05, "loss": 0.1001, "step": 6330 }, { "epoch": 1.935493732803424, "grad_norm": 0.4124774932861328, "learning_rate": 2.4176538020634314e-05, "loss": 0.0787, "step": 6331 }, { "epoch": 1.935799449709569, "grad_norm": 0.6305738091468811, "learning_rate": 2.4180359189912113e-05, "loss": 0.1355, "step": 6332 }, { "epoch": 1.9361051666157139, "grad_norm": 0.4922633171081543, "learning_rate": 2.418418035918991e-05, "loss": 0.0803, "step": 6333 }, { "epoch": 1.9364108835218588, "grad_norm": 0.6487208008766174, "learning_rate": 2.418800152846771e-05, "loss": 0.1183, "step": 6334 }, { "epoch": 1.9367166004280036, "grad_norm": 2.2749545574188232, "learning_rate": 2.419182269774551e-05, "loss": 0.1058, "step": 6335 }, { "epoch": 1.9370223173341485, "grad_norm": 0.6920011043548584, "learning_rate": 2.4195643867023308e-05, "loss": 0.1839, "step": 6336 }, { "epoch": 1.9373280342402937, "grad_norm": 0.7670975923538208, "learning_rate": 2.4199465036301107e-05, "loss": 0.168, "step": 6337 }, { "epoch": 1.9376337511464383, "grad_norm": 1.3647668361663818, "learning_rate": 2.4203286205578905e-05, "loss": 0.1832, "step": 6338 }, { "epoch": 1.9379394680525834, "grad_norm": 1.3749945163726807, "learning_rate": 2.4207107374856704e-05, "loss": 0.1865, "step": 6339 }, { "epoch": 1.938245184958728, "grad_norm": 0.9126365184783936, "learning_rate": 2.4210928544134506e-05, "loss": 0.215, "step": 6340 }, { "epoch": 1.9385509018648732, "grad_norm": 1.209141492843628, "learning_rate": 2.4214749713412305e-05, "loss": 0.1924, "step": 6341 }, { "epoch": 1.938856618771018, "grad_norm": 1.0206501483917236, "learning_rate": 2.4218570882690104e-05, "loss": 0.262, "step": 6342 }, { "epoch": 1.939162335677163, "grad_norm": 0.6950281262397766, "learning_rate": 2.4222392051967906e-05, "loss": 0.2178, "step": 6343 }, { "epoch": 1.939468052583308, "grad_norm": 1.8762916326522827, "learning_rate": 2.4226213221245705e-05, "loss": 0.2351, "step": 6344 }, { "epoch": 1.9397737694894528, "grad_norm": 1.4030523300170898, "learning_rate": 2.4230034390523503e-05, "loss": 0.2553, "step": 6345 }, { "epoch": 1.9400794863955977, "grad_norm": 1.5757360458374023, "learning_rate": 2.4233855559801302e-05, "loss": 0.3183, "step": 6346 }, { "epoch": 1.9403852033017426, "grad_norm": 0.36212223768234253, "learning_rate": 2.42376767290791e-05, "loss": 0.1782, "step": 6347 }, { "epoch": 1.9406909202078875, "grad_norm": 0.3079048991203308, "learning_rate": 2.42414978983569e-05, "loss": 0.0779, "step": 6348 }, { "epoch": 1.9409966371140324, "grad_norm": 0.4732174277305603, "learning_rate": 2.42453190676347e-05, "loss": 0.0953, "step": 6349 }, { "epoch": 1.9413023540201775, "grad_norm": 0.5856958627700806, "learning_rate": 2.4249140236912497e-05, "loss": 0.1016, "step": 6350 }, { "epoch": 1.9416080709263221, "grad_norm": 0.2143266499042511, "learning_rate": 2.4252961406190296e-05, "loss": 0.0625, "step": 6351 }, { "epoch": 1.9419137878324673, "grad_norm": 0.3913695812225342, "learning_rate": 2.4256782575468095e-05, "loss": 0.0611, "step": 6352 }, { "epoch": 1.942219504738612, "grad_norm": 0.3697551190853119, "learning_rate": 2.4260603744745893e-05, "loss": 0.0898, "step": 6353 }, { "epoch": 1.942525221644757, "grad_norm": 0.6667912602424622, "learning_rate": 2.4264424914023692e-05, "loss": 0.0887, "step": 6354 }, { "epoch": 1.9428309385509017, "grad_norm": 0.31474700570106506, "learning_rate": 2.426824608330149e-05, "loss": 0.0799, "step": 6355 }, { "epoch": 1.9431366554570468, "grad_norm": 0.559280514717102, "learning_rate": 2.427206725257929e-05, "loss": 0.1008, "step": 6356 }, { "epoch": 1.9434423723631917, "grad_norm": 0.8460932374000549, "learning_rate": 2.4275888421857088e-05, "loss": 0.1602, "step": 6357 }, { "epoch": 1.9437480892693366, "grad_norm": 0.8333715796470642, "learning_rate": 2.427970959113489e-05, "loss": 0.0924, "step": 6358 }, { "epoch": 1.9440538061754815, "grad_norm": 0.34430694580078125, "learning_rate": 2.428353076041269e-05, "loss": 0.0888, "step": 6359 }, { "epoch": 1.9443595230816264, "grad_norm": 1.228811264038086, "learning_rate": 2.4287351929690488e-05, "loss": 0.1423, "step": 6360 }, { "epoch": 1.9446652399877713, "grad_norm": 0.3731948435306549, "learning_rate": 2.4291173098968287e-05, "loss": 0.1393, "step": 6361 }, { "epoch": 1.9449709568939162, "grad_norm": 0.6394876837730408, "learning_rate": 2.4294994268246085e-05, "loss": 0.1458, "step": 6362 }, { "epoch": 1.9452766738000613, "grad_norm": 1.1052237749099731, "learning_rate": 2.4298815437523884e-05, "loss": 0.1871, "step": 6363 }, { "epoch": 1.945582390706206, "grad_norm": 0.7339423894882202, "learning_rate": 2.4302636606801683e-05, "loss": 0.1925, "step": 6364 }, { "epoch": 1.945888107612351, "grad_norm": 1.7553906440734863, "learning_rate": 2.430645777607948e-05, "loss": 0.2219, "step": 6365 }, { "epoch": 1.9461938245184958, "grad_norm": 0.6954481601715088, "learning_rate": 2.431027894535728e-05, "loss": 0.2163, "step": 6366 }, { "epoch": 1.9464995414246409, "grad_norm": 2.6045241355895996, "learning_rate": 2.431410011463508e-05, "loss": 0.1999, "step": 6367 }, { "epoch": 1.9468052583307855, "grad_norm": 1.7923165559768677, "learning_rate": 2.4317921283912878e-05, "loss": 0.2069, "step": 6368 }, { "epoch": 1.9471109752369307, "grad_norm": 1.0012035369873047, "learning_rate": 2.4321742453190677e-05, "loss": 0.2771, "step": 6369 }, { "epoch": 1.9474166921430756, "grad_norm": 1.104827642440796, "learning_rate": 2.4325563622468475e-05, "loss": 0.2652, "step": 6370 }, { "epoch": 1.9477224090492204, "grad_norm": 1.9280731678009033, "learning_rate": 2.4329384791746274e-05, "loss": 0.2718, "step": 6371 }, { "epoch": 1.9480281259553653, "grad_norm": 0.8480847477912903, "learning_rate": 2.4333205961024073e-05, "loss": 0.1688, "step": 6372 }, { "epoch": 1.9483338428615102, "grad_norm": 0.40534651279449463, "learning_rate": 2.433702713030187e-05, "loss": 0.1162, "step": 6373 }, { "epoch": 1.9486395597676551, "grad_norm": 0.3779584765434265, "learning_rate": 2.4340848299579674e-05, "loss": 0.1049, "step": 6374 }, { "epoch": 1.9489452766738, "grad_norm": 0.3492530882358551, "learning_rate": 2.4344669468857472e-05, "loss": 0.0797, "step": 6375 }, { "epoch": 1.9492509935799451, "grad_norm": 0.2840197682380676, "learning_rate": 2.434849063813527e-05, "loss": 0.0684, "step": 6376 }, { "epoch": 1.9495567104860898, "grad_norm": 0.4590209424495697, "learning_rate": 2.435231180741307e-05, "loss": 0.0574, "step": 6377 }, { "epoch": 1.949862427392235, "grad_norm": 0.3225541114807129, "learning_rate": 2.435613297669087e-05, "loss": 0.0734, "step": 6378 }, { "epoch": 1.9501681442983796, "grad_norm": 1.0296956300735474, "learning_rate": 2.4359954145968667e-05, "loss": 0.0778, "step": 6379 }, { "epoch": 1.9504738612045247, "grad_norm": 0.3237701654434204, "learning_rate": 2.4363775315246466e-05, "loss": 0.0746, "step": 6380 }, { "epoch": 1.9507795781106694, "grad_norm": 0.4579310119152069, "learning_rate": 2.4367596484524265e-05, "loss": 0.0898, "step": 6381 }, { "epoch": 1.9510852950168145, "grad_norm": 0.493283212184906, "learning_rate": 2.4371417653802064e-05, "loss": 0.0893, "step": 6382 }, { "epoch": 1.9513910119229594, "grad_norm": 0.5194309949874878, "learning_rate": 2.4375238823079862e-05, "loss": 0.0843, "step": 6383 }, { "epoch": 1.9516967288291043, "grad_norm": 0.5661357045173645, "learning_rate": 2.437905999235766e-05, "loss": 0.1243, "step": 6384 }, { "epoch": 1.9520024457352492, "grad_norm": 0.6005623936653137, "learning_rate": 2.438288116163546e-05, "loss": 0.0981, "step": 6385 }, { "epoch": 1.952308162641394, "grad_norm": 1.210753083229065, "learning_rate": 2.438670233091326e-05, "loss": 0.12, "step": 6386 }, { "epoch": 1.952613879547539, "grad_norm": 0.5192495584487915, "learning_rate": 2.4390523500191057e-05, "loss": 0.1805, "step": 6387 }, { "epoch": 1.9529195964536838, "grad_norm": 0.7512338757514954, "learning_rate": 2.4394344669468856e-05, "loss": 0.1892, "step": 6388 }, { "epoch": 1.953225313359829, "grad_norm": 1.1896977424621582, "learning_rate": 2.4398165838746658e-05, "loss": 0.2036, "step": 6389 }, { "epoch": 1.9535310302659736, "grad_norm": 0.6247238516807556, "learning_rate": 2.4401987008024457e-05, "loss": 0.2059, "step": 6390 }, { "epoch": 1.9538367471721187, "grad_norm": 0.727080225944519, "learning_rate": 2.4405808177302256e-05, "loss": 0.2036, "step": 6391 }, { "epoch": 1.9541424640782634, "grad_norm": 0.751765787601471, "learning_rate": 2.4409629346580054e-05, "loss": 0.2185, "step": 6392 }, { "epoch": 1.9544481809844085, "grad_norm": 2.303020477294922, "learning_rate": 2.4413450515857853e-05, "loss": 0.2268, "step": 6393 }, { "epoch": 1.9547538978905532, "grad_norm": 0.8386074304580688, "learning_rate": 2.4417271685135652e-05, "loss": 0.2182, "step": 6394 }, { "epoch": 1.9550596147966983, "grad_norm": 1.1143898963928223, "learning_rate": 2.442109285441345e-05, "loss": 0.2199, "step": 6395 }, { "epoch": 1.9553653317028432, "grad_norm": 2.545201301574707, "learning_rate": 2.442491402369125e-05, "loss": 0.2863, "step": 6396 }, { "epoch": 1.955671048608988, "grad_norm": 0.5041231513023376, "learning_rate": 2.4428735192969048e-05, "loss": 0.2004, "step": 6397 }, { "epoch": 1.955976765515133, "grad_norm": 0.3508281707763672, "learning_rate": 2.4432556362246847e-05, "loss": 0.0864, "step": 6398 }, { "epoch": 1.956282482421278, "grad_norm": 0.41621634364128113, "learning_rate": 2.4436377531524646e-05, "loss": 0.09, "step": 6399 }, { "epoch": 1.9565881993274228, "grad_norm": 0.3227291703224182, "learning_rate": 2.4440198700802444e-05, "loss": 0.0738, "step": 6400 }, { "epoch": 1.9568939162335677, "grad_norm": 0.5218549370765686, "learning_rate": 2.4444019870080243e-05, "loss": 0.0788, "step": 6401 }, { "epoch": 1.9571996331397128, "grad_norm": 0.4274415075778961, "learning_rate": 2.4447841039358042e-05, "loss": 0.0772, "step": 6402 }, { "epoch": 1.9575053500458575, "grad_norm": 0.6280909776687622, "learning_rate": 2.445166220863584e-05, "loss": 0.0482, "step": 6403 }, { "epoch": 1.9578110669520026, "grad_norm": 0.595230758190155, "learning_rate": 2.445548337791364e-05, "loss": 0.0796, "step": 6404 }, { "epoch": 1.9581167838581472, "grad_norm": 0.3765832781791687, "learning_rate": 2.445930454719144e-05, "loss": 0.0952, "step": 6405 }, { "epoch": 1.9584225007642924, "grad_norm": 0.3754270374774933, "learning_rate": 2.446312571646924e-05, "loss": 0.0827, "step": 6406 }, { "epoch": 1.958728217670437, "grad_norm": 5.1250224113464355, "learning_rate": 2.446694688574704e-05, "loss": 0.0922, "step": 6407 }, { "epoch": 1.9590339345765821, "grad_norm": 0.5271421670913696, "learning_rate": 2.4470768055024838e-05, "loss": 0.1131, "step": 6408 }, { "epoch": 1.959339651482727, "grad_norm": 0.432529091835022, "learning_rate": 2.4474589224302637e-05, "loss": 0.1149, "step": 6409 }, { "epoch": 1.959645368388872, "grad_norm": 0.47268736362457275, "learning_rate": 2.4478410393580435e-05, "loss": 0.2068, "step": 6410 }, { "epoch": 1.9599510852950168, "grad_norm": 1.7345781326293945, "learning_rate": 2.4482231562858234e-05, "loss": 0.1348, "step": 6411 }, { "epoch": 1.9602568022011617, "grad_norm": 0.7981429696083069, "learning_rate": 2.4486052732136033e-05, "loss": 0.1559, "step": 6412 }, { "epoch": 1.9605625191073066, "grad_norm": 0.711208164691925, "learning_rate": 2.448987390141383e-05, "loss": 0.149, "step": 6413 }, { "epoch": 1.9608682360134515, "grad_norm": 0.597986102104187, "learning_rate": 2.449369507069163e-05, "loss": 0.1981, "step": 6414 }, { "epoch": 1.9611739529195966, "grad_norm": 0.9943981766700745, "learning_rate": 2.449751623996943e-05, "loss": 0.2283, "step": 6415 }, { "epoch": 1.9614796698257413, "grad_norm": 0.9317859411239624, "learning_rate": 2.4501337409247228e-05, "loss": 0.1986, "step": 6416 }, { "epoch": 1.9617853867318864, "grad_norm": 0.8746550679206848, "learning_rate": 2.450515857852503e-05, "loss": 0.2244, "step": 6417 }, { "epoch": 1.962091103638031, "grad_norm": 1.3172274827957153, "learning_rate": 2.450897974780283e-05, "loss": 0.212, "step": 6418 }, { "epoch": 1.9623968205441762, "grad_norm": 1.2759122848510742, "learning_rate": 2.4512800917080627e-05, "loss": 0.31, "step": 6419 }, { "epoch": 1.9627025374503209, "grad_norm": 2.442958354949951, "learning_rate": 2.451662208635843e-05, "loss": 0.2375, "step": 6420 }, { "epoch": 1.963008254356466, "grad_norm": 1.7676655054092407, "learning_rate": 2.4520443255636228e-05, "loss": 0.2957, "step": 6421 }, { "epoch": 1.9633139712626109, "grad_norm": 0.5662017464637756, "learning_rate": 2.4524264424914027e-05, "loss": 0.1715, "step": 6422 }, { "epoch": 1.9636196881687558, "grad_norm": 0.3594457805156708, "learning_rate": 2.4528085594191826e-05, "loss": 0.1174, "step": 6423 }, { "epoch": 1.9639254050749007, "grad_norm": 0.3290724754333496, "learning_rate": 2.4531906763469624e-05, "loss": 0.0768, "step": 6424 }, { "epoch": 1.9642311219810455, "grad_norm": 0.32153716683387756, "learning_rate": 2.4535727932747423e-05, "loss": 0.0644, "step": 6425 }, { "epoch": 1.9645368388871904, "grad_norm": 0.2655496895313263, "learning_rate": 2.4539549102025222e-05, "loss": 0.063, "step": 6426 }, { "epoch": 1.9648425557933353, "grad_norm": 0.27964770793914795, "learning_rate": 2.454337027130302e-05, "loss": 0.0625, "step": 6427 }, { "epoch": 1.9651482726994804, "grad_norm": 0.26895666122436523, "learning_rate": 2.454719144058082e-05, "loss": 0.06, "step": 6428 }, { "epoch": 1.9654539896056251, "grad_norm": 0.4025997817516327, "learning_rate": 2.4551012609858618e-05, "loss": 0.0727, "step": 6429 }, { "epoch": 1.9657597065117702, "grad_norm": 0.3106488287448883, "learning_rate": 2.4554833779136417e-05, "loss": 0.0739, "step": 6430 }, { "epoch": 1.966065423417915, "grad_norm": 0.9413407444953918, "learning_rate": 2.4558654948414216e-05, "loss": 0.1008, "step": 6431 }, { "epoch": 1.96637114032406, "grad_norm": 0.5722661018371582, "learning_rate": 2.4562476117692014e-05, "loss": 0.1202, "step": 6432 }, { "epoch": 1.9666768572302047, "grad_norm": 0.34863847494125366, "learning_rate": 2.4566297286969813e-05, "loss": 0.1154, "step": 6433 }, { "epoch": 1.9669825741363498, "grad_norm": 0.4721043109893799, "learning_rate": 2.4570118456247612e-05, "loss": 0.136, "step": 6434 }, { "epoch": 1.9672882910424947, "grad_norm": 0.486040323972702, "learning_rate": 2.457393962552541e-05, "loss": 0.1009, "step": 6435 }, { "epoch": 1.9675940079486396, "grad_norm": 0.9454801082611084, "learning_rate": 2.4577760794803213e-05, "loss": 0.1805, "step": 6436 }, { "epoch": 1.9678997248547845, "grad_norm": 0.6967118382453918, "learning_rate": 2.458158196408101e-05, "loss": 0.1795, "step": 6437 }, { "epoch": 1.9682054417609294, "grad_norm": 0.8694626688957214, "learning_rate": 2.458540313335881e-05, "loss": 0.2006, "step": 6438 }, { "epoch": 1.9685111586670743, "grad_norm": 0.838963508605957, "learning_rate": 2.458922430263661e-05, "loss": 0.2078, "step": 6439 }, { "epoch": 1.9688168755732192, "grad_norm": 0.6944082975387573, "learning_rate": 2.4593045471914408e-05, "loss": 0.2116, "step": 6440 }, { "epoch": 1.969122592479364, "grad_norm": 0.9179471731185913, "learning_rate": 2.4596866641192207e-05, "loss": 0.2266, "step": 6441 }, { "epoch": 1.969428309385509, "grad_norm": 1.3293588161468506, "learning_rate": 2.4600687810470005e-05, "loss": 0.2513, "step": 6442 }, { "epoch": 1.969734026291654, "grad_norm": 0.8330287933349609, "learning_rate": 2.4604508979747804e-05, "loss": 0.2233, "step": 6443 }, { "epoch": 1.9700397431977987, "grad_norm": 1.3240280151367188, "learning_rate": 2.4608330149025603e-05, "loss": 0.2256, "step": 6444 }, { "epoch": 1.9703454601039438, "grad_norm": 1.4119774103164673, "learning_rate": 2.46121513183034e-05, "loss": 0.2679, "step": 6445 }, { "epoch": 1.9706511770100885, "grad_norm": 2.992941379547119, "learning_rate": 2.46159724875812e-05, "loss": 0.3104, "step": 6446 }, { "epoch": 1.9709568939162336, "grad_norm": 0.4402484893798828, "learning_rate": 2.4619793656859e-05, "loss": 0.1709, "step": 6447 }, { "epoch": 1.9712626108223785, "grad_norm": 0.4197898209095001, "learning_rate": 2.4623614826136798e-05, "loss": 0.0892, "step": 6448 }, { "epoch": 1.9715683277285234, "grad_norm": 0.3764101266860962, "learning_rate": 2.4627435995414596e-05, "loss": 0.0831, "step": 6449 }, { "epoch": 1.9718740446346683, "grad_norm": 0.2608382999897003, "learning_rate": 2.4631257164692395e-05, "loss": 0.0601, "step": 6450 }, { "epoch": 1.9721797615408132, "grad_norm": 0.2405189722776413, "learning_rate": 2.4635078333970197e-05, "loss": 0.0466, "step": 6451 }, { "epoch": 1.972485478446958, "grad_norm": 0.3313518762588501, "learning_rate": 2.4638899503247996e-05, "loss": 0.0833, "step": 6452 }, { "epoch": 1.972791195353103, "grad_norm": 0.3600405156612396, "learning_rate": 2.4642720672525795e-05, "loss": 0.068, "step": 6453 }, { "epoch": 1.9730969122592479, "grad_norm": 0.3862208425998688, "learning_rate": 2.4646541841803594e-05, "loss": 0.0526, "step": 6454 }, { "epoch": 1.9734026291653928, "grad_norm": 0.35938259959220886, "learning_rate": 2.4650363011081392e-05, "loss": 0.1093, "step": 6455 }, { "epoch": 1.973708346071538, "grad_norm": 0.3392852544784546, "learning_rate": 2.465418418035919e-05, "loss": 0.0769, "step": 6456 }, { "epoch": 1.9740140629776826, "grad_norm": 0.47768282890319824, "learning_rate": 2.465800534963699e-05, "loss": 0.1145, "step": 6457 }, { "epoch": 1.9743197798838277, "grad_norm": 0.4273030757904053, "learning_rate": 2.466182651891479e-05, "loss": 0.1101, "step": 6458 }, { "epoch": 1.9746254967899723, "grad_norm": 0.41851380467414856, "learning_rate": 2.4665647688192587e-05, "loss": 0.1142, "step": 6459 }, { "epoch": 1.9749312136961175, "grad_norm": 0.46019068360328674, "learning_rate": 2.4669468857470386e-05, "loss": 0.1567, "step": 6460 }, { "epoch": 1.9752369306022624, "grad_norm": 0.9394958019256592, "learning_rate": 2.4673290026748185e-05, "loss": 0.1594, "step": 6461 }, { "epoch": 1.9755426475084072, "grad_norm": 0.6411716938018799, "learning_rate": 2.4677111196025984e-05, "loss": 0.1803, "step": 6462 }, { "epoch": 1.9758483644145521, "grad_norm": 3.522366762161255, "learning_rate": 2.4680932365303782e-05, "loss": 0.1781, "step": 6463 }, { "epoch": 1.976154081320697, "grad_norm": 0.8914021253585815, "learning_rate": 2.468475353458158e-05, "loss": 0.2082, "step": 6464 }, { "epoch": 1.976459798226842, "grad_norm": 0.7525367736816406, "learning_rate": 2.468857470385938e-05, "loss": 0.2372, "step": 6465 }, { "epoch": 1.9767655151329868, "grad_norm": 0.8523505330085754, "learning_rate": 2.469239587313718e-05, "loss": 0.1987, "step": 6466 }, { "epoch": 1.9770712320391317, "grad_norm": 1.0476020574569702, "learning_rate": 2.469621704241498e-05, "loss": 0.2329, "step": 6467 }, { "epoch": 1.9773769489452766, "grad_norm": 0.9454726576805115, "learning_rate": 2.470003821169278e-05, "loss": 0.2158, "step": 6468 }, { "epoch": 1.9776826658514217, "grad_norm": 1.1950355768203735, "learning_rate": 2.4703859380970578e-05, "loss": 0.2104, "step": 6469 }, { "epoch": 1.9779883827575664, "grad_norm": 1.172938585281372, "learning_rate": 2.4707680550248377e-05, "loss": 0.2241, "step": 6470 }, { "epoch": 1.9782940996637115, "grad_norm": 1.7906919717788696, "learning_rate": 2.4711501719526176e-05, "loss": 0.3473, "step": 6471 }, { "epoch": 1.9785998165698562, "grad_norm": 0.561165988445282, "learning_rate": 2.4715322888803974e-05, "loss": 0.1733, "step": 6472 }, { "epoch": 1.9789055334760013, "grad_norm": 0.23951365053653717, "learning_rate": 2.4719144058081773e-05, "loss": 0.0761, "step": 6473 }, { "epoch": 1.9792112503821462, "grad_norm": 0.49035215377807617, "learning_rate": 2.4722965227359572e-05, "loss": 0.0961, "step": 6474 }, { "epoch": 1.979516967288291, "grad_norm": 0.4401525855064392, "learning_rate": 2.472678639663737e-05, "loss": 0.0816, "step": 6475 }, { "epoch": 1.979822684194436, "grad_norm": 0.3011339604854584, "learning_rate": 2.473060756591517e-05, "loss": 0.0743, "step": 6476 }, { "epoch": 1.9801284011005809, "grad_norm": 0.384479284286499, "learning_rate": 2.4734428735192968e-05, "loss": 0.0907, "step": 6477 }, { "epoch": 1.9804341180067258, "grad_norm": 0.34582215547561646, "learning_rate": 2.4738249904470767e-05, "loss": 0.0598, "step": 6478 }, { "epoch": 1.9807398349128706, "grad_norm": 0.2936725914478302, "learning_rate": 2.4742071073748566e-05, "loss": 0.0577, "step": 6479 }, { "epoch": 1.9810455518190155, "grad_norm": 0.33807873725891113, "learning_rate": 2.4745892243026364e-05, "loss": 0.1118, "step": 6480 }, { "epoch": 1.9813512687251604, "grad_norm": 1.9860621690750122, "learning_rate": 2.4749713412304163e-05, "loss": 0.0745, "step": 6481 }, { "epoch": 1.9816569856313055, "grad_norm": 0.4665187895298004, "learning_rate": 2.4753534581581965e-05, "loss": 0.102, "step": 6482 }, { "epoch": 1.9819627025374502, "grad_norm": 0.5357497930526733, "learning_rate": 2.4757355750859764e-05, "loss": 0.0995, "step": 6483 }, { "epoch": 1.9822684194435953, "grad_norm": 0.5349212288856506, "learning_rate": 2.4761176920137563e-05, "loss": 0.106, "step": 6484 }, { "epoch": 1.98257413634974, "grad_norm": 0.4201314151287079, "learning_rate": 2.476499808941536e-05, "loss": 0.137, "step": 6485 }, { "epoch": 1.9828798532558851, "grad_norm": 0.7408223748207092, "learning_rate": 2.476881925869316e-05, "loss": 0.1588, "step": 6486 }, { "epoch": 1.98318557016203, "grad_norm": 0.7512788772583008, "learning_rate": 2.477264042797096e-05, "loss": 0.1916, "step": 6487 }, { "epoch": 1.983491287068175, "grad_norm": 0.5963612794876099, "learning_rate": 2.4776461597248758e-05, "loss": 0.1636, "step": 6488 }, { "epoch": 1.9837970039743198, "grad_norm": 1.0753231048583984, "learning_rate": 2.4780282766526556e-05, "loss": 0.2034, "step": 6489 }, { "epoch": 1.9841027208804647, "grad_norm": 0.49320194125175476, "learning_rate": 2.4784103935804355e-05, "loss": 0.1784, "step": 6490 }, { "epoch": 1.9844084377866096, "grad_norm": 0.716719925403595, "learning_rate": 2.4787925105082154e-05, "loss": 0.2101, "step": 6491 }, { "epoch": 1.9847141546927545, "grad_norm": 1.981894850730896, "learning_rate": 2.4791746274359953e-05, "loss": 0.2165, "step": 6492 }, { "epoch": 1.9850198715988994, "grad_norm": 1.0197778940200806, "learning_rate": 2.479556744363775e-05, "loss": 0.2108, "step": 6493 }, { "epoch": 1.9853255885050443, "grad_norm": 0.8590005040168762, "learning_rate": 2.479938861291555e-05, "loss": 0.2265, "step": 6494 }, { "epoch": 1.9856313054111894, "grad_norm": 3.9505279064178467, "learning_rate": 2.4803209782193352e-05, "loss": 0.2468, "step": 6495 }, { "epoch": 1.985937022317334, "grad_norm": 2.2292428016662598, "learning_rate": 2.480703095147115e-05, "loss": 0.3019, "step": 6496 }, { "epoch": 1.9862427392234792, "grad_norm": 0.5055639743804932, "learning_rate": 2.481085212074895e-05, "loss": 0.1894, "step": 6497 }, { "epoch": 1.9865484561296238, "grad_norm": 0.4407506585121155, "learning_rate": 2.4814673290026752e-05, "loss": 0.0871, "step": 6498 }, { "epoch": 1.986854173035769, "grad_norm": 0.28371864557266235, "learning_rate": 2.481849445930455e-05, "loss": 0.1059, "step": 6499 }, { "epoch": 1.9871598899419138, "grad_norm": 0.2951236069202423, "learning_rate": 2.482231562858235e-05, "loss": 0.0849, "step": 6500 }, { "epoch": 1.9874656068480587, "grad_norm": 0.31010133028030396, "learning_rate": 2.4826136797860148e-05, "loss": 0.0638, "step": 6501 }, { "epoch": 1.9877713237542036, "grad_norm": 0.5503721237182617, "learning_rate": 2.4829957967137947e-05, "loss": 0.0959, "step": 6502 }, { "epoch": 1.9880770406603485, "grad_norm": 0.27418068051338196, "learning_rate": 2.4833779136415746e-05, "loss": 0.0882, "step": 6503 }, { "epoch": 1.9883827575664934, "grad_norm": 0.27971041202545166, "learning_rate": 2.4837600305693544e-05, "loss": 0.0787, "step": 6504 }, { "epoch": 1.9886884744726383, "grad_norm": 0.41813308000564575, "learning_rate": 2.4841421474971343e-05, "loss": 0.1001, "step": 6505 }, { "epoch": 1.9889941913787832, "grad_norm": 0.29509228467941284, "learning_rate": 2.4845242644249142e-05, "loss": 0.0831, "step": 6506 }, { "epoch": 1.989299908284928, "grad_norm": 0.4498313069343567, "learning_rate": 2.484906381352694e-05, "loss": 0.1033, "step": 6507 }, { "epoch": 1.9896056251910732, "grad_norm": 0.3962278962135315, "learning_rate": 2.485288498280474e-05, "loss": 0.0683, "step": 6508 }, { "epoch": 1.9899113420972179, "grad_norm": 0.6496415734291077, "learning_rate": 2.4856706152082538e-05, "loss": 0.1135, "step": 6509 }, { "epoch": 1.990217059003363, "grad_norm": 0.4678172469139099, "learning_rate": 2.4860527321360337e-05, "loss": 0.1398, "step": 6510 }, { "epoch": 1.9905227759095077, "grad_norm": 0.45723381638526917, "learning_rate": 2.4864348490638136e-05, "loss": 0.1484, "step": 6511 }, { "epoch": 1.9908284928156528, "grad_norm": 0.8440402746200562, "learning_rate": 2.4868169659915934e-05, "loss": 0.149, "step": 6512 }, { "epoch": 1.9911342097217977, "grad_norm": 1.0793875455856323, "learning_rate": 2.4871990829193736e-05, "loss": 0.1711, "step": 6513 }, { "epoch": 1.9914399266279426, "grad_norm": 0.5682001709938049, "learning_rate": 2.4875811998471535e-05, "loss": 0.1665, "step": 6514 }, { "epoch": 1.9917456435340875, "grad_norm": 0.8755152821540833, "learning_rate": 2.4879633167749334e-05, "loss": 0.1952, "step": 6515 }, { "epoch": 1.9920513604402323, "grad_norm": 0.8324375748634338, "learning_rate": 2.4883454337027133e-05, "loss": 0.18, "step": 6516 }, { "epoch": 1.9923570773463772, "grad_norm": 1.8006850481033325, "learning_rate": 2.488727550630493e-05, "loss": 0.3639, "step": 6517 }, { "epoch": 1.9926627942525221, "grad_norm": 1.1942532062530518, "learning_rate": 2.489109667558273e-05, "loss": 0.207, "step": 6518 }, { "epoch": 1.992968511158667, "grad_norm": 1.2478790283203125, "learning_rate": 2.489491784486053e-05, "loss": 0.213, "step": 6519 }, { "epoch": 1.993274228064812, "grad_norm": 1.1627511978149414, "learning_rate": 2.4898739014138328e-05, "loss": 0.2201, "step": 6520 }, { "epoch": 1.993579944970957, "grad_norm": 1.3023996353149414, "learning_rate": 2.4902560183416126e-05, "loss": 0.2497, "step": 6521 }, { "epoch": 1.9938856618771017, "grad_norm": 0.823257327079773, "learning_rate": 2.4906381352693925e-05, "loss": 0.1338, "step": 6522 }, { "epoch": 1.9941913787832468, "grad_norm": 0.46702754497528076, "learning_rate": 2.4910202521971724e-05, "loss": 0.0889, "step": 6523 }, { "epoch": 1.9944970956893915, "grad_norm": 0.31102386116981506, "learning_rate": 2.4914023691249523e-05, "loss": 0.082, "step": 6524 }, { "epoch": 1.9948028125955366, "grad_norm": 0.2779892086982727, "learning_rate": 2.491784486052732e-05, "loss": 0.0601, "step": 6525 }, { "epoch": 1.9951085295016815, "grad_norm": 0.45097342133522034, "learning_rate": 2.492166602980512e-05, "loss": 0.0731, "step": 6526 }, { "epoch": 1.9954142464078264, "grad_norm": 0.3124569356441498, "learning_rate": 2.492548719908292e-05, "loss": 0.0624, "step": 6527 }, { "epoch": 1.9957199633139713, "grad_norm": 0.2963085472583771, "learning_rate": 2.4929308368360718e-05, "loss": 0.0719, "step": 6528 }, { "epoch": 1.9960256802201162, "grad_norm": 0.38628706336021423, "learning_rate": 2.493312953763852e-05, "loss": 0.0808, "step": 6529 }, { "epoch": 1.996331397126261, "grad_norm": 0.4163617193698883, "learning_rate": 2.493695070691632e-05, "loss": 0.1032, "step": 6530 }, { "epoch": 1.996637114032406, "grad_norm": 0.5129755139350891, "learning_rate": 2.4940771876194117e-05, "loss": 0.1048, "step": 6531 }, { "epoch": 1.9969428309385509, "grad_norm": 0.8491182327270508, "learning_rate": 2.4944593045471916e-05, "loss": 0.1265, "step": 6532 }, { "epoch": 1.9972485478446957, "grad_norm": 0.8314477205276489, "learning_rate": 2.4948414214749715e-05, "loss": 0.1428, "step": 6533 }, { "epoch": 1.9975542647508409, "grad_norm": 0.6714627742767334, "learning_rate": 2.4952235384027514e-05, "loss": 0.1642, "step": 6534 }, { "epoch": 1.9978599816569855, "grad_norm": 1.0651665925979614, "learning_rate": 2.4956056553305312e-05, "loss": 0.213, "step": 6535 }, { "epoch": 1.9981656985631306, "grad_norm": 0.6256597638130188, "learning_rate": 2.495987772258311e-05, "loss": 0.1954, "step": 6536 }, { "epoch": 1.9984714154692753, "grad_norm": 1.0650641918182373, "learning_rate": 2.496369889186091e-05, "loss": 0.2073, "step": 6537 }, { "epoch": 1.9987771323754204, "grad_norm": 1.3185615539550781, "learning_rate": 2.496752006113871e-05, "loss": 0.2569, "step": 6538 }, { "epoch": 1.9990828492815653, "grad_norm": 1.1953219175338745, "learning_rate": 2.4971341230416507e-05, "loss": 0.2179, "step": 6539 }, { "epoch": 1.9993885661877102, "grad_norm": 1.0475904941558838, "learning_rate": 2.4975162399694306e-05, "loss": 0.2283, "step": 6540 }, { "epoch": 1.999694283093855, "grad_norm": 1.6295173168182373, "learning_rate": 2.4978983568972105e-05, "loss": 0.2403, "step": 6541 }, { "epoch": 2.0, "grad_norm": 1.7590538263320923, "learning_rate": 2.4982804738249903e-05, "loss": 0.2615, "step": 6542 }, { "epoch": 2.000305716906145, "grad_norm": 0.42448705434799194, "learning_rate": 2.4986625907527702e-05, "loss": 0.1631, "step": 6543 }, { "epoch": 2.00061143381229, "grad_norm": 1.2494852542877197, "learning_rate": 2.4990447076805504e-05, "loss": 0.0812, "step": 6544 }, { "epoch": 2.000917150718435, "grad_norm": 0.4665374755859375, "learning_rate": 2.4994268246083303e-05, "loss": 0.0939, "step": 6545 }, { "epoch": 2.0012228676245796, "grad_norm": 0.8085682988166809, "learning_rate": 2.4998089415361102e-05, "loss": 0.0754, "step": 6546 }, { "epoch": 2.0015285845307247, "grad_norm": 0.21420708298683167, "learning_rate": 2.50019105846389e-05, "loss": 0.0747, "step": 6547 }, { "epoch": 2.0018343014368694, "grad_norm": 0.4175623059272766, "learning_rate": 2.50057317539167e-05, "loss": 0.064, "step": 6548 }, { "epoch": 2.0021400183430145, "grad_norm": 0.3390757739543915, "learning_rate": 2.5009552923194498e-05, "loss": 0.0599, "step": 6549 }, { "epoch": 2.002445735249159, "grad_norm": 0.47780993580818176, "learning_rate": 2.5013374092472297e-05, "loss": 0.0821, "step": 6550 }, { "epoch": 2.0027514521553043, "grad_norm": 0.2411816120147705, "learning_rate": 2.5017195261750096e-05, "loss": 0.0758, "step": 6551 }, { "epoch": 2.003057169061449, "grad_norm": 0.2889666259288788, "learning_rate": 2.5021016431027894e-05, "loss": 0.0874, "step": 6552 }, { "epoch": 2.003362885967594, "grad_norm": 0.5417318940162659, "learning_rate": 2.5024837600305693e-05, "loss": 0.1453, "step": 6553 }, { "epoch": 2.0036686028737387, "grad_norm": 0.40776246786117554, "learning_rate": 2.5028658769583492e-05, "loss": 0.1022, "step": 6554 }, { "epoch": 2.003974319779884, "grad_norm": 1.05186927318573, "learning_rate": 2.503247993886129e-05, "loss": 0.1067, "step": 6555 }, { "epoch": 2.004280036686029, "grad_norm": 0.6957182884216309, "learning_rate": 2.503630110813909e-05, "loss": 0.2001, "step": 6556 }, { "epoch": 2.0045857535921736, "grad_norm": 0.4188138544559479, "learning_rate": 2.5040122277416888e-05, "loss": 0.1274, "step": 6557 }, { "epoch": 2.0048914704983187, "grad_norm": 0.5747617483139038, "learning_rate": 2.5043943446694687e-05, "loss": 0.1621, "step": 6558 }, { "epoch": 2.0051971874044634, "grad_norm": 0.7366291880607605, "learning_rate": 2.5047764615972486e-05, "loss": 0.1656, "step": 6559 }, { "epoch": 2.0055029043106085, "grad_norm": 0.7308005094528198, "learning_rate": 2.5051585785250288e-05, "loss": 0.1686, "step": 6560 }, { "epoch": 2.005808621216753, "grad_norm": 1.587412714958191, "learning_rate": 2.5055406954528086e-05, "loss": 0.1838, "step": 6561 }, { "epoch": 2.0061143381228983, "grad_norm": 0.7659873962402344, "learning_rate": 2.5059228123805885e-05, "loss": 0.1949, "step": 6562 }, { "epoch": 2.006420055029043, "grad_norm": 0.9573981165885925, "learning_rate": 2.5063049293083684e-05, "loss": 0.2067, "step": 6563 }, { "epoch": 2.006725771935188, "grad_norm": 0.9303998947143555, "learning_rate": 2.5066870462361483e-05, "loss": 0.2399, "step": 6564 }, { "epoch": 2.0070314888413328, "grad_norm": 0.9674760103225708, "learning_rate": 2.507069163163928e-05, "loss": 0.2153, "step": 6565 }, { "epoch": 2.007337205747478, "grad_norm": 1.0996538400650024, "learning_rate": 2.507451280091708e-05, "loss": 0.2524, "step": 6566 }, { "epoch": 2.0076429226536225, "grad_norm": 3.103044271469116, "learning_rate": 2.507833397019488e-05, "loss": 0.2698, "step": 6567 }, { "epoch": 2.0079486395597677, "grad_norm": 0.3554902672767639, "learning_rate": 2.5082155139472678e-05, "loss": 0.1421, "step": 6568 }, { "epoch": 2.0082543564659128, "grad_norm": 0.346723347902298, "learning_rate": 2.5085976308750476e-05, "loss": 0.1063, "step": 6569 }, { "epoch": 2.0085600733720574, "grad_norm": 0.45869335532188416, "learning_rate": 2.5089797478028275e-05, "loss": 0.1107, "step": 6570 }, { "epoch": 2.0088657902782026, "grad_norm": 0.2325754463672638, "learning_rate": 2.5093618647306074e-05, "loss": 0.0712, "step": 6571 }, { "epoch": 2.0091715071843472, "grad_norm": 0.5147345662117004, "learning_rate": 2.5097439816583873e-05, "loss": 0.0606, "step": 6572 }, { "epoch": 2.0094772240904923, "grad_norm": 0.24115774035453796, "learning_rate": 2.5101260985861675e-05, "loss": 0.0468, "step": 6573 }, { "epoch": 2.009782940996637, "grad_norm": 0.30636242032051086, "learning_rate": 2.5105082155139473e-05, "loss": 0.0772, "step": 6574 }, { "epoch": 2.010088657902782, "grad_norm": 0.27534547448158264, "learning_rate": 2.5108903324417272e-05, "loss": 0.0792, "step": 6575 }, { "epoch": 2.010394374808927, "grad_norm": 0.2691982090473175, "learning_rate": 2.5112724493695074e-05, "loss": 0.0753, "step": 6576 }, { "epoch": 2.010700091715072, "grad_norm": 1.5399404764175415, "learning_rate": 2.5116545662972873e-05, "loss": 0.0826, "step": 6577 }, { "epoch": 2.0110058086212166, "grad_norm": 0.4233097434043884, "learning_rate": 2.5120366832250672e-05, "loss": 0.1465, "step": 6578 }, { "epoch": 2.0113115255273617, "grad_norm": 0.33689001202583313, "learning_rate": 2.512418800152847e-05, "loss": 0.0911, "step": 6579 }, { "epoch": 2.0116172424335064, "grad_norm": 0.44457757472991943, "learning_rate": 2.512800917080627e-05, "loss": 0.1284, "step": 6580 }, { "epoch": 2.0119229593396515, "grad_norm": 0.9182707071304321, "learning_rate": 2.5131830340084068e-05, "loss": 0.1374, "step": 6581 }, { "epoch": 2.0122286762457966, "grad_norm": 0.38743266463279724, "learning_rate": 2.5135651509361867e-05, "loss": 0.1482, "step": 6582 }, { "epoch": 2.0125343931519413, "grad_norm": 0.6948342323303223, "learning_rate": 2.5139472678639666e-05, "loss": 0.2198, "step": 6583 }, { "epoch": 2.0128401100580864, "grad_norm": 0.48634183406829834, "learning_rate": 2.5143293847917464e-05, "loss": 0.1767, "step": 6584 }, { "epoch": 2.013145826964231, "grad_norm": 3.848721981048584, "learning_rate": 2.5147115017195263e-05, "loss": 0.1961, "step": 6585 }, { "epoch": 2.013451543870376, "grad_norm": 1.163944959640503, "learning_rate": 2.5150936186473062e-05, "loss": 0.2249, "step": 6586 }, { "epoch": 2.013757260776521, "grad_norm": 0.4836474061012268, "learning_rate": 2.515475735575086e-05, "loss": 0.1934, "step": 6587 }, { "epoch": 2.014062977682666, "grad_norm": 0.9519907236099243, "learning_rate": 2.515857852502866e-05, "loss": 0.1782, "step": 6588 }, { "epoch": 2.0143686945888106, "grad_norm": 0.8198890089988708, "learning_rate": 2.5162399694306458e-05, "loss": 0.2094, "step": 6589 }, { "epoch": 2.0146744114949557, "grad_norm": 1.0095022916793823, "learning_rate": 2.5166220863584257e-05, "loss": 0.2067, "step": 6590 }, { "epoch": 2.0149801284011004, "grad_norm": 1.1920913457870483, "learning_rate": 2.517004203286206e-05, "loss": 0.2327, "step": 6591 }, { "epoch": 2.0152858453072455, "grad_norm": 4.356917858123779, "learning_rate": 2.5173863202139858e-05, "loss": 0.26, "step": 6592 }, { "epoch": 2.01559156221339, "grad_norm": 0.6451190710067749, "learning_rate": 2.5177684371417656e-05, "loss": 0.1821, "step": 6593 }, { "epoch": 2.0158972791195353, "grad_norm": 0.5064073204994202, "learning_rate": 2.5181505540695455e-05, "loss": 0.1012, "step": 6594 }, { "epoch": 2.0162029960256804, "grad_norm": 0.31423866748809814, "learning_rate": 2.5185326709973254e-05, "loss": 0.0799, "step": 6595 }, { "epoch": 2.016508712931825, "grad_norm": 0.3284631371498108, "learning_rate": 2.5189147879251053e-05, "loss": 0.0752, "step": 6596 }, { "epoch": 2.01681442983797, "grad_norm": 0.36403656005859375, "learning_rate": 2.519296904852885e-05, "loss": 0.0662, "step": 6597 }, { "epoch": 2.017120146744115, "grad_norm": 0.4861830174922943, "learning_rate": 2.519679021780665e-05, "loss": 0.0702, "step": 6598 }, { "epoch": 2.01742586365026, "grad_norm": 0.41574355959892273, "learning_rate": 2.520061138708445e-05, "loss": 0.1125, "step": 6599 }, { "epoch": 2.0177315805564047, "grad_norm": 0.28512826561927795, "learning_rate": 2.5204432556362248e-05, "loss": 0.0658, "step": 6600 }, { "epoch": 2.01803729746255, "grad_norm": 0.2501002252101898, "learning_rate": 2.5208253725640046e-05, "loss": 0.0677, "step": 6601 }, { "epoch": 2.0183430143686945, "grad_norm": 0.3270208537578583, "learning_rate": 2.5212074894917845e-05, "loss": 0.0631, "step": 6602 }, { "epoch": 2.0186487312748396, "grad_norm": 0.3597809970378876, "learning_rate": 2.5215896064195644e-05, "loss": 0.1289, "step": 6603 }, { "epoch": 2.0189544481809842, "grad_norm": 0.3771742582321167, "learning_rate": 2.5219717233473443e-05, "loss": 0.083, "step": 6604 }, { "epoch": 2.0192601650871294, "grad_norm": 0.4114091098308563, "learning_rate": 2.522353840275124e-05, "loss": 0.113, "step": 6605 }, { "epoch": 2.019565881993274, "grad_norm": 0.4500918984413147, "learning_rate": 2.5227359572029043e-05, "loss": 0.1487, "step": 6606 }, { "epoch": 2.019871598899419, "grad_norm": 0.6106625199317932, "learning_rate": 2.5231180741306842e-05, "loss": 0.1526, "step": 6607 }, { "epoch": 2.0201773158055643, "grad_norm": 0.5554910898208618, "learning_rate": 2.523500191058464e-05, "loss": 0.1413, "step": 6608 }, { "epoch": 2.020483032711709, "grad_norm": 0.844653308391571, "learning_rate": 2.523882307986244e-05, "loss": 0.1958, "step": 6609 }, { "epoch": 2.020788749617854, "grad_norm": 0.6111506223678589, "learning_rate": 2.524264424914024e-05, "loss": 0.1946, "step": 6610 }, { "epoch": 2.0210944665239987, "grad_norm": 0.6251888275146484, "learning_rate": 2.5246465418418037e-05, "loss": 0.1794, "step": 6611 }, { "epoch": 2.021400183430144, "grad_norm": 0.5717607140541077, "learning_rate": 2.5250286587695836e-05, "loss": 0.1715, "step": 6612 }, { "epoch": 2.0217059003362885, "grad_norm": 0.9846701622009277, "learning_rate": 2.5254107756973635e-05, "loss": 0.1869, "step": 6613 }, { "epoch": 2.0220116172424336, "grad_norm": 1.326703429222107, "learning_rate": 2.5257928926251433e-05, "loss": 0.2392, "step": 6614 }, { "epoch": 2.0223173341485783, "grad_norm": 1.2893069982528687, "learning_rate": 2.5261750095529232e-05, "loss": 0.1995, "step": 6615 }, { "epoch": 2.0226230510547234, "grad_norm": 1.456768274307251, "learning_rate": 2.526557126480703e-05, "loss": 0.2768, "step": 6616 }, { "epoch": 2.022928767960868, "grad_norm": 1.598254680633545, "learning_rate": 2.526939243408483e-05, "loss": 0.2701, "step": 6617 }, { "epoch": 2.023234484867013, "grad_norm": 0.35585662722587585, "learning_rate": 2.527321360336263e-05, "loss": 0.1495, "step": 6618 }, { "epoch": 2.023540201773158, "grad_norm": 0.35374516248703003, "learning_rate": 2.5277034772640427e-05, "loss": 0.1066, "step": 6619 }, { "epoch": 2.023845918679303, "grad_norm": 0.6218037009239197, "learning_rate": 2.5280855941918226e-05, "loss": 0.0884, "step": 6620 }, { "epoch": 2.024151635585448, "grad_norm": 0.22701416909694672, "learning_rate": 2.5284677111196025e-05, "loss": 0.0704, "step": 6621 }, { "epoch": 2.0244573524915928, "grad_norm": 0.6053125262260437, "learning_rate": 2.5288498280473827e-05, "loss": 0.0632, "step": 6622 }, { "epoch": 2.024763069397738, "grad_norm": 0.3475567102432251, "learning_rate": 2.5292319449751625e-05, "loss": 0.0679, "step": 6623 }, { "epoch": 2.0250687863038825, "grad_norm": 0.25390711426734924, "learning_rate": 2.5296140619029424e-05, "loss": 0.0761, "step": 6624 }, { "epoch": 2.0253745032100277, "grad_norm": 0.4447312355041504, "learning_rate": 2.5299961788307223e-05, "loss": 0.0602, "step": 6625 }, { "epoch": 2.0256802201161723, "grad_norm": 0.4068864583969116, "learning_rate": 2.5303782957585022e-05, "loss": 0.0925, "step": 6626 }, { "epoch": 2.0259859370223174, "grad_norm": 0.4820728600025177, "learning_rate": 2.530760412686282e-05, "loss": 0.0493, "step": 6627 }, { "epoch": 2.026291653928462, "grad_norm": 0.46115484833717346, "learning_rate": 2.531142529614062e-05, "loss": 0.0792, "step": 6628 }, { "epoch": 2.0265973708346072, "grad_norm": 0.4243253469467163, "learning_rate": 2.5315246465418418e-05, "loss": 0.0974, "step": 6629 }, { "epoch": 2.026903087740752, "grad_norm": 0.3812737464904785, "learning_rate": 2.5319067634696217e-05, "loss": 0.0929, "step": 6630 }, { "epoch": 2.027208804646897, "grad_norm": 0.4317031502723694, "learning_rate": 2.5322888803974015e-05, "loss": 0.1256, "step": 6631 }, { "epoch": 2.0275145215530417, "grad_norm": 0.47472354769706726, "learning_rate": 2.5326709973251814e-05, "loss": 0.1548, "step": 6632 }, { "epoch": 2.027820238459187, "grad_norm": 0.9468677639961243, "learning_rate": 2.5330531142529613e-05, "loss": 0.1726, "step": 6633 }, { "epoch": 2.028125955365332, "grad_norm": 0.6795588135719299, "learning_rate": 2.533435231180741e-05, "loss": 0.2161, "step": 6634 }, { "epoch": 2.0284316722714766, "grad_norm": 0.7926715016365051, "learning_rate": 2.533817348108521e-05, "loss": 0.1782, "step": 6635 }, { "epoch": 2.0287373891776217, "grad_norm": 0.5156047940254211, "learning_rate": 2.534199465036301e-05, "loss": 0.2172, "step": 6636 }, { "epoch": 2.0290431060837664, "grad_norm": 0.7451431155204773, "learning_rate": 2.534581581964081e-05, "loss": 0.1994, "step": 6637 }, { "epoch": 2.0293488229899115, "grad_norm": 0.6026883721351624, "learning_rate": 2.534963698891861e-05, "loss": 0.2362, "step": 6638 }, { "epoch": 2.029654539896056, "grad_norm": 1.2014411687850952, "learning_rate": 2.535345815819641e-05, "loss": 0.2235, "step": 6639 }, { "epoch": 2.0299602568022013, "grad_norm": 0.9143808484077454, "learning_rate": 2.5357279327474208e-05, "loss": 0.2471, "step": 6640 }, { "epoch": 2.030265973708346, "grad_norm": 1.3543305397033691, "learning_rate": 2.5361100496752006e-05, "loss": 0.2595, "step": 6641 }, { "epoch": 2.030571690614491, "grad_norm": 2.8098092079162598, "learning_rate": 2.5364921666029805e-05, "loss": 0.3058, "step": 6642 }, { "epoch": 2.0308774075206357, "grad_norm": 0.4137118458747864, "learning_rate": 2.5368742835307604e-05, "loss": 0.1734, "step": 6643 }, { "epoch": 2.031183124426781, "grad_norm": 0.32486698031425476, "learning_rate": 2.5372564004585403e-05, "loss": 0.107, "step": 6644 }, { "epoch": 2.0314888413329255, "grad_norm": 0.5530273914337158, "learning_rate": 2.53763851738632e-05, "loss": 0.0765, "step": 6645 }, { "epoch": 2.0317945582390706, "grad_norm": 0.9972677230834961, "learning_rate": 2.5380206343141e-05, "loss": 0.0595, "step": 6646 }, { "epoch": 2.0321002751452157, "grad_norm": 0.3366190493106842, "learning_rate": 2.53840275124188e-05, "loss": 0.0736, "step": 6647 }, { "epoch": 2.0324059920513604, "grad_norm": 0.30460795760154724, "learning_rate": 2.5387848681696597e-05, "loss": 0.0604, "step": 6648 }, { "epoch": 2.0327117089575055, "grad_norm": 0.21202681958675385, "learning_rate": 2.5391669850974396e-05, "loss": 0.0547, "step": 6649 }, { "epoch": 2.03301742586365, "grad_norm": 0.4312629699707031, "learning_rate": 2.5395491020252195e-05, "loss": 0.0525, "step": 6650 }, { "epoch": 2.0333231427697953, "grad_norm": 0.4186932444572449, "learning_rate": 2.5399312189529997e-05, "loss": 0.0864, "step": 6651 }, { "epoch": 2.03362885967594, "grad_norm": 0.41644033789634705, "learning_rate": 2.5403133358807796e-05, "loss": 0.082, "step": 6652 }, { "epoch": 2.033934576582085, "grad_norm": 0.323477178812027, "learning_rate": 2.5406954528085598e-05, "loss": 0.1067, "step": 6653 }, { "epoch": 2.0342402934882298, "grad_norm": 0.29261860251426697, "learning_rate": 2.5410775697363397e-05, "loss": 0.0613, "step": 6654 }, { "epoch": 2.034546010394375, "grad_norm": 0.5995584726333618, "learning_rate": 2.5414596866641195e-05, "loss": 0.1616, "step": 6655 }, { "epoch": 2.0348517273005196, "grad_norm": 0.5127672553062439, "learning_rate": 2.5418418035918994e-05, "loss": 0.1476, "step": 6656 }, { "epoch": 2.0351574442066647, "grad_norm": 0.8561289310455322, "learning_rate": 2.5422239205196793e-05, "loss": 0.1624, "step": 6657 }, { "epoch": 2.0354631611128093, "grad_norm": 0.703136146068573, "learning_rate": 2.5426060374474592e-05, "loss": 0.1358, "step": 6658 }, { "epoch": 2.0357688780189545, "grad_norm": 0.9563149809837341, "learning_rate": 2.542988154375239e-05, "loss": 0.1806, "step": 6659 }, { "epoch": 2.0360745949250996, "grad_norm": 1.0832122564315796, "learning_rate": 2.543370271303019e-05, "loss": 0.1834, "step": 6660 }, { "epoch": 2.0363803118312442, "grad_norm": 0.7496498227119446, "learning_rate": 2.5437523882307988e-05, "loss": 0.2145, "step": 6661 }, { "epoch": 2.0366860287373894, "grad_norm": 1.1329166889190674, "learning_rate": 2.5441345051585787e-05, "loss": 0.2311, "step": 6662 }, { "epoch": 2.036991745643534, "grad_norm": 1.3470814228057861, "learning_rate": 2.5445166220863585e-05, "loss": 0.2242, "step": 6663 }, { "epoch": 2.037297462549679, "grad_norm": 1.187681794166565, "learning_rate": 2.5448987390141384e-05, "loss": 0.2268, "step": 6664 }, { "epoch": 2.037603179455824, "grad_norm": 1.0603556632995605, "learning_rate": 2.5452808559419183e-05, "loss": 0.2287, "step": 6665 }, { "epoch": 2.037908896361969, "grad_norm": 1.2546635866165161, "learning_rate": 2.545662972869698e-05, "loss": 0.2549, "step": 6666 }, { "epoch": 2.0382146132681136, "grad_norm": 2.272052049636841, "learning_rate": 2.546045089797478e-05, "loss": 0.3237, "step": 6667 }, { "epoch": 2.0385203301742587, "grad_norm": 0.41471216082572937, "learning_rate": 2.546427206725258e-05, "loss": 0.1786, "step": 6668 }, { "epoch": 2.0388260470804034, "grad_norm": 0.4785451889038086, "learning_rate": 2.546809323653038e-05, "loss": 0.1037, "step": 6669 }, { "epoch": 2.0391317639865485, "grad_norm": 0.34823912382125854, "learning_rate": 2.547191440580818e-05, "loss": 0.0746, "step": 6670 }, { "epoch": 2.039437480892693, "grad_norm": 0.31744149327278137, "learning_rate": 2.547573557508598e-05, "loss": 0.073, "step": 6671 }, { "epoch": 2.0397431977988383, "grad_norm": 0.4554775655269623, "learning_rate": 2.5479556744363778e-05, "loss": 0.0669, "step": 6672 }, { "epoch": 2.0400489147049834, "grad_norm": 0.37998634576797485, "learning_rate": 2.5483377913641576e-05, "loss": 0.0974, "step": 6673 }, { "epoch": 2.040354631611128, "grad_norm": 0.5499568581581116, "learning_rate": 2.5487199082919375e-05, "loss": 0.0777, "step": 6674 }, { "epoch": 2.040660348517273, "grad_norm": 0.5215058326721191, "learning_rate": 2.5491020252197174e-05, "loss": 0.1026, "step": 6675 }, { "epoch": 2.040966065423418, "grad_norm": 0.37528353929519653, "learning_rate": 2.5494841421474973e-05, "loss": 0.1117, "step": 6676 }, { "epoch": 2.041271782329563, "grad_norm": 0.3372325003147125, "learning_rate": 2.549866259075277e-05, "loss": 0.0605, "step": 6677 }, { "epoch": 2.0415774992357076, "grad_norm": 0.44612646102905273, "learning_rate": 2.550248376003057e-05, "loss": 0.1295, "step": 6678 }, { "epoch": 2.0418832161418528, "grad_norm": 0.555840015411377, "learning_rate": 2.550630492930837e-05, "loss": 0.095, "step": 6679 }, { "epoch": 2.0421889330479974, "grad_norm": 0.4150850474834442, "learning_rate": 2.5510126098586167e-05, "loss": 0.0963, "step": 6680 }, { "epoch": 2.0424946499541425, "grad_norm": 0.5337361693382263, "learning_rate": 2.5513947267863966e-05, "loss": 0.1397, "step": 6681 }, { "epoch": 2.042800366860287, "grad_norm": 0.5864183902740479, "learning_rate": 2.5517768437141765e-05, "loss": 0.1278, "step": 6682 }, { "epoch": 2.0431060837664323, "grad_norm": 0.958971381187439, "learning_rate": 2.5521589606419564e-05, "loss": 0.1871, "step": 6683 }, { "epoch": 2.043411800672577, "grad_norm": 0.8262670040130615, "learning_rate": 2.5525410775697366e-05, "loss": 0.1717, "step": 6684 }, { "epoch": 2.043717517578722, "grad_norm": 1.295039415359497, "learning_rate": 2.5529231944975165e-05, "loss": 0.1974, "step": 6685 }, { "epoch": 2.0440232344848672, "grad_norm": 0.7447128891944885, "learning_rate": 2.5533053114252963e-05, "loss": 0.1746, "step": 6686 }, { "epoch": 2.044328951391012, "grad_norm": 1.0549190044403076, "learning_rate": 2.5536874283530762e-05, "loss": 0.1919, "step": 6687 }, { "epoch": 2.044634668297157, "grad_norm": 0.913867712020874, "learning_rate": 2.554069545280856e-05, "loss": 0.2054, "step": 6688 }, { "epoch": 2.0449403852033017, "grad_norm": 4.950313091278076, "learning_rate": 2.554451662208636e-05, "loss": 0.2372, "step": 6689 }, { "epoch": 2.045246102109447, "grad_norm": 1.1186162233352661, "learning_rate": 2.554833779136416e-05, "loss": 0.245, "step": 6690 }, { "epoch": 2.0455518190155915, "grad_norm": 1.3251960277557373, "learning_rate": 2.5552158960641957e-05, "loss": 0.2664, "step": 6691 }, { "epoch": 2.0458575359217366, "grad_norm": 1.4227339029312134, "learning_rate": 2.5555980129919756e-05, "loss": 0.3023, "step": 6692 }, { "epoch": 2.0461632528278813, "grad_norm": 0.3530927300453186, "learning_rate": 2.5559801299197555e-05, "loss": 0.1549, "step": 6693 }, { "epoch": 2.0464689697340264, "grad_norm": 0.3355972468852997, "learning_rate": 2.5563622468475353e-05, "loss": 0.0897, "step": 6694 }, { "epoch": 2.046774686640171, "grad_norm": 0.24177075922489166, "learning_rate": 2.5567443637753152e-05, "loss": 0.0721, "step": 6695 }, { "epoch": 2.047080403546316, "grad_norm": 0.2106119990348816, "learning_rate": 2.557126480703095e-05, "loss": 0.0635, "step": 6696 }, { "epoch": 2.047386120452461, "grad_norm": 0.554821789264679, "learning_rate": 2.557508597630875e-05, "loss": 0.0802, "step": 6697 }, { "epoch": 2.047691837358606, "grad_norm": 0.41410210728645325, "learning_rate": 2.5578907145586548e-05, "loss": 0.0634, "step": 6698 }, { "epoch": 2.047997554264751, "grad_norm": 0.39545631408691406, "learning_rate": 2.5582728314864347e-05, "loss": 0.1034, "step": 6699 }, { "epoch": 2.0483032711708957, "grad_norm": 0.3358781635761261, "learning_rate": 2.558654948414215e-05, "loss": 0.0645, "step": 6700 }, { "epoch": 2.048608988077041, "grad_norm": 0.29251375794410706, "learning_rate": 2.5590370653419948e-05, "loss": 0.0676, "step": 6701 }, { "epoch": 2.0489147049831855, "grad_norm": 0.5972058773040771, "learning_rate": 2.5594191822697747e-05, "loss": 0.0842, "step": 6702 }, { "epoch": 2.0492204218893306, "grad_norm": 0.5625972151756287, "learning_rate": 2.5598012991975545e-05, "loss": 0.105, "step": 6703 }, { "epoch": 2.0495261387954753, "grad_norm": 0.580680787563324, "learning_rate": 2.5601834161253344e-05, "loss": 0.119, "step": 6704 }, { "epoch": 2.0498318557016204, "grad_norm": 0.5643013715744019, "learning_rate": 2.5605655330531143e-05, "loss": 0.1138, "step": 6705 }, { "epoch": 2.050137572607765, "grad_norm": 0.7581091523170471, "learning_rate": 2.560947649980894e-05, "loss": 0.1195, "step": 6706 }, { "epoch": 2.05044328951391, "grad_norm": 0.45716002583503723, "learning_rate": 2.561329766908674e-05, "loss": 0.1063, "step": 6707 }, { "epoch": 2.050749006420055, "grad_norm": 0.590782880783081, "learning_rate": 2.561711883836454e-05, "loss": 0.1811, "step": 6708 }, { "epoch": 2.0510547233262, "grad_norm": 0.705123782157898, "learning_rate": 2.5620940007642338e-05, "loss": 0.172, "step": 6709 }, { "epoch": 2.0513604402323447, "grad_norm": 0.6412637829780579, "learning_rate": 2.5624761176920137e-05, "loss": 0.1682, "step": 6710 }, { "epoch": 2.0516661571384898, "grad_norm": 1.8113911151885986, "learning_rate": 2.5628582346197935e-05, "loss": 0.2297, "step": 6711 }, { "epoch": 2.051971874044635, "grad_norm": 1.3013262748718262, "learning_rate": 2.5632403515475734e-05, "loss": 0.2025, "step": 6712 }, { "epoch": 2.0522775909507796, "grad_norm": 2.2996582984924316, "learning_rate": 2.5636224684753533e-05, "loss": 0.1836, "step": 6713 }, { "epoch": 2.0525833078569247, "grad_norm": 1.2405356168746948, "learning_rate": 2.564004585403133e-05, "loss": 0.219, "step": 6714 }, { "epoch": 2.0528890247630693, "grad_norm": 2.099123954772949, "learning_rate": 2.5643867023309134e-05, "loss": 0.2588, "step": 6715 }, { "epoch": 2.0531947416692145, "grad_norm": 2.048529624938965, "learning_rate": 2.5647688192586932e-05, "loss": 0.244, "step": 6716 }, { "epoch": 2.053500458575359, "grad_norm": 20.280088424682617, "learning_rate": 2.565150936186473e-05, "loss": 0.3134, "step": 6717 }, { "epoch": 2.0538061754815042, "grad_norm": 0.4012318551540375, "learning_rate": 2.565533053114253e-05, "loss": 0.177, "step": 6718 }, { "epoch": 2.054111892387649, "grad_norm": 0.7433446049690247, "learning_rate": 2.565915170042033e-05, "loss": 0.1062, "step": 6719 }, { "epoch": 2.054417609293794, "grad_norm": 0.8659158945083618, "learning_rate": 2.5662972869698127e-05, "loss": 0.0726, "step": 6720 }, { "epoch": 2.0547233261999387, "grad_norm": 0.35377222299575806, "learning_rate": 2.5666794038975926e-05, "loss": 0.0741, "step": 6721 }, { "epoch": 2.055029043106084, "grad_norm": 0.33660244941711426, "learning_rate": 2.5670615208253725e-05, "loss": 0.0653, "step": 6722 }, { "epoch": 2.0553347600122285, "grad_norm": 0.5974368453025818, "learning_rate": 2.5674436377531524e-05, "loss": 0.0652, "step": 6723 }, { "epoch": 2.0556404769183736, "grad_norm": 0.7147430181503296, "learning_rate": 2.5678257546809322e-05, "loss": 0.0767, "step": 6724 }, { "epoch": 2.0559461938245187, "grad_norm": 0.806989312171936, "learning_rate": 2.568207871608712e-05, "loss": 0.0896, "step": 6725 }, { "epoch": 2.0562519107306634, "grad_norm": 0.523047685623169, "learning_rate": 2.568589988536492e-05, "loss": 0.0849, "step": 6726 }, { "epoch": 2.0565576276368085, "grad_norm": 0.4339098632335663, "learning_rate": 2.568972105464272e-05, "loss": 0.0868, "step": 6727 }, { "epoch": 2.056863344542953, "grad_norm": 1.6358782052993774, "learning_rate": 2.569354222392052e-05, "loss": 0.1299, "step": 6728 }, { "epoch": 2.0571690614490983, "grad_norm": 0.5762453079223633, "learning_rate": 2.569736339319832e-05, "loss": 0.0773, "step": 6729 }, { "epoch": 2.057474778355243, "grad_norm": 0.4934680461883545, "learning_rate": 2.5701184562476118e-05, "loss": 0.135, "step": 6730 }, { "epoch": 2.057780495261388, "grad_norm": 0.9491875767707825, "learning_rate": 2.570500573175392e-05, "loss": 0.1336, "step": 6731 }, { "epoch": 2.0580862121675327, "grad_norm": 0.4757545292377472, "learning_rate": 2.570882690103172e-05, "loss": 0.1083, "step": 6732 }, { "epoch": 2.058391929073678, "grad_norm": 0.636071503162384, "learning_rate": 2.5712648070309518e-05, "loss": 0.1241, "step": 6733 }, { "epoch": 2.0586976459798225, "grad_norm": 0.9914324879646301, "learning_rate": 2.5716469239587317e-05, "loss": 0.1638, "step": 6734 }, { "epoch": 2.0590033628859676, "grad_norm": 1.7496415376663208, "learning_rate": 2.5720290408865115e-05, "loss": 0.1865, "step": 6735 }, { "epoch": 2.0593090797921123, "grad_norm": 1.4054816961288452, "learning_rate": 2.5724111578142914e-05, "loss": 0.1902, "step": 6736 }, { "epoch": 2.0596147966982574, "grad_norm": 1.0311442613601685, "learning_rate": 2.5727932747420713e-05, "loss": 0.1881, "step": 6737 }, { "epoch": 2.0599205136044025, "grad_norm": 1.0853776931762695, "learning_rate": 2.573175391669851e-05, "loss": 0.2036, "step": 6738 }, { "epoch": 2.060226230510547, "grad_norm": 0.9772021770477295, "learning_rate": 2.573557508597631e-05, "loss": 0.2432, "step": 6739 }, { "epoch": 2.0605319474166923, "grad_norm": 0.8687800168991089, "learning_rate": 2.573939625525411e-05, "loss": 0.2385, "step": 6740 }, { "epoch": 2.060837664322837, "grad_norm": 1.16692316532135, "learning_rate": 2.5743217424531908e-05, "loss": 0.2249, "step": 6741 }, { "epoch": 2.061143381228982, "grad_norm": 1.6681692600250244, "learning_rate": 2.5747038593809707e-05, "loss": 0.3312, "step": 6742 }, { "epoch": 2.061449098135127, "grad_norm": 0.5839918255805969, "learning_rate": 2.5750859763087505e-05, "loss": 0.1341, "step": 6743 }, { "epoch": 2.061754815041272, "grad_norm": 0.374447762966156, "learning_rate": 2.5754680932365304e-05, "loss": 0.1, "step": 6744 }, { "epoch": 2.0620605319474166, "grad_norm": 0.2635928988456726, "learning_rate": 2.5758502101643103e-05, "loss": 0.0809, "step": 6745 }, { "epoch": 2.0623662488535617, "grad_norm": 0.28196486830711365, "learning_rate": 2.5762323270920905e-05, "loss": 0.0703, "step": 6746 }, { "epoch": 2.0626719657597063, "grad_norm": 0.4242561459541321, "learning_rate": 2.5766144440198704e-05, "loss": 0.0582, "step": 6747 }, { "epoch": 2.0629776826658515, "grad_norm": 0.40872278809547424, "learning_rate": 2.5769965609476502e-05, "loss": 0.0705, "step": 6748 }, { "epoch": 2.063283399571996, "grad_norm": 0.47006359696388245, "learning_rate": 2.57737867787543e-05, "loss": 0.0631, "step": 6749 }, { "epoch": 2.0635891164781412, "grad_norm": 0.5544420480728149, "learning_rate": 2.57776079480321e-05, "loss": 0.0881, "step": 6750 }, { "epoch": 2.0638948333842864, "grad_norm": 0.6994861364364624, "learning_rate": 2.57814291173099e-05, "loss": 0.0631, "step": 6751 }, { "epoch": 2.064200550290431, "grad_norm": 0.5117506980895996, "learning_rate": 2.5785250286587697e-05, "loss": 0.1073, "step": 6752 }, { "epoch": 2.064506267196576, "grad_norm": 0.45650386810302734, "learning_rate": 2.5789071455865496e-05, "loss": 0.1166, "step": 6753 }, { "epoch": 2.064811984102721, "grad_norm": 0.3439253270626068, "learning_rate": 2.5792892625143295e-05, "loss": 0.0847, "step": 6754 }, { "epoch": 2.065117701008866, "grad_norm": 0.3182777464389801, "learning_rate": 2.5796713794421094e-05, "loss": 0.1344, "step": 6755 }, { "epoch": 2.0654234179150106, "grad_norm": 0.6826333999633789, "learning_rate": 2.5800534963698892e-05, "loss": 0.1668, "step": 6756 }, { "epoch": 2.0657291348211557, "grad_norm": 0.613685667514801, "learning_rate": 2.580435613297669e-05, "loss": 0.1241, "step": 6757 }, { "epoch": 2.0660348517273004, "grad_norm": 0.6023954749107361, "learning_rate": 2.580817730225449e-05, "loss": 0.162, "step": 6758 }, { "epoch": 2.0663405686334455, "grad_norm": 0.8192209601402283, "learning_rate": 2.581199847153229e-05, "loss": 0.1667, "step": 6759 }, { "epoch": 2.06664628553959, "grad_norm": 1.5629725456237793, "learning_rate": 2.5815819640810087e-05, "loss": 0.1892, "step": 6760 }, { "epoch": 2.0669520024457353, "grad_norm": 1.0349676609039307, "learning_rate": 2.5819640810087886e-05, "loss": 0.1943, "step": 6761 }, { "epoch": 2.06725771935188, "grad_norm": 0.7384461164474487, "learning_rate": 2.5823461979365688e-05, "loss": 0.1751, "step": 6762 }, { "epoch": 2.067563436258025, "grad_norm": 0.8128156661987305, "learning_rate": 2.5827283148643487e-05, "loss": 0.2015, "step": 6763 }, { "epoch": 2.06786915316417, "grad_norm": 1.6773115396499634, "learning_rate": 2.5831104317921286e-05, "loss": 0.2018, "step": 6764 }, { "epoch": 2.068174870070315, "grad_norm": 1.4344249963760376, "learning_rate": 2.5834925487199084e-05, "loss": 0.2152, "step": 6765 }, { "epoch": 2.06848058697646, "grad_norm": 1.4908709526062012, "learning_rate": 2.5838746656476883e-05, "loss": 0.2911, "step": 6766 }, { "epoch": 2.0687863038826046, "grad_norm": 1.9756039381027222, "learning_rate": 2.5842567825754682e-05, "loss": 0.3536, "step": 6767 }, { "epoch": 2.0690920207887498, "grad_norm": 0.5096313953399658, "learning_rate": 2.584638899503248e-05, "loss": 0.1694, "step": 6768 }, { "epoch": 2.0693977376948944, "grad_norm": 0.4709843099117279, "learning_rate": 2.585021016431028e-05, "loss": 0.0825, "step": 6769 }, { "epoch": 2.0697034546010395, "grad_norm": 0.42089423537254333, "learning_rate": 2.5854031333588078e-05, "loss": 0.0902, "step": 6770 }, { "epoch": 2.070009171507184, "grad_norm": 0.4764067232608795, "learning_rate": 2.5857852502865877e-05, "loss": 0.102, "step": 6771 }, { "epoch": 2.0703148884133293, "grad_norm": 0.3251112997531891, "learning_rate": 2.5861673672143676e-05, "loss": 0.0682, "step": 6772 }, { "epoch": 2.070620605319474, "grad_norm": 0.32745903730392456, "learning_rate": 2.5865494841421474e-05, "loss": 0.069, "step": 6773 }, { "epoch": 2.070926322225619, "grad_norm": 0.3431150019168854, "learning_rate": 2.5869316010699273e-05, "loss": 0.0873, "step": 6774 }, { "epoch": 2.071232039131764, "grad_norm": 0.2877151668071747, "learning_rate": 2.5873137179977072e-05, "loss": 0.0991, "step": 6775 }, { "epoch": 2.071537756037909, "grad_norm": 0.29412123560905457, "learning_rate": 2.587695834925487e-05, "loss": 0.1136, "step": 6776 }, { "epoch": 2.071843472944054, "grad_norm": 0.3276585340499878, "learning_rate": 2.5880779518532673e-05, "loss": 0.0607, "step": 6777 }, { "epoch": 2.0721491898501987, "grad_norm": 0.31688806414604187, "learning_rate": 2.588460068781047e-05, "loss": 0.095, "step": 6778 }, { "epoch": 2.072454906756344, "grad_norm": 0.5324599742889404, "learning_rate": 2.588842185708827e-05, "loss": 0.1164, "step": 6779 }, { "epoch": 2.0727606236624885, "grad_norm": 0.5974122285842896, "learning_rate": 2.589224302636607e-05, "loss": 0.1016, "step": 6780 }, { "epoch": 2.0730663405686336, "grad_norm": 0.4422624409198761, "learning_rate": 2.5896064195643868e-05, "loss": 0.1289, "step": 6781 }, { "epoch": 2.0733720574747783, "grad_norm": 0.5407295227050781, "learning_rate": 2.5899885364921667e-05, "loss": 0.1456, "step": 6782 }, { "epoch": 2.0736777743809234, "grad_norm": 0.6863033771514893, "learning_rate": 2.5903706534199465e-05, "loss": 0.2097, "step": 6783 }, { "epoch": 2.073983491287068, "grad_norm": 0.6764746904373169, "learning_rate": 2.5907527703477264e-05, "loss": 0.2332, "step": 6784 }, { "epoch": 2.074289208193213, "grad_norm": 0.6523457765579224, "learning_rate": 2.5911348872755063e-05, "loss": 0.1873, "step": 6785 }, { "epoch": 2.074594925099358, "grad_norm": 0.9431259632110596, "learning_rate": 2.591517004203286e-05, "loss": 0.1812, "step": 6786 }, { "epoch": 2.074900642005503, "grad_norm": 0.825599193572998, "learning_rate": 2.591899121131066e-05, "loss": 0.2172, "step": 6787 }, { "epoch": 2.0752063589116476, "grad_norm": 0.7800313234329224, "learning_rate": 2.592281238058846e-05, "loss": 0.1942, "step": 6788 }, { "epoch": 2.0755120758177927, "grad_norm": 0.814951479434967, "learning_rate": 2.5926633549866258e-05, "loss": 0.2225, "step": 6789 }, { "epoch": 2.075817792723938, "grad_norm": 0.9177517890930176, "learning_rate": 2.5930454719144057e-05, "loss": 0.2182, "step": 6790 }, { "epoch": 2.0761235096300825, "grad_norm": 1.2576236724853516, "learning_rate": 2.5934275888421855e-05, "loss": 0.2783, "step": 6791 }, { "epoch": 2.0764292265362276, "grad_norm": 1.253940224647522, "learning_rate": 2.5938097057699654e-05, "loss": 0.3131, "step": 6792 }, { "epoch": 2.0767349434423723, "grad_norm": 0.4654493033885956, "learning_rate": 2.5941918226977456e-05, "loss": 0.1462, "step": 6793 }, { "epoch": 2.0770406603485174, "grad_norm": 0.28687694668769836, "learning_rate": 2.5945739396255255e-05, "loss": 0.1058, "step": 6794 }, { "epoch": 2.077346377254662, "grad_norm": 0.2946234345436096, "learning_rate": 2.5949560565533054e-05, "loss": 0.0866, "step": 6795 }, { "epoch": 2.077652094160807, "grad_norm": 0.2519511878490448, "learning_rate": 2.5953381734810852e-05, "loss": 0.0621, "step": 6796 }, { "epoch": 2.077957811066952, "grad_norm": 1.2435311079025269, "learning_rate": 2.595720290408865e-05, "loss": 0.0737, "step": 6797 }, { "epoch": 2.078263527973097, "grad_norm": 0.40179309248924255, "learning_rate": 2.596102407336645e-05, "loss": 0.0771, "step": 6798 }, { "epoch": 2.0785692448792417, "grad_norm": 0.33586451411247253, "learning_rate": 2.596484524264425e-05, "loss": 0.0591, "step": 6799 }, { "epoch": 2.0788749617853868, "grad_norm": 0.37804341316223145, "learning_rate": 2.5968666411922047e-05, "loss": 0.0737, "step": 6800 }, { "epoch": 2.0791806786915314, "grad_norm": 0.43044012784957886, "learning_rate": 2.5972487581199846e-05, "loss": 0.0923, "step": 6801 }, { "epoch": 2.0794863955976766, "grad_norm": 0.5008851289749146, "learning_rate": 2.5976308750477645e-05, "loss": 0.0693, "step": 6802 }, { "epoch": 2.0797921125038217, "grad_norm": 0.45967426896095276, "learning_rate": 2.5980129919755444e-05, "loss": 0.0969, "step": 6803 }, { "epoch": 2.0800978294099663, "grad_norm": 0.486173152923584, "learning_rate": 2.5983951089033242e-05, "loss": 0.1206, "step": 6804 }, { "epoch": 2.0804035463161115, "grad_norm": 0.36592185497283936, "learning_rate": 2.598777225831104e-05, "loss": 0.0908, "step": 6805 }, { "epoch": 2.080709263222256, "grad_norm": 0.3747681677341461, "learning_rate": 2.5991593427588843e-05, "loss": 0.1607, "step": 6806 }, { "epoch": 2.0810149801284012, "grad_norm": 0.4946690499782562, "learning_rate": 2.5995414596866642e-05, "loss": 0.1472, "step": 6807 }, { "epoch": 2.081320697034546, "grad_norm": 0.5864738821983337, "learning_rate": 2.5999235766144444e-05, "loss": 0.1549, "step": 6808 }, { "epoch": 2.081626413940691, "grad_norm": 0.43500077724456787, "learning_rate": 2.6003056935422243e-05, "loss": 0.2092, "step": 6809 }, { "epoch": 2.0819321308468357, "grad_norm": 0.4775599539279938, "learning_rate": 2.600687810470004e-05, "loss": 0.1895, "step": 6810 }, { "epoch": 2.082237847752981, "grad_norm": 1.2293078899383545, "learning_rate": 2.601069927397784e-05, "loss": 0.2018, "step": 6811 }, { "epoch": 2.0825435646591255, "grad_norm": 0.5919601917266846, "learning_rate": 2.601452044325564e-05, "loss": 0.195, "step": 6812 }, { "epoch": 2.0828492815652706, "grad_norm": 0.6490631103515625, "learning_rate": 2.6018341612533438e-05, "loss": 0.1988, "step": 6813 }, { "epoch": 2.0831549984714153, "grad_norm": 0.704454779624939, "learning_rate": 2.6022162781811237e-05, "loss": 0.2053, "step": 6814 }, { "epoch": 2.0834607153775604, "grad_norm": 0.8389390707015991, "learning_rate": 2.6025983951089035e-05, "loss": 0.2538, "step": 6815 }, { "epoch": 2.0837664322837055, "grad_norm": 0.7771681547164917, "learning_rate": 2.6029805120366834e-05, "loss": 0.2911, "step": 6816 }, { "epoch": 2.08407214918985, "grad_norm": 1.6213070154190063, "learning_rate": 2.6033626289644633e-05, "loss": 0.2642, "step": 6817 }, { "epoch": 2.0843778660959953, "grad_norm": 0.3789851665496826, "learning_rate": 2.603744745892243e-05, "loss": 0.156, "step": 6818 }, { "epoch": 2.08468358300214, "grad_norm": 0.28902509808540344, "learning_rate": 2.604126862820023e-05, "loss": 0.0838, "step": 6819 }, { "epoch": 2.084989299908285, "grad_norm": 0.2546181082725525, "learning_rate": 2.604508979747803e-05, "loss": 0.0791, "step": 6820 }, { "epoch": 2.0852950168144297, "grad_norm": 0.3216468393802643, "learning_rate": 2.6048910966755828e-05, "loss": 0.0898, "step": 6821 }, { "epoch": 2.085600733720575, "grad_norm": 0.2780543863773346, "learning_rate": 2.6052732136033626e-05, "loss": 0.0796, "step": 6822 }, { "epoch": 2.0859064506267195, "grad_norm": 0.18596826493740082, "learning_rate": 2.6056553305311425e-05, "loss": 0.0479, "step": 6823 }, { "epoch": 2.0862121675328646, "grad_norm": 0.37503182888031006, "learning_rate": 2.6060374474589227e-05, "loss": 0.0775, "step": 6824 }, { "epoch": 2.0865178844390093, "grad_norm": 0.2954326272010803, "learning_rate": 2.6064195643867026e-05, "loss": 0.0619, "step": 6825 }, { "epoch": 2.0868236013451544, "grad_norm": 0.3089320957660675, "learning_rate": 2.6068016813144825e-05, "loss": 0.0959, "step": 6826 }, { "epoch": 2.087129318251299, "grad_norm": 0.42161768674850464, "learning_rate": 2.6071837982422624e-05, "loss": 0.0934, "step": 6827 }, { "epoch": 2.087435035157444, "grad_norm": 1.0496774911880493, "learning_rate": 2.6075659151700422e-05, "loss": 0.1031, "step": 6828 }, { "epoch": 2.0877407520635893, "grad_norm": 0.3075832426548004, "learning_rate": 2.607948032097822e-05, "loss": 0.0896, "step": 6829 }, { "epoch": 2.088046468969734, "grad_norm": 0.3843171298503876, "learning_rate": 2.608330149025602e-05, "loss": 0.1112, "step": 6830 }, { "epoch": 2.088352185875879, "grad_norm": 0.4106355607509613, "learning_rate": 2.608712265953382e-05, "loss": 0.1608, "step": 6831 }, { "epoch": 2.088657902782024, "grad_norm": 0.44678619503974915, "learning_rate": 2.6090943828811617e-05, "loss": 0.1573, "step": 6832 }, { "epoch": 2.088963619688169, "grad_norm": 0.6193581819534302, "learning_rate": 2.6094764998089416e-05, "loss": 0.1552, "step": 6833 }, { "epoch": 2.0892693365943136, "grad_norm": 0.8274082541465759, "learning_rate": 2.6098586167367215e-05, "loss": 0.1637, "step": 6834 }, { "epoch": 2.0895750535004587, "grad_norm": 0.46938562393188477, "learning_rate": 2.6102407336645014e-05, "loss": 0.1608, "step": 6835 }, { "epoch": 2.0898807704066034, "grad_norm": 0.5941140651702881, "learning_rate": 2.6106228505922812e-05, "loss": 0.1888, "step": 6836 }, { "epoch": 2.0901864873127485, "grad_norm": 0.6796985268592834, "learning_rate": 2.611004967520061e-05, "loss": 0.2041, "step": 6837 }, { "epoch": 2.090492204218893, "grad_norm": 0.8892584443092346, "learning_rate": 2.611387084447841e-05, "loss": 0.2697, "step": 6838 }, { "epoch": 2.0907979211250383, "grad_norm": 0.9827401041984558, "learning_rate": 2.6117692013756212e-05, "loss": 0.1834, "step": 6839 }, { "epoch": 2.091103638031183, "grad_norm": 0.8361902236938477, "learning_rate": 2.612151318303401e-05, "loss": 0.2153, "step": 6840 }, { "epoch": 2.091409354937328, "grad_norm": 1.1609827280044556, "learning_rate": 2.612533435231181e-05, "loss": 0.2499, "step": 6841 }, { "epoch": 2.091715071843473, "grad_norm": 1.6102606058120728, "learning_rate": 2.6129155521589608e-05, "loss": 0.3166, "step": 6842 }, { "epoch": 2.092020788749618, "grad_norm": 0.5419762134552002, "learning_rate": 2.6132976690867407e-05, "loss": 0.1439, "step": 6843 }, { "epoch": 2.092326505655763, "grad_norm": 0.4955834746360779, "learning_rate": 2.6136797860145206e-05, "loss": 0.1202, "step": 6844 }, { "epoch": 2.0926322225619076, "grad_norm": 0.3700505793094635, "learning_rate": 2.6140619029423004e-05, "loss": 0.094, "step": 6845 }, { "epoch": 2.0929379394680527, "grad_norm": 0.23660808801651, "learning_rate": 2.6144440198700803e-05, "loss": 0.0666, "step": 6846 }, { "epoch": 2.0932436563741974, "grad_norm": 0.5385577082633972, "learning_rate": 2.6148261367978602e-05, "loss": 0.0771, "step": 6847 }, { "epoch": 2.0935493732803425, "grad_norm": 0.44602060317993164, "learning_rate": 2.61520825372564e-05, "loss": 0.0734, "step": 6848 }, { "epoch": 2.093855090186487, "grad_norm": 0.2640535831451416, "learning_rate": 2.61559037065342e-05, "loss": 0.059, "step": 6849 }, { "epoch": 2.0941608070926323, "grad_norm": 0.33956247568130493, "learning_rate": 2.6159724875811998e-05, "loss": 0.0687, "step": 6850 }, { "epoch": 2.094466523998777, "grad_norm": 0.4650571048259735, "learning_rate": 2.6163546045089797e-05, "loss": 0.1035, "step": 6851 }, { "epoch": 2.094772240904922, "grad_norm": 0.5701234936714172, "learning_rate": 2.6167367214367596e-05, "loss": 0.0587, "step": 6852 }, { "epoch": 2.0950779578110668, "grad_norm": 0.5277001857757568, "learning_rate": 2.6171188383645394e-05, "loss": 0.1223, "step": 6853 }, { "epoch": 2.095383674717212, "grad_norm": 0.3614307641983032, "learning_rate": 2.6175009552923193e-05, "loss": 0.108, "step": 6854 }, { "epoch": 2.095689391623357, "grad_norm": 0.6265258193016052, "learning_rate": 2.6178830722200995e-05, "loss": 0.1011, "step": 6855 }, { "epoch": 2.0959951085295017, "grad_norm": 0.41705965995788574, "learning_rate": 2.6182651891478794e-05, "loss": 0.1111, "step": 6856 }, { "epoch": 2.0963008254356468, "grad_norm": 0.5079902410507202, "learning_rate": 2.6186473060756593e-05, "loss": 0.1429, "step": 6857 }, { "epoch": 2.0966065423417914, "grad_norm": 0.5769208669662476, "learning_rate": 2.619029423003439e-05, "loss": 0.1708, "step": 6858 }, { "epoch": 2.0969122592479366, "grad_norm": 0.7282539010047913, "learning_rate": 2.619411539931219e-05, "loss": 0.1925, "step": 6859 }, { "epoch": 2.0972179761540812, "grad_norm": 0.6389527916908264, "learning_rate": 2.619793656858999e-05, "loss": 0.2026, "step": 6860 }, { "epoch": 2.0975236930602263, "grad_norm": 0.9689258337020874, "learning_rate": 2.6201757737867788e-05, "loss": 0.1992, "step": 6861 }, { "epoch": 2.097829409966371, "grad_norm": 0.7777717709541321, "learning_rate": 2.6205578907145586e-05, "loss": 0.187, "step": 6862 }, { "epoch": 2.098135126872516, "grad_norm": 0.8017659187316895, "learning_rate": 2.6209400076423385e-05, "loss": 0.2233, "step": 6863 }, { "epoch": 2.098440843778661, "grad_norm": 1.188737392425537, "learning_rate": 2.6213221245701184e-05, "loss": 0.235, "step": 6864 }, { "epoch": 2.098746560684806, "grad_norm": 0.9088931679725647, "learning_rate": 2.6217042414978983e-05, "loss": 0.2198, "step": 6865 }, { "epoch": 2.0990522775909506, "grad_norm": 1.2633986473083496, "learning_rate": 2.622086358425678e-05, "loss": 0.2209, "step": 6866 }, { "epoch": 2.0993579944970957, "grad_norm": 2.4639732837677, "learning_rate": 2.622468475353458e-05, "loss": 0.3502, "step": 6867 }, { "epoch": 2.099663711403241, "grad_norm": 0.5876261591911316, "learning_rate": 2.622850592281238e-05, "loss": 0.1844, "step": 6868 }, { "epoch": 2.0999694283093855, "grad_norm": 0.3139699101448059, "learning_rate": 2.6232327092090178e-05, "loss": 0.109, "step": 6869 }, { "epoch": 2.1002751452155306, "grad_norm": 0.5502463579177856, "learning_rate": 2.623614826136798e-05, "loss": 0.1176, "step": 6870 }, { "epoch": 2.1005808621216753, "grad_norm": 0.35877883434295654, "learning_rate": 2.623996943064578e-05, "loss": 0.0692, "step": 6871 }, { "epoch": 2.1008865790278204, "grad_norm": 0.29018354415893555, "learning_rate": 2.6243790599923577e-05, "loss": 0.0641, "step": 6872 }, { "epoch": 2.101192295933965, "grad_norm": 0.3114745020866394, "learning_rate": 2.6247611769201376e-05, "loss": 0.0822, "step": 6873 }, { "epoch": 2.10149801284011, "grad_norm": 0.42525017261505127, "learning_rate": 2.6251432938479175e-05, "loss": 0.0612, "step": 6874 }, { "epoch": 2.101803729746255, "grad_norm": 0.4253791868686676, "learning_rate": 2.6255254107756974e-05, "loss": 0.1063, "step": 6875 }, { "epoch": 2.1021094466524, "grad_norm": 0.32782065868377686, "learning_rate": 2.6259075277034772e-05, "loss": 0.0628, "step": 6876 }, { "epoch": 2.1024151635585446, "grad_norm": 0.26406991481781006, "learning_rate": 2.626289644631257e-05, "loss": 0.0789, "step": 6877 }, { "epoch": 2.1027208804646897, "grad_norm": 0.47945523262023926, "learning_rate": 2.626671761559037e-05, "loss": 0.1084, "step": 6878 }, { "epoch": 2.1030265973708344, "grad_norm": 0.41119199991226196, "learning_rate": 2.627053878486817e-05, "loss": 0.1009, "step": 6879 }, { "epoch": 2.1033323142769795, "grad_norm": 0.855890154838562, "learning_rate": 2.6274359954145967e-05, "loss": 0.0931, "step": 6880 }, { "epoch": 2.1036380311831246, "grad_norm": 1.0325350761413574, "learning_rate": 2.6278181123423766e-05, "loss": 0.1132, "step": 6881 }, { "epoch": 2.1039437480892693, "grad_norm": 0.5200135707855225, "learning_rate": 2.6282002292701565e-05, "loss": 0.1691, "step": 6882 }, { "epoch": 2.1042494649954144, "grad_norm": 0.9598063826560974, "learning_rate": 2.6285823461979363e-05, "loss": 0.1983, "step": 6883 }, { "epoch": 2.104555181901559, "grad_norm": 0.5309841632843018, "learning_rate": 2.6289644631257166e-05, "loss": 0.2096, "step": 6884 }, { "epoch": 2.104860898807704, "grad_norm": 0.5975992679595947, "learning_rate": 2.6293465800534964e-05, "loss": 0.1981, "step": 6885 }, { "epoch": 2.105166615713849, "grad_norm": 2.169896125793457, "learning_rate": 2.6297286969812766e-05, "loss": 0.2073, "step": 6886 }, { "epoch": 2.105472332619994, "grad_norm": 0.6752058267593384, "learning_rate": 2.6301108139090565e-05, "loss": 0.1934, "step": 6887 }, { "epoch": 2.1057780495261387, "grad_norm": 0.9962291121482849, "learning_rate": 2.6304929308368364e-05, "loss": 0.2238, "step": 6888 }, { "epoch": 2.106083766432284, "grad_norm": 1.1384143829345703, "learning_rate": 2.6308750477646163e-05, "loss": 0.2543, "step": 6889 }, { "epoch": 2.1063894833384285, "grad_norm": 0.9781508445739746, "learning_rate": 2.631257164692396e-05, "loss": 0.2293, "step": 6890 }, { "epoch": 2.1066952002445736, "grad_norm": 1.129151463508606, "learning_rate": 2.631639281620176e-05, "loss": 0.2356, "step": 6891 }, { "epoch": 2.1070009171507182, "grad_norm": 1.2198574542999268, "learning_rate": 2.632021398547956e-05, "loss": 0.2558, "step": 6892 }, { "epoch": 2.1073066340568634, "grad_norm": 0.5213297009468079, "learning_rate": 2.6324035154757358e-05, "loss": 0.1884, "step": 6893 }, { "epoch": 2.1076123509630085, "grad_norm": 0.2815247178077698, "learning_rate": 2.6327856324035156e-05, "loss": 0.0738, "step": 6894 }, { "epoch": 2.107918067869153, "grad_norm": 0.3913421630859375, "learning_rate": 2.6331677493312955e-05, "loss": 0.0871, "step": 6895 }, { "epoch": 2.1082237847752983, "grad_norm": 0.2869618237018585, "learning_rate": 2.6335498662590754e-05, "loss": 0.0733, "step": 6896 }, { "epoch": 2.108529501681443, "grad_norm": 0.2238268256187439, "learning_rate": 2.6339319831868553e-05, "loss": 0.0704, "step": 6897 }, { "epoch": 2.108835218587588, "grad_norm": 0.3679649233818054, "learning_rate": 2.634314100114635e-05, "loss": 0.076, "step": 6898 }, { "epoch": 2.1091409354937327, "grad_norm": 0.3120984733104706, "learning_rate": 2.634696217042415e-05, "loss": 0.0595, "step": 6899 }, { "epoch": 2.109446652399878, "grad_norm": 0.2992878556251526, "learning_rate": 2.635078333970195e-05, "loss": 0.086, "step": 6900 }, { "epoch": 2.1097523693060225, "grad_norm": 0.5389651656150818, "learning_rate": 2.635460450897975e-05, "loss": 0.1024, "step": 6901 }, { "epoch": 2.1100580862121676, "grad_norm": 0.40245169401168823, "learning_rate": 2.635842567825755e-05, "loss": 0.0708, "step": 6902 }, { "epoch": 2.1103638031183123, "grad_norm": 0.40731897950172424, "learning_rate": 2.636224684753535e-05, "loss": 0.1144, "step": 6903 }, { "epoch": 2.1106695200244574, "grad_norm": 0.35747119784355164, "learning_rate": 2.6366068016813147e-05, "loss": 0.0993, "step": 6904 }, { "epoch": 2.110975236930602, "grad_norm": 0.7321627736091614, "learning_rate": 2.6369889186090946e-05, "loss": 0.1006, "step": 6905 }, { "epoch": 2.111280953836747, "grad_norm": 0.45106983184814453, "learning_rate": 2.6373710355368745e-05, "loss": 0.1354, "step": 6906 }, { "epoch": 2.1115866707428923, "grad_norm": 0.4476831555366516, "learning_rate": 2.6377531524646544e-05, "loss": 0.1539, "step": 6907 }, { "epoch": 2.111892387649037, "grad_norm": 0.5596027374267578, "learning_rate": 2.6381352693924342e-05, "loss": 0.1449, "step": 6908 }, { "epoch": 2.112198104555182, "grad_norm": 0.5064534544944763, "learning_rate": 2.638517386320214e-05, "loss": 0.1845, "step": 6909 }, { "epoch": 2.1125038214613268, "grad_norm": 0.5583907961845398, "learning_rate": 2.638899503247994e-05, "loss": 0.1777, "step": 6910 }, { "epoch": 2.112809538367472, "grad_norm": 0.9406691193580627, "learning_rate": 2.639281620175774e-05, "loss": 0.2097, "step": 6911 }, { "epoch": 2.1131152552736165, "grad_norm": 0.8071964979171753, "learning_rate": 2.6396637371035537e-05, "loss": 0.2094, "step": 6912 }, { "epoch": 2.1134209721797617, "grad_norm": 0.8694546818733215, "learning_rate": 2.6400458540313336e-05, "loss": 0.1772, "step": 6913 }, { "epoch": 2.1137266890859063, "grad_norm": 1.0826603174209595, "learning_rate": 2.6404279709591135e-05, "loss": 0.2374, "step": 6914 }, { "epoch": 2.1140324059920514, "grad_norm": 0.8356642127037048, "learning_rate": 2.6408100878868933e-05, "loss": 0.2235, "step": 6915 }, { "epoch": 2.114338122898196, "grad_norm": 0.7580328583717346, "learning_rate": 2.6411922048146732e-05, "loss": 0.211, "step": 6916 }, { "epoch": 2.1146438398043412, "grad_norm": 2.875462770462036, "learning_rate": 2.6415743217424534e-05, "loss": 0.3244, "step": 6917 }, { "epoch": 2.114949556710486, "grad_norm": 0.45573294162750244, "learning_rate": 2.6419564386702333e-05, "loss": 0.1813, "step": 6918 }, { "epoch": 2.115255273616631, "grad_norm": 0.49825748801231384, "learning_rate": 2.6423385555980132e-05, "loss": 0.0997, "step": 6919 }, { "epoch": 2.115560990522776, "grad_norm": 0.4125383496284485, "learning_rate": 2.642720672525793e-05, "loss": 0.0629, "step": 6920 }, { "epoch": 2.115866707428921, "grad_norm": 0.24032363295555115, "learning_rate": 2.643102789453573e-05, "loss": 0.0762, "step": 6921 }, { "epoch": 2.116172424335066, "grad_norm": 0.29140233993530273, "learning_rate": 2.6434849063813528e-05, "loss": 0.059, "step": 6922 }, { "epoch": 2.1164781412412106, "grad_norm": 0.3036983907222748, "learning_rate": 2.6438670233091327e-05, "loss": 0.0718, "step": 6923 }, { "epoch": 2.1167838581473557, "grad_norm": 0.3416826128959656, "learning_rate": 2.6442491402369126e-05, "loss": 0.0612, "step": 6924 }, { "epoch": 2.1170895750535004, "grad_norm": 0.21245039999485016, "learning_rate": 2.6446312571646924e-05, "loss": 0.0743, "step": 6925 }, { "epoch": 2.1173952919596455, "grad_norm": 0.5349218845367432, "learning_rate": 2.6450133740924723e-05, "loss": 0.0901, "step": 6926 }, { "epoch": 2.11770100886579, "grad_norm": 0.3920413553714752, "learning_rate": 2.6453954910202522e-05, "loss": 0.0619, "step": 6927 }, { "epoch": 2.1180067257719353, "grad_norm": 0.330826997756958, "learning_rate": 2.645777607948032e-05, "loss": 0.1086, "step": 6928 }, { "epoch": 2.11831244267808, "grad_norm": 0.33422181010246277, "learning_rate": 2.646159724875812e-05, "loss": 0.1158, "step": 6929 }, { "epoch": 2.118618159584225, "grad_norm": 0.5168736577033997, "learning_rate": 2.6465418418035918e-05, "loss": 0.1436, "step": 6930 }, { "epoch": 2.1189238764903697, "grad_norm": 0.6560452580451965, "learning_rate": 2.6469239587313717e-05, "loss": 0.1212, "step": 6931 }, { "epoch": 2.119229593396515, "grad_norm": 0.5051134824752808, "learning_rate": 2.647306075659152e-05, "loss": 0.1875, "step": 6932 }, { "epoch": 2.11953531030266, "grad_norm": 0.5335291028022766, "learning_rate": 2.6476881925869318e-05, "loss": 0.1845, "step": 6933 }, { "epoch": 2.1198410272088046, "grad_norm": 0.9315981864929199, "learning_rate": 2.6480703095147116e-05, "loss": 0.1993, "step": 6934 }, { "epoch": 2.1201467441149497, "grad_norm": 0.7560873627662659, "learning_rate": 2.6484524264424915e-05, "loss": 0.2549, "step": 6935 }, { "epoch": 2.1204524610210944, "grad_norm": 1.0027633905410767, "learning_rate": 2.6488345433702714e-05, "loss": 0.1895, "step": 6936 }, { "epoch": 2.1207581779272395, "grad_norm": 1.0752484798431396, "learning_rate": 2.6492166602980513e-05, "loss": 0.1894, "step": 6937 }, { "epoch": 2.121063894833384, "grad_norm": 1.288644790649414, "learning_rate": 2.649598777225831e-05, "loss": 0.2433, "step": 6938 }, { "epoch": 2.1213696117395293, "grad_norm": 1.0074563026428223, "learning_rate": 2.649980894153611e-05, "loss": 0.2557, "step": 6939 }, { "epoch": 2.121675328645674, "grad_norm": 0.8319312334060669, "learning_rate": 2.650363011081391e-05, "loss": 0.2384, "step": 6940 }, { "epoch": 2.121981045551819, "grad_norm": 0.9879283308982849, "learning_rate": 2.6507451280091708e-05, "loss": 0.2356, "step": 6941 }, { "epoch": 2.1222867624579638, "grad_norm": 3.6745569705963135, "learning_rate": 2.6511272449369506e-05, "loss": 0.2969, "step": 6942 }, { "epoch": 2.122592479364109, "grad_norm": 0.45262411236763, "learning_rate": 2.6515093618647305e-05, "loss": 0.1579, "step": 6943 }, { "epoch": 2.1228981962702536, "grad_norm": 0.41507694125175476, "learning_rate": 2.6518914787925104e-05, "loss": 0.0864, "step": 6944 }, { "epoch": 2.1232039131763987, "grad_norm": 0.3539709150791168, "learning_rate": 2.6522735957202903e-05, "loss": 0.098, "step": 6945 }, { "epoch": 2.123509630082544, "grad_norm": 0.3166246712207794, "learning_rate": 2.65265571264807e-05, "loss": 0.0668, "step": 6946 }, { "epoch": 2.1238153469886885, "grad_norm": 0.26252204179763794, "learning_rate": 2.65303782957585e-05, "loss": 0.0777, "step": 6947 }, { "epoch": 2.1241210638948336, "grad_norm": 0.29856932163238525, "learning_rate": 2.6534199465036302e-05, "loss": 0.0765, "step": 6948 }, { "epoch": 2.1244267808009782, "grad_norm": 0.6439314484596252, "learning_rate": 2.65380206343141e-05, "loss": 0.0676, "step": 6949 }, { "epoch": 2.1247324977071234, "grad_norm": 0.2550975978374481, "learning_rate": 2.65418418035919e-05, "loss": 0.0508, "step": 6950 }, { "epoch": 2.125038214613268, "grad_norm": 0.2589341104030609, "learning_rate": 2.65456629728697e-05, "loss": 0.0787, "step": 6951 }, { "epoch": 2.125343931519413, "grad_norm": 0.44630974531173706, "learning_rate": 2.6549484142147497e-05, "loss": 0.0897, "step": 6952 }, { "epoch": 2.125649648425558, "grad_norm": 0.34867069125175476, "learning_rate": 2.6553305311425296e-05, "loss": 0.1132, "step": 6953 }, { "epoch": 2.125955365331703, "grad_norm": 0.7955979704856873, "learning_rate": 2.6557126480703095e-05, "loss": 0.1187, "step": 6954 }, { "epoch": 2.1262610822378476, "grad_norm": 0.4910275936126709, "learning_rate": 2.6560947649980893e-05, "loss": 0.109, "step": 6955 }, { "epoch": 2.1265667991439927, "grad_norm": 0.5488490462303162, "learning_rate": 2.6564768819258692e-05, "loss": 0.1753, "step": 6956 }, { "epoch": 2.1268725160501374, "grad_norm": 0.5692638754844666, "learning_rate": 2.656858998853649e-05, "loss": 0.1276, "step": 6957 }, { "epoch": 2.1271782329562825, "grad_norm": 0.5053918361663818, "learning_rate": 2.657241115781429e-05, "loss": 0.1408, "step": 6958 }, { "epoch": 2.1274839498624276, "grad_norm": 0.7601051926612854, "learning_rate": 2.657623232709209e-05, "loss": 0.2375, "step": 6959 }, { "epoch": 2.1277896667685723, "grad_norm": 0.5821571946144104, "learning_rate": 2.6580053496369887e-05, "loss": 0.2126, "step": 6960 }, { "epoch": 2.1280953836747174, "grad_norm": 0.7970068454742432, "learning_rate": 2.6583874665647686e-05, "loss": 0.1909, "step": 6961 }, { "epoch": 2.128401100580862, "grad_norm": 0.9319230318069458, "learning_rate": 2.6587695834925488e-05, "loss": 0.1804, "step": 6962 }, { "epoch": 2.128706817487007, "grad_norm": 0.8972347378730774, "learning_rate": 2.659151700420329e-05, "loss": 0.1838, "step": 6963 }, { "epoch": 2.129012534393152, "grad_norm": 0.700096607208252, "learning_rate": 2.659533817348109e-05, "loss": 0.1931, "step": 6964 }, { "epoch": 2.129318251299297, "grad_norm": 1.0526939630508423, "learning_rate": 2.6599159342758888e-05, "loss": 0.2011, "step": 6965 }, { "epoch": 2.1296239682054416, "grad_norm": 1.4443678855895996, "learning_rate": 2.6602980512036686e-05, "loss": 0.2565, "step": 6966 }, { "epoch": 2.1299296851115868, "grad_norm": 1.276099443435669, "learning_rate": 2.6606801681314485e-05, "loss": 0.2797, "step": 6967 }, { "epoch": 2.1302354020177314, "grad_norm": 0.5873165130615234, "learning_rate": 2.6610622850592284e-05, "loss": 0.1905, "step": 6968 }, { "epoch": 2.1305411189238765, "grad_norm": 0.2794504165649414, "learning_rate": 2.6614444019870083e-05, "loss": 0.115, "step": 6969 }, { "epoch": 2.130846835830021, "grad_norm": 0.3314143419265747, "learning_rate": 2.661826518914788e-05, "loss": 0.0793, "step": 6970 }, { "epoch": 2.1311525527361663, "grad_norm": 0.378760427236557, "learning_rate": 2.662208635842568e-05, "loss": 0.0755, "step": 6971 }, { "epoch": 2.1314582696423114, "grad_norm": 0.3675072193145752, "learning_rate": 2.662590752770348e-05, "loss": 0.0821, "step": 6972 }, { "epoch": 2.131763986548456, "grad_norm": 0.30659225583076477, "learning_rate": 2.6629728696981278e-05, "loss": 0.0629, "step": 6973 }, { "epoch": 2.1320697034546012, "grad_norm": 0.32075273990631104, "learning_rate": 2.6633549866259076e-05, "loss": 0.0735, "step": 6974 }, { "epoch": 2.132375420360746, "grad_norm": 0.71958988904953, "learning_rate": 2.6637371035536875e-05, "loss": 0.0643, "step": 6975 }, { "epoch": 2.132681137266891, "grad_norm": 0.3332720398902893, "learning_rate": 2.6641192204814674e-05, "loss": 0.1092, "step": 6976 }, { "epoch": 2.1329868541730357, "grad_norm": 0.4043932855129242, "learning_rate": 2.6645013374092473e-05, "loss": 0.071, "step": 6977 }, { "epoch": 2.133292571079181, "grad_norm": 0.4333646297454834, "learning_rate": 2.664883454337027e-05, "loss": 0.1088, "step": 6978 }, { "epoch": 2.1335982879853255, "grad_norm": 0.35331836342811584, "learning_rate": 2.6652655712648073e-05, "loss": 0.0805, "step": 6979 }, { "epoch": 2.1339040048914706, "grad_norm": 0.7745158076286316, "learning_rate": 2.6656476881925872e-05, "loss": 0.1157, "step": 6980 }, { "epoch": 2.1342097217976153, "grad_norm": 1.0917710065841675, "learning_rate": 2.666029805120367e-05, "loss": 0.1465, "step": 6981 }, { "epoch": 2.1345154387037604, "grad_norm": 0.6283608675003052, "learning_rate": 2.666411922048147e-05, "loss": 0.1434, "step": 6982 }, { "epoch": 2.134821155609905, "grad_norm": 0.8298198580741882, "learning_rate": 2.666794038975927e-05, "loss": 0.1512, "step": 6983 }, { "epoch": 2.13512687251605, "grad_norm": 0.7431654930114746, "learning_rate": 2.6671761559037067e-05, "loss": 0.1459, "step": 6984 }, { "epoch": 2.1354325894221953, "grad_norm": 1.5376642942428589, "learning_rate": 2.6675582728314866e-05, "loss": 0.2262, "step": 6985 }, { "epoch": 2.13573830632834, "grad_norm": 0.6318380832672119, "learning_rate": 2.6679403897592665e-05, "loss": 0.169, "step": 6986 }, { "epoch": 2.136044023234485, "grad_norm": 1.0633357763290405, "learning_rate": 2.6683225066870463e-05, "loss": 0.196, "step": 6987 }, { "epoch": 2.1363497401406297, "grad_norm": 2.536278009414673, "learning_rate": 2.6687046236148262e-05, "loss": 0.2783, "step": 6988 }, { "epoch": 2.136655457046775, "grad_norm": 1.0339387655258179, "learning_rate": 2.669086740542606e-05, "loss": 0.2213, "step": 6989 }, { "epoch": 2.1369611739529195, "grad_norm": 2.3659017086029053, "learning_rate": 2.669468857470386e-05, "loss": 0.231, "step": 6990 }, { "epoch": 2.1372668908590646, "grad_norm": 2.3328399658203125, "learning_rate": 2.669850974398166e-05, "loss": 0.2809, "step": 6991 }, { "epoch": 2.1375726077652093, "grad_norm": 1.7471154928207397, "learning_rate": 2.6702330913259457e-05, "loss": 0.4028, "step": 6992 }, { "epoch": 2.1378783246713544, "grad_norm": 0.4777012765407562, "learning_rate": 2.6706152082537256e-05, "loss": 0.1697, "step": 6993 }, { "epoch": 2.138184041577499, "grad_norm": 0.5532597303390503, "learning_rate": 2.6709973251815058e-05, "loss": 0.0805, "step": 6994 }, { "epoch": 2.138489758483644, "grad_norm": 0.2533096671104431, "learning_rate": 2.6713794421092857e-05, "loss": 0.0719, "step": 6995 }, { "epoch": 2.138795475389789, "grad_norm": 0.21822184324264526, "learning_rate": 2.6717615590370655e-05, "loss": 0.0721, "step": 6996 }, { "epoch": 2.139101192295934, "grad_norm": 0.8532699942588806, "learning_rate": 2.6721436759648454e-05, "loss": 0.0528, "step": 6997 }, { "epoch": 2.139406909202079, "grad_norm": 0.3337603509426117, "learning_rate": 2.6725257928926253e-05, "loss": 0.0907, "step": 6998 }, { "epoch": 2.1397126261082238, "grad_norm": 0.5079228281974792, "learning_rate": 2.6729079098204052e-05, "loss": 0.1108, "step": 6999 }, { "epoch": 2.140018343014369, "grad_norm": 0.4229185879230499, "learning_rate": 2.673290026748185e-05, "loss": 0.0662, "step": 7000 }, { "epoch": 2.140018343014369, "eval_cer": 0.19358886233947606, "eval_loss": 0.26338818669319153, "eval_runtime": 19.1111, "eval_samples_per_second": 237.454, "eval_steps_per_second": 0.785, "eval_wer": 0.34585626099829614, "step": 7000 }, { "epoch": 2.1403240599205136, "grad_norm": 0.5087936520576477, "learning_rate": 2.673672143675965e-05, "loss": 0.1152, "step": 7001 }, { "epoch": 2.1406297768266587, "grad_norm": 0.5081256031990051, "learning_rate": 2.6740542606037448e-05, "loss": 0.0598, "step": 7002 }, { "epoch": 2.1409354937328033, "grad_norm": 0.3596276342868805, "learning_rate": 2.6744363775315247e-05, "loss": 0.1186, "step": 7003 }, { "epoch": 2.1412412106389485, "grad_norm": 0.5489554405212402, "learning_rate": 2.6748184944593045e-05, "loss": 0.08, "step": 7004 }, { "epoch": 2.141546927545093, "grad_norm": 0.5320497751235962, "learning_rate": 2.6752006113870844e-05, "loss": 0.1012, "step": 7005 }, { "epoch": 2.1418526444512382, "grad_norm": 0.48432278633117676, "learning_rate": 2.6755827283148643e-05, "loss": 0.1444, "step": 7006 }, { "epoch": 2.142158361357383, "grad_norm": 1.337916612625122, "learning_rate": 2.675964845242644e-05, "loss": 0.1159, "step": 7007 }, { "epoch": 2.142464078263528, "grad_norm": 0.5910773873329163, "learning_rate": 2.676346962170424e-05, "loss": 0.1585, "step": 7008 }, { "epoch": 2.1427697951696727, "grad_norm": 0.7172471880912781, "learning_rate": 2.676729079098204e-05, "loss": 0.1684, "step": 7009 }, { "epoch": 2.143075512075818, "grad_norm": 0.561581015586853, "learning_rate": 2.677111196025984e-05, "loss": 0.1898, "step": 7010 }, { "epoch": 2.143381228981963, "grad_norm": 0.8903080821037292, "learning_rate": 2.677493312953764e-05, "loss": 0.1849, "step": 7011 }, { "epoch": 2.1436869458881076, "grad_norm": 0.9199313521385193, "learning_rate": 2.677875429881544e-05, "loss": 0.2203, "step": 7012 }, { "epoch": 2.1439926627942527, "grad_norm": 0.6157388687133789, "learning_rate": 2.6782575468093238e-05, "loss": 0.2359, "step": 7013 }, { "epoch": 2.1442983797003974, "grad_norm": 1.0105575323104858, "learning_rate": 2.6786396637371036e-05, "loss": 0.1926, "step": 7014 }, { "epoch": 2.1446040966065425, "grad_norm": 0.8844746947288513, "learning_rate": 2.6790217806648835e-05, "loss": 0.2563, "step": 7015 }, { "epoch": 2.144909813512687, "grad_norm": 1.4889322519302368, "learning_rate": 2.6794038975926634e-05, "loss": 0.2249, "step": 7016 }, { "epoch": 2.1452155304188323, "grad_norm": 1.8683258295059204, "learning_rate": 2.6797860145204433e-05, "loss": 0.3042, "step": 7017 }, { "epoch": 2.145521247324977, "grad_norm": 0.4170190095901489, "learning_rate": 2.680168131448223e-05, "loss": 0.1463, "step": 7018 }, { "epoch": 2.145826964231122, "grad_norm": 0.29822084307670593, "learning_rate": 2.680550248376003e-05, "loss": 0.0664, "step": 7019 }, { "epoch": 2.1461326811372667, "grad_norm": 0.48026132583618164, "learning_rate": 2.680932365303783e-05, "loss": 0.0767, "step": 7020 }, { "epoch": 2.146438398043412, "grad_norm": 0.5033544898033142, "learning_rate": 2.6813144822315628e-05, "loss": 0.0744, "step": 7021 }, { "epoch": 2.1467441149495565, "grad_norm": 0.3068888187408447, "learning_rate": 2.6816965991593426e-05, "loss": 0.0661, "step": 7022 }, { "epoch": 2.1470498318557016, "grad_norm": 0.3117099702358246, "learning_rate": 2.6820787160871225e-05, "loss": 0.0816, "step": 7023 }, { "epoch": 2.1473555487618468, "grad_norm": 0.324609637260437, "learning_rate": 2.6824608330149024e-05, "loss": 0.0886, "step": 7024 }, { "epoch": 2.1476612656679914, "grad_norm": 0.2682928740978241, "learning_rate": 2.6828429499426826e-05, "loss": 0.0691, "step": 7025 }, { "epoch": 2.1479669825741365, "grad_norm": 0.5692829489707947, "learning_rate": 2.6832250668704625e-05, "loss": 0.1034, "step": 7026 }, { "epoch": 2.148272699480281, "grad_norm": 0.3059932291507721, "learning_rate": 2.6836071837982423e-05, "loss": 0.0869, "step": 7027 }, { "epoch": 2.1485784163864263, "grad_norm": 0.3625854551792145, "learning_rate": 2.6839893007260222e-05, "loss": 0.1035, "step": 7028 }, { "epoch": 2.148884133292571, "grad_norm": 0.41967087984085083, "learning_rate": 2.684371417653802e-05, "loss": 0.112, "step": 7029 }, { "epoch": 2.149189850198716, "grad_norm": 0.43506041169166565, "learning_rate": 2.684753534581582e-05, "loss": 0.1443, "step": 7030 }, { "epoch": 2.149495567104861, "grad_norm": 0.4518330693244934, "learning_rate": 2.685135651509362e-05, "loss": 0.1298, "step": 7031 }, { "epoch": 2.149801284011006, "grad_norm": 0.5377371907234192, "learning_rate": 2.6855177684371417e-05, "loss": 0.1686, "step": 7032 }, { "epoch": 2.1501070009171506, "grad_norm": 0.5521318316459656, "learning_rate": 2.6858998853649216e-05, "loss": 0.1884, "step": 7033 }, { "epoch": 2.1504127178232957, "grad_norm": 0.816204309463501, "learning_rate": 2.6862820022927015e-05, "loss": 0.1954, "step": 7034 }, { "epoch": 2.1507184347294404, "grad_norm": 0.7433658838272095, "learning_rate": 2.6866641192204813e-05, "loss": 0.178, "step": 7035 }, { "epoch": 2.1510241516355855, "grad_norm": 1.4879333972930908, "learning_rate": 2.6870462361482612e-05, "loss": 0.2112, "step": 7036 }, { "epoch": 2.1513298685417306, "grad_norm": 0.8028479218482971, "learning_rate": 2.687428353076041e-05, "loss": 0.2174, "step": 7037 }, { "epoch": 2.1516355854478753, "grad_norm": 1.0387285947799683, "learning_rate": 2.687810470003821e-05, "loss": 0.2475, "step": 7038 }, { "epoch": 2.1519413023540204, "grad_norm": 0.7794132828712463, "learning_rate": 2.688192586931601e-05, "loss": 0.2286, "step": 7039 }, { "epoch": 2.152247019260165, "grad_norm": 1.8582568168640137, "learning_rate": 2.688574703859381e-05, "loss": 0.2184, "step": 7040 }, { "epoch": 2.15255273616631, "grad_norm": 0.7760523557662964, "learning_rate": 2.6889568207871613e-05, "loss": 0.2312, "step": 7041 }, { "epoch": 2.152858453072455, "grad_norm": 1.18756103515625, "learning_rate": 2.689338937714941e-05, "loss": 0.2969, "step": 7042 }, { "epoch": 2.1531641699786, "grad_norm": 0.3944629430770874, "learning_rate": 2.689721054642721e-05, "loss": 0.1567, "step": 7043 }, { "epoch": 2.1534698868847446, "grad_norm": 0.3221830129623413, "learning_rate": 2.690103171570501e-05, "loss": 0.1033, "step": 7044 }, { "epoch": 2.1537756037908897, "grad_norm": 0.30260753631591797, "learning_rate": 2.6904852884982808e-05, "loss": 0.0647, "step": 7045 }, { "epoch": 2.1540813206970344, "grad_norm": 0.39540764689445496, "learning_rate": 2.6908674054260606e-05, "loss": 0.0778, "step": 7046 }, { "epoch": 2.1543870376031795, "grad_norm": 0.30498456954956055, "learning_rate": 2.6912495223538405e-05, "loss": 0.0768, "step": 7047 }, { "epoch": 2.154692754509324, "grad_norm": 0.5253481864929199, "learning_rate": 2.6916316392816204e-05, "loss": 0.0696, "step": 7048 }, { "epoch": 2.1549984714154693, "grad_norm": 0.559775710105896, "learning_rate": 2.6920137562094003e-05, "loss": 0.0739, "step": 7049 }, { "epoch": 2.1553041883216144, "grad_norm": 0.24870631098747253, "learning_rate": 2.69239587313718e-05, "loss": 0.0586, "step": 7050 }, { "epoch": 2.155609905227759, "grad_norm": 0.29455870389938354, "learning_rate": 2.69277799006496e-05, "loss": 0.0726, "step": 7051 }, { "epoch": 2.155915622133904, "grad_norm": 0.36482682824134827, "learning_rate": 2.69316010699274e-05, "loss": 0.0658, "step": 7052 }, { "epoch": 2.156221339040049, "grad_norm": 0.4622988700866699, "learning_rate": 2.6935422239205197e-05, "loss": 0.0875, "step": 7053 }, { "epoch": 2.156527055946194, "grad_norm": 0.3651861846446991, "learning_rate": 2.6939243408482996e-05, "loss": 0.0914, "step": 7054 }, { "epoch": 2.1568327728523387, "grad_norm": 0.6283716559410095, "learning_rate": 2.6943064577760795e-05, "loss": 0.1176, "step": 7055 }, { "epoch": 2.1571384897584838, "grad_norm": 0.6162665486335754, "learning_rate": 2.6946885747038594e-05, "loss": 0.1302, "step": 7056 }, { "epoch": 2.1574442066646284, "grad_norm": 0.629937469959259, "learning_rate": 2.6950706916316396e-05, "loss": 0.1386, "step": 7057 }, { "epoch": 2.1577499235707736, "grad_norm": 0.4702097475528717, "learning_rate": 2.6954528085594195e-05, "loss": 0.157, "step": 7058 }, { "epoch": 2.1580556404769182, "grad_norm": 0.4954817295074463, "learning_rate": 2.6958349254871993e-05, "loss": 0.1691, "step": 7059 }, { "epoch": 2.1583613573830633, "grad_norm": 0.6780310869216919, "learning_rate": 2.6962170424149792e-05, "loss": 0.2217, "step": 7060 }, { "epoch": 2.158667074289208, "grad_norm": 0.5611264109611511, "learning_rate": 2.696599159342759e-05, "loss": 0.1947, "step": 7061 }, { "epoch": 2.158972791195353, "grad_norm": 0.7378785014152527, "learning_rate": 2.696981276270539e-05, "loss": 0.1843, "step": 7062 }, { "epoch": 2.1592785081014982, "grad_norm": 0.9105340242385864, "learning_rate": 2.697363393198319e-05, "loss": 0.2189, "step": 7063 }, { "epoch": 2.159584225007643, "grad_norm": 0.7971762418746948, "learning_rate": 2.6977455101260987e-05, "loss": 0.2068, "step": 7064 }, { "epoch": 2.159889941913788, "grad_norm": 0.7546399235725403, "learning_rate": 2.6981276270538786e-05, "loss": 0.2577, "step": 7065 }, { "epoch": 2.1601956588199327, "grad_norm": 0.9492889642715454, "learning_rate": 2.6985097439816585e-05, "loss": 0.2471, "step": 7066 }, { "epoch": 2.160501375726078, "grad_norm": 1.4225578308105469, "learning_rate": 2.6988918609094383e-05, "loss": 0.2987, "step": 7067 }, { "epoch": 2.1608070926322225, "grad_norm": 0.36814621090888977, "learning_rate": 2.6992739778372182e-05, "loss": 0.1517, "step": 7068 }, { "epoch": 2.1611128095383676, "grad_norm": 0.5136745572090149, "learning_rate": 2.699656094764998e-05, "loss": 0.1269, "step": 7069 }, { "epoch": 2.1614185264445123, "grad_norm": 0.36110639572143555, "learning_rate": 2.700038211692778e-05, "loss": 0.0906, "step": 7070 }, { "epoch": 2.1617242433506574, "grad_norm": 0.29407405853271484, "learning_rate": 2.7004203286205578e-05, "loss": 0.0532, "step": 7071 }, { "epoch": 2.162029960256802, "grad_norm": 0.3129805326461792, "learning_rate": 2.700802445548338e-05, "loss": 0.0783, "step": 7072 }, { "epoch": 2.162335677162947, "grad_norm": 0.5611547231674194, "learning_rate": 2.701184562476118e-05, "loss": 0.1097, "step": 7073 }, { "epoch": 2.162641394069092, "grad_norm": 0.2826860249042511, "learning_rate": 2.7015666794038978e-05, "loss": 0.0866, "step": 7074 }, { "epoch": 2.162947110975237, "grad_norm": 0.2917850911617279, "learning_rate": 2.7019487963316777e-05, "loss": 0.0716, "step": 7075 }, { "epoch": 2.163252827881382, "grad_norm": 0.2822268307209015, "learning_rate": 2.7023309132594575e-05, "loss": 0.1052, "step": 7076 }, { "epoch": 2.1635585447875267, "grad_norm": 0.3741103410720825, "learning_rate": 2.7027130301872374e-05, "loss": 0.0883, "step": 7077 }, { "epoch": 2.163864261693672, "grad_norm": 0.27484238147735596, "learning_rate": 2.7030951471150173e-05, "loss": 0.1074, "step": 7078 }, { "epoch": 2.1641699785998165, "grad_norm": 0.31758996844291687, "learning_rate": 2.703477264042797e-05, "loss": 0.0943, "step": 7079 }, { "epoch": 2.1644756955059616, "grad_norm": 0.3655761480331421, "learning_rate": 2.703859380970577e-05, "loss": 0.1036, "step": 7080 }, { "epoch": 2.1647814124121063, "grad_norm": 0.728262722492218, "learning_rate": 2.704241497898357e-05, "loss": 0.1664, "step": 7081 }, { "epoch": 2.1650871293182514, "grad_norm": 0.3521135747432709, "learning_rate": 2.7046236148261368e-05, "loss": 0.1465, "step": 7082 }, { "epoch": 2.165392846224396, "grad_norm": 0.5440397262573242, "learning_rate": 2.7050057317539167e-05, "loss": 0.176, "step": 7083 }, { "epoch": 2.165698563130541, "grad_norm": 0.6137725710868835, "learning_rate": 2.7053878486816965e-05, "loss": 0.2074, "step": 7084 }, { "epoch": 2.166004280036686, "grad_norm": 0.558089017868042, "learning_rate": 2.7057699656094764e-05, "loss": 0.1563, "step": 7085 }, { "epoch": 2.166309996942831, "grad_norm": 0.7315589189529419, "learning_rate": 2.7061520825372563e-05, "loss": 0.221, "step": 7086 }, { "epoch": 2.1666157138489757, "grad_norm": 0.796095073223114, "learning_rate": 2.7065341994650365e-05, "loss": 0.2231, "step": 7087 }, { "epoch": 2.166921430755121, "grad_norm": 0.8372097611427307, "learning_rate": 2.7069163163928164e-05, "loss": 0.2075, "step": 7088 }, { "epoch": 2.167227147661266, "grad_norm": 0.8248023390769958, "learning_rate": 2.7072984333205962e-05, "loss": 0.215, "step": 7089 }, { "epoch": 2.1675328645674106, "grad_norm": 1.1498732566833496, "learning_rate": 2.707680550248376e-05, "loss": 0.2286, "step": 7090 }, { "epoch": 2.1678385814735557, "grad_norm": 1.818839192390442, "learning_rate": 2.708062667176156e-05, "loss": 0.2542, "step": 7091 }, { "epoch": 2.1681442983797004, "grad_norm": 1.3838227987289429, "learning_rate": 2.708444784103936e-05, "loss": 0.287, "step": 7092 }, { "epoch": 2.1684500152858455, "grad_norm": 0.37427401542663574, "learning_rate": 2.7088269010317157e-05, "loss": 0.1973, "step": 7093 }, { "epoch": 2.16875573219199, "grad_norm": 0.33843696117401123, "learning_rate": 2.7092090179594956e-05, "loss": 0.0928, "step": 7094 }, { "epoch": 2.1690614490981353, "grad_norm": 0.30115818977355957, "learning_rate": 2.7095911348872755e-05, "loss": 0.0766, "step": 7095 }, { "epoch": 2.16936716600428, "grad_norm": 0.3240291178226471, "learning_rate": 2.7099732518150554e-05, "loss": 0.0787, "step": 7096 }, { "epoch": 2.169672882910425, "grad_norm": 0.26563382148742676, "learning_rate": 2.7103553687428352e-05, "loss": 0.0766, "step": 7097 }, { "epoch": 2.1699785998165697, "grad_norm": 0.2918142080307007, "learning_rate": 2.710737485670615e-05, "loss": 0.0726, "step": 7098 }, { "epoch": 2.170284316722715, "grad_norm": 0.3788161873817444, "learning_rate": 2.711119602598395e-05, "loss": 0.0638, "step": 7099 }, { "epoch": 2.1705900336288595, "grad_norm": 0.6464394927024841, "learning_rate": 2.711501719526175e-05, "loss": 0.0905, "step": 7100 }, { "epoch": 2.1708957505350046, "grad_norm": 0.3275757133960724, "learning_rate": 2.7118838364539547e-05, "loss": 0.1082, "step": 7101 }, { "epoch": 2.1712014674411497, "grad_norm": 0.4186345040798187, "learning_rate": 2.7122659533817346e-05, "loss": 0.0917, "step": 7102 }, { "epoch": 2.1715071843472944, "grad_norm": 0.26277974247932434, "learning_rate": 2.7126480703095148e-05, "loss": 0.1052, "step": 7103 }, { "epoch": 2.1718129012534395, "grad_norm": 0.336950421333313, "learning_rate": 2.7130301872372947e-05, "loss": 0.0738, "step": 7104 }, { "epoch": 2.172118618159584, "grad_norm": 0.4453859031200409, "learning_rate": 2.7134123041650746e-05, "loss": 0.0774, "step": 7105 }, { "epoch": 2.1724243350657293, "grad_norm": 0.6827691793441772, "learning_rate": 2.7137944210928545e-05, "loss": 0.1564, "step": 7106 }, { "epoch": 2.172730051971874, "grad_norm": 0.5470419526100159, "learning_rate": 2.7141765380206343e-05, "loss": 0.1298, "step": 7107 }, { "epoch": 2.173035768878019, "grad_norm": 0.502107560634613, "learning_rate": 2.7145586549484142e-05, "loss": 0.1475, "step": 7108 }, { "epoch": 2.1733414857841638, "grad_norm": 0.7546409368515015, "learning_rate": 2.714940771876194e-05, "loss": 0.1553, "step": 7109 }, { "epoch": 2.173647202690309, "grad_norm": 0.6997967958450317, "learning_rate": 2.715322888803974e-05, "loss": 0.2236, "step": 7110 }, { "epoch": 2.1739529195964535, "grad_norm": 0.7194198966026306, "learning_rate": 2.7157050057317538e-05, "loss": 0.1939, "step": 7111 }, { "epoch": 2.1742586365025987, "grad_norm": 0.7931292057037354, "learning_rate": 2.7160871226595337e-05, "loss": 0.2072, "step": 7112 }, { "epoch": 2.1745643534087433, "grad_norm": 0.6083042025566101, "learning_rate": 2.7164692395873136e-05, "loss": 0.241, "step": 7113 }, { "epoch": 2.1748700703148884, "grad_norm": 0.9708653092384338, "learning_rate": 2.7168513565150934e-05, "loss": 0.2086, "step": 7114 }, { "epoch": 2.1751757872210336, "grad_norm": 0.8778173923492432, "learning_rate": 2.7172334734428733e-05, "loss": 0.226, "step": 7115 }, { "epoch": 2.1754815041271782, "grad_norm": 1.0697139501571655, "learning_rate": 2.7176155903706532e-05, "loss": 0.2761, "step": 7116 }, { "epoch": 2.1757872210333233, "grad_norm": 3.068713665008545, "learning_rate": 2.7179977072984334e-05, "loss": 0.3062, "step": 7117 }, { "epoch": 2.176092937939468, "grad_norm": 0.4680692255496979, "learning_rate": 2.7183798242262133e-05, "loss": 0.1733, "step": 7118 }, { "epoch": 2.176398654845613, "grad_norm": 0.23984061181545258, "learning_rate": 2.7187619411539935e-05, "loss": 0.0777, "step": 7119 }, { "epoch": 2.176704371751758, "grad_norm": 0.270263135433197, "learning_rate": 2.7191440580817734e-05, "loss": 0.0749, "step": 7120 }, { "epoch": 2.177010088657903, "grad_norm": 0.39594438672065735, "learning_rate": 2.7195261750095532e-05, "loss": 0.0693, "step": 7121 }, { "epoch": 2.1773158055640476, "grad_norm": 0.25702208280563354, "learning_rate": 2.719908291937333e-05, "loss": 0.059, "step": 7122 }, { "epoch": 2.1776215224701927, "grad_norm": 0.33883151412010193, "learning_rate": 2.720290408865113e-05, "loss": 0.0535, "step": 7123 }, { "epoch": 2.1779272393763374, "grad_norm": 0.504114031791687, "learning_rate": 2.720672525792893e-05, "loss": 0.0768, "step": 7124 }, { "epoch": 2.1782329562824825, "grad_norm": 0.2953636944293976, "learning_rate": 2.7210546427206727e-05, "loss": 0.0621, "step": 7125 }, { "epoch": 2.178538673188627, "grad_norm": 0.39059978723526, "learning_rate": 2.7214367596484526e-05, "loss": 0.1047, "step": 7126 }, { "epoch": 2.1788443900947723, "grad_norm": 1.181030035018921, "learning_rate": 2.7218188765762325e-05, "loss": 0.0778, "step": 7127 }, { "epoch": 2.1791501070009174, "grad_norm": 0.3940613567829132, "learning_rate": 2.7222009935040124e-05, "loss": 0.0979, "step": 7128 }, { "epoch": 2.179455823907062, "grad_norm": 0.3199200928211212, "learning_rate": 2.7225831104317922e-05, "loss": 0.0882, "step": 7129 }, { "epoch": 2.179761540813207, "grad_norm": 0.3347514271736145, "learning_rate": 2.722965227359572e-05, "loss": 0.0852, "step": 7130 }, { "epoch": 2.180067257719352, "grad_norm": 0.6200264692306519, "learning_rate": 2.723347344287352e-05, "loss": 0.146, "step": 7131 }, { "epoch": 2.180372974625497, "grad_norm": 0.5418228507041931, "learning_rate": 2.723729461215132e-05, "loss": 0.1376, "step": 7132 }, { "epoch": 2.1806786915316416, "grad_norm": 0.5005942583084106, "learning_rate": 2.7241115781429117e-05, "loss": 0.145, "step": 7133 }, { "epoch": 2.1809844084377867, "grad_norm": 0.6777404546737671, "learning_rate": 2.724493695070692e-05, "loss": 0.1583, "step": 7134 }, { "epoch": 2.1812901253439314, "grad_norm": 0.4932810962200165, "learning_rate": 2.7248758119984718e-05, "loss": 0.1707, "step": 7135 }, { "epoch": 2.1815958422500765, "grad_norm": 0.5882030725479126, "learning_rate": 2.7252579289262517e-05, "loss": 0.2027, "step": 7136 }, { "epoch": 2.181901559156221, "grad_norm": 0.8856511116027832, "learning_rate": 2.7256400458540316e-05, "loss": 0.2267, "step": 7137 }, { "epoch": 2.1822072760623663, "grad_norm": 1.2178502082824707, "learning_rate": 2.7260221627818115e-05, "loss": 0.2354, "step": 7138 }, { "epoch": 2.182512992968511, "grad_norm": 0.8637135028839111, "learning_rate": 2.7264042797095913e-05, "loss": 0.2418, "step": 7139 }, { "epoch": 2.182818709874656, "grad_norm": 1.0317474603652954, "learning_rate": 2.7267863966373712e-05, "loss": 0.2434, "step": 7140 }, { "epoch": 2.183124426780801, "grad_norm": 0.7719634175300598, "learning_rate": 2.727168513565151e-05, "loss": 0.2273, "step": 7141 }, { "epoch": 2.183430143686946, "grad_norm": 1.6543186902999878, "learning_rate": 2.727550630492931e-05, "loss": 0.3426, "step": 7142 }, { "epoch": 2.183735860593091, "grad_norm": 0.4322933554649353, "learning_rate": 2.7279327474207108e-05, "loss": 0.1978, "step": 7143 }, { "epoch": 2.1840415774992357, "grad_norm": 0.2708527743816376, "learning_rate": 2.7283148643484907e-05, "loss": 0.0893, "step": 7144 }, { "epoch": 2.184347294405381, "grad_norm": 0.44872280955314636, "learning_rate": 2.7286969812762706e-05, "loss": 0.066, "step": 7145 }, { "epoch": 2.1846530113115255, "grad_norm": 0.3324790596961975, "learning_rate": 2.7290790982040504e-05, "loss": 0.1, "step": 7146 }, { "epoch": 2.1849587282176706, "grad_norm": 0.2322012186050415, "learning_rate": 2.7294612151318303e-05, "loss": 0.0653, "step": 7147 }, { "epoch": 2.1852644451238152, "grad_norm": 0.25371795892715454, "learning_rate": 2.7298433320596102e-05, "loss": 0.0685, "step": 7148 }, { "epoch": 2.1855701620299604, "grad_norm": 0.22222928702831268, "learning_rate": 2.73022544898739e-05, "loss": 0.0786, "step": 7149 }, { "epoch": 2.185875878936105, "grad_norm": 0.5356463193893433, "learning_rate": 2.7306075659151703e-05, "loss": 0.0838, "step": 7150 }, { "epoch": 2.18618159584225, "grad_norm": 0.32189616560935974, "learning_rate": 2.73098968284295e-05, "loss": 0.0603, "step": 7151 }, { "epoch": 2.186487312748395, "grad_norm": 0.3270339071750641, "learning_rate": 2.73137179977073e-05, "loss": 0.0891, "step": 7152 }, { "epoch": 2.18679302965454, "grad_norm": 0.333137184381485, "learning_rate": 2.73175391669851e-05, "loss": 0.0902, "step": 7153 }, { "epoch": 2.187098746560685, "grad_norm": 0.3730262517929077, "learning_rate": 2.7321360336262898e-05, "loss": 0.085, "step": 7154 }, { "epoch": 2.1874044634668297, "grad_norm": 0.4018065929412842, "learning_rate": 2.7325181505540697e-05, "loss": 0.089, "step": 7155 }, { "epoch": 2.187710180372975, "grad_norm": 0.43772003054618835, "learning_rate": 2.7329002674818495e-05, "loss": 0.1558, "step": 7156 }, { "epoch": 2.1880158972791195, "grad_norm": 0.5017663836479187, "learning_rate": 2.7332823844096294e-05, "loss": 0.1248, "step": 7157 }, { "epoch": 2.1883216141852646, "grad_norm": 0.600753128528595, "learning_rate": 2.7336645013374093e-05, "loss": 0.1515, "step": 7158 }, { "epoch": 2.1886273310914093, "grad_norm": 0.5117200016975403, "learning_rate": 2.734046618265189e-05, "loss": 0.1668, "step": 7159 }, { "epoch": 2.1889330479975544, "grad_norm": 1.0638618469238281, "learning_rate": 2.734428735192969e-05, "loss": 0.1712, "step": 7160 }, { "epoch": 2.189238764903699, "grad_norm": 0.8487361669540405, "learning_rate": 2.734810852120749e-05, "loss": 0.2001, "step": 7161 }, { "epoch": 2.189544481809844, "grad_norm": 0.6261155605316162, "learning_rate": 2.7351929690485288e-05, "loss": 0.228, "step": 7162 }, { "epoch": 2.189850198715989, "grad_norm": 1.9991259574890137, "learning_rate": 2.7355750859763087e-05, "loss": 0.2038, "step": 7163 }, { "epoch": 2.190155915622134, "grad_norm": 0.8972893357276917, "learning_rate": 2.7359572029040885e-05, "loss": 0.2432, "step": 7164 }, { "epoch": 2.1904616325282786, "grad_norm": 0.8978043794631958, "learning_rate": 2.7363393198318687e-05, "loss": 0.2262, "step": 7165 }, { "epoch": 2.1907673494344237, "grad_norm": 0.7389625310897827, "learning_rate": 2.7367214367596486e-05, "loss": 0.2027, "step": 7166 }, { "epoch": 2.191073066340569, "grad_norm": 1.1283628940582275, "learning_rate": 2.7371035536874285e-05, "loss": 0.2285, "step": 7167 }, { "epoch": 2.1913787832467135, "grad_norm": 0.3722768723964691, "learning_rate": 2.7374856706152084e-05, "loss": 0.1612, "step": 7168 }, { "epoch": 2.1916845001528587, "grad_norm": 0.32076406478881836, "learning_rate": 2.7378677875429882e-05, "loss": 0.0838, "step": 7169 }, { "epoch": 2.1919902170590033, "grad_norm": 0.33558160066604614, "learning_rate": 2.738249904470768e-05, "loss": 0.0762, "step": 7170 }, { "epoch": 2.1922959339651484, "grad_norm": 0.2979694902896881, "learning_rate": 2.738632021398548e-05, "loss": 0.0671, "step": 7171 }, { "epoch": 2.192601650871293, "grad_norm": 0.971225917339325, "learning_rate": 2.739014138326328e-05, "loss": 0.0588, "step": 7172 }, { "epoch": 2.192907367777438, "grad_norm": 0.27886027097702026, "learning_rate": 2.7393962552541077e-05, "loss": 0.0717, "step": 7173 }, { "epoch": 2.193213084683583, "grad_norm": 0.7179349064826965, "learning_rate": 2.7397783721818876e-05, "loss": 0.0688, "step": 7174 }, { "epoch": 2.193518801589728, "grad_norm": 0.21194778382778168, "learning_rate": 2.7401604891096675e-05, "loss": 0.0563, "step": 7175 }, { "epoch": 2.1938245184958727, "grad_norm": 0.480765163898468, "learning_rate": 2.7405426060374474e-05, "loss": 0.0994, "step": 7176 }, { "epoch": 2.194130235402018, "grad_norm": 1.2773317098617554, "learning_rate": 2.7409247229652272e-05, "loss": 0.0941, "step": 7177 }, { "epoch": 2.1944359523081625, "grad_norm": 0.4376058876514435, "learning_rate": 2.741306839893007e-05, "loss": 0.1015, "step": 7178 }, { "epoch": 2.1947416692143076, "grad_norm": 0.401043564081192, "learning_rate": 2.741688956820787e-05, "loss": 0.1, "step": 7179 }, { "epoch": 2.1950473861204527, "grad_norm": 0.45896342396736145, "learning_rate": 2.7420710737485672e-05, "loss": 0.109, "step": 7180 }, { "epoch": 2.1953531030265974, "grad_norm": 0.6167517304420471, "learning_rate": 2.742453190676347e-05, "loss": 0.1603, "step": 7181 }, { "epoch": 2.1956588199327425, "grad_norm": 0.8066892027854919, "learning_rate": 2.742835307604127e-05, "loss": 0.1106, "step": 7182 }, { "epoch": 2.195964536838887, "grad_norm": 0.5835094451904297, "learning_rate": 2.7432174245319068e-05, "loss": 0.1596, "step": 7183 }, { "epoch": 2.1962702537450323, "grad_norm": 3.2929399013519287, "learning_rate": 2.7435995414596867e-05, "loss": 0.1588, "step": 7184 }, { "epoch": 2.196575970651177, "grad_norm": 0.7887065410614014, "learning_rate": 2.7439816583874666e-05, "loss": 0.1834, "step": 7185 }, { "epoch": 2.196881687557322, "grad_norm": 1.203173279762268, "learning_rate": 2.7443637753152464e-05, "loss": 0.1837, "step": 7186 }, { "epoch": 2.1971874044634667, "grad_norm": 0.6757314205169678, "learning_rate": 2.7447458922430263e-05, "loss": 0.2421, "step": 7187 }, { "epoch": 2.197493121369612, "grad_norm": 0.7898409366607666, "learning_rate": 2.7451280091708062e-05, "loss": 0.2002, "step": 7188 }, { "epoch": 2.1977988382757565, "grad_norm": 0.8433424234390259, "learning_rate": 2.745510126098586e-05, "loss": 0.1915, "step": 7189 }, { "epoch": 2.1981045551819016, "grad_norm": 1.820265293121338, "learning_rate": 2.745892243026366e-05, "loss": 0.2341, "step": 7190 }, { "epoch": 2.1984102720880463, "grad_norm": 0.9465093016624451, "learning_rate": 2.7462743599541458e-05, "loss": 0.2227, "step": 7191 }, { "epoch": 2.1987159889941914, "grad_norm": 1.2522106170654297, "learning_rate": 2.7466564768819257e-05, "loss": 0.317, "step": 7192 }, { "epoch": 2.1990217059003365, "grad_norm": 0.5698123574256897, "learning_rate": 2.7470385938097056e-05, "loss": 0.1723, "step": 7193 }, { "epoch": 2.199327422806481, "grad_norm": 0.3204188346862793, "learning_rate": 2.7474207107374854e-05, "loss": 0.09, "step": 7194 }, { "epoch": 2.1996331397126263, "grad_norm": 0.3096972405910492, "learning_rate": 2.7478028276652657e-05, "loss": 0.0805, "step": 7195 }, { "epoch": 2.199938856618771, "grad_norm": 0.2844715118408203, "learning_rate": 2.748184944593046e-05, "loss": 0.0812, "step": 7196 }, { "epoch": 2.200244573524916, "grad_norm": 0.2824420928955078, "learning_rate": 2.7485670615208257e-05, "loss": 0.0583, "step": 7197 }, { "epoch": 2.2005502904310608, "grad_norm": 0.26306891441345215, "learning_rate": 2.7489491784486056e-05, "loss": 0.0668, "step": 7198 }, { "epoch": 2.200856007337206, "grad_norm": 0.5690627694129944, "learning_rate": 2.7493312953763855e-05, "loss": 0.1119, "step": 7199 }, { "epoch": 2.2011617242433505, "grad_norm": 0.48546329140663147, "learning_rate": 2.7497134123041654e-05, "loss": 0.0889, "step": 7200 }, { "epoch": 2.2014674411494957, "grad_norm": 0.5214344263076782, "learning_rate": 2.7500955292319452e-05, "loss": 0.0813, "step": 7201 }, { "epoch": 2.2017731580556403, "grad_norm": 0.2966497838497162, "learning_rate": 2.750477646159725e-05, "loss": 0.0789, "step": 7202 }, { "epoch": 2.2020788749617854, "grad_norm": 0.42909011244773865, "learning_rate": 2.750859763087505e-05, "loss": 0.1106, "step": 7203 }, { "epoch": 2.20238459186793, "grad_norm": 0.3530571460723877, "learning_rate": 2.751241880015285e-05, "loss": 0.0806, "step": 7204 }, { "epoch": 2.2026903087740752, "grad_norm": 0.4273819923400879, "learning_rate": 2.7516239969430647e-05, "loss": 0.0826, "step": 7205 }, { "epoch": 2.2029960256802203, "grad_norm": 0.5833699703216553, "learning_rate": 2.7520061138708446e-05, "loss": 0.1627, "step": 7206 }, { "epoch": 2.203301742586365, "grad_norm": 0.5877130031585693, "learning_rate": 2.7523882307986245e-05, "loss": 0.1387, "step": 7207 }, { "epoch": 2.20360745949251, "grad_norm": 0.8927109837532043, "learning_rate": 2.7527703477264044e-05, "loss": 0.1609, "step": 7208 }, { "epoch": 2.203913176398655, "grad_norm": 0.6019277572631836, "learning_rate": 2.7531524646541842e-05, "loss": 0.1439, "step": 7209 }, { "epoch": 2.2042188933048, "grad_norm": 0.9537140727043152, "learning_rate": 2.753534581581964e-05, "loss": 0.1817, "step": 7210 }, { "epoch": 2.2045246102109446, "grad_norm": 1.4663453102111816, "learning_rate": 2.753916698509744e-05, "loss": 0.2496, "step": 7211 }, { "epoch": 2.2048303271170897, "grad_norm": 0.5973847508430481, "learning_rate": 2.7542988154375242e-05, "loss": 0.1851, "step": 7212 }, { "epoch": 2.2051360440232344, "grad_norm": 1.290939450263977, "learning_rate": 2.754680932365304e-05, "loss": 0.1907, "step": 7213 }, { "epoch": 2.2054417609293795, "grad_norm": 1.0195119380950928, "learning_rate": 2.755063049293084e-05, "loss": 0.2199, "step": 7214 }, { "epoch": 2.205747477835524, "grad_norm": 1.3616552352905273, "learning_rate": 2.7554451662208638e-05, "loss": 0.2159, "step": 7215 }, { "epoch": 2.2060531947416693, "grad_norm": 2.08681058883667, "learning_rate": 2.7558272831486437e-05, "loss": 0.239, "step": 7216 }, { "epoch": 2.206358911647814, "grad_norm": 1.8815938234329224, "learning_rate": 2.7562094000764236e-05, "loss": 0.2929, "step": 7217 }, { "epoch": 2.206664628553959, "grad_norm": 0.6801667213439941, "learning_rate": 2.7565915170042034e-05, "loss": 0.1955, "step": 7218 }, { "epoch": 2.206970345460104, "grad_norm": 0.3896467983722687, "learning_rate": 2.7569736339319833e-05, "loss": 0.1085, "step": 7219 }, { "epoch": 2.207276062366249, "grad_norm": 0.3604697585105896, "learning_rate": 2.7573557508597632e-05, "loss": 0.0597, "step": 7220 }, { "epoch": 2.207581779272394, "grad_norm": 0.3056943714618683, "learning_rate": 2.757737867787543e-05, "loss": 0.0738, "step": 7221 }, { "epoch": 2.2078874961785386, "grad_norm": 0.20985716581344604, "learning_rate": 2.758119984715323e-05, "loss": 0.0439, "step": 7222 }, { "epoch": 2.2081932130846837, "grad_norm": 0.30876147747039795, "learning_rate": 2.7585021016431028e-05, "loss": 0.079, "step": 7223 }, { "epoch": 2.2084989299908284, "grad_norm": 0.3124255836009979, "learning_rate": 2.7588842185708827e-05, "loss": 0.0528, "step": 7224 }, { "epoch": 2.2088046468969735, "grad_norm": 0.5344427227973938, "learning_rate": 2.7592663354986626e-05, "loss": 0.0857, "step": 7225 }, { "epoch": 2.209110363803118, "grad_norm": 0.9410603046417236, "learning_rate": 2.7596484524264424e-05, "loss": 0.0672, "step": 7226 }, { "epoch": 2.2094160807092633, "grad_norm": 0.2507101893424988, "learning_rate": 2.7600305693542226e-05, "loss": 0.0714, "step": 7227 }, { "epoch": 2.209721797615408, "grad_norm": 0.2991536855697632, "learning_rate": 2.7604126862820025e-05, "loss": 0.0799, "step": 7228 }, { "epoch": 2.210027514521553, "grad_norm": 0.6823958158493042, "learning_rate": 2.7607948032097824e-05, "loss": 0.1278, "step": 7229 }, { "epoch": 2.2103332314276978, "grad_norm": 0.5366721749305725, "learning_rate": 2.7611769201375623e-05, "loss": 0.1278, "step": 7230 }, { "epoch": 2.210638948333843, "grad_norm": 0.5016875267028809, "learning_rate": 2.761559037065342e-05, "loss": 0.1262, "step": 7231 }, { "epoch": 2.210944665239988, "grad_norm": 0.5555603504180908, "learning_rate": 2.761941153993122e-05, "loss": 0.1289, "step": 7232 }, { "epoch": 2.2112503821461327, "grad_norm": 0.611578643321991, "learning_rate": 2.762323270920902e-05, "loss": 0.174, "step": 7233 }, { "epoch": 2.211556099052278, "grad_norm": 0.7220152020454407, "learning_rate": 2.7627053878486818e-05, "loss": 0.1814, "step": 7234 }, { "epoch": 2.2118618159584225, "grad_norm": 0.9132217764854431, "learning_rate": 2.7630875047764616e-05, "loss": 0.2205, "step": 7235 }, { "epoch": 2.2121675328645676, "grad_norm": 0.6147518157958984, "learning_rate": 2.7634696217042415e-05, "loss": 0.199, "step": 7236 }, { "epoch": 2.2124732497707122, "grad_norm": 2.346395492553711, "learning_rate": 2.7638517386320214e-05, "loss": 0.1831, "step": 7237 }, { "epoch": 2.2127789666768574, "grad_norm": 0.940519392490387, "learning_rate": 2.7642338555598013e-05, "loss": 0.2328, "step": 7238 }, { "epoch": 2.213084683583002, "grad_norm": 1.078989028930664, "learning_rate": 2.764615972487581e-05, "loss": 0.2244, "step": 7239 }, { "epoch": 2.213390400489147, "grad_norm": 0.9202976822853088, "learning_rate": 2.764998089415361e-05, "loss": 0.2352, "step": 7240 }, { "epoch": 2.213696117395292, "grad_norm": 1.3466460704803467, "learning_rate": 2.765380206343141e-05, "loss": 0.2444, "step": 7241 }, { "epoch": 2.214001834301437, "grad_norm": 1.7804616689682007, "learning_rate": 2.7657623232709208e-05, "loss": 0.332, "step": 7242 }, { "epoch": 2.2143075512075816, "grad_norm": 0.4395348131656647, "learning_rate": 2.766144440198701e-05, "loss": 0.1562, "step": 7243 }, { "epoch": 2.2146132681137267, "grad_norm": 0.3272618353366852, "learning_rate": 2.766526557126481e-05, "loss": 0.098, "step": 7244 }, { "epoch": 2.214918985019872, "grad_norm": 0.36494237184524536, "learning_rate": 2.7669086740542607e-05, "loss": 0.0912, "step": 7245 }, { "epoch": 2.2152247019260165, "grad_norm": 0.3233100473880768, "learning_rate": 2.7672907909820406e-05, "loss": 0.0822, "step": 7246 }, { "epoch": 2.2155304188321616, "grad_norm": 0.296383798122406, "learning_rate": 2.7676729079098205e-05, "loss": 0.075, "step": 7247 }, { "epoch": 2.2158361357383063, "grad_norm": 0.38721176981925964, "learning_rate": 2.7680550248376004e-05, "loss": 0.0581, "step": 7248 }, { "epoch": 2.2161418526444514, "grad_norm": 0.3549882769584656, "learning_rate": 2.7684371417653802e-05, "loss": 0.0721, "step": 7249 }, { "epoch": 2.216447569550596, "grad_norm": 0.3549184501171112, "learning_rate": 2.76881925869316e-05, "loss": 0.0613, "step": 7250 }, { "epoch": 2.216753286456741, "grad_norm": 0.4067099094390869, "learning_rate": 2.76920137562094e-05, "loss": 0.0935, "step": 7251 }, { "epoch": 2.217059003362886, "grad_norm": 0.427254319190979, "learning_rate": 2.76958349254872e-05, "loss": 0.1067, "step": 7252 }, { "epoch": 2.217364720269031, "grad_norm": 0.3354641795158386, "learning_rate": 2.7699656094764997e-05, "loss": 0.0785, "step": 7253 }, { "epoch": 2.2176704371751756, "grad_norm": 0.3503603935241699, "learning_rate": 2.7703477264042796e-05, "loss": 0.0999, "step": 7254 }, { "epoch": 2.2179761540813208, "grad_norm": 0.42070409655570984, "learning_rate": 2.7707298433320595e-05, "loss": 0.1379, "step": 7255 }, { "epoch": 2.2182818709874654, "grad_norm": 1.2508056163787842, "learning_rate": 2.7711119602598393e-05, "loss": 0.1254, "step": 7256 }, { "epoch": 2.2185875878936105, "grad_norm": 0.8744251728057861, "learning_rate": 2.7714940771876192e-05, "loss": 0.178, "step": 7257 }, { "epoch": 2.2188933047997557, "grad_norm": 0.6742433905601501, "learning_rate": 2.7718761941153994e-05, "loss": 0.151, "step": 7258 }, { "epoch": 2.2191990217059003, "grad_norm": 0.6521055102348328, "learning_rate": 2.7722583110431793e-05, "loss": 0.1862, "step": 7259 }, { "epoch": 2.2195047386120454, "grad_norm": 0.7308959364891052, "learning_rate": 2.7726404279709592e-05, "loss": 0.2033, "step": 7260 }, { "epoch": 2.21981045551819, "grad_norm": 0.9065965414047241, "learning_rate": 2.773022544898739e-05, "loss": 0.1897, "step": 7261 }, { "epoch": 2.2201161724243352, "grad_norm": 0.873730480670929, "learning_rate": 2.773404661826519e-05, "loss": 0.2059, "step": 7262 }, { "epoch": 2.22042188933048, "grad_norm": 1.3591368198394775, "learning_rate": 2.7737867787542988e-05, "loss": 0.1872, "step": 7263 }, { "epoch": 2.220727606236625, "grad_norm": 0.8860329985618591, "learning_rate": 2.7741688956820787e-05, "loss": 0.1965, "step": 7264 }, { "epoch": 2.2210333231427697, "grad_norm": 0.7686523795127869, "learning_rate": 2.7745510126098586e-05, "loss": 0.2575, "step": 7265 }, { "epoch": 2.221339040048915, "grad_norm": 1.1418157815933228, "learning_rate": 2.7749331295376384e-05, "loss": 0.2317, "step": 7266 }, { "epoch": 2.2216447569550595, "grad_norm": 1.4913251399993896, "learning_rate": 2.7753152464654183e-05, "loss": 0.3183, "step": 7267 }, { "epoch": 2.2219504738612046, "grad_norm": 0.394113153219223, "learning_rate": 2.7756973633931982e-05, "loss": 0.1562, "step": 7268 }, { "epoch": 2.2222561907673493, "grad_norm": 0.33067479729652405, "learning_rate": 2.776079480320978e-05, "loss": 0.0868, "step": 7269 }, { "epoch": 2.2225619076734944, "grad_norm": 0.43152526021003723, "learning_rate": 2.776461597248758e-05, "loss": 0.0808, "step": 7270 }, { "epoch": 2.2228676245796395, "grad_norm": 0.3086299002170563, "learning_rate": 2.7768437141765378e-05, "loss": 0.0657, "step": 7271 }, { "epoch": 2.223173341485784, "grad_norm": 0.3798903524875641, "learning_rate": 2.7772258311043177e-05, "loss": 0.0671, "step": 7272 }, { "epoch": 2.2234790583919293, "grad_norm": 0.6566299796104431, "learning_rate": 2.777607948032098e-05, "loss": 0.0658, "step": 7273 }, { "epoch": 2.223784775298074, "grad_norm": 0.22796593606472015, "learning_rate": 2.777990064959878e-05, "loss": 0.0744, "step": 7274 }, { "epoch": 2.224090492204219, "grad_norm": 0.3341822922229767, "learning_rate": 2.778372181887658e-05, "loss": 0.0836, "step": 7275 }, { "epoch": 2.2243962091103637, "grad_norm": 0.4848545789718628, "learning_rate": 2.778754298815438e-05, "loss": 0.0789, "step": 7276 }, { "epoch": 2.224701926016509, "grad_norm": 0.4350726008415222, "learning_rate": 2.7791364157432177e-05, "loss": 0.0758, "step": 7277 }, { "epoch": 2.2250076429226535, "grad_norm": 0.4039871096611023, "learning_rate": 2.7795185326709976e-05, "loss": 0.1406, "step": 7278 }, { "epoch": 2.2253133598287986, "grad_norm": 0.3838861286640167, "learning_rate": 2.7799006495987775e-05, "loss": 0.0945, "step": 7279 }, { "epoch": 2.2256190767349433, "grad_norm": 1.1088417768478394, "learning_rate": 2.7802827665265574e-05, "loss": 0.1267, "step": 7280 }, { "epoch": 2.2259247936410884, "grad_norm": 0.8603148460388184, "learning_rate": 2.7806648834543372e-05, "loss": 0.1483, "step": 7281 }, { "epoch": 2.226230510547233, "grad_norm": 0.7709636688232422, "learning_rate": 2.781047000382117e-05, "loss": 0.1259, "step": 7282 }, { "epoch": 2.226536227453378, "grad_norm": 0.5182821750640869, "learning_rate": 2.781429117309897e-05, "loss": 0.1568, "step": 7283 }, { "epoch": 2.226841944359523, "grad_norm": 0.8145864605903625, "learning_rate": 2.781811234237677e-05, "loss": 0.178, "step": 7284 }, { "epoch": 2.227147661265668, "grad_norm": 0.8536222577095032, "learning_rate": 2.7821933511654567e-05, "loss": 0.1661, "step": 7285 }, { "epoch": 2.227453378171813, "grad_norm": 0.8335719108581543, "learning_rate": 2.7825754680932366e-05, "loss": 0.2009, "step": 7286 }, { "epoch": 2.2277590950779578, "grad_norm": 0.7979024648666382, "learning_rate": 2.7829575850210165e-05, "loss": 0.2112, "step": 7287 }, { "epoch": 2.228064811984103, "grad_norm": 0.8626941442489624, "learning_rate": 2.7833397019487963e-05, "loss": 0.1846, "step": 7288 }, { "epoch": 2.2283705288902476, "grad_norm": 1.3038076162338257, "learning_rate": 2.7837218188765766e-05, "loss": 0.2548, "step": 7289 }, { "epoch": 2.2286762457963927, "grad_norm": 2.8833515644073486, "learning_rate": 2.7841039358043564e-05, "loss": 0.251, "step": 7290 }, { "epoch": 2.2289819627025373, "grad_norm": 1.24587881565094, "learning_rate": 2.7844860527321363e-05, "loss": 0.2351, "step": 7291 }, { "epoch": 2.2292876796086825, "grad_norm": 2.0764312744140625, "learning_rate": 2.7848681696599162e-05, "loss": 0.3034, "step": 7292 }, { "epoch": 2.229593396514827, "grad_norm": 0.608111560344696, "learning_rate": 2.785250286587696e-05, "loss": 0.1486, "step": 7293 }, { "epoch": 2.2298991134209722, "grad_norm": 0.4340076446533203, "learning_rate": 2.785632403515476e-05, "loss": 0.1344, "step": 7294 }, { "epoch": 2.230204830327117, "grad_norm": 0.2051907479763031, "learning_rate": 2.7860145204432558e-05, "loss": 0.0759, "step": 7295 }, { "epoch": 2.230510547233262, "grad_norm": 0.31655970215797424, "learning_rate": 2.7863966373710357e-05, "loss": 0.0654, "step": 7296 }, { "epoch": 2.2308162641394067, "grad_norm": 0.4026654362678528, "learning_rate": 2.7867787542988156e-05, "loss": 0.0679, "step": 7297 }, { "epoch": 2.231121981045552, "grad_norm": 0.3816775679588318, "learning_rate": 2.7871608712265954e-05, "loss": 0.0383, "step": 7298 }, { "epoch": 2.231427697951697, "grad_norm": 0.3902985155582428, "learning_rate": 2.7875429881543753e-05, "loss": 0.0995, "step": 7299 }, { "epoch": 2.2317334148578416, "grad_norm": 0.29150256514549255, "learning_rate": 2.7879251050821552e-05, "loss": 0.0695, "step": 7300 }, { "epoch": 2.2320391317639867, "grad_norm": 0.2835788130760193, "learning_rate": 2.788307222009935e-05, "loss": 0.0666, "step": 7301 }, { "epoch": 2.2323448486701314, "grad_norm": 0.3449878394603729, "learning_rate": 2.788689338937715e-05, "loss": 0.0823, "step": 7302 }, { "epoch": 2.2326505655762765, "grad_norm": 0.47967448830604553, "learning_rate": 2.7890714558654948e-05, "loss": 0.1191, "step": 7303 }, { "epoch": 2.232956282482421, "grad_norm": 0.7661290764808655, "learning_rate": 2.7894535727932747e-05, "loss": 0.0799, "step": 7304 }, { "epoch": 2.2332619993885663, "grad_norm": 0.47738632559776306, "learning_rate": 2.789835689721055e-05, "loss": 0.1136, "step": 7305 }, { "epoch": 2.233567716294711, "grad_norm": 0.4540547728538513, "learning_rate": 2.7902178066488348e-05, "loss": 0.1361, "step": 7306 }, { "epoch": 2.233873433200856, "grad_norm": 0.42834874987602234, "learning_rate": 2.7905999235766146e-05, "loss": 0.1331, "step": 7307 }, { "epoch": 2.2341791501070007, "grad_norm": 0.5389931201934814, "learning_rate": 2.7909820405043945e-05, "loss": 0.1625, "step": 7308 }, { "epoch": 2.234484867013146, "grad_norm": 0.584682047367096, "learning_rate": 2.7913641574321744e-05, "loss": 0.1832, "step": 7309 }, { "epoch": 2.2347905839192905, "grad_norm": 0.6902965307235718, "learning_rate": 2.7917462743599543e-05, "loss": 0.1409, "step": 7310 }, { "epoch": 2.2350963008254356, "grad_norm": 1.4165046215057373, "learning_rate": 2.792128391287734e-05, "loss": 0.2175, "step": 7311 }, { "epoch": 2.2354020177315808, "grad_norm": 0.6423725485801697, "learning_rate": 2.792510508215514e-05, "loss": 0.1977, "step": 7312 }, { "epoch": 2.2357077346377254, "grad_norm": 1.2646232843399048, "learning_rate": 2.792892625143294e-05, "loss": 0.1903, "step": 7313 }, { "epoch": 2.2360134515438705, "grad_norm": 1.3004519939422607, "learning_rate": 2.7932747420710738e-05, "loss": 0.2462, "step": 7314 }, { "epoch": 2.236319168450015, "grad_norm": 1.2344284057617188, "learning_rate": 2.7936568589988536e-05, "loss": 0.2652, "step": 7315 }, { "epoch": 2.2366248853561603, "grad_norm": 1.1997939348220825, "learning_rate": 2.7940389759266335e-05, "loss": 0.2536, "step": 7316 }, { "epoch": 2.236930602262305, "grad_norm": 1.4748644828796387, "learning_rate": 2.7944210928544134e-05, "loss": 0.3225, "step": 7317 }, { "epoch": 2.23723631916845, "grad_norm": 0.8794047236442566, "learning_rate": 2.7948032097821933e-05, "loss": 0.1489, "step": 7318 }, { "epoch": 2.237542036074595, "grad_norm": 0.3056824505329132, "learning_rate": 2.795185326709973e-05, "loss": 0.0809, "step": 7319 }, { "epoch": 2.23784775298074, "grad_norm": 0.42853379249572754, "learning_rate": 2.7955674436377533e-05, "loss": 0.1037, "step": 7320 }, { "epoch": 2.2381534698868846, "grad_norm": 0.3359771966934204, "learning_rate": 2.7959495605655332e-05, "loss": 0.0896, "step": 7321 }, { "epoch": 2.2384591867930297, "grad_norm": 0.36654597520828247, "learning_rate": 2.796331677493313e-05, "loss": 0.0552, "step": 7322 }, { "epoch": 2.2387649036991744, "grad_norm": 0.3300630748271942, "learning_rate": 2.796713794421093e-05, "loss": 0.0565, "step": 7323 }, { "epoch": 2.2390706206053195, "grad_norm": 0.39741256833076477, "learning_rate": 2.797095911348873e-05, "loss": 0.0925, "step": 7324 }, { "epoch": 2.2393763375114646, "grad_norm": 0.3888700306415558, "learning_rate": 2.7974780282766527e-05, "loss": 0.0735, "step": 7325 }, { "epoch": 2.2396820544176093, "grad_norm": 0.4464672803878784, "learning_rate": 2.7978601452044326e-05, "loss": 0.0845, "step": 7326 }, { "epoch": 2.2399877713237544, "grad_norm": 0.4521157741546631, "learning_rate": 2.7982422621322125e-05, "loss": 0.0734, "step": 7327 }, { "epoch": 2.240293488229899, "grad_norm": 0.6377056241035461, "learning_rate": 2.7986243790599923e-05, "loss": 0.1368, "step": 7328 }, { "epoch": 2.240599205136044, "grad_norm": 0.885776937007904, "learning_rate": 2.7990064959877722e-05, "loss": 0.0745, "step": 7329 }, { "epoch": 2.240904922042189, "grad_norm": 0.8266817331314087, "learning_rate": 2.799388612915552e-05, "loss": 0.1098, "step": 7330 }, { "epoch": 2.241210638948334, "grad_norm": 0.45338305830955505, "learning_rate": 2.799770729843332e-05, "loss": 0.1202, "step": 7331 }, { "epoch": 2.2415163558544786, "grad_norm": 0.606958270072937, "learning_rate": 2.800152846771112e-05, "loss": 0.135, "step": 7332 }, { "epoch": 2.2418220727606237, "grad_norm": 1.4173721075057983, "learning_rate": 2.8005349636988917e-05, "loss": 0.1696, "step": 7333 }, { "epoch": 2.2421277896667684, "grad_norm": 0.9756584167480469, "learning_rate": 2.8009170806266716e-05, "loss": 0.1575, "step": 7334 }, { "epoch": 2.2424335065729135, "grad_norm": 0.541686475276947, "learning_rate": 2.8012991975544515e-05, "loss": 0.1617, "step": 7335 }, { "epoch": 2.242739223479058, "grad_norm": 1.1004400253295898, "learning_rate": 2.8016813144822317e-05, "loss": 0.1857, "step": 7336 }, { "epoch": 2.2430449403852033, "grad_norm": 0.9703087210655212, "learning_rate": 2.8020634314100116e-05, "loss": 0.1782, "step": 7337 }, { "epoch": 2.2433506572913484, "grad_norm": 1.412490963935852, "learning_rate": 2.8024455483377914e-05, "loss": 0.1891, "step": 7338 }, { "epoch": 2.243656374197493, "grad_norm": 1.4687159061431885, "learning_rate": 2.8028276652655713e-05, "loss": 0.2203, "step": 7339 }, { "epoch": 2.243962091103638, "grad_norm": 1.4687895774841309, "learning_rate": 2.8032097821933512e-05, "loss": 0.2448, "step": 7340 }, { "epoch": 2.244267808009783, "grad_norm": 2.2020580768585205, "learning_rate": 2.803591899121131e-05, "loss": 0.2719, "step": 7341 }, { "epoch": 2.244573524915928, "grad_norm": 2.5708508491516113, "learning_rate": 2.803974016048911e-05, "loss": 0.2715, "step": 7342 }, { "epoch": 2.2448792418220727, "grad_norm": 0.48642420768737793, "learning_rate": 2.8043561329766908e-05, "loss": 0.1657, "step": 7343 }, { "epoch": 2.2451849587282178, "grad_norm": 0.4323895573616028, "learning_rate": 2.8047382499044707e-05, "loss": 0.1059, "step": 7344 }, { "epoch": 2.2454906756343624, "grad_norm": 0.36122190952301025, "learning_rate": 2.8051203668322505e-05, "loss": 0.0672, "step": 7345 }, { "epoch": 2.2457963925405076, "grad_norm": 0.3057980239391327, "learning_rate": 2.8055024837600304e-05, "loss": 0.0735, "step": 7346 }, { "epoch": 2.2461021094466522, "grad_norm": 0.30336466431617737, "learning_rate": 2.8058846006878103e-05, "loss": 0.0644, "step": 7347 }, { "epoch": 2.2464078263527973, "grad_norm": 0.33443936705589294, "learning_rate": 2.8062667176155902e-05, "loss": 0.0596, "step": 7348 }, { "epoch": 2.246713543258942, "grad_norm": 0.43693938851356506, "learning_rate": 2.80664883454337e-05, "loss": 0.0865, "step": 7349 }, { "epoch": 2.247019260165087, "grad_norm": 0.4564915597438812, "learning_rate": 2.8070309514711503e-05, "loss": 0.0695, "step": 7350 }, { "epoch": 2.2473249770712322, "grad_norm": 0.43111562728881836, "learning_rate": 2.8074130683989305e-05, "loss": 0.1011, "step": 7351 }, { "epoch": 2.247630693977377, "grad_norm": 0.46266400814056396, "learning_rate": 2.8077951853267103e-05, "loss": 0.0753, "step": 7352 }, { "epoch": 2.247936410883522, "grad_norm": 0.41317474842071533, "learning_rate": 2.8081773022544902e-05, "loss": 0.0876, "step": 7353 }, { "epoch": 2.2482421277896667, "grad_norm": 0.6488450765609741, "learning_rate": 2.80855941918227e-05, "loss": 0.1131, "step": 7354 }, { "epoch": 2.248547844695812, "grad_norm": 0.7675851583480835, "learning_rate": 2.80894153611005e-05, "loss": 0.1008, "step": 7355 }, { "epoch": 2.2488535616019565, "grad_norm": 0.4870242476463318, "learning_rate": 2.80932365303783e-05, "loss": 0.1197, "step": 7356 }, { "epoch": 2.2491592785081016, "grad_norm": 0.46612492203712463, "learning_rate": 2.8097057699656097e-05, "loss": 0.1725, "step": 7357 }, { "epoch": 2.2494649954142463, "grad_norm": 0.8338161706924438, "learning_rate": 2.8100878868933896e-05, "loss": 0.1704, "step": 7358 }, { "epoch": 2.2497707123203914, "grad_norm": 1.0238538980484009, "learning_rate": 2.8104700038211695e-05, "loss": 0.1812, "step": 7359 }, { "epoch": 2.250076429226536, "grad_norm": 0.6558175683021545, "learning_rate": 2.8108521207489493e-05, "loss": 0.1959, "step": 7360 }, { "epoch": 2.250382146132681, "grad_norm": 0.8441532850265503, "learning_rate": 2.8112342376767292e-05, "loss": 0.2061, "step": 7361 }, { "epoch": 2.2506878630388263, "grad_norm": 0.6366831064224243, "learning_rate": 2.811616354604509e-05, "loss": 0.2038, "step": 7362 }, { "epoch": 2.250993579944971, "grad_norm": 0.8597474694252014, "learning_rate": 2.811998471532289e-05, "loss": 0.2062, "step": 7363 }, { "epoch": 2.2512992968511156, "grad_norm": 0.9908716082572937, "learning_rate": 2.812380588460069e-05, "loss": 0.2361, "step": 7364 }, { "epoch": 2.2516050137572607, "grad_norm": 1.1974499225616455, "learning_rate": 2.8127627053878487e-05, "loss": 0.2146, "step": 7365 }, { "epoch": 2.251910730663406, "grad_norm": 1.0907732248306274, "learning_rate": 2.8131448223156286e-05, "loss": 0.2594, "step": 7366 }, { "epoch": 2.2522164475695505, "grad_norm": 1.4752464294433594, "learning_rate": 2.8135269392434088e-05, "loss": 0.2966, "step": 7367 }, { "epoch": 2.2525221644756956, "grad_norm": 0.44088220596313477, "learning_rate": 2.8139090561711887e-05, "loss": 0.1817, "step": 7368 }, { "epoch": 2.2528278813818403, "grad_norm": 0.3711060881614685, "learning_rate": 2.8142911730989686e-05, "loss": 0.0904, "step": 7369 }, { "epoch": 2.2531335982879854, "grad_norm": 0.5027047395706177, "learning_rate": 2.8146732900267484e-05, "loss": 0.1158, "step": 7370 }, { "epoch": 2.25343931519413, "grad_norm": 0.34329280257225037, "learning_rate": 2.8150554069545283e-05, "loss": 0.0739, "step": 7371 }, { "epoch": 2.253745032100275, "grad_norm": 0.5952829122543335, "learning_rate": 2.8154375238823082e-05, "loss": 0.1009, "step": 7372 }, { "epoch": 2.25405074900642, "grad_norm": 0.253520131111145, "learning_rate": 2.815819640810088e-05, "loss": 0.0623, "step": 7373 }, { "epoch": 2.254356465912565, "grad_norm": 0.3138020634651184, "learning_rate": 2.816201757737868e-05, "loss": 0.0657, "step": 7374 }, { "epoch": 2.25466218281871, "grad_norm": 0.25575369596481323, "learning_rate": 2.8165838746656478e-05, "loss": 0.0712, "step": 7375 }, { "epoch": 2.254967899724855, "grad_norm": 0.27907392382621765, "learning_rate": 2.8169659915934277e-05, "loss": 0.0845, "step": 7376 }, { "epoch": 2.2552736166309995, "grad_norm": 0.470973402261734, "learning_rate": 2.8173481085212075e-05, "loss": 0.0871, "step": 7377 }, { "epoch": 2.2555793335371446, "grad_norm": 0.49163827300071716, "learning_rate": 2.8177302254489874e-05, "loss": 0.1216, "step": 7378 }, { "epoch": 2.2558850504432897, "grad_norm": 0.3077130913734436, "learning_rate": 2.8181123423767673e-05, "loss": 0.0872, "step": 7379 }, { "epoch": 2.2561907673494344, "grad_norm": 0.8408883810043335, "learning_rate": 2.8184944593045472e-05, "loss": 0.1307, "step": 7380 }, { "epoch": 2.2564964842555795, "grad_norm": 0.717801034450531, "learning_rate": 2.818876576232327e-05, "loss": 0.1057, "step": 7381 }, { "epoch": 2.256802201161724, "grad_norm": 0.8257322311401367, "learning_rate": 2.8192586931601073e-05, "loss": 0.1407, "step": 7382 }, { "epoch": 2.2571079180678693, "grad_norm": 0.6437109708786011, "learning_rate": 2.819640810087887e-05, "loss": 0.1626, "step": 7383 }, { "epoch": 2.257413634974014, "grad_norm": 0.7860509157180786, "learning_rate": 2.820022927015667e-05, "loss": 0.1993, "step": 7384 }, { "epoch": 2.257719351880159, "grad_norm": 0.8053140640258789, "learning_rate": 2.820405043943447e-05, "loss": 0.1863, "step": 7385 }, { "epoch": 2.2580250687863037, "grad_norm": 0.8977100253105164, "learning_rate": 2.8207871608712268e-05, "loss": 0.2171, "step": 7386 }, { "epoch": 2.258330785692449, "grad_norm": 0.7595359086990356, "learning_rate": 2.8211692777990066e-05, "loss": 0.204, "step": 7387 }, { "epoch": 2.258636502598594, "grad_norm": 0.7271461486816406, "learning_rate": 2.8215513947267865e-05, "loss": 0.2094, "step": 7388 }, { "epoch": 2.2589422195047386, "grad_norm": 0.7832782864570618, "learning_rate": 2.8219335116545664e-05, "loss": 0.209, "step": 7389 }, { "epoch": 2.2592479364108833, "grad_norm": 0.8457315564155579, "learning_rate": 2.8223156285823463e-05, "loss": 0.2319, "step": 7390 }, { "epoch": 2.2595536533170284, "grad_norm": 1.8064745664596558, "learning_rate": 2.822697745510126e-05, "loss": 0.2841, "step": 7391 }, { "epoch": 2.2598593702231735, "grad_norm": 1.7350587844848633, "learning_rate": 2.823079862437906e-05, "loss": 0.3073, "step": 7392 }, { "epoch": 2.260165087129318, "grad_norm": 0.43555137515068054, "learning_rate": 2.823461979365686e-05, "loss": 0.1674, "step": 7393 }, { "epoch": 2.2604708040354633, "grad_norm": 0.31527405977249146, "learning_rate": 2.8238440962934658e-05, "loss": 0.1063, "step": 7394 }, { "epoch": 2.260776520941608, "grad_norm": 0.32739681005477905, "learning_rate": 2.8242262132212456e-05, "loss": 0.1014, "step": 7395 }, { "epoch": 2.261082237847753, "grad_norm": 0.645633339881897, "learning_rate": 2.8246083301490255e-05, "loss": 0.0614, "step": 7396 }, { "epoch": 2.2613879547538978, "grad_norm": 1.0098249912261963, "learning_rate": 2.8249904470768054e-05, "loss": 0.071, "step": 7397 }, { "epoch": 2.261693671660043, "grad_norm": 0.38260024785995483, "learning_rate": 2.8253725640045856e-05, "loss": 0.0539, "step": 7398 }, { "epoch": 2.2619993885661875, "grad_norm": 0.3118835985660553, "learning_rate": 2.8257546809323655e-05, "loss": 0.0582, "step": 7399 }, { "epoch": 2.2623051054723327, "grad_norm": 0.22657616436481476, "learning_rate": 2.8261367978601453e-05, "loss": 0.0702, "step": 7400 }, { "epoch": 2.2626108223784778, "grad_norm": 0.3477165400981903, "learning_rate": 2.8265189147879252e-05, "loss": 0.0732, "step": 7401 }, { "epoch": 2.2629165392846224, "grad_norm": 0.5072835087776184, "learning_rate": 2.826901031715705e-05, "loss": 0.0835, "step": 7402 }, { "epoch": 2.263222256190767, "grad_norm": 0.4893054962158203, "learning_rate": 2.827283148643485e-05, "loss": 0.1196, "step": 7403 }, { "epoch": 2.2635279730969122, "grad_norm": 1.06656014919281, "learning_rate": 2.827665265571265e-05, "loss": 0.1055, "step": 7404 }, { "epoch": 2.2638336900030573, "grad_norm": 0.381242960691452, "learning_rate": 2.8280473824990447e-05, "loss": 0.1027, "step": 7405 }, { "epoch": 2.264139406909202, "grad_norm": 0.5036057829856873, "learning_rate": 2.8284294994268246e-05, "loss": 0.1245, "step": 7406 }, { "epoch": 2.264445123815347, "grad_norm": 2.9906630516052246, "learning_rate": 2.8288116163546045e-05, "loss": 0.1488, "step": 7407 }, { "epoch": 2.264750840721492, "grad_norm": 0.6812462210655212, "learning_rate": 2.8291937332823843e-05, "loss": 0.1772, "step": 7408 }, { "epoch": 2.265056557627637, "grad_norm": 1.031022071838379, "learning_rate": 2.8295758502101642e-05, "loss": 0.1795, "step": 7409 }, { "epoch": 2.2653622745337816, "grad_norm": 0.734355628490448, "learning_rate": 2.829957967137944e-05, "loss": 0.19, "step": 7410 }, { "epoch": 2.2656679914399267, "grad_norm": 0.9828577041625977, "learning_rate": 2.830340084065724e-05, "loss": 0.195, "step": 7411 }, { "epoch": 2.2659737083460714, "grad_norm": 0.8663661479949951, "learning_rate": 2.8307222009935038e-05, "loss": 0.2237, "step": 7412 }, { "epoch": 2.2662794252522165, "grad_norm": 0.7535672187805176, "learning_rate": 2.831104317921284e-05, "loss": 0.2047, "step": 7413 }, { "epoch": 2.2665851421583616, "grad_norm": 0.8889128565788269, "learning_rate": 2.831486434849064e-05, "loss": 0.1919, "step": 7414 }, { "epoch": 2.2668908590645063, "grad_norm": 0.8428142666816711, "learning_rate": 2.8318685517768438e-05, "loss": 0.1849, "step": 7415 }, { "epoch": 2.267196575970651, "grad_norm": 1.5160380601882935, "learning_rate": 2.8322506687046237e-05, "loss": 0.2357, "step": 7416 }, { "epoch": 2.267502292876796, "grad_norm": 2.317176580429077, "learning_rate": 2.8326327856324035e-05, "loss": 0.3044, "step": 7417 }, { "epoch": 2.267808009782941, "grad_norm": 2.525517225265503, "learning_rate": 2.8330149025601834e-05, "loss": 0.1695, "step": 7418 }, { "epoch": 2.268113726689086, "grad_norm": 0.34109076857566833, "learning_rate": 2.8333970194879633e-05, "loss": 0.0999, "step": 7419 }, { "epoch": 2.268419443595231, "grad_norm": 0.42694664001464844, "learning_rate": 2.833779136415743e-05, "loss": 0.0824, "step": 7420 }, { "epoch": 2.2687251605013756, "grad_norm": 0.29348838329315186, "learning_rate": 2.834161253343523e-05, "loss": 0.0586, "step": 7421 }, { "epoch": 2.2690308774075207, "grad_norm": 0.267353355884552, "learning_rate": 2.834543370271303e-05, "loss": 0.0723, "step": 7422 }, { "epoch": 2.2693365943136654, "grad_norm": 0.4615834951400757, "learning_rate": 2.8349254871990828e-05, "loss": 0.0515, "step": 7423 }, { "epoch": 2.2696423112198105, "grad_norm": 0.6454854011535645, "learning_rate": 2.8353076041268627e-05, "loss": 0.0757, "step": 7424 }, { "epoch": 2.269948028125955, "grad_norm": 0.3614984452724457, "learning_rate": 2.8356897210546425e-05, "loss": 0.0839, "step": 7425 }, { "epoch": 2.2702537450321003, "grad_norm": 0.32900500297546387, "learning_rate": 2.8360718379824224e-05, "loss": 0.0717, "step": 7426 }, { "epoch": 2.2705594619382454, "grad_norm": 0.3135313093662262, "learning_rate": 2.8364539549102023e-05, "loss": 0.081, "step": 7427 }, { "epoch": 2.27086517884439, "grad_norm": 0.8695619106292725, "learning_rate": 2.8368360718379825e-05, "loss": 0.1231, "step": 7428 }, { "epoch": 2.2711708957505348, "grad_norm": 0.44079065322875977, "learning_rate": 2.8372181887657627e-05, "loss": 0.1157, "step": 7429 }, { "epoch": 2.27147661265668, "grad_norm": 0.39970526099205017, "learning_rate": 2.8376003056935426e-05, "loss": 0.1288, "step": 7430 }, { "epoch": 2.271782329562825, "grad_norm": 1.0098403692245483, "learning_rate": 2.8379824226213225e-05, "loss": 0.0962, "step": 7431 }, { "epoch": 2.2720880464689697, "grad_norm": 1.7636184692382812, "learning_rate": 2.8383645395491023e-05, "loss": 0.1943, "step": 7432 }, { "epoch": 2.272393763375115, "grad_norm": 1.1189097166061401, "learning_rate": 2.8387466564768822e-05, "loss": 0.1512, "step": 7433 }, { "epoch": 2.2726994802812595, "grad_norm": 0.6449055075645447, "learning_rate": 2.839128773404662e-05, "loss": 0.1837, "step": 7434 }, { "epoch": 2.2730051971874046, "grad_norm": 1.6566085815429688, "learning_rate": 2.839510890332442e-05, "loss": 0.1775, "step": 7435 }, { "epoch": 2.2733109140935492, "grad_norm": 1.315734624862671, "learning_rate": 2.839893007260222e-05, "loss": 0.2045, "step": 7436 }, { "epoch": 2.2736166309996944, "grad_norm": 1.1309235095977783, "learning_rate": 2.8402751241880017e-05, "loss": 0.2484, "step": 7437 }, { "epoch": 2.273922347905839, "grad_norm": 0.6738730669021606, "learning_rate": 2.8406572411157816e-05, "loss": 0.2083, "step": 7438 }, { "epoch": 2.274228064811984, "grad_norm": 0.9136133193969727, "learning_rate": 2.8410393580435615e-05, "loss": 0.2289, "step": 7439 }, { "epoch": 2.2745337817181293, "grad_norm": 1.242142915725708, "learning_rate": 2.8414214749713413e-05, "loss": 0.2592, "step": 7440 }, { "epoch": 2.274839498624274, "grad_norm": 1.1244701147079468, "learning_rate": 2.8418035918991212e-05, "loss": 0.2377, "step": 7441 }, { "epoch": 2.2751452155304186, "grad_norm": 1.570652961730957, "learning_rate": 2.842185708826901e-05, "loss": 0.2732, "step": 7442 }, { "epoch": 2.2754509324365637, "grad_norm": 0.6195098161697388, "learning_rate": 2.842567825754681e-05, "loss": 0.1745, "step": 7443 }, { "epoch": 2.275756649342709, "grad_norm": 0.3875739872455597, "learning_rate": 2.842949942682461e-05, "loss": 0.0988, "step": 7444 }, { "epoch": 2.2760623662488535, "grad_norm": 0.3012505769729614, "learning_rate": 2.843332059610241e-05, "loss": 0.0827, "step": 7445 }, { "epoch": 2.2763680831549986, "grad_norm": 0.49913525581359863, "learning_rate": 2.843714176538021e-05, "loss": 0.0873, "step": 7446 }, { "epoch": 2.2766738000611433, "grad_norm": 0.3412223756313324, "learning_rate": 2.8440962934658008e-05, "loss": 0.0701, "step": 7447 }, { "epoch": 2.2769795169672884, "grad_norm": 0.3033045530319214, "learning_rate": 2.8444784103935807e-05, "loss": 0.0665, "step": 7448 }, { "epoch": 2.277285233873433, "grad_norm": 0.46778303384780884, "learning_rate": 2.8448605273213605e-05, "loss": 0.0578, "step": 7449 }, { "epoch": 2.277590950779578, "grad_norm": 0.32937702536582947, "learning_rate": 2.8452426442491404e-05, "loss": 0.0826, "step": 7450 }, { "epoch": 2.277896667685723, "grad_norm": 0.237934872508049, "learning_rate": 2.8456247611769203e-05, "loss": 0.0646, "step": 7451 }, { "epoch": 2.278202384591868, "grad_norm": 0.46363818645477295, "learning_rate": 2.8460068781047e-05, "loss": 0.0631, "step": 7452 }, { "epoch": 2.278508101498013, "grad_norm": 0.37032270431518555, "learning_rate": 2.84638899503248e-05, "loss": 0.1196, "step": 7453 }, { "epoch": 2.2788138184041578, "grad_norm": 0.38839012384414673, "learning_rate": 2.84677111196026e-05, "loss": 0.1041, "step": 7454 }, { "epoch": 2.2791195353103024, "grad_norm": 0.4413239061832428, "learning_rate": 2.8471532288880398e-05, "loss": 0.0872, "step": 7455 }, { "epoch": 2.2794252522164475, "grad_norm": 0.48668286204338074, "learning_rate": 2.8475353458158197e-05, "loss": 0.1085, "step": 7456 }, { "epoch": 2.2797309691225927, "grad_norm": 0.7056439518928528, "learning_rate": 2.8479174627435995e-05, "loss": 0.169, "step": 7457 }, { "epoch": 2.2800366860287373, "grad_norm": 0.6685630679130554, "learning_rate": 2.8482995796713794e-05, "loss": 0.1807, "step": 7458 }, { "epoch": 2.2803424029348824, "grad_norm": 0.7288064360618591, "learning_rate": 2.8486816965991593e-05, "loss": 0.2016, "step": 7459 }, { "epoch": 2.280648119841027, "grad_norm": 0.6542525291442871, "learning_rate": 2.8490638135269395e-05, "loss": 0.1826, "step": 7460 }, { "epoch": 2.2809538367471722, "grad_norm": 1.0395879745483398, "learning_rate": 2.8494459304547194e-05, "loss": 0.2051, "step": 7461 }, { "epoch": 2.281259553653317, "grad_norm": 0.6941827535629272, "learning_rate": 2.8498280473824992e-05, "loss": 0.2087, "step": 7462 }, { "epoch": 2.281565270559462, "grad_norm": 0.6712960004806519, "learning_rate": 2.850210164310279e-05, "loss": 0.2018, "step": 7463 }, { "epoch": 2.2818709874656067, "grad_norm": 0.8865637183189392, "learning_rate": 2.850592281238059e-05, "loss": 0.2166, "step": 7464 }, { "epoch": 2.282176704371752, "grad_norm": 0.8303442597389221, "learning_rate": 2.850974398165839e-05, "loss": 0.2137, "step": 7465 }, { "epoch": 2.282482421277897, "grad_norm": 1.1301701068878174, "learning_rate": 2.8513565150936187e-05, "loss": 0.2547, "step": 7466 }, { "epoch": 2.2827881381840416, "grad_norm": 2.704989433288574, "learning_rate": 2.8517386320213986e-05, "loss": 0.3195, "step": 7467 }, { "epoch": 2.2830938550901863, "grad_norm": 0.38451674580574036, "learning_rate": 2.8521207489491785e-05, "loss": 0.1521, "step": 7468 }, { "epoch": 2.2833995719963314, "grad_norm": 0.3128558099269867, "learning_rate": 2.8525028658769584e-05, "loss": 0.0927, "step": 7469 }, { "epoch": 2.2837052889024765, "grad_norm": 0.2775086760520935, "learning_rate": 2.8528849828047382e-05, "loss": 0.0748, "step": 7470 }, { "epoch": 2.284011005808621, "grad_norm": 0.28951963782310486, "learning_rate": 2.853267099732518e-05, "loss": 0.105, "step": 7471 }, { "epoch": 2.2843167227147663, "grad_norm": 0.4539552628993988, "learning_rate": 2.853649216660298e-05, "loss": 0.0752, "step": 7472 }, { "epoch": 2.284622439620911, "grad_norm": 0.3506033718585968, "learning_rate": 2.854031333588078e-05, "loss": 0.0818, "step": 7473 }, { "epoch": 2.284928156527056, "grad_norm": 0.4658331274986267, "learning_rate": 2.8544134505158577e-05, "loss": 0.0739, "step": 7474 }, { "epoch": 2.2852338734332007, "grad_norm": 0.7144290208816528, "learning_rate": 2.854795567443638e-05, "loss": 0.0722, "step": 7475 }, { "epoch": 2.285539590339346, "grad_norm": 0.6622812151908875, "learning_rate": 2.8551776843714178e-05, "loss": 0.0684, "step": 7476 }, { "epoch": 2.2858453072454905, "grad_norm": 0.7743318676948547, "learning_rate": 2.8555598012991977e-05, "loss": 0.0892, "step": 7477 }, { "epoch": 2.2861510241516356, "grad_norm": 0.34522339701652527, "learning_rate": 2.8559419182269776e-05, "loss": 0.1063, "step": 7478 }, { "epoch": 2.2864567410577807, "grad_norm": 0.40159496665000916, "learning_rate": 2.8563240351547575e-05, "loss": 0.0958, "step": 7479 }, { "epoch": 2.2867624579639254, "grad_norm": 0.7242107391357422, "learning_rate": 2.8567061520825373e-05, "loss": 0.1224, "step": 7480 }, { "epoch": 2.28706817487007, "grad_norm": 0.6342408061027527, "learning_rate": 2.8570882690103172e-05, "loss": 0.125, "step": 7481 }, { "epoch": 2.287373891776215, "grad_norm": 0.48364225029945374, "learning_rate": 2.857470385938097e-05, "loss": 0.178, "step": 7482 }, { "epoch": 2.2876796086823603, "grad_norm": 0.6203823685646057, "learning_rate": 2.857852502865877e-05, "loss": 0.1911, "step": 7483 }, { "epoch": 2.287985325588505, "grad_norm": 1.1699111461639404, "learning_rate": 2.8582346197936568e-05, "loss": 0.2033, "step": 7484 }, { "epoch": 2.28829104249465, "grad_norm": 0.9143407940864563, "learning_rate": 2.8586167367214367e-05, "loss": 0.1951, "step": 7485 }, { "epoch": 2.2885967594007948, "grad_norm": 0.7229843139648438, "learning_rate": 2.8589988536492166e-05, "loss": 0.1936, "step": 7486 }, { "epoch": 2.28890247630694, "grad_norm": 0.8904842734336853, "learning_rate": 2.8593809705769964e-05, "loss": 0.1995, "step": 7487 }, { "epoch": 2.2892081932130846, "grad_norm": 1.5514459609985352, "learning_rate": 2.8597630875047763e-05, "loss": 0.1882, "step": 7488 }, { "epoch": 2.2895139101192297, "grad_norm": 1.3278262615203857, "learning_rate": 2.8601452044325562e-05, "loss": 0.2193, "step": 7489 }, { "epoch": 2.2898196270253743, "grad_norm": 1.2864344120025635, "learning_rate": 2.860527321360336e-05, "loss": 0.2552, "step": 7490 }, { "epoch": 2.2901253439315195, "grad_norm": 1.2618060111999512, "learning_rate": 2.8609094382881163e-05, "loss": 0.2765, "step": 7491 }, { "epoch": 2.2904310608376646, "grad_norm": 1.2384216785430908, "learning_rate": 2.861291555215896e-05, "loss": 0.2644, "step": 7492 }, { "epoch": 2.2907367777438092, "grad_norm": 0.4335736632347107, "learning_rate": 2.861673672143676e-05, "loss": 0.1804, "step": 7493 }, { "epoch": 2.291042494649954, "grad_norm": 0.3741190731525421, "learning_rate": 2.862055789071456e-05, "loss": 0.0903, "step": 7494 }, { "epoch": 2.291348211556099, "grad_norm": 0.43373048305511475, "learning_rate": 2.8624379059992358e-05, "loss": 0.0767, "step": 7495 }, { "epoch": 2.291653928462244, "grad_norm": 0.4016561210155487, "learning_rate": 2.8628200229270157e-05, "loss": 0.0805, "step": 7496 }, { "epoch": 2.291959645368389, "grad_norm": 0.20446951687335968, "learning_rate": 2.8632021398547955e-05, "loss": 0.0594, "step": 7497 }, { "epoch": 2.292265362274534, "grad_norm": 0.33842378854751587, "learning_rate": 2.8635842567825754e-05, "loss": 0.0779, "step": 7498 }, { "epoch": 2.2925710791806786, "grad_norm": 0.3602825105190277, "learning_rate": 2.8639663737103553e-05, "loss": 0.0588, "step": 7499 }, { "epoch": 2.2928767960868237, "grad_norm": 0.3505137264728546, "learning_rate": 2.864348490638135e-05, "loss": 0.0845, "step": 7500 }, { "epoch": 2.2931825129929684, "grad_norm": 0.3203282654285431, "learning_rate": 2.864730607565915e-05, "loss": 0.0658, "step": 7501 }, { "epoch": 2.2934882298991135, "grad_norm": 0.3169296085834503, "learning_rate": 2.865112724493695e-05, "loss": 0.0691, "step": 7502 }, { "epoch": 2.293793946805258, "grad_norm": 0.5143694877624512, "learning_rate": 2.8654948414214748e-05, "loss": 0.1046, "step": 7503 }, { "epoch": 2.2940996637114033, "grad_norm": 0.31111663579940796, "learning_rate": 2.8658769583492547e-05, "loss": 0.0849, "step": 7504 }, { "epoch": 2.2944053806175484, "grad_norm": 0.415971040725708, "learning_rate": 2.8662590752770345e-05, "loss": 0.1438, "step": 7505 }, { "epoch": 2.294711097523693, "grad_norm": 0.3807222843170166, "learning_rate": 2.8666411922048147e-05, "loss": 0.1493, "step": 7506 }, { "epoch": 2.2950168144298377, "grad_norm": 0.6740612983703613, "learning_rate": 2.867023309132595e-05, "loss": 0.1414, "step": 7507 }, { "epoch": 2.295322531335983, "grad_norm": 0.608700692653656, "learning_rate": 2.8674054260603748e-05, "loss": 0.1544, "step": 7508 }, { "epoch": 2.295628248242128, "grad_norm": 1.6839219331741333, "learning_rate": 2.8677875429881547e-05, "loss": 0.2106, "step": 7509 }, { "epoch": 2.2959339651482726, "grad_norm": 0.7479556202888489, "learning_rate": 2.8681696599159346e-05, "loss": 0.1754, "step": 7510 }, { "epoch": 2.2962396820544178, "grad_norm": 0.5974370837211609, "learning_rate": 2.8685517768437145e-05, "loss": 0.1684, "step": 7511 }, { "epoch": 2.2965453989605624, "grad_norm": 0.8048266768455505, "learning_rate": 2.8689338937714943e-05, "loss": 0.1883, "step": 7512 }, { "epoch": 2.2968511158667075, "grad_norm": 0.6918771862983704, "learning_rate": 2.8693160106992742e-05, "loss": 0.2076, "step": 7513 }, { "epoch": 2.297156832772852, "grad_norm": 1.0532617568969727, "learning_rate": 2.869698127627054e-05, "loss": 0.1891, "step": 7514 }, { "epoch": 2.2974625496789973, "grad_norm": 0.8386517763137817, "learning_rate": 2.870080244554834e-05, "loss": 0.2314, "step": 7515 }, { "epoch": 2.297768266585142, "grad_norm": 1.1486172676086426, "learning_rate": 2.8704623614826138e-05, "loss": 0.2179, "step": 7516 }, { "epoch": 2.298073983491287, "grad_norm": 1.40446937084198, "learning_rate": 2.8708444784103937e-05, "loss": 0.2875, "step": 7517 }, { "epoch": 2.2983797003974322, "grad_norm": 0.4683662950992584, "learning_rate": 2.8712265953381736e-05, "loss": 0.1948, "step": 7518 }, { "epoch": 2.298685417303577, "grad_norm": 0.6495504379272461, "learning_rate": 2.8716087122659534e-05, "loss": 0.1151, "step": 7519 }, { "epoch": 2.2989911342097216, "grad_norm": 0.3304412364959717, "learning_rate": 2.8719908291937333e-05, "loss": 0.0561, "step": 7520 }, { "epoch": 2.2992968511158667, "grad_norm": 0.3002147674560547, "learning_rate": 2.8723729461215132e-05, "loss": 0.0769, "step": 7521 }, { "epoch": 2.299602568022012, "grad_norm": 0.3756667375564575, "learning_rate": 2.8727550630492934e-05, "loss": 0.0485, "step": 7522 }, { "epoch": 2.2999082849281565, "grad_norm": 0.9102890491485596, "learning_rate": 2.8731371799770733e-05, "loss": 0.1089, "step": 7523 }, { "epoch": 2.3002140018343016, "grad_norm": 0.30093035101890564, "learning_rate": 2.873519296904853e-05, "loss": 0.0597, "step": 7524 }, { "epoch": 2.3005197187404463, "grad_norm": 0.35749807953834534, "learning_rate": 2.873901413832633e-05, "loss": 0.0688, "step": 7525 }, { "epoch": 2.3008254356465914, "grad_norm": 0.3462176024913788, "learning_rate": 2.874283530760413e-05, "loss": 0.0796, "step": 7526 }, { "epoch": 2.301131152552736, "grad_norm": 1.06118905544281, "learning_rate": 2.8746656476881928e-05, "loss": 0.0918, "step": 7527 }, { "epoch": 2.301436869458881, "grad_norm": 1.3516439199447632, "learning_rate": 2.8750477646159727e-05, "loss": 0.1123, "step": 7528 }, { "epoch": 2.301742586365026, "grad_norm": 0.24292103946208954, "learning_rate": 2.8754298815437525e-05, "loss": 0.0699, "step": 7529 }, { "epoch": 2.302048303271171, "grad_norm": 0.32467472553253174, "learning_rate": 2.8758119984715324e-05, "loss": 0.1184, "step": 7530 }, { "epoch": 2.302354020177316, "grad_norm": 0.5084934234619141, "learning_rate": 2.8761941153993123e-05, "loss": 0.132, "step": 7531 }, { "epoch": 2.3026597370834607, "grad_norm": 1.9089670181274414, "learning_rate": 2.876576232327092e-05, "loss": 0.1624, "step": 7532 }, { "epoch": 2.3029654539896054, "grad_norm": 0.7061022520065308, "learning_rate": 2.876958349254872e-05, "loss": 0.1656, "step": 7533 }, { "epoch": 2.3032711708957505, "grad_norm": 0.9862715601921082, "learning_rate": 2.877340466182652e-05, "loss": 0.1629, "step": 7534 }, { "epoch": 2.3035768878018956, "grad_norm": 0.6939222812652588, "learning_rate": 2.8777225831104318e-05, "loss": 0.19, "step": 7535 }, { "epoch": 2.3038826047080403, "grad_norm": 0.7063977122306824, "learning_rate": 2.8781047000382117e-05, "loss": 0.1891, "step": 7536 }, { "epoch": 2.3041883216141854, "grad_norm": 0.7416645288467407, "learning_rate": 2.878486816965992e-05, "loss": 0.1799, "step": 7537 }, { "epoch": 2.30449403852033, "grad_norm": 0.741371214389801, "learning_rate": 2.8788689338937717e-05, "loss": 0.2279, "step": 7538 }, { "epoch": 2.304799755426475, "grad_norm": 0.8118292689323425, "learning_rate": 2.8792510508215516e-05, "loss": 0.2058, "step": 7539 }, { "epoch": 2.30510547233262, "grad_norm": 0.9217849969863892, "learning_rate": 2.8796331677493315e-05, "loss": 0.2088, "step": 7540 }, { "epoch": 2.305411189238765, "grad_norm": 2.3600547313690186, "learning_rate": 2.8800152846771114e-05, "loss": 0.2928, "step": 7541 }, { "epoch": 2.3057169061449097, "grad_norm": 3.453552722930908, "learning_rate": 2.8803974016048912e-05, "loss": 0.3184, "step": 7542 }, { "epoch": 2.3060226230510548, "grad_norm": 0.43162065744400024, "learning_rate": 2.880779518532671e-05, "loss": 0.15, "step": 7543 }, { "epoch": 2.3063283399572, "grad_norm": 0.29982873797416687, "learning_rate": 2.881161635460451e-05, "loss": 0.1078, "step": 7544 }, { "epoch": 2.3066340568633446, "grad_norm": 0.2559639811515808, "learning_rate": 2.881543752388231e-05, "loss": 0.0924, "step": 7545 }, { "epoch": 2.306939773769489, "grad_norm": 0.2346080243587494, "learning_rate": 2.8819258693160107e-05, "loss": 0.071, "step": 7546 }, { "epoch": 2.3072454906756343, "grad_norm": 0.48012247681617737, "learning_rate": 2.8823079862437906e-05, "loss": 0.0732, "step": 7547 }, { "epoch": 2.3075512075817795, "grad_norm": 0.20457255840301514, "learning_rate": 2.8826901031715705e-05, "loss": 0.053, "step": 7548 }, { "epoch": 2.307856924487924, "grad_norm": 0.2660687267780304, "learning_rate": 2.8830722200993504e-05, "loss": 0.0946, "step": 7549 }, { "epoch": 2.3081626413940692, "grad_norm": 0.44019076228141785, "learning_rate": 2.8834543370271302e-05, "loss": 0.0788, "step": 7550 }, { "epoch": 2.308468358300214, "grad_norm": 0.49759408831596375, "learning_rate": 2.88383645395491e-05, "loss": 0.0662, "step": 7551 }, { "epoch": 2.308774075206359, "grad_norm": 0.31251224875450134, "learning_rate": 2.88421857088269e-05, "loss": 0.0746, "step": 7552 }, { "epoch": 2.3090797921125037, "grad_norm": 0.5106304883956909, "learning_rate": 2.8846006878104702e-05, "loss": 0.1169, "step": 7553 }, { "epoch": 2.309385509018649, "grad_norm": 0.3610357344150543, "learning_rate": 2.88498280473825e-05, "loss": 0.0803, "step": 7554 }, { "epoch": 2.3096912259247935, "grad_norm": 0.33001258969306946, "learning_rate": 2.88536492166603e-05, "loss": 0.1275, "step": 7555 }, { "epoch": 2.3099969428309386, "grad_norm": 0.4176312983036041, "learning_rate": 2.8857470385938098e-05, "loss": 0.126, "step": 7556 }, { "epoch": 2.3103026597370837, "grad_norm": 0.43016451597213745, "learning_rate": 2.8861291555215897e-05, "loss": 0.1372, "step": 7557 }, { "epoch": 2.3106083766432284, "grad_norm": 0.4722355306148529, "learning_rate": 2.8865112724493696e-05, "loss": 0.165, "step": 7558 }, { "epoch": 2.310914093549373, "grad_norm": 0.6107311844825745, "learning_rate": 2.8868933893771494e-05, "loss": 0.1601, "step": 7559 }, { "epoch": 2.311219810455518, "grad_norm": 0.602062463760376, "learning_rate": 2.8872755063049293e-05, "loss": 0.1935, "step": 7560 }, { "epoch": 2.3115255273616633, "grad_norm": 0.9276050925254822, "learning_rate": 2.8876576232327092e-05, "loss": 0.2338, "step": 7561 }, { "epoch": 2.311831244267808, "grad_norm": 0.8573559522628784, "learning_rate": 2.888039740160489e-05, "loss": 0.2147, "step": 7562 }, { "epoch": 2.312136961173953, "grad_norm": 0.5816456079483032, "learning_rate": 2.888421857088269e-05, "loss": 0.2194, "step": 7563 }, { "epoch": 2.3124426780800977, "grad_norm": 0.9281688928604126, "learning_rate": 2.8888039740160488e-05, "loss": 0.2183, "step": 7564 }, { "epoch": 2.312748394986243, "grad_norm": 0.7616890072822571, "learning_rate": 2.8891860909438287e-05, "loss": 0.2421, "step": 7565 }, { "epoch": 2.3130541118923875, "grad_norm": 1.1041845083236694, "learning_rate": 2.8895682078716086e-05, "loss": 0.2631, "step": 7566 }, { "epoch": 2.3133598287985326, "grad_norm": 1.31110680103302, "learning_rate": 2.8899503247993884e-05, "loss": 0.3061, "step": 7567 }, { "epoch": 2.3136655457046773, "grad_norm": 0.3263636827468872, "learning_rate": 2.8903324417271687e-05, "loss": 0.1524, "step": 7568 }, { "epoch": 2.3139712626108224, "grad_norm": 0.35512107610702515, "learning_rate": 2.8907145586549485e-05, "loss": 0.109, "step": 7569 }, { "epoch": 2.3142769795169675, "grad_norm": 0.2748121917247772, "learning_rate": 2.8910966755827284e-05, "loss": 0.0726, "step": 7570 }, { "epoch": 2.314582696423112, "grad_norm": 0.33601704239845276, "learning_rate": 2.8914787925105083e-05, "loss": 0.0677, "step": 7571 }, { "epoch": 2.314888413329257, "grad_norm": 0.30430862307548523, "learning_rate": 2.891860909438288e-05, "loss": 0.0595, "step": 7572 }, { "epoch": 2.315194130235402, "grad_norm": 0.5909938216209412, "learning_rate": 2.892243026366068e-05, "loss": 0.1152, "step": 7573 }, { "epoch": 2.315499847141547, "grad_norm": 0.2715126574039459, "learning_rate": 2.892625143293848e-05, "loss": 0.0668, "step": 7574 }, { "epoch": 2.315805564047692, "grad_norm": 0.3075518012046814, "learning_rate": 2.8930072602216278e-05, "loss": 0.0672, "step": 7575 }, { "epoch": 2.316111280953837, "grad_norm": 0.3838854134082794, "learning_rate": 2.8933893771494076e-05, "loss": 0.07, "step": 7576 }, { "epoch": 2.3164169978599816, "grad_norm": 0.28490495681762695, "learning_rate": 2.8937714940771875e-05, "loss": 0.08, "step": 7577 }, { "epoch": 2.3167227147661267, "grad_norm": 0.4596186578273773, "learning_rate": 2.8941536110049674e-05, "loss": 0.1228, "step": 7578 }, { "epoch": 2.3170284316722713, "grad_norm": 0.335965096950531, "learning_rate": 2.8945357279327473e-05, "loss": 0.0931, "step": 7579 }, { "epoch": 2.3173341485784165, "grad_norm": 0.530803918838501, "learning_rate": 2.894917844860527e-05, "loss": 0.1009, "step": 7580 }, { "epoch": 2.317639865484561, "grad_norm": 0.36344432830810547, "learning_rate": 2.895299961788307e-05, "loss": 0.1276, "step": 7581 }, { "epoch": 2.3179455823907062, "grad_norm": 0.665578305721283, "learning_rate": 2.895682078716087e-05, "loss": 0.1447, "step": 7582 }, { "epoch": 2.3182512992968514, "grad_norm": 0.7144237756729126, "learning_rate": 2.896064195643867e-05, "loss": 0.1385, "step": 7583 }, { "epoch": 2.318557016202996, "grad_norm": 0.44742846488952637, "learning_rate": 2.8964463125716473e-05, "loss": 0.1894, "step": 7584 }, { "epoch": 2.3188627331091407, "grad_norm": 0.9239465594291687, "learning_rate": 2.8968284294994272e-05, "loss": 0.1851, "step": 7585 }, { "epoch": 2.319168450015286, "grad_norm": 0.55808025598526, "learning_rate": 2.897210546427207e-05, "loss": 0.2068, "step": 7586 }, { "epoch": 2.319474166921431, "grad_norm": 1.659022569656372, "learning_rate": 2.897592663354987e-05, "loss": 0.2066, "step": 7587 }, { "epoch": 2.3197798838275756, "grad_norm": 1.297344446182251, "learning_rate": 2.8979747802827668e-05, "loss": 0.2311, "step": 7588 }, { "epoch": 2.3200856007337207, "grad_norm": 1.164866328239441, "learning_rate": 2.8983568972105467e-05, "loss": 0.2136, "step": 7589 }, { "epoch": 2.3203913176398654, "grad_norm": 1.3622578382492065, "learning_rate": 2.8987390141383266e-05, "loss": 0.2612, "step": 7590 }, { "epoch": 2.3206970345460105, "grad_norm": 1.0978846549987793, "learning_rate": 2.8991211310661064e-05, "loss": 0.2726, "step": 7591 }, { "epoch": 2.321002751452155, "grad_norm": 1.1262507438659668, "learning_rate": 2.8995032479938863e-05, "loss": 0.2685, "step": 7592 }, { "epoch": 2.3213084683583003, "grad_norm": 0.44354450702667236, "learning_rate": 2.8998853649216662e-05, "loss": 0.1395, "step": 7593 }, { "epoch": 2.321614185264445, "grad_norm": 0.3856768310070038, "learning_rate": 2.900267481849446e-05, "loss": 0.0973, "step": 7594 }, { "epoch": 2.32191990217059, "grad_norm": 0.3897543251514435, "learning_rate": 2.900649598777226e-05, "loss": 0.0832, "step": 7595 }, { "epoch": 2.322225619076735, "grad_norm": 0.2754332721233368, "learning_rate": 2.9010317157050058e-05, "loss": 0.0726, "step": 7596 }, { "epoch": 2.32253133598288, "grad_norm": 0.35950592160224915, "learning_rate": 2.9014138326327857e-05, "loss": 0.0692, "step": 7597 }, { "epoch": 2.3228370528890245, "grad_norm": 0.3064972162246704, "learning_rate": 2.9017959495605656e-05, "loss": 0.0572, "step": 7598 }, { "epoch": 2.3231427697951696, "grad_norm": 0.307540625333786, "learning_rate": 2.9021780664883454e-05, "loss": 0.0601, "step": 7599 }, { "epoch": 2.3234484867013148, "grad_norm": 0.3557620942592621, "learning_rate": 2.9025601834161257e-05, "loss": 0.0785, "step": 7600 }, { "epoch": 2.3237542036074594, "grad_norm": 0.7676001191139221, "learning_rate": 2.9029423003439055e-05, "loss": 0.1329, "step": 7601 }, { "epoch": 2.3240599205136045, "grad_norm": 0.29560819268226624, "learning_rate": 2.9033244172716854e-05, "loss": 0.0684, "step": 7602 }, { "epoch": 2.324365637419749, "grad_norm": 2.508864164352417, "learning_rate": 2.9037065341994653e-05, "loss": 0.1059, "step": 7603 }, { "epoch": 2.3246713543258943, "grad_norm": 0.45418208837509155, "learning_rate": 2.904088651127245e-05, "loss": 0.1117, "step": 7604 }, { "epoch": 2.324977071232039, "grad_norm": 0.40900641679763794, "learning_rate": 2.904470768055025e-05, "loss": 0.1095, "step": 7605 }, { "epoch": 2.325282788138184, "grad_norm": 0.5292288661003113, "learning_rate": 2.904852884982805e-05, "loss": 0.0904, "step": 7606 }, { "epoch": 2.325588505044329, "grad_norm": 0.5699718594551086, "learning_rate": 2.9052350019105848e-05, "loss": 0.1311, "step": 7607 }, { "epoch": 2.325894221950474, "grad_norm": 1.9440544843673706, "learning_rate": 2.9056171188383646e-05, "loss": 0.1951, "step": 7608 }, { "epoch": 2.326199938856619, "grad_norm": 1.041907548904419, "learning_rate": 2.9059992357661445e-05, "loss": 0.2177, "step": 7609 }, { "epoch": 2.3265056557627637, "grad_norm": 0.669350266456604, "learning_rate": 2.9063813526939244e-05, "loss": 0.2114, "step": 7610 }, { "epoch": 2.3268113726689084, "grad_norm": 1.3728891611099243, "learning_rate": 2.9067634696217043e-05, "loss": 0.2035, "step": 7611 }, { "epoch": 2.3271170895750535, "grad_norm": 1.1725305318832397, "learning_rate": 2.907145586549484e-05, "loss": 0.1896, "step": 7612 }, { "epoch": 2.3274228064811986, "grad_norm": 0.9091787338256836, "learning_rate": 2.907527703477264e-05, "loss": 0.2062, "step": 7613 }, { "epoch": 2.3277285233873433, "grad_norm": 1.211126685142517, "learning_rate": 2.907909820405044e-05, "loss": 0.2436, "step": 7614 }, { "epoch": 2.3280342402934884, "grad_norm": 1.1527800559997559, "learning_rate": 2.908291937332824e-05, "loss": 0.1975, "step": 7615 }, { "epoch": 2.328339957199633, "grad_norm": 2.19952392578125, "learning_rate": 2.908674054260604e-05, "loss": 0.2382, "step": 7616 }, { "epoch": 2.328645674105778, "grad_norm": NaN, "learning_rate": 2.908674054260604e-05, "loss": 0.2841, "step": 7617 }, { "epoch": 2.328951391011923, "grad_norm": 0.7988470196723938, "learning_rate": 2.909056171188384e-05, "loss": 0.1905, "step": 7618 }, { "epoch": 2.329257107918068, "grad_norm": 0.548987865447998, "learning_rate": 2.9094382881161637e-05, "loss": 0.1102, "step": 7619 }, { "epoch": 2.3295628248242126, "grad_norm": 0.2923658490180969, "learning_rate": 2.9098204050439436e-05, "loss": 0.0526, "step": 7620 }, { "epoch": 2.3298685417303577, "grad_norm": 0.30206209421157837, "learning_rate": 2.9102025219717235e-05, "loss": 0.0893, "step": 7621 }, { "epoch": 2.3301742586365024, "grad_norm": 0.5604680776596069, "learning_rate": 2.9105846388995034e-05, "loss": 0.0663, "step": 7622 }, { "epoch": 2.3304799755426475, "grad_norm": 0.38376012444496155, "learning_rate": 2.9109667558272832e-05, "loss": 0.0732, "step": 7623 }, { "epoch": 2.330785692448792, "grad_norm": 0.996726393699646, "learning_rate": 2.911348872755063e-05, "loss": 0.0669, "step": 7624 }, { "epoch": 2.3310914093549373, "grad_norm": 0.330494225025177, "learning_rate": 2.911730989682843e-05, "loss": 0.0899, "step": 7625 }, { "epoch": 2.3313971262610824, "grad_norm": 0.5935619473457336, "learning_rate": 2.912113106610623e-05, "loss": 0.1463, "step": 7626 }, { "epoch": 2.331702843167227, "grad_norm": 0.7546497583389282, "learning_rate": 2.9124952235384027e-05, "loss": 0.0895, "step": 7627 }, { "epoch": 2.332008560073372, "grad_norm": 0.6337897181510925, "learning_rate": 2.9128773404661826e-05, "loss": 0.0999, "step": 7628 }, { "epoch": 2.332314276979517, "grad_norm": 0.7573128938674927, "learning_rate": 2.9132594573939625e-05, "loss": 0.084, "step": 7629 }, { "epoch": 2.332619993885662, "grad_norm": 1.3545829057693481, "learning_rate": 2.9136415743217423e-05, "loss": 0.1085, "step": 7630 }, { "epoch": 2.3329257107918067, "grad_norm": 0.8736164569854736, "learning_rate": 2.9140236912495226e-05, "loss": 0.1768, "step": 7631 }, { "epoch": 2.3332314276979518, "grad_norm": 0.36486339569091797, "learning_rate": 2.9144058081773024e-05, "loss": 0.1397, "step": 7632 }, { "epoch": 2.3335371446040964, "grad_norm": 0.9396327137947083, "learning_rate": 2.9147879251050823e-05, "loss": 0.1264, "step": 7633 }, { "epoch": 2.3338428615102416, "grad_norm": 0.47829100489616394, "learning_rate": 2.9151700420328622e-05, "loss": 0.1611, "step": 7634 }, { "epoch": 2.3341485784163862, "grad_norm": 0.5329495668411255, "learning_rate": 2.915552158960642e-05, "loss": 0.1827, "step": 7635 }, { "epoch": 2.3344542953225313, "grad_norm": 1.0021324157714844, "learning_rate": 2.915934275888422e-05, "loss": 0.2187, "step": 7636 }, { "epoch": 2.334760012228676, "grad_norm": 0.6222850680351257, "learning_rate": 2.9163163928162018e-05, "loss": 0.235, "step": 7637 }, { "epoch": 2.335065729134821, "grad_norm": 0.7986207008361816, "learning_rate": 2.9166985097439817e-05, "loss": 0.2594, "step": 7638 }, { "epoch": 2.3353714460409662, "grad_norm": 0.6901640892028809, "learning_rate": 2.9170806266717616e-05, "loss": 0.1827, "step": 7639 }, { "epoch": 2.335677162947111, "grad_norm": 1.1342500448226929, "learning_rate": 2.9174627435995414e-05, "loss": 0.252, "step": 7640 }, { "epoch": 2.335982879853256, "grad_norm": 0.759447455406189, "learning_rate": 2.9178448605273213e-05, "loss": 0.2258, "step": 7641 }, { "epoch": 2.3362885967594007, "grad_norm": 1.4814119338989258, "learning_rate": 2.9182269774551012e-05, "loss": 0.2891, "step": 7642 }, { "epoch": 2.336594313665546, "grad_norm": 0.43667858839035034, "learning_rate": 2.918609094382881e-05, "loss": 0.1377, "step": 7643 }, { "epoch": 2.3369000305716905, "grad_norm": 0.2786879241466522, "learning_rate": 2.918991211310661e-05, "loss": 0.0763, "step": 7644 }, { "epoch": 2.3372057474778356, "grad_norm": 0.3271520435810089, "learning_rate": 2.9193733282384408e-05, "loss": 0.0752, "step": 7645 }, { "epoch": 2.3375114643839803, "grad_norm": 0.35701486468315125, "learning_rate": 2.9197554451662207e-05, "loss": 0.0817, "step": 7646 }, { "epoch": 2.3378171812901254, "grad_norm": 0.4370735287666321, "learning_rate": 2.920137562094001e-05, "loss": 0.0714, "step": 7647 }, { "epoch": 2.33812289819627, "grad_norm": 0.33304786682128906, "learning_rate": 2.9205196790217808e-05, "loss": 0.0772, "step": 7648 }, { "epoch": 2.338428615102415, "grad_norm": 0.3668128252029419, "learning_rate": 2.9209017959495606e-05, "loss": 0.058, "step": 7649 }, { "epoch": 2.33873433200856, "grad_norm": 0.6817300915718079, "learning_rate": 2.9212839128773405e-05, "loss": 0.0847, "step": 7650 }, { "epoch": 2.339040048914705, "grad_norm": 0.33846142888069153, "learning_rate": 2.9216660298051204e-05, "loss": 0.0746, "step": 7651 }, { "epoch": 2.33934576582085, "grad_norm": 0.460321307182312, "learning_rate": 2.9220481467329003e-05, "loss": 0.0995, "step": 7652 }, { "epoch": 2.3396514827269947, "grad_norm": 0.3290937840938568, "learning_rate": 2.92243026366068e-05, "loss": 0.1172, "step": 7653 }, { "epoch": 2.33995719963314, "grad_norm": 0.41584959626197815, "learning_rate": 2.92281238058846e-05, "loss": 0.0651, "step": 7654 }, { "epoch": 2.3402629165392845, "grad_norm": 3.0645461082458496, "learning_rate": 2.92319449751624e-05, "loss": 0.1246, "step": 7655 }, { "epoch": 2.3405686334454296, "grad_norm": 0.570110023021698, "learning_rate": 2.9235766144440198e-05, "loss": 0.1659, "step": 7656 }, { "epoch": 2.3408743503515743, "grad_norm": 0.650628924369812, "learning_rate": 2.9239587313717996e-05, "loss": 0.1426, "step": 7657 }, { "epoch": 2.3411800672577194, "grad_norm": 0.4802393913269043, "learning_rate": 2.9243408482995795e-05, "loss": 0.1281, "step": 7658 }, { "epoch": 2.341485784163864, "grad_norm": 0.7199332118034363, "learning_rate": 2.9247229652273594e-05, "loss": 0.1826, "step": 7659 }, { "epoch": 2.341791501070009, "grad_norm": 0.6097378730773926, "learning_rate": 2.9251050821551393e-05, "loss": 0.1852, "step": 7660 }, { "epoch": 2.342097217976154, "grad_norm": 0.9344286322593689, "learning_rate": 2.925487199082919e-05, "loss": 0.1705, "step": 7661 }, { "epoch": 2.342402934882299, "grad_norm": 0.6265787482261658, "learning_rate": 2.9258693160106993e-05, "loss": 0.1871, "step": 7662 }, { "epoch": 2.3427086517884437, "grad_norm": 0.7288218140602112, "learning_rate": 2.9262514329384796e-05, "loss": 0.1793, "step": 7663 }, { "epoch": 2.343014368694589, "grad_norm": 1.1766618490219116, "learning_rate": 2.9266335498662594e-05, "loss": 0.255, "step": 7664 }, { "epoch": 2.343320085600734, "grad_norm": 1.0318334102630615, "learning_rate": 2.9270156667940393e-05, "loss": 0.2188, "step": 7665 }, { "epoch": 2.3436258025068786, "grad_norm": 1.2757728099822998, "learning_rate": 2.9273977837218192e-05, "loss": 0.2255, "step": 7666 }, { "epoch": 2.3439315194130237, "grad_norm": 1.2280958890914917, "learning_rate": 2.927779900649599e-05, "loss": 0.3028, "step": 7667 }, { "epoch": 2.3442372363191684, "grad_norm": 0.569523274898529, "learning_rate": 2.928162017577379e-05, "loss": 0.148, "step": 7668 }, { "epoch": 2.3445429532253135, "grad_norm": 0.8800312280654907, "learning_rate": 2.9285441345051588e-05, "loss": 0.099, "step": 7669 }, { "epoch": 2.344848670131458, "grad_norm": 0.25210142135620117, "learning_rate": 2.9289262514329387e-05, "loss": 0.0544, "step": 7670 }, { "epoch": 2.3451543870376033, "grad_norm": 0.3553265631198883, "learning_rate": 2.9293083683607186e-05, "loss": 0.0676, "step": 7671 }, { "epoch": 2.345460103943748, "grad_norm": 0.4561507999897003, "learning_rate": 2.9296904852884984e-05, "loss": 0.0674, "step": 7672 }, { "epoch": 2.345765820849893, "grad_norm": 0.4139935076236725, "learning_rate": 2.9300726022162783e-05, "loss": 0.0741, "step": 7673 }, { "epoch": 2.3460715377560377, "grad_norm": 0.395748496055603, "learning_rate": 2.9304547191440582e-05, "loss": 0.0907, "step": 7674 }, { "epoch": 2.346377254662183, "grad_norm": 0.298657089471817, "learning_rate": 2.930836836071838e-05, "loss": 0.0603, "step": 7675 }, { "epoch": 2.3466829715683275, "grad_norm": 0.3687571883201599, "learning_rate": 2.931218952999618e-05, "loss": 0.0971, "step": 7676 }, { "epoch": 2.3469886884744726, "grad_norm": 0.37362805008888245, "learning_rate": 2.9316010699273978e-05, "loss": 0.0925, "step": 7677 }, { "epoch": 2.3472944053806177, "grad_norm": 0.37502607703208923, "learning_rate": 2.931983186855178e-05, "loss": 0.1191, "step": 7678 }, { "epoch": 2.3476001222867624, "grad_norm": 1.1296063661575317, "learning_rate": 2.932365303782958e-05, "loss": 0.087, "step": 7679 }, { "epoch": 2.3479058391929075, "grad_norm": 0.4415367841720581, "learning_rate": 2.9327474207107378e-05, "loss": 0.1245, "step": 7680 }, { "epoch": 2.348211556099052, "grad_norm": 0.8839843273162842, "learning_rate": 2.9331295376385176e-05, "loss": 0.1786, "step": 7681 }, { "epoch": 2.3485172730051973, "grad_norm": 0.6249372363090515, "learning_rate": 2.9335116545662975e-05, "loss": 0.1461, "step": 7682 }, { "epoch": 2.348822989911342, "grad_norm": 0.5088854432106018, "learning_rate": 2.9338937714940774e-05, "loss": 0.1375, "step": 7683 }, { "epoch": 2.349128706817487, "grad_norm": 0.655959963798523, "learning_rate": 2.9342758884218573e-05, "loss": 0.1878, "step": 7684 }, { "epoch": 2.3494344237236318, "grad_norm": 0.8086658120155334, "learning_rate": 2.934658005349637e-05, "loss": 0.2219, "step": 7685 }, { "epoch": 2.349740140629777, "grad_norm": 1.1417441368103027, "learning_rate": 2.935040122277417e-05, "loss": 0.2012, "step": 7686 }, { "epoch": 2.3500458575359215, "grad_norm": 0.9045966863632202, "learning_rate": 2.935422239205197e-05, "loss": 0.1793, "step": 7687 }, { "epoch": 2.3503515744420667, "grad_norm": 0.9589837193489075, "learning_rate": 2.9358043561329768e-05, "loss": 0.1962, "step": 7688 }, { "epoch": 2.3506572913482113, "grad_norm": 0.9353507161140442, "learning_rate": 2.9361864730607566e-05, "loss": 0.2065, "step": 7689 }, { "epoch": 2.3509630082543564, "grad_norm": 1.2107058763504028, "learning_rate": 2.9365685899885365e-05, "loss": 0.2238, "step": 7690 }, { "epoch": 2.3512687251605016, "grad_norm": 1.209251880645752, "learning_rate": 2.9369507069163164e-05, "loss": 0.2606, "step": 7691 }, { "epoch": 2.3515744420666462, "grad_norm": 2.5798301696777344, "learning_rate": 2.9373328238440963e-05, "loss": 0.3087, "step": 7692 }, { "epoch": 2.3518801589727913, "grad_norm": 0.6411860585212708, "learning_rate": 2.937714940771876e-05, "loss": 0.158, "step": 7693 }, { "epoch": 2.352185875878936, "grad_norm": 0.33852314949035645, "learning_rate": 2.9380970576996563e-05, "loss": 0.0982, "step": 7694 }, { "epoch": 2.352491592785081, "grad_norm": 0.4160299599170685, "learning_rate": 2.9384791746274362e-05, "loss": 0.0952, "step": 7695 }, { "epoch": 2.352797309691226, "grad_norm": 0.26883184909820557, "learning_rate": 2.938861291555216e-05, "loss": 0.0553, "step": 7696 }, { "epoch": 2.353103026597371, "grad_norm": 0.30984142422676086, "learning_rate": 2.939243408482996e-05, "loss": 0.0783, "step": 7697 }, { "epoch": 2.3534087435035156, "grad_norm": 0.38092926144599915, "learning_rate": 2.939625525410776e-05, "loss": 0.093, "step": 7698 }, { "epoch": 2.3537144604096607, "grad_norm": 0.3628116250038147, "learning_rate": 2.9400076423385557e-05, "loss": 0.0565, "step": 7699 }, { "epoch": 2.3540201773158054, "grad_norm": 0.5320684909820557, "learning_rate": 2.9403897592663356e-05, "loss": 0.1019, "step": 7700 }, { "epoch": 2.3543258942219505, "grad_norm": 0.3479795753955841, "learning_rate": 2.9407718761941155e-05, "loss": 0.0635, "step": 7701 }, { "epoch": 2.354631611128095, "grad_norm": 0.3635393977165222, "learning_rate": 2.9411539931218953e-05, "loss": 0.0902, "step": 7702 }, { "epoch": 2.3549373280342403, "grad_norm": 0.579884946346283, "learning_rate": 2.9415361100496752e-05, "loss": 0.0905, "step": 7703 }, { "epoch": 2.3552430449403854, "grad_norm": 0.7475872039794922, "learning_rate": 2.941918226977455e-05, "loss": 0.1052, "step": 7704 }, { "epoch": 2.35554876184653, "grad_norm": 0.43899768590927124, "learning_rate": 2.942300343905235e-05, "loss": 0.1447, "step": 7705 }, { "epoch": 2.355854478752675, "grad_norm": 0.4680424928665161, "learning_rate": 2.942682460833015e-05, "loss": 0.1207, "step": 7706 }, { "epoch": 2.35616019565882, "grad_norm": 0.5722905397415161, "learning_rate": 2.9430645777607947e-05, "loss": 0.141, "step": 7707 }, { "epoch": 2.356465912564965, "grad_norm": 0.6314733028411865, "learning_rate": 2.9434466946885746e-05, "loss": 0.1434, "step": 7708 }, { "epoch": 2.3567716294711096, "grad_norm": 1.1416436433792114, "learning_rate": 2.9438288116163548e-05, "loss": 0.2127, "step": 7709 }, { "epoch": 2.3570773463772547, "grad_norm": 1.0240765810012817, "learning_rate": 2.9442109285441347e-05, "loss": 0.1751, "step": 7710 }, { "epoch": 2.3573830632833994, "grad_norm": 1.2444096803665161, "learning_rate": 2.9445930454719146e-05, "loss": 0.2187, "step": 7711 }, { "epoch": 2.3576887801895445, "grad_norm": 0.9570043683052063, "learning_rate": 2.9449751623996944e-05, "loss": 0.219, "step": 7712 }, { "epoch": 2.357994497095689, "grad_norm": 1.1054764986038208, "learning_rate": 2.9453572793274743e-05, "loss": 0.2147, "step": 7713 }, { "epoch": 2.3583002140018343, "grad_norm": 2.2403810024261475, "learning_rate": 2.9457393962552542e-05, "loss": 0.1752, "step": 7714 }, { "epoch": 2.358605930907979, "grad_norm": 0.8386846780776978, "learning_rate": 2.946121513183034e-05, "loss": 0.2131, "step": 7715 }, { "epoch": 2.358911647814124, "grad_norm": 1.6433937549591064, "learning_rate": 2.946503630110814e-05, "loss": 0.2734, "step": 7716 }, { "epoch": 2.359217364720269, "grad_norm": 1.1497381925582886, "learning_rate": 2.9468857470385938e-05, "loss": 0.2862, "step": 7717 }, { "epoch": 2.359523081626414, "grad_norm": 0.499171644449234, "learning_rate": 2.9472678639663737e-05, "loss": 0.1901, "step": 7718 }, { "epoch": 2.359828798532559, "grad_norm": 0.2904789447784424, "learning_rate": 2.9476499808941535e-05, "loss": 0.1016, "step": 7719 }, { "epoch": 2.3601345154387037, "grad_norm": 0.5535513162612915, "learning_rate": 2.9480320978219334e-05, "loss": 0.0762, "step": 7720 }, { "epoch": 2.360440232344849, "grad_norm": 0.24161186814308167, "learning_rate": 2.9484142147497133e-05, "loss": 0.0697, "step": 7721 }, { "epoch": 2.3607459492509935, "grad_norm": 0.518621563911438, "learning_rate": 2.9487963316774932e-05, "loss": 0.0762, "step": 7722 }, { "epoch": 2.3610516661571386, "grad_norm": 0.2682506740093231, "learning_rate": 2.949178448605273e-05, "loss": 0.0528, "step": 7723 }, { "epoch": 2.3613573830632832, "grad_norm": 0.3813106119632721, "learning_rate": 2.949560565533053e-05, "loss": 0.0591, "step": 7724 }, { "epoch": 2.3616630999694284, "grad_norm": 0.42876315116882324, "learning_rate": 2.949942682460833e-05, "loss": 0.0607, "step": 7725 }, { "epoch": 2.361968816875573, "grad_norm": 0.4104273319244385, "learning_rate": 2.950324799388613e-05, "loss": 0.0781, "step": 7726 }, { "epoch": 2.362274533781718, "grad_norm": 0.624413251876831, "learning_rate": 2.950706916316393e-05, "loss": 0.0852, "step": 7727 }, { "epoch": 2.362580250687863, "grad_norm": 0.3929961025714874, "learning_rate": 2.9510890332441728e-05, "loss": 0.1126, "step": 7728 }, { "epoch": 2.362885967594008, "grad_norm": 0.5849949717521667, "learning_rate": 2.9514711501719526e-05, "loss": 0.098, "step": 7729 }, { "epoch": 2.363191684500153, "grad_norm": 0.42458581924438477, "learning_rate": 2.9518532670997325e-05, "loss": 0.091, "step": 7730 }, { "epoch": 2.3634974014062977, "grad_norm": 1.1757256984710693, "learning_rate": 2.9522353840275124e-05, "loss": 0.1557, "step": 7731 }, { "epoch": 2.363803118312443, "grad_norm": 0.7519523501396179, "learning_rate": 2.9526175009552923e-05, "loss": 0.136, "step": 7732 }, { "epoch": 2.3641088352185875, "grad_norm": 0.46436306834220886, "learning_rate": 2.952999617883072e-05, "loss": 0.1554, "step": 7733 }, { "epoch": 2.3644145521247326, "grad_norm": 0.6997377872467041, "learning_rate": 2.953381734810852e-05, "loss": 0.1803, "step": 7734 }, { "epoch": 2.3647202690308773, "grad_norm": 0.7020920515060425, "learning_rate": 2.953763851738632e-05, "loss": 0.1871, "step": 7735 }, { "epoch": 2.3650259859370224, "grad_norm": 0.7478744983673096, "learning_rate": 2.9541459686664118e-05, "loss": 0.1844, "step": 7736 }, { "epoch": 2.365331702843167, "grad_norm": 0.92518550157547, "learning_rate": 2.9545280855941916e-05, "loss": 0.2188, "step": 7737 }, { "epoch": 2.365637419749312, "grad_norm": 1.0732873678207397, "learning_rate": 2.9549102025219715e-05, "loss": 0.1884, "step": 7738 }, { "epoch": 2.365943136655457, "grad_norm": 0.7965973615646362, "learning_rate": 2.9552923194497514e-05, "loss": 0.2079, "step": 7739 }, { "epoch": 2.366248853561602, "grad_norm": 1.1395421028137207, "learning_rate": 2.955674436377532e-05, "loss": 0.2245, "step": 7740 }, { "epoch": 2.3665545704677466, "grad_norm": 8.838541984558105, "learning_rate": 2.9560565533053118e-05, "loss": 0.2181, "step": 7741 }, { "epoch": 2.3668602873738918, "grad_norm": 2.9786856174468994, "learning_rate": 2.9564386702330917e-05, "loss": 0.3072, "step": 7742 }, { "epoch": 2.367166004280037, "grad_norm": 0.4444866180419922, "learning_rate": 2.9568207871608716e-05, "loss": 0.1605, "step": 7743 }, { "epoch": 2.3674717211861815, "grad_norm": 0.4697217345237732, "learning_rate": 2.9572029040886514e-05, "loss": 0.0815, "step": 7744 }, { "epoch": 2.3677774380923267, "grad_norm": 0.36243292689323425, "learning_rate": 2.9575850210164313e-05, "loss": 0.0706, "step": 7745 }, { "epoch": 2.3680831549984713, "grad_norm": 0.356790155172348, "learning_rate": 2.9579671379442112e-05, "loss": 0.0649, "step": 7746 }, { "epoch": 2.3683888719046164, "grad_norm": 0.25284257531166077, "learning_rate": 2.958349254871991e-05, "loss": 0.0776, "step": 7747 }, { "epoch": 2.368694588810761, "grad_norm": 0.3323809504508972, "learning_rate": 2.958731371799771e-05, "loss": 0.0625, "step": 7748 }, { "epoch": 2.3690003057169062, "grad_norm": 0.2998465895652771, "learning_rate": 2.9591134887275508e-05, "loss": 0.0576, "step": 7749 }, { "epoch": 2.369306022623051, "grad_norm": 0.7803493142127991, "learning_rate": 2.9594956056553307e-05, "loss": 0.1079, "step": 7750 }, { "epoch": 2.369611739529196, "grad_norm": 0.45762747526168823, "learning_rate": 2.9598777225831105e-05, "loss": 0.0757, "step": 7751 }, { "epoch": 2.3699174564353407, "grad_norm": 0.2939193546772003, "learning_rate": 2.9602598395108904e-05, "loss": 0.0756, "step": 7752 }, { "epoch": 2.370223173341486, "grad_norm": 0.38261866569519043, "learning_rate": 2.9606419564386703e-05, "loss": 0.1051, "step": 7753 }, { "epoch": 2.3705288902476305, "grad_norm": 0.421844482421875, "learning_rate": 2.9610240733664502e-05, "loss": 0.1178, "step": 7754 }, { "epoch": 2.3708346071537756, "grad_norm": 0.4811236262321472, "learning_rate": 2.96140619029423e-05, "loss": 0.113, "step": 7755 }, { "epoch": 2.3711403240599207, "grad_norm": 0.49473366141319275, "learning_rate": 2.9617883072220103e-05, "loss": 0.1429, "step": 7756 }, { "epoch": 2.3714460409660654, "grad_norm": 0.3837795555591583, "learning_rate": 2.96217042414979e-05, "loss": 0.1065, "step": 7757 }, { "epoch": 2.3717517578722105, "grad_norm": 0.579876720905304, "learning_rate": 2.96255254107757e-05, "loss": 0.1484, "step": 7758 }, { "epoch": 2.372057474778355, "grad_norm": 0.7959766387939453, "learning_rate": 2.96293465800535e-05, "loss": 0.1443, "step": 7759 }, { "epoch": 2.3723631916845003, "grad_norm": 0.657534658908844, "learning_rate": 2.9633167749331298e-05, "loss": 0.1863, "step": 7760 }, { "epoch": 2.372668908590645, "grad_norm": 0.5560991764068604, "learning_rate": 2.9636988918609096e-05, "loss": 0.1642, "step": 7761 }, { "epoch": 2.37297462549679, "grad_norm": 0.9682074189186096, "learning_rate": 2.9640810087886895e-05, "loss": 0.1832, "step": 7762 }, { "epoch": 2.3732803424029347, "grad_norm": 0.9462303519248962, "learning_rate": 2.9644631257164694e-05, "loss": 0.2134, "step": 7763 }, { "epoch": 2.37358605930908, "grad_norm": 0.7914586663246155, "learning_rate": 2.9648452426442493e-05, "loss": 0.1766, "step": 7764 }, { "epoch": 2.3738917762152245, "grad_norm": 0.7595401406288147, "learning_rate": 2.965227359572029e-05, "loss": 0.1907, "step": 7765 }, { "epoch": 2.3741974931213696, "grad_norm": 1.4057261943817139, "learning_rate": 2.965609476499809e-05, "loss": 0.2104, "step": 7766 }, { "epoch": 2.3745032100275143, "grad_norm": 1.7788912057876587, "learning_rate": 2.965991593427589e-05, "loss": 0.4024, "step": 7767 }, { "epoch": 2.3748089269336594, "grad_norm": 1.635724663734436, "learning_rate": 2.9663737103553688e-05, "loss": 0.1513, "step": 7768 }, { "epoch": 2.3751146438398045, "grad_norm": 0.26561611890792847, "learning_rate": 2.9667558272831486e-05, "loss": 0.0949, "step": 7769 }, { "epoch": 2.375420360745949, "grad_norm": 0.4580318331718445, "learning_rate": 2.9671379442109285e-05, "loss": 0.0701, "step": 7770 }, { "epoch": 2.3757260776520943, "grad_norm": 0.24309439957141876, "learning_rate": 2.9675200611387087e-05, "loss": 0.061, "step": 7771 }, { "epoch": 2.376031794558239, "grad_norm": 0.20880085229873657, "learning_rate": 2.9679021780664886e-05, "loss": 0.0812, "step": 7772 }, { "epoch": 2.376337511464384, "grad_norm": 0.3289095461368561, "learning_rate": 2.9682842949942685e-05, "loss": 0.0501, "step": 7773 }, { "epoch": 2.3766432283705288, "grad_norm": 0.30967921018600464, "learning_rate": 2.9686664119220483e-05, "loss": 0.0626, "step": 7774 }, { "epoch": 2.376948945276674, "grad_norm": 0.3211393356323242, "learning_rate": 2.9690485288498282e-05, "loss": 0.0695, "step": 7775 }, { "epoch": 2.3772546621828186, "grad_norm": 0.6908048391342163, "learning_rate": 2.969430645777608e-05, "loss": 0.1024, "step": 7776 }, { "epoch": 2.3775603790889637, "grad_norm": 0.6520815491676331, "learning_rate": 2.969812762705388e-05, "loss": 0.0853, "step": 7777 }, { "epoch": 2.3778660959951083, "grad_norm": 0.5455451607704163, "learning_rate": 2.970194879633168e-05, "loss": 0.0929, "step": 7778 }, { "epoch": 2.3781718129012535, "grad_norm": 0.37201622128486633, "learning_rate": 2.9705769965609477e-05, "loss": 0.0923, "step": 7779 }, { "epoch": 2.378477529807398, "grad_norm": 0.42992374300956726, "learning_rate": 2.9709591134887276e-05, "loss": 0.1136, "step": 7780 }, { "epoch": 2.3787832467135432, "grad_norm": 0.3715883791446686, "learning_rate": 2.9713412304165075e-05, "loss": 0.1142, "step": 7781 }, { "epoch": 2.3790889636196884, "grad_norm": 0.8579406142234802, "learning_rate": 2.9717233473442873e-05, "loss": 0.1395, "step": 7782 }, { "epoch": 2.379394680525833, "grad_norm": 0.815402626991272, "learning_rate": 2.9721054642720672e-05, "loss": 0.1589, "step": 7783 }, { "epoch": 2.379700397431978, "grad_norm": 0.711955189704895, "learning_rate": 2.972487581199847e-05, "loss": 0.2114, "step": 7784 }, { "epoch": 2.380006114338123, "grad_norm": 0.44608908891677856, "learning_rate": 2.972869698127627e-05, "loss": 0.1684, "step": 7785 }, { "epoch": 2.380311831244268, "grad_norm": 0.9092902541160583, "learning_rate": 2.973251815055407e-05, "loss": 0.1971, "step": 7786 }, { "epoch": 2.3806175481504126, "grad_norm": 0.9152440428733826, "learning_rate": 2.973633931983187e-05, "loss": 0.2108, "step": 7787 }, { "epoch": 2.3809232650565577, "grad_norm": 0.5806522965431213, "learning_rate": 2.974016048910967e-05, "loss": 0.1905, "step": 7788 }, { "epoch": 2.3812289819627024, "grad_norm": 0.9436842799186707, "learning_rate": 2.9743981658387468e-05, "loss": 0.212, "step": 7789 }, { "epoch": 2.3815346988688475, "grad_norm": 3.1907615661621094, "learning_rate": 2.9747802827665267e-05, "loss": 0.2271, "step": 7790 }, { "epoch": 2.381840415774992, "grad_norm": 1.1838804483413696, "learning_rate": 2.9751623996943065e-05, "loss": 0.2192, "step": 7791 }, { "epoch": 2.3821461326811373, "grad_norm": 1.249245524406433, "learning_rate": 2.9755445166220864e-05, "loss": 0.2826, "step": 7792 }, { "epoch": 2.382451849587282, "grad_norm": 0.35376352071762085, "learning_rate": 2.9759266335498663e-05, "loss": 0.1586, "step": 7793 }, { "epoch": 2.382757566493427, "grad_norm": 0.6331285834312439, "learning_rate": 2.976308750477646e-05, "loss": 0.0991, "step": 7794 }, { "epoch": 2.383063283399572, "grad_norm": 0.36481234431266785, "learning_rate": 2.976690867405426e-05, "loss": 0.0865, "step": 7795 }, { "epoch": 2.383369000305717, "grad_norm": 0.2747879922389984, "learning_rate": 2.977072984333206e-05, "loss": 0.0758, "step": 7796 }, { "epoch": 2.383674717211862, "grad_norm": 0.2637726366519928, "learning_rate": 2.9774551012609858e-05, "loss": 0.0804, "step": 7797 }, { "epoch": 2.3839804341180066, "grad_norm": 0.24759918451309204, "learning_rate": 2.9778372181887657e-05, "loss": 0.0581, "step": 7798 }, { "epoch": 2.3842861510241518, "grad_norm": 0.27418452501296997, "learning_rate": 2.9782193351165455e-05, "loss": 0.0591, "step": 7799 }, { "epoch": 2.3845918679302964, "grad_norm": 0.3645658493041992, "learning_rate": 2.9786014520443254e-05, "loss": 0.0699, "step": 7800 }, { "epoch": 2.3848975848364415, "grad_norm": 0.3534906208515167, "learning_rate": 2.9789835689721053e-05, "loss": 0.0686, "step": 7801 }, { "epoch": 2.385203301742586, "grad_norm": 0.25927022099494934, "learning_rate": 2.9793656858998855e-05, "loss": 0.0947, "step": 7802 }, { "epoch": 2.3855090186487313, "grad_norm": 0.3313908874988556, "learning_rate": 2.9797478028276654e-05, "loss": 0.103, "step": 7803 }, { "epoch": 2.385814735554876, "grad_norm": 0.2760538160800934, "learning_rate": 2.9801299197554452e-05, "loss": 0.093, "step": 7804 }, { "epoch": 2.386120452461021, "grad_norm": 0.6388120055198669, "learning_rate": 2.980512036683225e-05, "loss": 0.0852, "step": 7805 }, { "epoch": 2.386426169367166, "grad_norm": 0.8348307013511658, "learning_rate": 2.980894153611005e-05, "loss": 0.1694, "step": 7806 }, { "epoch": 2.386731886273311, "grad_norm": 1.0009160041809082, "learning_rate": 2.981276270538785e-05, "loss": 0.1136, "step": 7807 }, { "epoch": 2.387037603179456, "grad_norm": 0.7642815709114075, "learning_rate": 2.9816583874665647e-05, "loss": 0.1591, "step": 7808 }, { "epoch": 2.3873433200856007, "grad_norm": 0.7109202742576599, "learning_rate": 2.9820405043943446e-05, "loss": 0.1566, "step": 7809 }, { "epoch": 2.387649036991746, "grad_norm": 0.9207500219345093, "learning_rate": 2.9824226213221245e-05, "loss": 0.1824, "step": 7810 }, { "epoch": 2.3879547538978905, "grad_norm": 0.9279300570487976, "learning_rate": 2.9828047382499044e-05, "loss": 0.1959, "step": 7811 }, { "epoch": 2.3882604708040356, "grad_norm": 1.2232590913772583, "learning_rate": 2.9831868551776842e-05, "loss": 0.1783, "step": 7812 }, { "epoch": 2.3885661877101803, "grad_norm": 0.5695309042930603, "learning_rate": 2.983568972105464e-05, "loss": 0.1821, "step": 7813 }, { "epoch": 2.3888719046163254, "grad_norm": 0.7785462141036987, "learning_rate": 2.983951089033244e-05, "loss": 0.2059, "step": 7814 }, { "epoch": 2.38917762152247, "grad_norm": 1.188051700592041, "learning_rate": 2.984333205961024e-05, "loss": 0.2031, "step": 7815 }, { "epoch": 2.389483338428615, "grad_norm": 1.1732114553451538, "learning_rate": 2.9847153228888037e-05, "loss": 0.2405, "step": 7816 }, { "epoch": 2.38978905533476, "grad_norm": 1.0725369453430176, "learning_rate": 2.9850974398165836e-05, "loss": 0.2892, "step": 7817 }, { "epoch": 2.390094772240905, "grad_norm": 0.9145435094833374, "learning_rate": 2.985479556744364e-05, "loss": 0.1847, "step": 7818 }, { "epoch": 2.3904004891470496, "grad_norm": 0.5097176432609558, "learning_rate": 2.985861673672144e-05, "loss": 0.0736, "step": 7819 }, { "epoch": 2.3907062060531947, "grad_norm": 0.4430336654186249, "learning_rate": 2.986243790599924e-05, "loss": 0.1125, "step": 7820 }, { "epoch": 2.39101192295934, "grad_norm": 0.2690974771976471, "learning_rate": 2.9866259075277038e-05, "loss": 0.0781, "step": 7821 }, { "epoch": 2.3913176398654845, "grad_norm": 0.28055012226104736, "learning_rate": 2.9870080244554837e-05, "loss": 0.0679, "step": 7822 }, { "epoch": 2.3916233567716296, "grad_norm": 0.33513012528419495, "learning_rate": 2.9873901413832635e-05, "loss": 0.0654, "step": 7823 }, { "epoch": 2.3919290736777743, "grad_norm": 0.36571359634399414, "learning_rate": 2.9877722583110434e-05, "loss": 0.0698, "step": 7824 }, { "epoch": 2.3922347905839194, "grad_norm": 0.4127984941005707, "learning_rate": 2.9881543752388233e-05, "loss": 0.0892, "step": 7825 }, { "epoch": 2.392540507490064, "grad_norm": 0.56235671043396, "learning_rate": 2.988536492166603e-05, "loss": 0.1342, "step": 7826 }, { "epoch": 2.392846224396209, "grad_norm": 0.2840481102466583, "learning_rate": 2.988918609094383e-05, "loss": 0.0696, "step": 7827 }, { "epoch": 2.393151941302354, "grad_norm": 0.2692835330963135, "learning_rate": 2.989300726022163e-05, "loss": 0.0972, "step": 7828 }, { "epoch": 2.393457658208499, "grad_norm": 0.6091710329055786, "learning_rate": 2.9896828429499428e-05, "loss": 0.1354, "step": 7829 }, { "epoch": 2.3937633751146437, "grad_norm": 0.3428962528705597, "learning_rate": 2.9900649598777227e-05, "loss": 0.119, "step": 7830 }, { "epoch": 2.3940690920207888, "grad_norm": 0.3623840808868408, "learning_rate": 2.9904470768055025e-05, "loss": 0.1734, "step": 7831 }, { "epoch": 2.3943748089269334, "grad_norm": 0.5049846172332764, "learning_rate": 2.9908291937332824e-05, "loss": 0.1337, "step": 7832 }, { "epoch": 2.3946805258330786, "grad_norm": 0.725912868976593, "learning_rate": 2.9912113106610626e-05, "loss": 0.1834, "step": 7833 }, { "epoch": 2.3949862427392237, "grad_norm": 0.7919042706489563, "learning_rate": 2.9915934275888425e-05, "loss": 0.1546, "step": 7834 }, { "epoch": 2.3952919596453683, "grad_norm": 0.5677649974822998, "learning_rate": 2.9919755445166224e-05, "loss": 0.2146, "step": 7835 }, { "epoch": 2.3955976765515135, "grad_norm": 1.821698784828186, "learning_rate": 2.9923576614444022e-05, "loss": 0.2038, "step": 7836 }, { "epoch": 2.395903393457658, "grad_norm": 1.3037558794021606, "learning_rate": 2.992739778372182e-05, "loss": 0.2166, "step": 7837 }, { "epoch": 2.3962091103638032, "grad_norm": 0.7371826171875, "learning_rate": 2.993121895299962e-05, "loss": 0.2286, "step": 7838 }, { "epoch": 2.396514827269948, "grad_norm": 0.9738297462463379, "learning_rate": 2.993504012227742e-05, "loss": 0.2367, "step": 7839 }, { "epoch": 2.396820544176093, "grad_norm": 2.567676544189453, "learning_rate": 2.9938861291555217e-05, "loss": 0.1848, "step": 7840 }, { "epoch": 2.3971262610822377, "grad_norm": 0.9593561291694641, "learning_rate": 2.9942682460833016e-05, "loss": 0.2119, "step": 7841 }, { "epoch": 2.397431977988383, "grad_norm": 3.26448655128479, "learning_rate": 2.9946503630110815e-05, "loss": 0.2784, "step": 7842 }, { "epoch": 2.3977376948945275, "grad_norm": 0.3315775692462921, "learning_rate": 2.9950324799388614e-05, "loss": 0.1456, "step": 7843 }, { "epoch": 2.3980434118006726, "grad_norm": 0.3008967638015747, "learning_rate": 2.9954145968666412e-05, "loss": 0.0875, "step": 7844 }, { "epoch": 2.3983491287068173, "grad_norm": 0.30707597732543945, "learning_rate": 2.995796713794421e-05, "loss": 0.0958, "step": 7845 }, { "epoch": 2.3986548456129624, "grad_norm": 0.25790348649024963, "learning_rate": 2.996178830722201e-05, "loss": 0.0663, "step": 7846 }, { "epoch": 2.3989605625191075, "grad_norm": 0.36088502407073975, "learning_rate": 2.996560947649981e-05, "loss": 0.0657, "step": 7847 }, { "epoch": 2.399266279425252, "grad_norm": 0.44335392117500305, "learning_rate": 2.9969430645777607e-05, "loss": 0.0756, "step": 7848 }, { "epoch": 2.3995719963313973, "grad_norm": 0.4116150140762329, "learning_rate": 2.997325181505541e-05, "loss": 0.0489, "step": 7849 }, { "epoch": 2.399877713237542, "grad_norm": 0.3575882613658905, "learning_rate": 2.9977072984333208e-05, "loss": 0.0929, "step": 7850 }, { "epoch": 2.400183430143687, "grad_norm": 0.3627317547798157, "learning_rate": 2.9980894153611007e-05, "loss": 0.1087, "step": 7851 }, { "epoch": 2.4004891470498317, "grad_norm": 0.4984242022037506, "learning_rate": 2.9984715322888806e-05, "loss": 0.0772, "step": 7852 }, { "epoch": 2.400794863955977, "grad_norm": 0.44130387902259827, "learning_rate": 2.9988536492166605e-05, "loss": 0.1072, "step": 7853 }, { "epoch": 2.4011005808621215, "grad_norm": 0.42436203360557556, "learning_rate": 2.9992357661444403e-05, "loss": 0.1035, "step": 7854 }, { "epoch": 2.4014062977682666, "grad_norm": 0.4140557646751404, "learning_rate": 2.9996178830722202e-05, "loss": 0.0869, "step": 7855 }, { "epoch": 2.4017120146744113, "grad_norm": 0.4730995297431946, "learning_rate": 3e-05, "loss": 0.1349, "step": 7856 }, { "epoch": 2.4020177315805564, "grad_norm": 0.6871447563171387, "learning_rate": 2.999957538958006e-05, "loss": 0.1648, "step": 7857 }, { "epoch": 2.402323448486701, "grad_norm": 0.7143360376358032, "learning_rate": 2.999915077916012e-05, "loss": 0.2027, "step": 7858 }, { "epoch": 2.402629165392846, "grad_norm": 0.6204958558082581, "learning_rate": 2.999872616874018e-05, "loss": 0.1423, "step": 7859 }, { "epoch": 2.4029348822989913, "grad_norm": 0.5967832207679749, "learning_rate": 2.9998301558320242e-05, "loss": 0.1858, "step": 7860 }, { "epoch": 2.403240599205136, "grad_norm": 2.0377302169799805, "learning_rate": 2.99978769479003e-05, "loss": 0.1989, "step": 7861 }, { "epoch": 2.403546316111281, "grad_norm": 0.8623858094215393, "learning_rate": 2.9997452337480363e-05, "loss": 0.2254, "step": 7862 }, { "epoch": 2.403852033017426, "grad_norm": 0.8867431282997131, "learning_rate": 2.9997027727060422e-05, "loss": 0.2226, "step": 7863 }, { "epoch": 2.404157749923571, "grad_norm": 0.8506391644477844, "learning_rate": 2.9996603116640484e-05, "loss": 0.2214, "step": 7864 }, { "epoch": 2.4044634668297156, "grad_norm": 0.7078743577003479, "learning_rate": 2.9996178506220543e-05, "loss": 0.1803, "step": 7865 }, { "epoch": 2.4047691837358607, "grad_norm": 0.7849482893943787, "learning_rate": 2.9995753895800605e-05, "loss": 0.2816, "step": 7866 }, { "epoch": 2.4050749006420054, "grad_norm": 1.4631831645965576, "learning_rate": 2.9995329285380663e-05, "loss": 0.262, "step": 7867 }, { "epoch": 2.4053806175481505, "grad_norm": 1.1153644323349, "learning_rate": 2.9994904674960725e-05, "loss": 0.1647, "step": 7868 }, { "epoch": 2.405686334454295, "grad_norm": 0.26503491401672363, "learning_rate": 2.9994480064540784e-05, "loss": 0.0983, "step": 7869 }, { "epoch": 2.4059920513604403, "grad_norm": 0.2993394732475281, "learning_rate": 2.9994055454120843e-05, "loss": 0.0835, "step": 7870 }, { "epoch": 2.406297768266585, "grad_norm": 0.4350195825099945, "learning_rate": 2.9993630843700905e-05, "loss": 0.0891, "step": 7871 }, { "epoch": 2.40660348517273, "grad_norm": 0.2647486627101898, "learning_rate": 2.9993206233280964e-05, "loss": 0.0714, "step": 7872 }, { "epoch": 2.406909202078875, "grad_norm": 0.3368365168571472, "learning_rate": 2.9992781622861026e-05, "loss": 0.0662, "step": 7873 }, { "epoch": 2.40721491898502, "grad_norm": 0.4832082986831665, "learning_rate": 2.9992357012441084e-05, "loss": 0.0912, "step": 7874 }, { "epoch": 2.407520635891165, "grad_norm": 0.24253304302692413, "learning_rate": 2.9991932402021147e-05, "loss": 0.0619, "step": 7875 }, { "epoch": 2.4078263527973096, "grad_norm": 0.30167150497436523, "learning_rate": 2.9991507791601205e-05, "loss": 0.0947, "step": 7876 }, { "epoch": 2.4081320697034547, "grad_norm": 0.39894336462020874, "learning_rate": 2.9991083181181267e-05, "loss": 0.0965, "step": 7877 }, { "epoch": 2.4084377866095994, "grad_norm": 0.3300742506980896, "learning_rate": 2.9990658570761326e-05, "loss": 0.1078, "step": 7878 }, { "epoch": 2.4087435035157445, "grad_norm": 0.7468714714050293, "learning_rate": 2.9990233960341388e-05, "loss": 0.103, "step": 7879 }, { "epoch": 2.409049220421889, "grad_norm": 0.5341529846191406, "learning_rate": 2.9989809349921447e-05, "loss": 0.1035, "step": 7880 }, { "epoch": 2.4093549373280343, "grad_norm": 0.4732903838157654, "learning_rate": 2.998938473950151e-05, "loss": 0.1218, "step": 7881 }, { "epoch": 2.409660654234179, "grad_norm": 0.41536378860473633, "learning_rate": 2.9988960129081568e-05, "loss": 0.1647, "step": 7882 }, { "epoch": 2.409966371140324, "grad_norm": 0.5272341966629028, "learning_rate": 2.9988535518661626e-05, "loss": 0.1248, "step": 7883 }, { "epoch": 2.4102720880464688, "grad_norm": 0.6460676193237305, "learning_rate": 2.998811090824169e-05, "loss": 0.1497, "step": 7884 }, { "epoch": 2.410577804952614, "grad_norm": 0.589039146900177, "learning_rate": 2.9987686297821747e-05, "loss": 0.1941, "step": 7885 }, { "epoch": 2.410883521858759, "grad_norm": 0.5012587904930115, "learning_rate": 2.998726168740181e-05, "loss": 0.1608, "step": 7886 }, { "epoch": 2.4111892387649037, "grad_norm": 1.0568959712982178, "learning_rate": 2.9986837076981868e-05, "loss": 0.2017, "step": 7887 }, { "epoch": 2.4114949556710488, "grad_norm": 0.804967999458313, "learning_rate": 2.998641246656193e-05, "loss": 0.1833, "step": 7888 }, { "epoch": 2.4118006725771934, "grad_norm": 0.904463529586792, "learning_rate": 2.998598785614199e-05, "loss": 0.2397, "step": 7889 }, { "epoch": 2.4121063894833386, "grad_norm": 0.809052050113678, "learning_rate": 2.998556324572205e-05, "loss": 0.2047, "step": 7890 }, { "epoch": 2.4124121063894832, "grad_norm": 0.6560147404670715, "learning_rate": 2.998513863530211e-05, "loss": 0.2461, "step": 7891 }, { "epoch": 2.4127178232956283, "grad_norm": 0.9436032176017761, "learning_rate": 2.998471402488217e-05, "loss": 0.2758, "step": 7892 }, { "epoch": 2.413023540201773, "grad_norm": 0.4606597125530243, "learning_rate": 2.998428941446223e-05, "loss": 0.1475, "step": 7893 }, { "epoch": 2.413329257107918, "grad_norm": 0.28019753098487854, "learning_rate": 2.9983864804042292e-05, "loss": 0.1004, "step": 7894 }, { "epoch": 2.413634974014063, "grad_norm": 0.4064193069934845, "learning_rate": 2.998344019362235e-05, "loss": 0.0692, "step": 7895 }, { "epoch": 2.413940690920208, "grad_norm": 2.58994197845459, "learning_rate": 2.998301558320241e-05, "loss": 0.0738, "step": 7896 }, { "epoch": 2.4142464078263526, "grad_norm": 0.3530253767967224, "learning_rate": 2.9982590972782472e-05, "loss": 0.0783, "step": 7897 }, { "epoch": 2.4145521247324977, "grad_norm": 0.23260053992271423, "learning_rate": 2.998216636236253e-05, "loss": 0.0492, "step": 7898 }, { "epoch": 2.414857841638643, "grad_norm": 0.35369816422462463, "learning_rate": 2.9981741751942593e-05, "loss": 0.0873, "step": 7899 }, { "epoch": 2.4151635585447875, "grad_norm": 0.5599589347839355, "learning_rate": 2.998131714152265e-05, "loss": 0.0845, "step": 7900 }, { "epoch": 2.4154692754509326, "grad_norm": 0.5741417407989502, "learning_rate": 2.9980892531102713e-05, "loss": 0.0881, "step": 7901 }, { "epoch": 2.4157749923570773, "grad_norm": 0.323655903339386, "learning_rate": 2.9980467920682772e-05, "loss": 0.0822, "step": 7902 }, { "epoch": 2.4160807092632224, "grad_norm": 0.5502365231513977, "learning_rate": 2.9980043310262834e-05, "loss": 0.1293, "step": 7903 }, { "epoch": 2.416386426169367, "grad_norm": 0.33435115218162537, "learning_rate": 2.9979618699842893e-05, "loss": 0.096, "step": 7904 }, { "epoch": 2.416692143075512, "grad_norm": 0.373270183801651, "learning_rate": 2.9979194089422955e-05, "loss": 0.137, "step": 7905 }, { "epoch": 2.416997859981657, "grad_norm": 0.5377856492996216, "learning_rate": 2.9978769479003014e-05, "loss": 0.1347, "step": 7906 }, { "epoch": 2.417303576887802, "grad_norm": 1.0153404474258423, "learning_rate": 2.9978344868583076e-05, "loss": 0.1318, "step": 7907 }, { "epoch": 2.4176092937939466, "grad_norm": 0.6053585410118103, "learning_rate": 2.9977920258163138e-05, "loss": 0.1436, "step": 7908 }, { "epoch": 2.4179150107000917, "grad_norm": 0.822456955909729, "learning_rate": 2.9977495647743197e-05, "loss": 0.14, "step": 7909 }, { "epoch": 2.4182207276062364, "grad_norm": 0.7452265024185181, "learning_rate": 2.997707103732326e-05, "loss": 0.1913, "step": 7910 }, { "epoch": 2.4185264445123815, "grad_norm": 0.6740055084228516, "learning_rate": 2.9976646426903317e-05, "loss": 0.2366, "step": 7911 }, { "epoch": 2.4188321614185266, "grad_norm": 0.6186607480049133, "learning_rate": 2.997622181648338e-05, "loss": 0.1804, "step": 7912 }, { "epoch": 2.4191378783246713, "grad_norm": 0.825633704662323, "learning_rate": 2.9975797206063438e-05, "loss": 0.1878, "step": 7913 }, { "epoch": 2.4194435952308164, "grad_norm": 0.9150055050849915, "learning_rate": 2.99753725956435e-05, "loss": 0.2148, "step": 7914 }, { "epoch": 2.419749312136961, "grad_norm": 1.8920718431472778, "learning_rate": 2.997494798522356e-05, "loss": 0.234, "step": 7915 }, { "epoch": 2.420055029043106, "grad_norm": 5.506700038909912, "learning_rate": 2.997452337480362e-05, "loss": 0.2528, "step": 7916 }, { "epoch": 2.420360745949251, "grad_norm": 2.105818748474121, "learning_rate": 2.997409876438368e-05, "loss": 0.2631, "step": 7917 }, { "epoch": 2.420666462855396, "grad_norm": 0.6389627456665039, "learning_rate": 2.9973674153963742e-05, "loss": 0.1616, "step": 7918 }, { "epoch": 2.4209721797615407, "grad_norm": 0.43795570731163025, "learning_rate": 2.99732495435438e-05, "loss": 0.0819, "step": 7919 }, { "epoch": 2.421277896667686, "grad_norm": 0.6183004379272461, "learning_rate": 2.997282493312386e-05, "loss": 0.0874, "step": 7920 }, { "epoch": 2.4215836135738305, "grad_norm": 0.3649000823497772, "learning_rate": 2.997240032270392e-05, "loss": 0.072, "step": 7921 }, { "epoch": 2.4218893304799756, "grad_norm": 0.3845837116241455, "learning_rate": 2.997197571228398e-05, "loss": 0.0722, "step": 7922 }, { "epoch": 2.4221950473861202, "grad_norm": 0.26755014061927795, "learning_rate": 2.9971551101864042e-05, "loss": 0.0875, "step": 7923 }, { "epoch": 2.4225007642922654, "grad_norm": 0.2732967734336853, "learning_rate": 2.99711264914441e-05, "loss": 0.0673, "step": 7924 }, { "epoch": 2.4228064811984105, "grad_norm": 0.3199889361858368, "learning_rate": 2.9970701881024163e-05, "loss": 0.0517, "step": 7925 }, { "epoch": 2.423112198104555, "grad_norm": 0.3969211280345917, "learning_rate": 2.997027727060422e-05, "loss": 0.1083, "step": 7926 }, { "epoch": 2.4234179150107003, "grad_norm": 0.3826121687889099, "learning_rate": 2.9969852660184284e-05, "loss": 0.0771, "step": 7927 }, { "epoch": 2.423723631916845, "grad_norm": 0.29978135228157043, "learning_rate": 2.9969428049764342e-05, "loss": 0.0886, "step": 7928 }, { "epoch": 2.42402934882299, "grad_norm": 0.46011078357696533, "learning_rate": 2.9969003439344404e-05, "loss": 0.0791, "step": 7929 }, { "epoch": 2.4243350657291347, "grad_norm": 0.47561559081077576, "learning_rate": 2.9968578828924463e-05, "loss": 0.1241, "step": 7930 }, { "epoch": 2.42464078263528, "grad_norm": 0.567965030670166, "learning_rate": 2.9968154218504525e-05, "loss": 0.1048, "step": 7931 }, { "epoch": 2.4249464995414245, "grad_norm": 0.6881142854690552, "learning_rate": 2.9967729608084584e-05, "loss": 0.1669, "step": 7932 }, { "epoch": 2.4252522164475696, "grad_norm": 2.293203830718994, "learning_rate": 2.9967304997664643e-05, "loss": 0.1444, "step": 7933 }, { "epoch": 2.4255579333537143, "grad_norm": 0.7049921154975891, "learning_rate": 2.9966880387244705e-05, "loss": 0.1718, "step": 7934 }, { "epoch": 2.4258636502598594, "grad_norm": 0.6322556734085083, "learning_rate": 2.9966455776824763e-05, "loss": 0.1941, "step": 7935 }, { "epoch": 2.426169367166004, "grad_norm": 0.7659299373626709, "learning_rate": 2.9966031166404826e-05, "loss": 0.2269, "step": 7936 }, { "epoch": 2.426475084072149, "grad_norm": 0.8418258428573608, "learning_rate": 2.9965606555984884e-05, "loss": 0.1937, "step": 7937 }, { "epoch": 2.4267808009782943, "grad_norm": 0.6930940747261047, "learning_rate": 2.9965181945564946e-05, "loss": 0.2242, "step": 7938 }, { "epoch": 2.427086517884439, "grad_norm": 0.8226381540298462, "learning_rate": 2.9964757335145005e-05, "loss": 0.1936, "step": 7939 }, { "epoch": 2.427392234790584, "grad_norm": 1.3652799129486084, "learning_rate": 2.9964332724725067e-05, "loss": 0.2088, "step": 7940 }, { "epoch": 2.4276979516967288, "grad_norm": 1.1806894540786743, "learning_rate": 2.9963908114305126e-05, "loss": 0.2487, "step": 7941 }, { "epoch": 2.428003668602874, "grad_norm": 1.1964823007583618, "learning_rate": 2.9963483503885188e-05, "loss": 0.2783, "step": 7942 }, { "epoch": 2.4283093855090185, "grad_norm": 0.7758735418319702, "learning_rate": 2.9963058893465247e-05, "loss": 0.1626, "step": 7943 }, { "epoch": 2.4286151024151637, "grad_norm": 0.6949077844619751, "learning_rate": 2.996263428304531e-05, "loss": 0.0712, "step": 7944 }, { "epoch": 2.4289208193213083, "grad_norm": 0.36215680837631226, "learning_rate": 2.9962209672625367e-05, "loss": 0.0926, "step": 7945 }, { "epoch": 2.4292265362274534, "grad_norm": 0.23642532527446747, "learning_rate": 2.9961785062205426e-05, "loss": 0.0832, "step": 7946 }, { "epoch": 2.429532253133598, "grad_norm": 0.5168200731277466, "learning_rate": 2.9961360451785488e-05, "loss": 0.0865, "step": 7947 }, { "epoch": 2.4298379700397432, "grad_norm": 0.2375514954328537, "learning_rate": 2.9960935841365547e-05, "loss": 0.0739, "step": 7948 }, { "epoch": 2.430143686945888, "grad_norm": 0.2064887285232544, "learning_rate": 2.996051123094561e-05, "loss": 0.0504, "step": 7949 }, { "epoch": 2.430449403852033, "grad_norm": 0.26339274644851685, "learning_rate": 2.9960086620525668e-05, "loss": 0.0488, "step": 7950 }, { "epoch": 2.430755120758178, "grad_norm": 0.31709298491477966, "learning_rate": 2.995966201010573e-05, "loss": 0.0812, "step": 7951 }, { "epoch": 2.431060837664323, "grad_norm": 0.2175816148519516, "learning_rate": 2.995923739968579e-05, "loss": 0.0649, "step": 7952 }, { "epoch": 2.431366554570468, "grad_norm": 0.6387092471122742, "learning_rate": 2.995881278926585e-05, "loss": 0.0857, "step": 7953 }, { "epoch": 2.4316722714766126, "grad_norm": 0.3984636962413788, "learning_rate": 2.995838817884591e-05, "loss": 0.0789, "step": 7954 }, { "epoch": 2.4319779883827577, "grad_norm": 0.4509336054325104, "learning_rate": 2.995796356842597e-05, "loss": 0.1102, "step": 7955 }, { "epoch": 2.4322837052889024, "grad_norm": 0.41080576181411743, "learning_rate": 2.995753895800603e-05, "loss": 0.1095, "step": 7956 }, { "epoch": 2.4325894221950475, "grad_norm": 1.7031620740890503, "learning_rate": 2.9957114347586092e-05, "loss": 0.1608, "step": 7957 }, { "epoch": 2.432895139101192, "grad_norm": 0.5393343567848206, "learning_rate": 2.995668973716615e-05, "loss": 0.148, "step": 7958 }, { "epoch": 2.4332008560073373, "grad_norm": 0.781764566898346, "learning_rate": 2.995626512674621e-05, "loss": 0.1927, "step": 7959 }, { "epoch": 2.433506572913482, "grad_norm": 0.6912841200828552, "learning_rate": 2.995584051632627e-05, "loss": 0.1784, "step": 7960 }, { "epoch": 2.433812289819627, "grad_norm": 0.7694900631904602, "learning_rate": 2.995541590590633e-05, "loss": 0.1706, "step": 7961 }, { "epoch": 2.4341180067257717, "grad_norm": 0.9341328740119934, "learning_rate": 2.9954991295486392e-05, "loss": 0.1926, "step": 7962 }, { "epoch": 2.434423723631917, "grad_norm": 1.1374765634536743, "learning_rate": 2.995456668506645e-05, "loss": 0.2346, "step": 7963 }, { "epoch": 2.434729440538062, "grad_norm": 0.9107559323310852, "learning_rate": 2.9954142074646513e-05, "loss": 0.2322, "step": 7964 }, { "epoch": 2.4350351574442066, "grad_norm": 1.9712393283843994, "learning_rate": 2.9953717464226572e-05, "loss": 0.2004, "step": 7965 }, { "epoch": 2.4353408743503517, "grad_norm": 1.1866203546524048, "learning_rate": 2.9953292853806634e-05, "loss": 0.2201, "step": 7966 }, { "epoch": 2.4356465912564964, "grad_norm": 1.6260290145874023, "learning_rate": 2.9952868243386693e-05, "loss": 0.2966, "step": 7967 }, { "epoch": 2.4359523081626415, "grad_norm": 0.8199893236160278, "learning_rate": 2.9952443632966755e-05, "loss": 0.1876, "step": 7968 }, { "epoch": 2.436258025068786, "grad_norm": 0.4886553883552551, "learning_rate": 2.9952019022546813e-05, "loss": 0.0975, "step": 7969 }, { "epoch": 2.4365637419749313, "grad_norm": 0.5316845774650574, "learning_rate": 2.9951594412126876e-05, "loss": 0.0967, "step": 7970 }, { "epoch": 2.436869458881076, "grad_norm": 0.39579665660858154, "learning_rate": 2.9951169801706934e-05, "loss": 0.0634, "step": 7971 }, { "epoch": 2.437175175787221, "grad_norm": 2.8089475631713867, "learning_rate": 2.9950745191286993e-05, "loss": 0.06, "step": 7972 }, { "epoch": 2.4374808926933658, "grad_norm": 0.27410802245140076, "learning_rate": 2.9950320580867055e-05, "loss": 0.0476, "step": 7973 }, { "epoch": 2.437786609599511, "grad_norm": 0.5204256772994995, "learning_rate": 2.9949895970447114e-05, "loss": 0.061, "step": 7974 }, { "epoch": 2.4380923265056555, "grad_norm": 0.4446132183074951, "learning_rate": 2.9949471360027176e-05, "loss": 0.0941, "step": 7975 }, { "epoch": 2.4383980434118007, "grad_norm": 0.38393083214759827, "learning_rate": 2.9949046749607234e-05, "loss": 0.0684, "step": 7976 }, { "epoch": 2.438703760317946, "grad_norm": 0.5330730080604553, "learning_rate": 2.9948622139187297e-05, "loss": 0.064, "step": 7977 }, { "epoch": 2.4390094772240904, "grad_norm": 0.4874364137649536, "learning_rate": 2.9948197528767355e-05, "loss": 0.1216, "step": 7978 }, { "epoch": 2.4393151941302356, "grad_norm": 0.6190516352653503, "learning_rate": 2.9947772918347417e-05, "loss": 0.091, "step": 7979 }, { "epoch": 2.4396209110363802, "grad_norm": 0.6935859322547913, "learning_rate": 2.9947348307927476e-05, "loss": 0.1364, "step": 7980 }, { "epoch": 2.4399266279425254, "grad_norm": 0.6017069220542908, "learning_rate": 2.9946923697507538e-05, "loss": 0.1345, "step": 7981 }, { "epoch": 2.44023234484867, "grad_norm": 0.5299845337867737, "learning_rate": 2.9946499087087597e-05, "loss": 0.1772, "step": 7982 }, { "epoch": 2.440538061754815, "grad_norm": 0.5342752933502197, "learning_rate": 2.994607447666766e-05, "loss": 0.1513, "step": 7983 }, { "epoch": 2.44084377866096, "grad_norm": 1.6347206830978394, "learning_rate": 2.9945649866247718e-05, "loss": 0.1973, "step": 7984 }, { "epoch": 2.441149495567105, "grad_norm": 1.4335873126983643, "learning_rate": 2.9945225255827776e-05, "loss": 0.2074, "step": 7985 }, { "epoch": 2.4414552124732496, "grad_norm": 0.7595264911651611, "learning_rate": 2.994480064540784e-05, "loss": 0.2239, "step": 7986 }, { "epoch": 2.4417609293793947, "grad_norm": 0.7178658843040466, "learning_rate": 2.9944376034987897e-05, "loss": 0.2135, "step": 7987 }, { "epoch": 2.4420666462855394, "grad_norm": 0.8540993332862854, "learning_rate": 2.994395142456796e-05, "loss": 0.2083, "step": 7988 }, { "epoch": 2.4423723631916845, "grad_norm": 0.7493446469306946, "learning_rate": 2.9943526814148018e-05, "loss": 0.2006, "step": 7989 }, { "epoch": 2.4426780800978296, "grad_norm": 2.12689208984375, "learning_rate": 2.994310220372808e-05, "loss": 0.2837, "step": 7990 }, { "epoch": 2.4429837970039743, "grad_norm": 1.3828213214874268, "learning_rate": 2.994267759330814e-05, "loss": 0.232, "step": 7991 }, { "epoch": 2.4432895139101194, "grad_norm": 1.5804284811019897, "learning_rate": 2.99422529828882e-05, "loss": 0.2718, "step": 7992 }, { "epoch": 2.443595230816264, "grad_norm": 0.5417662858963013, "learning_rate": 2.994182837246826e-05, "loss": 0.1461, "step": 7993 }, { "epoch": 2.443900947722409, "grad_norm": 0.297852486371994, "learning_rate": 2.994140376204832e-05, "loss": 0.078, "step": 7994 }, { "epoch": 2.444206664628554, "grad_norm": 0.29042428731918335, "learning_rate": 2.994097915162838e-05, "loss": 0.0995, "step": 7995 }, { "epoch": 2.444512381534699, "grad_norm": 0.49607595801353455, "learning_rate": 2.9940554541208442e-05, "loss": 0.0774, "step": 7996 }, { "epoch": 2.4448180984408436, "grad_norm": 0.3515443205833435, "learning_rate": 2.99401299307885e-05, "loss": 0.0811, "step": 7997 }, { "epoch": 2.4451238153469887, "grad_norm": 0.4120791554450989, "learning_rate": 2.993970532036856e-05, "loss": 0.0577, "step": 7998 }, { "epoch": 2.4454295322531334, "grad_norm": 0.2784789800643921, "learning_rate": 2.9939280709948622e-05, "loss": 0.0559, "step": 7999 }, { "epoch": 2.4457352491592785, "grad_norm": 0.4124196767807007, "learning_rate": 2.993885609952868e-05, "loss": 0.0828, "step": 8000 }, { "epoch": 2.4457352491592785, "eval_cer": 0.19486541629842996, "eval_loss": 0.2634495496749878, "eval_runtime": 18.8539, "eval_samples_per_second": 240.693, "eval_steps_per_second": 0.796, "eval_wer": 0.34730873439289406, "step": 8000 }, { "epoch": 2.446040966065423, "grad_norm": 0.4801124036312103, "learning_rate": 2.9938431489108743e-05, "loss": 0.0765, "step": 8001 }, { "epoch": 2.4463466829715683, "grad_norm": 0.2977868318557739, "learning_rate": 2.99380068786888e-05, "loss": 0.0839, "step": 8002 }, { "epoch": 2.4466523998777134, "grad_norm": 0.2926197946071625, "learning_rate": 2.9937582268268863e-05, "loss": 0.1051, "step": 8003 }, { "epoch": 2.446958116783858, "grad_norm": 0.40979140996932983, "learning_rate": 2.9937157657848922e-05, "loss": 0.1098, "step": 8004 }, { "epoch": 2.447263833690003, "grad_norm": 0.30364710092544556, "learning_rate": 2.9936733047428984e-05, "loss": 0.0879, "step": 8005 }, { "epoch": 2.447569550596148, "grad_norm": 0.3705176115036011, "learning_rate": 2.9936308437009043e-05, "loss": 0.1059, "step": 8006 }, { "epoch": 2.447875267502293, "grad_norm": 0.6467152833938599, "learning_rate": 2.9935883826589105e-05, "loss": 0.155, "step": 8007 }, { "epoch": 2.4481809844084377, "grad_norm": 0.7022431492805481, "learning_rate": 2.9935459216169164e-05, "loss": 0.1861, "step": 8008 }, { "epoch": 2.448486701314583, "grad_norm": 0.5121182799339294, "learning_rate": 2.9935034605749226e-05, "loss": 0.1544, "step": 8009 }, { "epoch": 2.4487924182207275, "grad_norm": 1.095749020576477, "learning_rate": 2.9934609995329288e-05, "loss": 0.1675, "step": 8010 }, { "epoch": 2.4490981351268726, "grad_norm": 1.7698172330856323, "learning_rate": 2.9934185384909347e-05, "loss": 0.1885, "step": 8011 }, { "epoch": 2.4494038520330172, "grad_norm": 0.8751893639564514, "learning_rate": 2.993376077448941e-05, "loss": 0.2042, "step": 8012 }, { "epoch": 2.4497095689391624, "grad_norm": 1.0434867143630981, "learning_rate": 2.9933336164069467e-05, "loss": 0.236, "step": 8013 }, { "epoch": 2.450015285845307, "grad_norm": 0.9864802360534668, "learning_rate": 2.993291155364953e-05, "loss": 0.236, "step": 8014 }, { "epoch": 2.450321002751452, "grad_norm": 1.355552077293396, "learning_rate": 2.9932486943229588e-05, "loss": 0.2281, "step": 8015 }, { "epoch": 2.4506267196575973, "grad_norm": 2.312650203704834, "learning_rate": 2.993206233280965e-05, "loss": 0.244, "step": 8016 }, { "epoch": 2.450932436563742, "grad_norm": 1.6670249700546265, "learning_rate": 2.993163772238971e-05, "loss": 0.3019, "step": 8017 }, { "epoch": 2.451238153469887, "grad_norm": 0.5474679470062256, "learning_rate": 2.993121311196977e-05, "loss": 0.1741, "step": 8018 }, { "epoch": 2.4515438703760317, "grad_norm": 0.42796480655670166, "learning_rate": 2.993078850154983e-05, "loss": 0.0992, "step": 8019 }, { "epoch": 2.451849587282177, "grad_norm": 0.30831700563430786, "learning_rate": 2.9930363891129892e-05, "loss": 0.0833, "step": 8020 }, { "epoch": 2.4521553041883215, "grad_norm": 0.3181995153427124, "learning_rate": 2.992993928070995e-05, "loss": 0.0713, "step": 8021 }, { "epoch": 2.4524610210944666, "grad_norm": 0.423006534576416, "learning_rate": 2.9929514670290013e-05, "loss": 0.0926, "step": 8022 }, { "epoch": 2.4527667380006113, "grad_norm": 0.27026137709617615, "learning_rate": 2.992909005987007e-05, "loss": 0.0604, "step": 8023 }, { "epoch": 2.4530724549067564, "grad_norm": 0.34934931993484497, "learning_rate": 2.992866544945013e-05, "loss": 0.0742, "step": 8024 }, { "epoch": 2.453378171812901, "grad_norm": 0.25592413544654846, "learning_rate": 2.9928240839030192e-05, "loss": 0.0711, "step": 8025 }, { "epoch": 2.453683888719046, "grad_norm": 0.49087315797805786, "learning_rate": 2.992781622861025e-05, "loss": 0.117, "step": 8026 }, { "epoch": 2.453989605625191, "grad_norm": 0.36434975266456604, "learning_rate": 2.9927391618190313e-05, "loss": 0.1172, "step": 8027 }, { "epoch": 2.454295322531336, "grad_norm": 0.42360666394233704, "learning_rate": 2.992696700777037e-05, "loss": 0.1051, "step": 8028 }, { "epoch": 2.454601039437481, "grad_norm": 0.2884521484375, "learning_rate": 2.9926542397350434e-05, "loss": 0.0721, "step": 8029 }, { "epoch": 2.4549067563436258, "grad_norm": 0.42677128314971924, "learning_rate": 2.9926117786930492e-05, "loss": 0.1217, "step": 8030 }, { "epoch": 2.455212473249771, "grad_norm": 0.49454888701438904, "learning_rate": 2.9925693176510554e-05, "loss": 0.1235, "step": 8031 }, { "epoch": 2.4555181901559155, "grad_norm": 0.49173295497894287, "learning_rate": 2.9925268566090613e-05, "loss": 0.1472, "step": 8032 }, { "epoch": 2.4558239070620607, "grad_norm": 0.6854293346405029, "learning_rate": 2.9924843955670675e-05, "loss": 0.1654, "step": 8033 }, { "epoch": 2.4561296239682053, "grad_norm": 0.8106250762939453, "learning_rate": 2.9924419345250734e-05, "loss": 0.1763, "step": 8034 }, { "epoch": 2.4564353408743504, "grad_norm": 0.5504964590072632, "learning_rate": 2.9923994734830793e-05, "loss": 0.1814, "step": 8035 }, { "epoch": 2.456741057780495, "grad_norm": 0.9781885743141174, "learning_rate": 2.9923570124410855e-05, "loss": 0.181, "step": 8036 }, { "epoch": 2.4570467746866402, "grad_norm": 1.3313504457473755, "learning_rate": 2.9923145513990913e-05, "loss": 0.1691, "step": 8037 }, { "epoch": 2.457352491592785, "grad_norm": 1.1838202476501465, "learning_rate": 2.9922720903570976e-05, "loss": 0.2195, "step": 8038 }, { "epoch": 2.45765820849893, "grad_norm": 1.1167725324630737, "learning_rate": 2.9922296293151034e-05, "loss": 0.2265, "step": 8039 }, { "epoch": 2.4579639254050747, "grad_norm": 0.9777399897575378, "learning_rate": 2.9921871682731096e-05, "loss": 0.279, "step": 8040 }, { "epoch": 2.45826964231122, "grad_norm": 1.9793951511383057, "learning_rate": 2.9921447072311155e-05, "loss": 0.2429, "step": 8041 }, { "epoch": 2.458575359217365, "grad_norm": 1.2426371574401855, "learning_rate": 2.9921022461891217e-05, "loss": 0.3324, "step": 8042 }, { "epoch": 2.4588810761235096, "grad_norm": 0.8595341444015503, "learning_rate": 2.9920597851471276e-05, "loss": 0.1684, "step": 8043 }, { "epoch": 2.4591867930296547, "grad_norm": 0.31563714146614075, "learning_rate": 2.9920173241051338e-05, "loss": 0.0854, "step": 8044 }, { "epoch": 2.4594925099357994, "grad_norm": 0.2818988561630249, "learning_rate": 2.9919748630631397e-05, "loss": 0.0752, "step": 8045 }, { "epoch": 2.4597982268419445, "grad_norm": 0.6245784759521484, "learning_rate": 2.991932402021146e-05, "loss": 0.0823, "step": 8046 }, { "epoch": 2.460103943748089, "grad_norm": 0.46191713213920593, "learning_rate": 2.9918899409791517e-05, "loss": 0.0589, "step": 8047 }, { "epoch": 2.4604096606542343, "grad_norm": 0.3702324330806732, "learning_rate": 2.9918474799371576e-05, "loss": 0.0597, "step": 8048 }, { "epoch": 2.460715377560379, "grad_norm": 0.42403385043144226, "learning_rate": 2.9918050188951638e-05, "loss": 0.058, "step": 8049 }, { "epoch": 2.461021094466524, "grad_norm": 0.28788161277770996, "learning_rate": 2.9917625578531697e-05, "loss": 0.0788, "step": 8050 }, { "epoch": 2.4613268113726687, "grad_norm": 0.3770267069339752, "learning_rate": 2.991720096811176e-05, "loss": 0.0608, "step": 8051 }, { "epoch": 2.461632528278814, "grad_norm": 0.3677668571472168, "learning_rate": 2.9916776357691818e-05, "loss": 0.1151, "step": 8052 }, { "epoch": 2.4619382451849585, "grad_norm": 0.45261481404304504, "learning_rate": 2.991635174727188e-05, "loss": 0.1209, "step": 8053 }, { "epoch": 2.4622439620911036, "grad_norm": 0.2829691767692566, "learning_rate": 2.991592713685194e-05, "loss": 0.0671, "step": 8054 }, { "epoch": 2.4625496789972487, "grad_norm": 0.3819495737552643, "learning_rate": 2.9915502526432e-05, "loss": 0.0875, "step": 8055 }, { "epoch": 2.4628553959033934, "grad_norm": 0.48958489298820496, "learning_rate": 2.991507791601206e-05, "loss": 0.1608, "step": 8056 }, { "epoch": 2.4631611128095385, "grad_norm": 2.0266237258911133, "learning_rate": 2.991465330559212e-05, "loss": 0.1354, "step": 8057 }, { "epoch": 2.463466829715683, "grad_norm": 0.4574490487575531, "learning_rate": 2.991422869517218e-05, "loss": 0.1607, "step": 8058 }, { "epoch": 2.4637725466218283, "grad_norm": 0.5634899735450745, "learning_rate": 2.9913804084752242e-05, "loss": 0.1736, "step": 8059 }, { "epoch": 2.464078263527973, "grad_norm": 1.4209845066070557, "learning_rate": 2.99133794743323e-05, "loss": 0.2203, "step": 8060 }, { "epoch": 2.464383980434118, "grad_norm": 0.9145939350128174, "learning_rate": 2.991295486391236e-05, "loss": 0.1991, "step": 8061 }, { "epoch": 2.4646896973402628, "grad_norm": 0.7416648268699646, "learning_rate": 2.991253025349242e-05, "loss": 0.2083, "step": 8062 }, { "epoch": 2.464995414246408, "grad_norm": 0.9203619956970215, "learning_rate": 2.991210564307248e-05, "loss": 0.2357, "step": 8063 }, { "epoch": 2.4653011311525526, "grad_norm": 0.7641226649284363, "learning_rate": 2.9911681032652542e-05, "loss": 0.2203, "step": 8064 }, { "epoch": 2.4656068480586977, "grad_norm": 1.3474775552749634, "learning_rate": 2.99112564222326e-05, "loss": 0.2712, "step": 8065 }, { "epoch": 2.4659125649648423, "grad_norm": 1.0098296403884888, "learning_rate": 2.9910831811812663e-05, "loss": 0.1904, "step": 8066 }, { "epoch": 2.4662182818709875, "grad_norm": 3.683748960494995, "learning_rate": 2.9910407201392722e-05, "loss": 0.3026, "step": 8067 }, { "epoch": 2.4665239987771326, "grad_norm": 0.5322412848472595, "learning_rate": 2.9909982590972784e-05, "loss": 0.1634, "step": 8068 }, { "epoch": 2.4668297156832772, "grad_norm": 0.6744182109832764, "learning_rate": 2.9909557980552843e-05, "loss": 0.094, "step": 8069 }, { "epoch": 2.4671354325894224, "grad_norm": 0.33919399976730347, "learning_rate": 2.9909133370132905e-05, "loss": 0.091, "step": 8070 }, { "epoch": 2.467441149495567, "grad_norm": 0.2118224948644638, "learning_rate": 2.9908708759712963e-05, "loss": 0.0693, "step": 8071 }, { "epoch": 2.467746866401712, "grad_norm": 0.4306458532810211, "learning_rate": 2.9908284149293026e-05, "loss": 0.0545, "step": 8072 }, { "epoch": 2.468052583307857, "grad_norm": 0.6232031583786011, "learning_rate": 2.9907859538873084e-05, "loss": 0.0547, "step": 8073 }, { "epoch": 2.468358300214002, "grad_norm": 0.851924479007721, "learning_rate": 2.9907434928453143e-05, "loss": 0.1022, "step": 8074 }, { "epoch": 2.4686640171201466, "grad_norm": 0.3052920997142792, "learning_rate": 2.9907010318033205e-05, "loss": 0.0497, "step": 8075 }, { "epoch": 2.4689697340262917, "grad_norm": 0.3377465009689331, "learning_rate": 2.9906585707613264e-05, "loss": 0.1002, "step": 8076 }, { "epoch": 2.4692754509324364, "grad_norm": 0.26802337169647217, "learning_rate": 2.9906161097193326e-05, "loss": 0.0733, "step": 8077 }, { "epoch": 2.4695811678385815, "grad_norm": 0.47520357370376587, "learning_rate": 2.9905736486773385e-05, "loss": 0.1382, "step": 8078 }, { "epoch": 2.469886884744726, "grad_norm": 0.305033415555954, "learning_rate": 2.9905311876353447e-05, "loss": 0.0937, "step": 8079 }, { "epoch": 2.4701926016508713, "grad_norm": 0.6167755126953125, "learning_rate": 2.9904887265933505e-05, "loss": 0.111, "step": 8080 }, { "epoch": 2.4704983185570164, "grad_norm": 0.4931790828704834, "learning_rate": 2.9904462655513567e-05, "loss": 0.1267, "step": 8081 }, { "epoch": 2.470804035463161, "grad_norm": 0.7526784539222717, "learning_rate": 2.9904038045093626e-05, "loss": 0.1411, "step": 8082 }, { "epoch": 2.471109752369306, "grad_norm": 0.8409481048583984, "learning_rate": 2.9903613434673688e-05, "loss": 0.1368, "step": 8083 }, { "epoch": 2.471415469275451, "grad_norm": 0.7733150124549866, "learning_rate": 2.9903188824253747e-05, "loss": 0.1765, "step": 8084 }, { "epoch": 2.471721186181596, "grad_norm": 1.2044223546981812, "learning_rate": 2.990276421383381e-05, "loss": 0.195, "step": 8085 }, { "epoch": 2.4720269030877406, "grad_norm": 1.1435072422027588, "learning_rate": 2.9902339603413868e-05, "loss": 0.1637, "step": 8086 }, { "epoch": 2.4723326199938858, "grad_norm": 1.3542557954788208, "learning_rate": 2.9901914992993926e-05, "loss": 0.1994, "step": 8087 }, { "epoch": 2.4726383369000304, "grad_norm": 1.305058479309082, "learning_rate": 2.990149038257399e-05, "loss": 0.205, "step": 8088 }, { "epoch": 2.4729440538061755, "grad_norm": 1.373839020729065, "learning_rate": 2.9901065772154047e-05, "loss": 0.2211, "step": 8089 }, { "epoch": 2.47324977071232, "grad_norm": 0.7773467302322388, "learning_rate": 2.990064116173411e-05, "loss": 0.2173, "step": 8090 }, { "epoch": 2.4735554876184653, "grad_norm": 1.261860728263855, "learning_rate": 2.9900216551314168e-05, "loss": 0.2261, "step": 8091 }, { "epoch": 2.47386120452461, "grad_norm": 3.6025936603546143, "learning_rate": 2.989979194089423e-05, "loss": 0.3032, "step": 8092 }, { "epoch": 2.474166921430755, "grad_norm": 0.370749831199646, "learning_rate": 2.989936733047429e-05, "loss": 0.1622, "step": 8093 }, { "epoch": 2.4744726383369002, "grad_norm": 0.5054711699485779, "learning_rate": 2.989894272005435e-05, "loss": 0.1113, "step": 8094 }, { "epoch": 2.474778355243045, "grad_norm": 0.3044017553329468, "learning_rate": 2.989851810963441e-05, "loss": 0.0996, "step": 8095 }, { "epoch": 2.47508407214919, "grad_norm": 0.25899308919906616, "learning_rate": 2.989809349921447e-05, "loss": 0.0637, "step": 8096 }, { "epoch": 2.4753897890553347, "grad_norm": 0.39063388109207153, "learning_rate": 2.989766888879453e-05, "loss": 0.0596, "step": 8097 }, { "epoch": 2.47569550596148, "grad_norm": 0.2960585653781891, "learning_rate": 2.9897244278374592e-05, "loss": 0.0701, "step": 8098 }, { "epoch": 2.4760012228676245, "grad_norm": 0.3414613902568817, "learning_rate": 2.989681966795465e-05, "loss": 0.0852, "step": 8099 }, { "epoch": 2.4763069397737696, "grad_norm": 0.29585352540016174, "learning_rate": 2.989639505753471e-05, "loss": 0.0966, "step": 8100 }, { "epoch": 2.4766126566799143, "grad_norm": 0.5023000836372375, "learning_rate": 2.9895970447114772e-05, "loss": 0.0753, "step": 8101 }, { "epoch": 2.4769183735860594, "grad_norm": 0.2643120288848877, "learning_rate": 2.989554583669483e-05, "loss": 0.066, "step": 8102 }, { "epoch": 2.477224090492204, "grad_norm": 0.4079147279262543, "learning_rate": 2.9895121226274893e-05, "loss": 0.1365, "step": 8103 }, { "epoch": 2.477529807398349, "grad_norm": 0.9086787104606628, "learning_rate": 2.989469661585495e-05, "loss": 0.083, "step": 8104 }, { "epoch": 2.477835524304494, "grad_norm": 0.5128028392791748, "learning_rate": 2.9894272005435013e-05, "loss": 0.1082, "step": 8105 }, { "epoch": 2.478141241210639, "grad_norm": 0.4218543469905853, "learning_rate": 2.9893847395015072e-05, "loss": 0.1117, "step": 8106 }, { "epoch": 2.478446958116784, "grad_norm": 0.8942366242408752, "learning_rate": 2.9893422784595134e-05, "loss": 0.1706, "step": 8107 }, { "epoch": 2.4787526750229287, "grad_norm": 1.1899642944335938, "learning_rate": 2.9892998174175193e-05, "loss": 0.1738, "step": 8108 }, { "epoch": 2.479058391929074, "grad_norm": 0.7334678769111633, "learning_rate": 2.9892573563755255e-05, "loss": 0.1812, "step": 8109 }, { "epoch": 2.4793641088352185, "grad_norm": 0.6443423628807068, "learning_rate": 2.9892148953335314e-05, "loss": 0.1866, "step": 8110 }, { "epoch": 2.4796698257413636, "grad_norm": 1.3256182670593262, "learning_rate": 2.9891724342915376e-05, "loss": 0.1634, "step": 8111 }, { "epoch": 2.4799755426475083, "grad_norm": 0.8974039554595947, "learning_rate": 2.9891299732495438e-05, "loss": 0.189, "step": 8112 }, { "epoch": 2.4802812595536534, "grad_norm": 0.854267954826355, "learning_rate": 2.9890875122075497e-05, "loss": 0.1829, "step": 8113 }, { "epoch": 2.480586976459798, "grad_norm": 0.6669566035270691, "learning_rate": 2.989045051165556e-05, "loss": 0.2222, "step": 8114 }, { "epoch": 2.480892693365943, "grad_norm": 1.186930537223816, "learning_rate": 2.9890025901235617e-05, "loss": 0.2132, "step": 8115 }, { "epoch": 2.481198410272088, "grad_norm": 1.0336806774139404, "learning_rate": 2.988960129081568e-05, "loss": 0.281, "step": 8116 }, { "epoch": 2.481504127178233, "grad_norm": 1.9817763566970825, "learning_rate": 2.9889176680395738e-05, "loss": 0.3374, "step": 8117 }, { "epoch": 2.4818098440843777, "grad_norm": 0.43890684843063354, "learning_rate": 2.98887520699758e-05, "loss": 0.1619, "step": 8118 }, { "epoch": 2.4821155609905228, "grad_norm": 0.41629424691200256, "learning_rate": 2.988832745955586e-05, "loss": 0.0818, "step": 8119 }, { "epoch": 2.482421277896668, "grad_norm": 0.31545135378837585, "learning_rate": 2.988790284913592e-05, "loss": 0.1025, "step": 8120 }, { "epoch": 2.4827269948028126, "grad_norm": 0.40616732835769653, "learning_rate": 2.988747823871598e-05, "loss": 0.0897, "step": 8121 }, { "epoch": 2.4830327117089577, "grad_norm": 0.2437174767255783, "learning_rate": 2.9887053628296042e-05, "loss": 0.0681, "step": 8122 }, { "epoch": 2.4833384286151023, "grad_norm": 0.6808393597602844, "learning_rate": 2.98866290178761e-05, "loss": 0.0815, "step": 8123 }, { "epoch": 2.4836441455212475, "grad_norm": 0.1995892971754074, "learning_rate": 2.9886204407456163e-05, "loss": 0.0502, "step": 8124 }, { "epoch": 2.483949862427392, "grad_norm": 0.2971232235431671, "learning_rate": 2.988577979703622e-05, "loss": 0.0856, "step": 8125 }, { "epoch": 2.4842555793335372, "grad_norm": 0.23098136484622955, "learning_rate": 2.988535518661628e-05, "loss": 0.0826, "step": 8126 }, { "epoch": 2.484561296239682, "grad_norm": 0.28700312972068787, "learning_rate": 2.9884930576196342e-05, "loss": 0.0596, "step": 8127 }, { "epoch": 2.484867013145827, "grad_norm": 0.5390397310256958, "learning_rate": 2.98845059657764e-05, "loss": 0.1076, "step": 8128 }, { "epoch": 2.4851727300519717, "grad_norm": 0.28998181223869324, "learning_rate": 2.9884081355356463e-05, "loss": 0.0731, "step": 8129 }, { "epoch": 2.485478446958117, "grad_norm": 0.3749183118343353, "learning_rate": 2.988365674493652e-05, "loss": 0.1019, "step": 8130 }, { "epoch": 2.4857841638642615, "grad_norm": 0.39201775193214417, "learning_rate": 2.9883232134516584e-05, "loss": 0.1439, "step": 8131 }, { "epoch": 2.4860898807704066, "grad_norm": 0.6529495120048523, "learning_rate": 2.9882807524096642e-05, "loss": 0.1664, "step": 8132 }, { "epoch": 2.4863955976765517, "grad_norm": 0.43685534596443176, "learning_rate": 2.9882382913676705e-05, "loss": 0.1385, "step": 8133 }, { "epoch": 2.4867013145826964, "grad_norm": 0.8120418190956116, "learning_rate": 2.9881958303256763e-05, "loss": 0.1824, "step": 8134 }, { "epoch": 2.4870070314888415, "grad_norm": 1.0740480422973633, "learning_rate": 2.9881533692836825e-05, "loss": 0.1994, "step": 8135 }, { "epoch": 2.487312748394986, "grad_norm": 0.720079243183136, "learning_rate": 2.9881109082416884e-05, "loss": 0.1883, "step": 8136 }, { "epoch": 2.4876184653011313, "grad_norm": 0.9188902974128723, "learning_rate": 2.9880684471996946e-05, "loss": 0.2119, "step": 8137 }, { "epoch": 2.487924182207276, "grad_norm": 1.1299188137054443, "learning_rate": 2.9880259861577005e-05, "loss": 0.2072, "step": 8138 }, { "epoch": 2.488229899113421, "grad_norm": 1.0614205598831177, "learning_rate": 2.9879835251157063e-05, "loss": 0.202, "step": 8139 }, { "epoch": 2.4885356160195657, "grad_norm": 1.0254994630813599, "learning_rate": 2.9879410640737126e-05, "loss": 0.2368, "step": 8140 }, { "epoch": 2.488841332925711, "grad_norm": 1.403084635734558, "learning_rate": 2.9878986030317184e-05, "loss": 0.224, "step": 8141 }, { "epoch": 2.4891470498318555, "grad_norm": 1.6502310037612915, "learning_rate": 2.9878561419897246e-05, "loss": 0.3376, "step": 8142 }, { "epoch": 2.4894527667380006, "grad_norm": 0.4400444030761719, "learning_rate": 2.9878136809477305e-05, "loss": 0.1706, "step": 8143 }, { "epoch": 2.4897584836441453, "grad_norm": 0.3651316165924072, "learning_rate": 2.9877712199057367e-05, "loss": 0.133, "step": 8144 }, { "epoch": 2.4900642005502904, "grad_norm": 0.5037561058998108, "learning_rate": 2.9877287588637426e-05, "loss": 0.0666, "step": 8145 }, { "epoch": 2.4903699174564355, "grad_norm": 0.30046266317367554, "learning_rate": 2.9876862978217488e-05, "loss": 0.0916, "step": 8146 }, { "epoch": 2.49067563436258, "grad_norm": 0.23663875460624695, "learning_rate": 2.9876438367797547e-05, "loss": 0.054, "step": 8147 }, { "epoch": 2.4909813512687253, "grad_norm": 0.38444381952285767, "learning_rate": 2.987601375737761e-05, "loss": 0.0898, "step": 8148 }, { "epoch": 2.49128706817487, "grad_norm": 0.25683626532554626, "learning_rate": 2.9875589146957667e-05, "loss": 0.0534, "step": 8149 }, { "epoch": 2.491592785081015, "grad_norm": 0.3372156620025635, "learning_rate": 2.987516453653773e-05, "loss": 0.0915, "step": 8150 }, { "epoch": 2.49189850198716, "grad_norm": 0.4763500690460205, "learning_rate": 2.9874739926117788e-05, "loss": 0.0639, "step": 8151 }, { "epoch": 2.492204218893305, "grad_norm": 0.4594251811504364, "learning_rate": 2.9874315315697847e-05, "loss": 0.0757, "step": 8152 }, { "epoch": 2.4925099357994496, "grad_norm": 0.48200979828834534, "learning_rate": 2.987389070527791e-05, "loss": 0.1168, "step": 8153 }, { "epoch": 2.4928156527055947, "grad_norm": 0.4657181203365326, "learning_rate": 2.9873466094857968e-05, "loss": 0.0915, "step": 8154 }, { "epoch": 2.4931213696117394, "grad_norm": 0.7003732919692993, "learning_rate": 2.987304148443803e-05, "loss": 0.132, "step": 8155 }, { "epoch": 2.4934270865178845, "grad_norm": 0.44913017749786377, "learning_rate": 2.987261687401809e-05, "loss": 0.0987, "step": 8156 }, { "epoch": 2.493732803424029, "grad_norm": 1.2006720304489136, "learning_rate": 2.987219226359815e-05, "loss": 0.1704, "step": 8157 }, { "epoch": 2.4940385203301743, "grad_norm": 0.8639605641365051, "learning_rate": 2.987176765317821e-05, "loss": 0.1835, "step": 8158 }, { "epoch": 2.4943442372363194, "grad_norm": 0.5814498066902161, "learning_rate": 2.987134304275827e-05, "loss": 0.1824, "step": 8159 }, { "epoch": 2.494649954142464, "grad_norm": 0.7426087856292725, "learning_rate": 2.987091843233833e-05, "loss": 0.1883, "step": 8160 }, { "epoch": 2.494955671048609, "grad_norm": 3.1375324726104736, "learning_rate": 2.9870493821918392e-05, "loss": 0.1914, "step": 8161 }, { "epoch": 2.495261387954754, "grad_norm": 0.8503732085227966, "learning_rate": 2.987006921149845e-05, "loss": 0.2125, "step": 8162 }, { "epoch": 2.495567104860899, "grad_norm": 1.084751844406128, "learning_rate": 2.986964460107851e-05, "loss": 0.1936, "step": 8163 }, { "epoch": 2.4958728217670436, "grad_norm": 0.8584094047546387, "learning_rate": 2.986921999065857e-05, "loss": 0.2224, "step": 8164 }, { "epoch": 2.4961785386731887, "grad_norm": 0.9529433250427246, "learning_rate": 2.986879538023863e-05, "loss": 0.2386, "step": 8165 }, { "epoch": 2.4964842555793334, "grad_norm": 1.5423064231872559, "learning_rate": 2.9868370769818692e-05, "loss": 0.233, "step": 8166 }, { "epoch": 2.4967899724854785, "grad_norm": 1.4095007181167603, "learning_rate": 2.986794615939875e-05, "loss": 0.2821, "step": 8167 }, { "epoch": 2.497095689391623, "grad_norm": 0.4204067885875702, "learning_rate": 2.9867521548978813e-05, "loss": 0.1469, "step": 8168 }, { "epoch": 2.4974014062977683, "grad_norm": 0.3563932478427887, "learning_rate": 2.9867096938558872e-05, "loss": 0.0908, "step": 8169 }, { "epoch": 2.497707123203913, "grad_norm": 0.4515950381755829, "learning_rate": 2.9866672328138934e-05, "loss": 0.0618, "step": 8170 }, { "epoch": 2.498012840110058, "grad_norm": 0.17495939135551453, "learning_rate": 2.9866247717718993e-05, "loss": 0.0573, "step": 8171 }, { "epoch": 2.498318557016203, "grad_norm": 0.3572370409965515, "learning_rate": 2.9865823107299055e-05, "loss": 0.0682, "step": 8172 }, { "epoch": 2.498624273922348, "grad_norm": 0.3583231270313263, "learning_rate": 2.9865398496879113e-05, "loss": 0.0645, "step": 8173 }, { "epoch": 2.498929990828493, "grad_norm": 0.34016773104667664, "learning_rate": 2.9864973886459176e-05, "loss": 0.0496, "step": 8174 }, { "epoch": 2.4992357077346377, "grad_norm": 1.0242594480514526, "learning_rate": 2.9864549276039234e-05, "loss": 0.0844, "step": 8175 }, { "epoch": 2.4995414246407828, "grad_norm": 0.2601062059402466, "learning_rate": 2.9864124665619293e-05, "loss": 0.0792, "step": 8176 }, { "epoch": 2.4998471415469274, "grad_norm": 0.3201124966144562, "learning_rate": 2.9863700055199355e-05, "loss": 0.0733, "step": 8177 }, { "epoch": 2.5001528584530726, "grad_norm": 0.28455668687820435, "learning_rate": 2.9863275444779414e-05, "loss": 0.0732, "step": 8178 }, { "epoch": 2.5004585753592172, "grad_norm": 0.3280475437641144, "learning_rate": 2.9862850834359476e-05, "loss": 0.0935, "step": 8179 }, { "epoch": 2.5007642922653623, "grad_norm": 0.355910062789917, "learning_rate": 2.9862426223939535e-05, "loss": 0.117, "step": 8180 }, { "epoch": 2.5010700091715075, "grad_norm": 0.5658973455429077, "learning_rate": 2.9862001613519597e-05, "loss": 0.1483, "step": 8181 }, { "epoch": 2.501375726077652, "grad_norm": 0.5414937138557434, "learning_rate": 2.9861577003099655e-05, "loss": 0.1659, "step": 8182 }, { "epoch": 2.501681442983797, "grad_norm": 1.120459794998169, "learning_rate": 2.9861152392679717e-05, "loss": 0.1349, "step": 8183 }, { "epoch": 2.501987159889942, "grad_norm": 0.49005326628685, "learning_rate": 2.9860727782259776e-05, "loss": 0.1994, "step": 8184 }, { "epoch": 2.502292876796087, "grad_norm": 0.5188384056091309, "learning_rate": 2.9860303171839838e-05, "loss": 0.192, "step": 8185 }, { "epoch": 2.5025985937022317, "grad_norm": 0.5886049270629883, "learning_rate": 2.9859878561419897e-05, "loss": 0.1939, "step": 8186 }, { "epoch": 2.5029043106083764, "grad_norm": 1.052263855934143, "learning_rate": 2.985945395099996e-05, "loss": 0.2098, "step": 8187 }, { "epoch": 2.5032100275145215, "grad_norm": 0.757007896900177, "learning_rate": 2.9859029340580018e-05, "loss": 0.2368, "step": 8188 }, { "epoch": 2.5035157444206666, "grad_norm": 1.0162047147750854, "learning_rate": 2.9858604730160076e-05, "loss": 0.1884, "step": 8189 }, { "epoch": 2.5038214613268113, "grad_norm": 0.9662662744522095, "learning_rate": 2.985818011974014e-05, "loss": 0.2247, "step": 8190 }, { "epoch": 2.5041271782329564, "grad_norm": 2.116555690765381, "learning_rate": 2.9857755509320197e-05, "loss": 0.242, "step": 8191 }, { "epoch": 2.504432895139101, "grad_norm": 3.0055603981018066, "learning_rate": 2.985733089890026e-05, "loss": 0.2611, "step": 8192 }, { "epoch": 2.504738612045246, "grad_norm": 0.3485099673271179, "learning_rate": 2.9856906288480318e-05, "loss": 0.129, "step": 8193 }, { "epoch": 2.5050443289513913, "grad_norm": 0.2692350149154663, "learning_rate": 2.985648167806038e-05, "loss": 0.1097, "step": 8194 }, { "epoch": 2.505350045857536, "grad_norm": 0.2597716152667999, "learning_rate": 2.985605706764044e-05, "loss": 0.0833, "step": 8195 }, { "epoch": 2.5056557627636806, "grad_norm": 0.5010417103767395, "learning_rate": 2.98556324572205e-05, "loss": 0.0626, "step": 8196 }, { "epoch": 2.5059614796698257, "grad_norm": 0.281341016292572, "learning_rate": 2.985520784680056e-05, "loss": 0.0542, "step": 8197 }, { "epoch": 2.506267196575971, "grad_norm": 0.21585173904895782, "learning_rate": 2.985478323638062e-05, "loss": 0.056, "step": 8198 }, { "epoch": 2.5065729134821155, "grad_norm": 0.26932182908058167, "learning_rate": 2.985435862596068e-05, "loss": 0.0591, "step": 8199 }, { "epoch": 2.50687863038826, "grad_norm": 0.5158606171607971, "learning_rate": 2.9853934015540742e-05, "loss": 0.0912, "step": 8200 }, { "epoch": 2.5071843472944053, "grad_norm": 0.3262871503829956, "learning_rate": 2.98535094051208e-05, "loss": 0.1137, "step": 8201 }, { "epoch": 2.5074900642005504, "grad_norm": 0.35113435983657837, "learning_rate": 2.985308479470086e-05, "loss": 0.0749, "step": 8202 }, { "epoch": 2.507795781106695, "grad_norm": 0.29659560322761536, "learning_rate": 2.9852660184280922e-05, "loss": 0.1002, "step": 8203 }, { "epoch": 2.50810149801284, "grad_norm": 0.36187613010406494, "learning_rate": 2.985223557386098e-05, "loss": 0.1071, "step": 8204 }, { "epoch": 2.508407214918985, "grad_norm": 0.47664764523506165, "learning_rate": 2.9851810963441043e-05, "loss": 0.1024, "step": 8205 }, { "epoch": 2.50871293182513, "grad_norm": 0.9715594053268433, "learning_rate": 2.98513863530211e-05, "loss": 0.1443, "step": 8206 }, { "epoch": 2.509018648731275, "grad_norm": 0.505963146686554, "learning_rate": 2.9850961742601163e-05, "loss": 0.1287, "step": 8207 }, { "epoch": 2.50932436563742, "grad_norm": 0.7940978407859802, "learning_rate": 2.9850537132181222e-05, "loss": 0.163, "step": 8208 }, { "epoch": 2.5096300825435645, "grad_norm": 0.6197808980941772, "learning_rate": 2.9850112521761284e-05, "loss": 0.1457, "step": 8209 }, { "epoch": 2.5099357994497096, "grad_norm": 0.5839415788650513, "learning_rate": 2.9849687911341343e-05, "loss": 0.2409, "step": 8210 }, { "epoch": 2.5102415163558547, "grad_norm": 0.9139887690544128, "learning_rate": 2.9849263300921405e-05, "loss": 0.1731, "step": 8211 }, { "epoch": 2.5105472332619994, "grad_norm": 0.860094428062439, "learning_rate": 2.9848838690501464e-05, "loss": 0.2048, "step": 8212 }, { "epoch": 2.510852950168144, "grad_norm": 1.169443130493164, "learning_rate": 2.9848414080081526e-05, "loss": 0.2053, "step": 8213 }, { "epoch": 2.511158667074289, "grad_norm": 0.7011336088180542, "learning_rate": 2.9847989469661588e-05, "loss": 0.2273, "step": 8214 }, { "epoch": 2.5114643839804343, "grad_norm": 0.5953391790390015, "learning_rate": 2.9847564859241647e-05, "loss": 0.197, "step": 8215 }, { "epoch": 2.511770100886579, "grad_norm": 1.177841305732727, "learning_rate": 2.984714024882171e-05, "loss": 0.205, "step": 8216 }, { "epoch": 2.512075817792724, "grad_norm": 1.6545377969741821, "learning_rate": 2.9846715638401767e-05, "loss": 0.3035, "step": 8217 }, { "epoch": 2.5123815346988687, "grad_norm": 0.3953278064727783, "learning_rate": 2.984629102798183e-05, "loss": 0.1769, "step": 8218 }, { "epoch": 2.512687251605014, "grad_norm": 0.2814268469810486, "learning_rate": 2.9845866417561888e-05, "loss": 0.0959, "step": 8219 }, { "epoch": 2.512992968511159, "grad_norm": 0.2814735174179077, "learning_rate": 2.984544180714195e-05, "loss": 0.0965, "step": 8220 }, { "epoch": 2.5132986854173036, "grad_norm": 0.2844310998916626, "learning_rate": 2.984501719672201e-05, "loss": 0.0909, "step": 8221 }, { "epoch": 2.5136044023234483, "grad_norm": 0.23908013105392456, "learning_rate": 2.984459258630207e-05, "loss": 0.0706, "step": 8222 }, { "epoch": 2.5139101192295934, "grad_norm": 0.24714802205562592, "learning_rate": 2.984416797588213e-05, "loss": 0.0518, "step": 8223 }, { "epoch": 2.5142158361357385, "grad_norm": 0.29408925771713257, "learning_rate": 2.9843743365462192e-05, "loss": 0.0658, "step": 8224 }, { "epoch": 2.514521553041883, "grad_norm": 0.20400071144104004, "learning_rate": 2.984331875504225e-05, "loss": 0.0626, "step": 8225 }, { "epoch": 2.514827269948028, "grad_norm": 0.29126647114753723, "learning_rate": 2.9842894144622313e-05, "loss": 0.0842, "step": 8226 }, { "epoch": 2.515132986854173, "grad_norm": 0.33118027448654175, "learning_rate": 2.984246953420237e-05, "loss": 0.0969, "step": 8227 }, { "epoch": 2.515438703760318, "grad_norm": 0.4015538990497589, "learning_rate": 2.984204492378243e-05, "loss": 0.0924, "step": 8228 }, { "epoch": 2.5157444206664628, "grad_norm": 0.5148810744285583, "learning_rate": 2.9841620313362492e-05, "loss": 0.08, "step": 8229 }, { "epoch": 2.516050137572608, "grad_norm": 0.2989797592163086, "learning_rate": 2.984119570294255e-05, "loss": 0.1304, "step": 8230 }, { "epoch": 2.5163558544787525, "grad_norm": 0.4284929931163788, "learning_rate": 2.9840771092522613e-05, "loss": 0.1368, "step": 8231 }, { "epoch": 2.5166615713848977, "grad_norm": 1.0821263790130615, "learning_rate": 2.984034648210267e-05, "loss": 0.1371, "step": 8232 }, { "epoch": 2.5169672882910428, "grad_norm": 0.7700421214103699, "learning_rate": 2.9839921871682734e-05, "loss": 0.1353, "step": 8233 }, { "epoch": 2.5172730051971874, "grad_norm": 0.4994412660598755, "learning_rate": 2.9839497261262792e-05, "loss": 0.177, "step": 8234 }, { "epoch": 2.517578722103332, "grad_norm": 0.5895884037017822, "learning_rate": 2.9839072650842855e-05, "loss": 0.2035, "step": 8235 }, { "epoch": 2.5178844390094772, "grad_norm": 0.7206140756607056, "learning_rate": 2.9838648040422913e-05, "loss": 0.2005, "step": 8236 }, { "epoch": 2.5181901559156223, "grad_norm": 1.0324969291687012, "learning_rate": 2.9838223430002975e-05, "loss": 0.1984, "step": 8237 }, { "epoch": 2.518495872821767, "grad_norm": 1.0590180158615112, "learning_rate": 2.9837798819583034e-05, "loss": 0.2149, "step": 8238 }, { "epoch": 2.5188015897279117, "grad_norm": 1.1967159509658813, "learning_rate": 2.9837374209163096e-05, "loss": 0.2277, "step": 8239 }, { "epoch": 2.519107306634057, "grad_norm": 1.0025639533996582, "learning_rate": 2.9836949598743155e-05, "loss": 0.2046, "step": 8240 }, { "epoch": 2.519413023540202, "grad_norm": 1.696621060371399, "learning_rate": 2.9836524988323214e-05, "loss": 0.2465, "step": 8241 }, { "epoch": 2.5197187404463466, "grad_norm": 2.018003225326538, "learning_rate": 2.9836100377903276e-05, "loss": 0.3398, "step": 8242 }, { "epoch": 2.5200244573524917, "grad_norm": 0.4338679313659668, "learning_rate": 2.9835675767483334e-05, "loss": 0.1472, "step": 8243 }, { "epoch": 2.5203301742586364, "grad_norm": 0.3432902693748474, "learning_rate": 2.9835251157063396e-05, "loss": 0.0754, "step": 8244 }, { "epoch": 2.5206358911647815, "grad_norm": 0.26066339015960693, "learning_rate": 2.9834826546643455e-05, "loss": 0.0751, "step": 8245 }, { "epoch": 2.5209416080709266, "grad_norm": 0.2564464509487152, "learning_rate": 2.9834401936223517e-05, "loss": 0.0668, "step": 8246 }, { "epoch": 2.5212473249770713, "grad_norm": 0.717294454574585, "learning_rate": 2.9833977325803576e-05, "loss": 0.0943, "step": 8247 }, { "epoch": 2.521553041883216, "grad_norm": 0.2764582931995392, "learning_rate": 2.9833552715383638e-05, "loss": 0.0636, "step": 8248 }, { "epoch": 2.521858758789361, "grad_norm": 1.7376078367233276, "learning_rate": 2.9833128104963697e-05, "loss": 0.0543, "step": 8249 }, { "epoch": 2.522164475695506, "grad_norm": 0.409488320350647, "learning_rate": 2.983270349454376e-05, "loss": 0.0748, "step": 8250 }, { "epoch": 2.522470192601651, "grad_norm": 0.4496593177318573, "learning_rate": 2.9832278884123817e-05, "loss": 0.0856, "step": 8251 }, { "epoch": 2.5227759095077955, "grad_norm": 0.7741529941558838, "learning_rate": 2.983185427370388e-05, "loss": 0.089, "step": 8252 }, { "epoch": 2.5230816264139406, "grad_norm": 0.3832903504371643, "learning_rate": 2.9831429663283938e-05, "loss": 0.1112, "step": 8253 }, { "epoch": 2.5233873433200857, "grad_norm": 0.5615965127944946, "learning_rate": 2.9831005052863997e-05, "loss": 0.103, "step": 8254 }, { "epoch": 2.5236930602262304, "grad_norm": 0.5559606552124023, "learning_rate": 2.983058044244406e-05, "loss": 0.102, "step": 8255 }, { "epoch": 2.5239987771323755, "grad_norm": 0.3977636396884918, "learning_rate": 2.9830155832024118e-05, "loss": 0.1588, "step": 8256 }, { "epoch": 2.52430449403852, "grad_norm": 0.6968163251876831, "learning_rate": 2.982973122160418e-05, "loss": 0.1418, "step": 8257 }, { "epoch": 2.5246102109446653, "grad_norm": 0.4829663932323456, "learning_rate": 2.982930661118424e-05, "loss": 0.156, "step": 8258 }, { "epoch": 2.5249159278508104, "grad_norm": 0.6877796053886414, "learning_rate": 2.98288820007643e-05, "loss": 0.1725, "step": 8259 }, { "epoch": 2.525221644756955, "grad_norm": 0.553942859172821, "learning_rate": 2.982845739034436e-05, "loss": 0.1949, "step": 8260 }, { "epoch": 2.5255273616630998, "grad_norm": 1.058703064918518, "learning_rate": 2.982803277992442e-05, "loss": 0.2025, "step": 8261 }, { "epoch": 2.525833078569245, "grad_norm": 1.2359654903411865, "learning_rate": 2.982760816950448e-05, "loss": 0.1939, "step": 8262 }, { "epoch": 2.52613879547539, "grad_norm": 0.6291717290878296, "learning_rate": 2.9827183559084542e-05, "loss": 0.1839, "step": 8263 }, { "epoch": 2.5264445123815347, "grad_norm": 0.7424638867378235, "learning_rate": 2.98267589486646e-05, "loss": 0.2117, "step": 8264 }, { "epoch": 2.5267502292876793, "grad_norm": 0.9145022034645081, "learning_rate": 2.9826334338244663e-05, "loss": 0.2412, "step": 8265 }, { "epoch": 2.5270559461938245, "grad_norm": 0.8807359337806702, "learning_rate": 2.982590972782472e-05, "loss": 0.2038, "step": 8266 }, { "epoch": 2.5273616630999696, "grad_norm": 1.3597893714904785, "learning_rate": 2.982548511740478e-05, "loss": 0.283, "step": 8267 }, { "epoch": 2.5276673800061142, "grad_norm": 0.35436996817588806, "learning_rate": 2.9825060506984842e-05, "loss": 0.1464, "step": 8268 }, { "epoch": 2.5279730969122594, "grad_norm": 0.2724387049674988, "learning_rate": 2.98246358965649e-05, "loss": 0.0922, "step": 8269 }, { "epoch": 2.528278813818404, "grad_norm": 0.4919244945049286, "learning_rate": 2.9824211286144963e-05, "loss": 0.0968, "step": 8270 }, { "epoch": 2.528584530724549, "grad_norm": 0.3246879279613495, "learning_rate": 2.9823786675725022e-05, "loss": 0.0685, "step": 8271 }, { "epoch": 2.5288902476306943, "grad_norm": 0.2692434787750244, "learning_rate": 2.9823362065305084e-05, "loss": 0.0619, "step": 8272 }, { "epoch": 2.529195964536839, "grad_norm": 0.42069604992866516, "learning_rate": 2.9822937454885143e-05, "loss": 0.0625, "step": 8273 }, { "epoch": 2.5295016814429836, "grad_norm": 0.45744431018829346, "learning_rate": 2.9822512844465205e-05, "loss": 0.0787, "step": 8274 }, { "epoch": 2.5298073983491287, "grad_norm": 0.2663906514644623, "learning_rate": 2.9822088234045264e-05, "loss": 0.0644, "step": 8275 }, { "epoch": 2.530113115255274, "grad_norm": 0.36570125818252563, "learning_rate": 2.9821663623625326e-05, "loss": 0.0781, "step": 8276 }, { "epoch": 2.5304188321614185, "grad_norm": 0.5177819728851318, "learning_rate": 2.9821239013205384e-05, "loss": 0.0638, "step": 8277 }, { "epoch": 2.530724549067563, "grad_norm": 0.4565875828266144, "learning_rate": 2.9820814402785443e-05, "loss": 0.116, "step": 8278 }, { "epoch": 2.5310302659737083, "grad_norm": 0.3411373496055603, "learning_rate": 2.9820389792365505e-05, "loss": 0.0716, "step": 8279 }, { "epoch": 2.5313359828798534, "grad_norm": 0.3001583516597748, "learning_rate": 2.9819965181945564e-05, "loss": 0.0969, "step": 8280 }, { "epoch": 2.531641699785998, "grad_norm": 0.4873327314853668, "learning_rate": 2.9819540571525626e-05, "loss": 0.1275, "step": 8281 }, { "epoch": 2.531947416692143, "grad_norm": 0.40379413962364197, "learning_rate": 2.9819115961105685e-05, "loss": 0.1185, "step": 8282 }, { "epoch": 2.532253133598288, "grad_norm": 0.6715131998062134, "learning_rate": 2.9818691350685747e-05, "loss": 0.1543, "step": 8283 }, { "epoch": 2.532558850504433, "grad_norm": 0.6229904294013977, "learning_rate": 2.9818266740265805e-05, "loss": 0.1561, "step": 8284 }, { "epoch": 2.5328645674105776, "grad_norm": 0.6395179629325867, "learning_rate": 2.9817842129845867e-05, "loss": 0.1705, "step": 8285 }, { "epoch": 2.5331702843167228, "grad_norm": 0.606717586517334, "learning_rate": 2.9817417519425926e-05, "loss": 0.2054, "step": 8286 }, { "epoch": 2.5334760012228674, "grad_norm": 0.7289931774139404, "learning_rate": 2.9816992909005988e-05, "loss": 0.2256, "step": 8287 }, { "epoch": 2.5337817181290125, "grad_norm": 0.842157781124115, "learning_rate": 2.9816568298586047e-05, "loss": 0.194, "step": 8288 }, { "epoch": 2.5340874350351577, "grad_norm": 0.8633816242218018, "learning_rate": 2.981614368816611e-05, "loss": 0.2193, "step": 8289 }, { "epoch": 2.5343931519413023, "grad_norm": 1.0040735006332397, "learning_rate": 2.9815719077746168e-05, "loss": 0.2585, "step": 8290 }, { "epoch": 2.534698868847447, "grad_norm": 1.171146035194397, "learning_rate": 2.9815294467326226e-05, "loss": 0.2162, "step": 8291 }, { "epoch": 2.535004585753592, "grad_norm": 1.4983220100402832, "learning_rate": 2.981486985690629e-05, "loss": 0.3284, "step": 8292 }, { "epoch": 2.5353103026597372, "grad_norm": 0.3469662070274353, "learning_rate": 2.9814445246486347e-05, "loss": 0.1357, "step": 8293 }, { "epoch": 2.535616019565882, "grad_norm": 0.2670593559741974, "learning_rate": 2.981402063606641e-05, "loss": 0.0755, "step": 8294 }, { "epoch": 2.535921736472027, "grad_norm": 0.3231266438961029, "learning_rate": 2.9813596025646468e-05, "loss": 0.0637, "step": 8295 }, { "epoch": 2.5362274533781717, "grad_norm": 0.5060835480690002, "learning_rate": 2.981317141522653e-05, "loss": 0.0768, "step": 8296 }, { "epoch": 2.536533170284317, "grad_norm": 0.4575967788696289, "learning_rate": 2.981274680480659e-05, "loss": 0.0817, "step": 8297 }, { "epoch": 2.5368388871904615, "grad_norm": 0.7774275541305542, "learning_rate": 2.981232219438665e-05, "loss": 0.0874, "step": 8298 }, { "epoch": 2.5371446040966066, "grad_norm": 0.3194688558578491, "learning_rate": 2.981189758396671e-05, "loss": 0.0721, "step": 8299 }, { "epoch": 2.5374503210027513, "grad_norm": 0.27617400884628296, "learning_rate": 2.981147297354677e-05, "loss": 0.0533, "step": 8300 }, { "epoch": 2.5377560379088964, "grad_norm": 0.7751598954200745, "learning_rate": 2.981104836312683e-05, "loss": 0.118, "step": 8301 }, { "epoch": 2.5380617548150415, "grad_norm": 0.4135259687900543, "learning_rate": 2.9810623752706892e-05, "loss": 0.0866, "step": 8302 }, { "epoch": 2.538367471721186, "grad_norm": 0.5565584897994995, "learning_rate": 2.981019914228695e-05, "loss": 0.1027, "step": 8303 }, { "epoch": 2.538673188627331, "grad_norm": 0.43647587299346924, "learning_rate": 2.980977453186701e-05, "loss": 0.0775, "step": 8304 }, { "epoch": 2.538978905533476, "grad_norm": 0.5247365236282349, "learning_rate": 2.9809349921447072e-05, "loss": 0.1578, "step": 8305 }, { "epoch": 2.539284622439621, "grad_norm": 0.7619134783744812, "learning_rate": 2.980892531102713e-05, "loss": 0.1266, "step": 8306 }, { "epoch": 2.5395903393457657, "grad_norm": 0.9530573487281799, "learning_rate": 2.9808500700607193e-05, "loss": 0.1452, "step": 8307 }, { "epoch": 2.539896056251911, "grad_norm": 9.588468551635742, "learning_rate": 2.980807609018725e-05, "loss": 0.1733, "step": 8308 }, { "epoch": 2.5402017731580555, "grad_norm": 0.5728113055229187, "learning_rate": 2.9807651479767314e-05, "loss": 0.1846, "step": 8309 }, { "epoch": 2.5405074900642006, "grad_norm": 0.6969230771064758, "learning_rate": 2.9807226869347372e-05, "loss": 0.1795, "step": 8310 }, { "epoch": 2.5408132069703453, "grad_norm": 1.2693716287612915, "learning_rate": 2.9806802258927434e-05, "loss": 0.2091, "step": 8311 }, { "epoch": 2.5411189238764904, "grad_norm": 0.8405194282531738, "learning_rate": 2.9806377648507493e-05, "loss": 0.2115, "step": 8312 }, { "epoch": 2.541424640782635, "grad_norm": 1.0496915578842163, "learning_rate": 2.9805953038087555e-05, "loss": 0.2491, "step": 8313 }, { "epoch": 2.54173035768878, "grad_norm": 1.1696054935455322, "learning_rate": 2.9805528427667614e-05, "loss": 0.208, "step": 8314 }, { "epoch": 2.5420360745949253, "grad_norm": 1.437209963798523, "learning_rate": 2.9805103817247676e-05, "loss": 0.2285, "step": 8315 }, { "epoch": 2.54234179150107, "grad_norm": 0.8782382011413574, "learning_rate": 2.9804679206827735e-05, "loss": 0.2246, "step": 8316 }, { "epoch": 2.5426475084072147, "grad_norm": 1.3908655643463135, "learning_rate": 2.9804254596407797e-05, "loss": 0.2501, "step": 8317 }, { "epoch": 2.5429532253133598, "grad_norm": 0.4094943106174469, "learning_rate": 2.980382998598786e-05, "loss": 0.1618, "step": 8318 }, { "epoch": 2.543258942219505, "grad_norm": 0.29834458231925964, "learning_rate": 2.9803405375567917e-05, "loss": 0.0835, "step": 8319 }, { "epoch": 2.5435646591256496, "grad_norm": 0.33155274391174316, "learning_rate": 2.980298076514798e-05, "loss": 0.0981, "step": 8320 }, { "epoch": 2.5438703760317947, "grad_norm": 0.6615211367607117, "learning_rate": 2.9802556154728038e-05, "loss": 0.1003, "step": 8321 }, { "epoch": 2.5441760929379393, "grad_norm": 0.18501843512058258, "learning_rate": 2.98021315443081e-05, "loss": 0.077, "step": 8322 }, { "epoch": 2.5444818098440845, "grad_norm": 0.27717822790145874, "learning_rate": 2.980170693388816e-05, "loss": 0.0592, "step": 8323 }, { "epoch": 2.544787526750229, "grad_norm": 0.33657944202423096, "learning_rate": 2.980128232346822e-05, "loss": 0.0632, "step": 8324 }, { "epoch": 2.5450932436563742, "grad_norm": 0.21840722858905792, "learning_rate": 2.980085771304828e-05, "loss": 0.0646, "step": 8325 }, { "epoch": 2.545398960562519, "grad_norm": 0.4799385666847229, "learning_rate": 2.9800433102628342e-05, "loss": 0.0849, "step": 8326 }, { "epoch": 2.545704677468664, "grad_norm": 0.5146706700325012, "learning_rate": 2.98000084922084e-05, "loss": 0.1097, "step": 8327 }, { "epoch": 2.546010394374809, "grad_norm": 0.4751254618167877, "learning_rate": 2.9799583881788463e-05, "loss": 0.0723, "step": 8328 }, { "epoch": 2.546316111280954, "grad_norm": 0.3808456063270569, "learning_rate": 2.979915927136852e-05, "loss": 0.1282, "step": 8329 }, { "epoch": 2.5466218281870985, "grad_norm": 0.5662547945976257, "learning_rate": 2.979873466094858e-05, "loss": 0.119, "step": 8330 }, { "epoch": 2.5469275450932436, "grad_norm": 0.5493873357772827, "learning_rate": 2.9798310050528642e-05, "loss": 0.1303, "step": 8331 }, { "epoch": 2.5472332619993887, "grad_norm": 0.49144700169563293, "learning_rate": 2.97978854401087e-05, "loss": 0.1188, "step": 8332 }, { "epoch": 2.5475389789055334, "grad_norm": 0.5319558382034302, "learning_rate": 2.9797460829688763e-05, "loss": 0.1323, "step": 8333 }, { "epoch": 2.5478446958116785, "grad_norm": 0.5926159024238586, "learning_rate": 2.979703621926882e-05, "loss": 0.2163, "step": 8334 }, { "epoch": 2.548150412717823, "grad_norm": 0.5070227980613708, "learning_rate": 2.9796611608848884e-05, "loss": 0.185, "step": 8335 }, { "epoch": 2.5484561296239683, "grad_norm": 0.8654133081436157, "learning_rate": 2.9796186998428942e-05, "loss": 0.2072, "step": 8336 }, { "epoch": 2.548761846530113, "grad_norm": 0.7669133543968201, "learning_rate": 2.9795762388009005e-05, "loss": 0.1936, "step": 8337 }, { "epoch": 2.549067563436258, "grad_norm": 1.138536810874939, "learning_rate": 2.9795337777589063e-05, "loss": 0.2076, "step": 8338 }, { "epoch": 2.5493732803424027, "grad_norm": 0.8293712735176086, "learning_rate": 2.9794913167169125e-05, "loss": 0.2387, "step": 8339 }, { "epoch": 2.549678997248548, "grad_norm": 0.8053146600723267, "learning_rate": 2.9794488556749184e-05, "loss": 0.2086, "step": 8340 }, { "epoch": 2.549984714154693, "grad_norm": 1.8234024047851562, "learning_rate": 2.9794063946329246e-05, "loss": 0.2367, "step": 8341 }, { "epoch": 2.5502904310608376, "grad_norm": 1.3816311359405518, "learning_rate": 2.9793639335909305e-05, "loss": 0.2434, "step": 8342 }, { "epoch": 2.5505961479669823, "grad_norm": 0.4278380870819092, "learning_rate": 2.9793214725489364e-05, "loss": 0.1639, "step": 8343 }, { "epoch": 2.5509018648731274, "grad_norm": 0.4531414806842804, "learning_rate": 2.9792790115069426e-05, "loss": 0.1039, "step": 8344 }, { "epoch": 2.5512075817792725, "grad_norm": 0.30998945236206055, "learning_rate": 2.9792365504649484e-05, "loss": 0.078, "step": 8345 }, { "epoch": 2.551513298685417, "grad_norm": 0.28420960903167725, "learning_rate": 2.9791940894229546e-05, "loss": 0.0512, "step": 8346 }, { "epoch": 2.5518190155915623, "grad_norm": 0.33427852392196655, "learning_rate": 2.9791516283809605e-05, "loss": 0.0855, "step": 8347 }, { "epoch": 2.552124732497707, "grad_norm": 0.36205968260765076, "learning_rate": 2.9791091673389667e-05, "loss": 0.0769, "step": 8348 }, { "epoch": 2.552430449403852, "grad_norm": 0.21905508637428284, "learning_rate": 2.9790667062969726e-05, "loss": 0.0652, "step": 8349 }, { "epoch": 2.552736166309997, "grad_norm": 0.40163519978523254, "learning_rate": 2.9790242452549788e-05, "loss": 0.0829, "step": 8350 }, { "epoch": 2.553041883216142, "grad_norm": 0.2783992290496826, "learning_rate": 2.9789817842129847e-05, "loss": 0.1066, "step": 8351 }, { "epoch": 2.5533476001222866, "grad_norm": 0.2616511285305023, "learning_rate": 2.978939323170991e-05, "loss": 0.0635, "step": 8352 }, { "epoch": 2.5536533170284317, "grad_norm": 0.4878736734390259, "learning_rate": 2.9788968621289967e-05, "loss": 0.0728, "step": 8353 }, { "epoch": 2.553959033934577, "grad_norm": 0.3684738278388977, "learning_rate": 2.978854401087003e-05, "loss": 0.0987, "step": 8354 }, { "epoch": 2.5542647508407215, "grad_norm": 0.39749687910079956, "learning_rate": 2.9788119400450088e-05, "loss": 0.1203, "step": 8355 }, { "epoch": 2.554570467746866, "grad_norm": 0.33960145711898804, "learning_rate": 2.9787694790030147e-05, "loss": 0.1371, "step": 8356 }, { "epoch": 2.5548761846530113, "grad_norm": 0.7304810285568237, "learning_rate": 2.978727017961021e-05, "loss": 0.1384, "step": 8357 }, { "epoch": 2.5551819015591564, "grad_norm": 0.5567790865898132, "learning_rate": 2.9786845569190268e-05, "loss": 0.1892, "step": 8358 }, { "epoch": 2.555487618465301, "grad_norm": 0.5488064289093018, "learning_rate": 2.978642095877033e-05, "loss": 0.1641, "step": 8359 }, { "epoch": 2.555793335371446, "grad_norm": 0.7058230638504028, "learning_rate": 2.978599634835039e-05, "loss": 0.1753, "step": 8360 }, { "epoch": 2.556099052277591, "grad_norm": 1.0510905981063843, "learning_rate": 2.978557173793045e-05, "loss": 0.1938, "step": 8361 }, { "epoch": 2.556404769183736, "grad_norm": 0.8958444595336914, "learning_rate": 2.978514712751051e-05, "loss": 0.209, "step": 8362 }, { "epoch": 2.5567104860898806, "grad_norm": 0.7192044854164124, "learning_rate": 2.978472251709057e-05, "loss": 0.206, "step": 8363 }, { "epoch": 2.5570162029960257, "grad_norm": 0.8918411135673523, "learning_rate": 2.978429790667063e-05, "loss": 0.216, "step": 8364 }, { "epoch": 2.5573219199021704, "grad_norm": 1.4857861995697021, "learning_rate": 2.9783873296250692e-05, "loss": 0.2179, "step": 8365 }, { "epoch": 2.5576276368083155, "grad_norm": 1.1071372032165527, "learning_rate": 2.978344868583075e-05, "loss": 0.224, "step": 8366 }, { "epoch": 2.5579333537144606, "grad_norm": 1.326225996017456, "learning_rate": 2.9783024075410813e-05, "loss": 0.3075, "step": 8367 }, { "epoch": 2.5582390706206053, "grad_norm": 0.45578625798225403, "learning_rate": 2.978259946499087e-05, "loss": 0.1414, "step": 8368 }, { "epoch": 2.55854478752675, "grad_norm": 0.4510904550552368, "learning_rate": 2.978217485457093e-05, "loss": 0.0779, "step": 8369 }, { "epoch": 2.558850504432895, "grad_norm": 0.43607500195503235, "learning_rate": 2.9781750244150992e-05, "loss": 0.0751, "step": 8370 }, { "epoch": 2.55915622133904, "grad_norm": 0.37792545557022095, "learning_rate": 2.978132563373105e-05, "loss": 0.1118, "step": 8371 }, { "epoch": 2.559461938245185, "grad_norm": 0.36639291048049927, "learning_rate": 2.9780901023311113e-05, "loss": 0.069, "step": 8372 }, { "epoch": 2.55976765515133, "grad_norm": 0.40852612257003784, "learning_rate": 2.9780476412891172e-05, "loss": 0.0476, "step": 8373 }, { "epoch": 2.5600733720574746, "grad_norm": 0.2316218465566635, "learning_rate": 2.9780051802471234e-05, "loss": 0.064, "step": 8374 }, { "epoch": 2.5603790889636198, "grad_norm": 0.32115593552589417, "learning_rate": 2.9779627192051293e-05, "loss": 0.0754, "step": 8375 }, { "epoch": 2.5606848058697644, "grad_norm": 0.22614265978336334, "learning_rate": 2.9779202581631355e-05, "loss": 0.0516, "step": 8376 }, { "epoch": 2.5609905227759096, "grad_norm": 0.38804614543914795, "learning_rate": 2.9778777971211414e-05, "loss": 0.0899, "step": 8377 }, { "epoch": 2.561296239682054, "grad_norm": 0.5522410869598389, "learning_rate": 2.9778353360791476e-05, "loss": 0.109, "step": 8378 }, { "epoch": 2.5616019565881993, "grad_norm": 0.4360395669937134, "learning_rate": 2.9777928750371534e-05, "loss": 0.1125, "step": 8379 }, { "epoch": 2.5619076734943445, "grad_norm": 0.378873735666275, "learning_rate": 2.9777504139951596e-05, "loss": 0.0883, "step": 8380 }, { "epoch": 2.562213390400489, "grad_norm": 1.0197346210479736, "learning_rate": 2.9777079529531655e-05, "loss": 0.1413, "step": 8381 }, { "epoch": 2.562519107306634, "grad_norm": 0.5122418999671936, "learning_rate": 2.9776654919111714e-05, "loss": 0.1464, "step": 8382 }, { "epoch": 2.562824824212779, "grad_norm": 0.8548694849014282, "learning_rate": 2.9776230308691776e-05, "loss": 0.1808, "step": 8383 }, { "epoch": 2.563130541118924, "grad_norm": 0.7412512898445129, "learning_rate": 2.9775805698271835e-05, "loss": 0.1657, "step": 8384 }, { "epoch": 2.5634362580250687, "grad_norm": 0.5393138527870178, "learning_rate": 2.9775381087851897e-05, "loss": 0.1764, "step": 8385 }, { "epoch": 2.563741974931214, "grad_norm": 0.9220337271690369, "learning_rate": 2.9774956477431955e-05, "loss": 0.1934, "step": 8386 }, { "epoch": 2.5640476918373585, "grad_norm": 0.6774190664291382, "learning_rate": 2.9774531867012017e-05, "loss": 0.1856, "step": 8387 }, { "epoch": 2.5643534087435036, "grad_norm": 1.7669955492019653, "learning_rate": 2.9774107256592076e-05, "loss": 0.2104, "step": 8388 }, { "epoch": 2.5646591256496483, "grad_norm": 3.855278968811035, "learning_rate": 2.9773682646172138e-05, "loss": 0.25, "step": 8389 }, { "epoch": 2.5649648425557934, "grad_norm": 0.8770455121994019, "learning_rate": 2.9773258035752197e-05, "loss": 0.2139, "step": 8390 }, { "epoch": 2.565270559461938, "grad_norm": 1.9386475086212158, "learning_rate": 2.977283342533226e-05, "loss": 0.2934, "step": 8391 }, { "epoch": 2.565576276368083, "grad_norm": 5.064486503601074, "learning_rate": 2.9772408814912318e-05, "loss": 0.3019, "step": 8392 }, { "epoch": 2.5658819932742283, "grad_norm": 0.3207942843437195, "learning_rate": 2.9771984204492376e-05, "loss": 0.1614, "step": 8393 }, { "epoch": 2.566187710180373, "grad_norm": 0.40152978897094727, "learning_rate": 2.977155959407244e-05, "loss": 0.0813, "step": 8394 }, { "epoch": 2.5664934270865176, "grad_norm": 0.4153764843940735, "learning_rate": 2.9771134983652497e-05, "loss": 0.1002, "step": 8395 }, { "epoch": 2.5667991439926627, "grad_norm": 0.26214075088500977, "learning_rate": 2.977071037323256e-05, "loss": 0.0763, "step": 8396 }, { "epoch": 2.567104860898808, "grad_norm": 0.4014420509338379, "learning_rate": 2.9770285762812618e-05, "loss": 0.0732, "step": 8397 }, { "epoch": 2.5674105778049525, "grad_norm": 0.2207091897726059, "learning_rate": 2.976986115239268e-05, "loss": 0.055, "step": 8398 }, { "epoch": 2.5677162947110976, "grad_norm": 0.3176036477088928, "learning_rate": 2.976943654197274e-05, "loss": 0.0782, "step": 8399 }, { "epoch": 2.5680220116172423, "grad_norm": 0.23526357114315033, "learning_rate": 2.97690119315528e-05, "loss": 0.0466, "step": 8400 }, { "epoch": 2.5683277285233874, "grad_norm": 0.32100674510002136, "learning_rate": 2.976858732113286e-05, "loss": 0.0813, "step": 8401 }, { "epoch": 2.568633445429532, "grad_norm": 0.2992071807384491, "learning_rate": 2.9768162710712922e-05, "loss": 0.0588, "step": 8402 }, { "epoch": 2.568939162335677, "grad_norm": 0.40845388174057007, "learning_rate": 2.976773810029298e-05, "loss": 0.116, "step": 8403 }, { "epoch": 2.569244879241822, "grad_norm": 0.41498035192489624, "learning_rate": 2.9767313489873042e-05, "loss": 0.0932, "step": 8404 }, { "epoch": 2.569550596147967, "grad_norm": 0.3922388553619385, "learning_rate": 2.97668888794531e-05, "loss": 0.1077, "step": 8405 }, { "epoch": 2.569856313054112, "grad_norm": 0.5820786952972412, "learning_rate": 2.976646426903316e-05, "loss": 0.1612, "step": 8406 }, { "epoch": 2.5701620299602568, "grad_norm": 0.46122246980667114, "learning_rate": 2.9766039658613222e-05, "loss": 0.1173, "step": 8407 }, { "epoch": 2.5704677468664014, "grad_norm": 0.8741685152053833, "learning_rate": 2.976561504819328e-05, "loss": 0.202, "step": 8408 }, { "epoch": 2.5707734637725466, "grad_norm": 0.8338782787322998, "learning_rate": 2.9765190437773343e-05, "loss": 0.1713, "step": 8409 }, { "epoch": 2.5710791806786917, "grad_norm": 1.753347635269165, "learning_rate": 2.97647658273534e-05, "loss": 0.1493, "step": 8410 }, { "epoch": 2.5713848975848363, "grad_norm": 0.8196293115615845, "learning_rate": 2.9764341216933464e-05, "loss": 0.1951, "step": 8411 }, { "epoch": 2.5716906144909815, "grad_norm": 0.978943407535553, "learning_rate": 2.9763916606513522e-05, "loss": 0.2067, "step": 8412 }, { "epoch": 2.571996331397126, "grad_norm": 0.6682287454605103, "learning_rate": 2.9763491996093584e-05, "loss": 0.2307, "step": 8413 }, { "epoch": 2.5723020483032712, "grad_norm": 0.8417865633964539, "learning_rate": 2.9763067385673643e-05, "loss": 0.2442, "step": 8414 }, { "epoch": 2.572607765209416, "grad_norm": 1.095544695854187, "learning_rate": 2.9762642775253705e-05, "loss": 0.232, "step": 8415 }, { "epoch": 2.572913482115561, "grad_norm": 1.9070682525634766, "learning_rate": 2.9762218164833764e-05, "loss": 0.1884, "step": 8416 }, { "epoch": 2.5732191990217057, "grad_norm": 1.4511562585830688, "learning_rate": 2.9761793554413826e-05, "loss": 0.2947, "step": 8417 }, { "epoch": 2.573524915927851, "grad_norm": 0.6764416694641113, "learning_rate": 2.9761368943993885e-05, "loss": 0.1367, "step": 8418 }, { "epoch": 2.573830632833996, "grad_norm": 0.4172336757183075, "learning_rate": 2.9760944333573947e-05, "loss": 0.0966, "step": 8419 }, { "epoch": 2.5741363497401406, "grad_norm": 0.36573344469070435, "learning_rate": 2.976051972315401e-05, "loss": 0.0931, "step": 8420 }, { "epoch": 2.5744420666462853, "grad_norm": 0.30138587951660156, "learning_rate": 2.9760095112734067e-05, "loss": 0.0819, "step": 8421 }, { "epoch": 2.5747477835524304, "grad_norm": 0.35846513509750366, "learning_rate": 2.975967050231413e-05, "loss": 0.0735, "step": 8422 }, { "epoch": 2.5750535004585755, "grad_norm": 0.24846430122852325, "learning_rate": 2.9759245891894188e-05, "loss": 0.0756, "step": 8423 }, { "epoch": 2.57535921736472, "grad_norm": 0.29507747292518616, "learning_rate": 2.975882128147425e-05, "loss": 0.0645, "step": 8424 }, { "epoch": 2.5756649342708653, "grad_norm": 0.47675418853759766, "learning_rate": 2.975839667105431e-05, "loss": 0.0837, "step": 8425 }, { "epoch": 2.57597065117701, "grad_norm": 0.29754170775413513, "learning_rate": 2.975797206063437e-05, "loss": 0.0676, "step": 8426 }, { "epoch": 2.576276368083155, "grad_norm": 0.30687016248703003, "learning_rate": 2.975754745021443e-05, "loss": 0.0802, "step": 8427 }, { "epoch": 2.5765820849892997, "grad_norm": 0.6567064523696899, "learning_rate": 2.9757122839794492e-05, "loss": 0.1243, "step": 8428 }, { "epoch": 2.576887801895445, "grad_norm": 0.45685499906539917, "learning_rate": 2.975669822937455e-05, "loss": 0.0948, "step": 8429 }, { "epoch": 2.5771935188015895, "grad_norm": 0.32021093368530273, "learning_rate": 2.9756273618954613e-05, "loss": 0.1062, "step": 8430 }, { "epoch": 2.5774992357077346, "grad_norm": 0.3298240602016449, "learning_rate": 2.975584900853467e-05, "loss": 0.1117, "step": 8431 }, { "epoch": 2.5778049526138798, "grad_norm": 2.9173026084899902, "learning_rate": 2.975542439811473e-05, "loss": 0.1673, "step": 8432 }, { "epoch": 2.5781106695200244, "grad_norm": 0.6476272344589233, "learning_rate": 2.9754999787694792e-05, "loss": 0.2001, "step": 8433 }, { "epoch": 2.578416386426169, "grad_norm": 0.6420387029647827, "learning_rate": 2.975457517727485e-05, "loss": 0.1561, "step": 8434 }, { "epoch": 2.578722103332314, "grad_norm": 1.0298134088516235, "learning_rate": 2.9754150566854913e-05, "loss": 0.1696, "step": 8435 }, { "epoch": 2.5790278202384593, "grad_norm": 0.673408567905426, "learning_rate": 2.9753725956434972e-05, "loss": 0.2103, "step": 8436 }, { "epoch": 2.579333537144604, "grad_norm": 1.4648151397705078, "learning_rate": 2.9753301346015034e-05, "loss": 0.2006, "step": 8437 }, { "epoch": 2.579639254050749, "grad_norm": 0.7139231562614441, "learning_rate": 2.9752876735595093e-05, "loss": 0.1916, "step": 8438 }, { "epoch": 2.579944970956894, "grad_norm": 0.8316459059715271, "learning_rate": 2.9752452125175155e-05, "loss": 0.2209, "step": 8439 }, { "epoch": 2.580250687863039, "grad_norm": 0.7570959329605103, "learning_rate": 2.9752027514755213e-05, "loss": 0.2293, "step": 8440 }, { "epoch": 2.5805564047691836, "grad_norm": 1.0263348817825317, "learning_rate": 2.9751602904335275e-05, "loss": 0.2017, "step": 8441 }, { "epoch": 2.5808621216753287, "grad_norm": 1.347184658050537, "learning_rate": 2.9751178293915334e-05, "loss": 0.291, "step": 8442 }, { "epoch": 2.5811678385814734, "grad_norm": 0.8507481217384338, "learning_rate": 2.9750753683495396e-05, "loss": 0.1784, "step": 8443 }, { "epoch": 2.5814735554876185, "grad_norm": 0.34178033471107483, "learning_rate": 2.9750329073075455e-05, "loss": 0.136, "step": 8444 }, { "epoch": 2.5817792723937636, "grad_norm": 0.345165878534317, "learning_rate": 2.9749904462655514e-05, "loss": 0.0909, "step": 8445 }, { "epoch": 2.5820849892999083, "grad_norm": 0.25209009647369385, "learning_rate": 2.9749479852235576e-05, "loss": 0.0667, "step": 8446 }, { "epoch": 2.582390706206053, "grad_norm": 0.33090993762016296, "learning_rate": 2.9749055241815634e-05, "loss": 0.0715, "step": 8447 }, { "epoch": 2.582696423112198, "grad_norm": 0.22804392874240875, "learning_rate": 2.9748630631395696e-05, "loss": 0.0637, "step": 8448 }, { "epoch": 2.583002140018343, "grad_norm": 0.30860376358032227, "learning_rate": 2.9748206020975755e-05, "loss": 0.0775, "step": 8449 }, { "epoch": 2.583307856924488, "grad_norm": 0.43181705474853516, "learning_rate": 2.9747781410555817e-05, "loss": 0.0767, "step": 8450 }, { "epoch": 2.583613573830633, "grad_norm": 0.3905654549598694, "learning_rate": 2.9747356800135876e-05, "loss": 0.0837, "step": 8451 }, { "epoch": 2.5839192907367776, "grad_norm": 1.1268526315689087, "learning_rate": 2.9746932189715938e-05, "loss": 0.0895, "step": 8452 }, { "epoch": 2.5842250076429227, "grad_norm": 0.38513848185539246, "learning_rate": 2.9746507579295997e-05, "loss": 0.0685, "step": 8453 }, { "epoch": 2.5845307245490674, "grad_norm": 0.35875675082206726, "learning_rate": 2.974608296887606e-05, "loss": 0.1317, "step": 8454 }, { "epoch": 2.5848364414552125, "grad_norm": 0.4370126724243164, "learning_rate": 2.9745658358456118e-05, "loss": 0.1022, "step": 8455 }, { "epoch": 2.585142158361357, "grad_norm": 0.6696600914001465, "learning_rate": 2.974523374803618e-05, "loss": 0.1139, "step": 8456 }, { "epoch": 2.5854478752675023, "grad_norm": 0.5439867377281189, "learning_rate": 2.9744809137616238e-05, "loss": 0.1208, "step": 8457 }, { "epoch": 2.5857535921736474, "grad_norm": 0.515100359916687, "learning_rate": 2.9744384527196297e-05, "loss": 0.1546, "step": 8458 }, { "epoch": 2.586059309079792, "grad_norm": 0.5268304944038391, "learning_rate": 2.974395991677636e-05, "loss": 0.1908, "step": 8459 }, { "epoch": 2.5863650259859368, "grad_norm": 1.186877965927124, "learning_rate": 2.9743535306356418e-05, "loss": 0.2144, "step": 8460 }, { "epoch": 2.586670742892082, "grad_norm": 1.508428931236267, "learning_rate": 2.974311069593648e-05, "loss": 0.1835, "step": 8461 }, { "epoch": 2.586976459798227, "grad_norm": 1.5133506059646606, "learning_rate": 2.974268608551654e-05, "loss": 0.2175, "step": 8462 }, { "epoch": 2.5872821767043717, "grad_norm": 0.6428373456001282, "learning_rate": 2.97422614750966e-05, "loss": 0.1903, "step": 8463 }, { "epoch": 2.5875878936105168, "grad_norm": 0.7333836555480957, "learning_rate": 2.974183686467666e-05, "loss": 0.2157, "step": 8464 }, { "epoch": 2.5878936105166614, "grad_norm": 1.2339098453521729, "learning_rate": 2.974141225425672e-05, "loss": 0.2048, "step": 8465 }, { "epoch": 2.5881993274228066, "grad_norm": 1.6550978422164917, "learning_rate": 2.974098764383678e-05, "loss": 0.2454, "step": 8466 }, { "epoch": 2.5885050443289512, "grad_norm": 8.434856414794922, "learning_rate": 2.9740563033416842e-05, "loss": 0.2519, "step": 8467 }, { "epoch": 2.5888107612350963, "grad_norm": 0.4033075273036957, "learning_rate": 2.97401384229969e-05, "loss": 0.1661, "step": 8468 }, { "epoch": 2.589116478141241, "grad_norm": 0.41242608428001404, "learning_rate": 2.9739713812576963e-05, "loss": 0.0927, "step": 8469 }, { "epoch": 2.589422195047386, "grad_norm": 0.25879716873168945, "learning_rate": 2.9739289202157022e-05, "loss": 0.0865, "step": 8470 }, { "epoch": 2.5897279119535312, "grad_norm": 0.24608129262924194, "learning_rate": 2.973886459173708e-05, "loss": 0.07, "step": 8471 }, { "epoch": 2.590033628859676, "grad_norm": 0.3211709260940552, "learning_rate": 2.9738439981317143e-05, "loss": 0.0596, "step": 8472 }, { "epoch": 2.5903393457658206, "grad_norm": 0.32334068417549133, "learning_rate": 2.97380153708972e-05, "loss": 0.0806, "step": 8473 }, { "epoch": 2.5906450626719657, "grad_norm": 0.3146856427192688, "learning_rate": 2.9737590760477263e-05, "loss": 0.0786, "step": 8474 }, { "epoch": 2.590950779578111, "grad_norm": 0.35783594846725464, "learning_rate": 2.9737166150057322e-05, "loss": 0.0769, "step": 8475 }, { "epoch": 2.5912564964842555, "grad_norm": 0.41958364844322205, "learning_rate": 2.9736741539637384e-05, "loss": 0.1141, "step": 8476 }, { "epoch": 2.5915622133904006, "grad_norm": 0.38858047127723694, "learning_rate": 2.9736316929217443e-05, "loss": 0.1027, "step": 8477 }, { "epoch": 2.5918679302965453, "grad_norm": 0.29130494594573975, "learning_rate": 2.9735892318797505e-05, "loss": 0.0964, "step": 8478 }, { "epoch": 2.5921736472026904, "grad_norm": 0.7420637607574463, "learning_rate": 2.9735467708377564e-05, "loss": 0.1243, "step": 8479 }, { "epoch": 2.592479364108835, "grad_norm": 0.38296282291412354, "learning_rate": 2.9735043097957626e-05, "loss": 0.1176, "step": 8480 }, { "epoch": 2.59278508101498, "grad_norm": 0.43452051281929016, "learning_rate": 2.9734618487537684e-05, "loss": 0.1104, "step": 8481 }, { "epoch": 2.593090797921125, "grad_norm": 0.988491952419281, "learning_rate": 2.9734193877117746e-05, "loss": 0.1802, "step": 8482 }, { "epoch": 2.59339651482727, "grad_norm": 0.41661128401756287, "learning_rate": 2.9733769266697805e-05, "loss": 0.1533, "step": 8483 }, { "epoch": 2.593702231733415, "grad_norm": 0.5945215821266174, "learning_rate": 2.9733344656277864e-05, "loss": 0.1956, "step": 8484 }, { "epoch": 2.5940079486395597, "grad_norm": 0.6246954798698425, "learning_rate": 2.9732920045857926e-05, "loss": 0.2117, "step": 8485 }, { "epoch": 2.5943136655457044, "grad_norm": 0.7516672015190125, "learning_rate": 2.9732495435437985e-05, "loss": 0.2089, "step": 8486 }, { "epoch": 2.5946193824518495, "grad_norm": 0.5626375079154968, "learning_rate": 2.9732070825018047e-05, "loss": 0.1972, "step": 8487 }, { "epoch": 2.5949250993579946, "grad_norm": 0.9884100556373596, "learning_rate": 2.9731646214598105e-05, "loss": 0.1742, "step": 8488 }, { "epoch": 2.5952308162641393, "grad_norm": 0.8531062006950378, "learning_rate": 2.9731221604178168e-05, "loss": 0.2276, "step": 8489 }, { "epoch": 2.5955365331702844, "grad_norm": 1.5098134279251099, "learning_rate": 2.9730796993758226e-05, "loss": 0.2242, "step": 8490 }, { "epoch": 2.595842250076429, "grad_norm": 1.3657336235046387, "learning_rate": 2.9730372383338288e-05, "loss": 0.2734, "step": 8491 }, { "epoch": 2.596147966982574, "grad_norm": 2.1050004959106445, "learning_rate": 2.9729947772918347e-05, "loss": 0.2793, "step": 8492 }, { "epoch": 2.596453683888719, "grad_norm": 1.2834982872009277, "learning_rate": 2.972952316249841e-05, "loss": 0.1356, "step": 8493 }, { "epoch": 2.596759400794864, "grad_norm": 0.26599714159965515, "learning_rate": 2.9729098552078468e-05, "loss": 0.0808, "step": 8494 }, { "epoch": 2.5970651177010087, "grad_norm": 0.29217350482940674, "learning_rate": 2.972867394165853e-05, "loss": 0.0954, "step": 8495 }, { "epoch": 2.597370834607154, "grad_norm": 0.1835613250732422, "learning_rate": 2.972824933123859e-05, "loss": 0.0636, "step": 8496 }, { "epoch": 2.597676551513299, "grad_norm": 0.24072949588298798, "learning_rate": 2.9727824720818647e-05, "loss": 0.0612, "step": 8497 }, { "epoch": 2.5979822684194436, "grad_norm": 0.24964024126529694, "learning_rate": 2.972740011039871e-05, "loss": 0.0503, "step": 8498 }, { "epoch": 2.5982879853255882, "grad_norm": 0.23911963403224945, "learning_rate": 2.9726975499978768e-05, "loss": 0.0752, "step": 8499 }, { "epoch": 2.5985937022317334, "grad_norm": 0.26345688104629517, "learning_rate": 2.972655088955883e-05, "loss": 0.0615, "step": 8500 }, { "epoch": 2.5988994191378785, "grad_norm": 0.4567795991897583, "learning_rate": 2.972612627913889e-05, "loss": 0.1022, "step": 8501 }, { "epoch": 2.599205136044023, "grad_norm": 0.36205148696899414, "learning_rate": 2.972570166871895e-05, "loss": 0.0583, "step": 8502 }, { "epoch": 2.5995108529501683, "grad_norm": 0.32154348492622375, "learning_rate": 2.972527705829901e-05, "loss": 0.0942, "step": 8503 }, { "epoch": 2.599816569856313, "grad_norm": 0.4154849946498871, "learning_rate": 2.9724852447879072e-05, "loss": 0.0771, "step": 8504 }, { "epoch": 2.600122286762458, "grad_norm": 0.3088824152946472, "learning_rate": 2.972442783745913e-05, "loss": 0.116, "step": 8505 }, { "epoch": 2.6004280036686027, "grad_norm": 0.6277368068695068, "learning_rate": 2.9724003227039193e-05, "loss": 0.1664, "step": 8506 }, { "epoch": 2.600733720574748, "grad_norm": 0.49403685331344604, "learning_rate": 2.972357861661925e-05, "loss": 0.1628, "step": 8507 }, { "epoch": 2.6010394374808925, "grad_norm": 0.6313951015472412, "learning_rate": 2.972315400619931e-05, "loss": 0.1374, "step": 8508 }, { "epoch": 2.6013451543870376, "grad_norm": 0.6936460733413696, "learning_rate": 2.9722729395779372e-05, "loss": 0.1706, "step": 8509 }, { "epoch": 2.6016508712931827, "grad_norm": 0.8249213695526123, "learning_rate": 2.972230478535943e-05, "loss": 0.1774, "step": 8510 }, { "epoch": 2.6019565881993274, "grad_norm": 0.6251870393753052, "learning_rate": 2.9721880174939493e-05, "loss": 0.2048, "step": 8511 }, { "epoch": 2.602262305105472, "grad_norm": 0.7805861234664917, "learning_rate": 2.972145556451955e-05, "loss": 0.2204, "step": 8512 }, { "epoch": 2.602568022011617, "grad_norm": 1.1009514331817627, "learning_rate": 2.9721030954099614e-05, "loss": 0.2146, "step": 8513 }, { "epoch": 2.6028737389177623, "grad_norm": 0.6578028798103333, "learning_rate": 2.9720606343679672e-05, "loss": 0.2121, "step": 8514 }, { "epoch": 2.603179455823907, "grad_norm": 1.2539424896240234, "learning_rate": 2.9720181733259734e-05, "loss": 0.2095, "step": 8515 }, { "epoch": 2.603485172730052, "grad_norm": 1.3532443046569824, "learning_rate": 2.9719757122839793e-05, "loss": 0.2853, "step": 8516 }, { "epoch": 2.6037908896361968, "grad_norm": 1.5525014400482178, "learning_rate": 2.9719332512419855e-05, "loss": 0.285, "step": 8517 }, { "epoch": 2.604096606542342, "grad_norm": 0.42061835527420044, "learning_rate": 2.9718907901999914e-05, "loss": 0.1331, "step": 8518 }, { "epoch": 2.6044023234484865, "grad_norm": 0.4974418878555298, "learning_rate": 2.9718483291579976e-05, "loss": 0.0948, "step": 8519 }, { "epoch": 2.6047080403546317, "grad_norm": 0.31529608368873596, "learning_rate": 2.9718058681160035e-05, "loss": 0.0738, "step": 8520 }, { "epoch": 2.6050137572607763, "grad_norm": 0.36245498061180115, "learning_rate": 2.9717634070740097e-05, "loss": 0.066, "step": 8521 }, { "epoch": 2.6053194741669214, "grad_norm": 0.23762482404708862, "learning_rate": 2.971720946032016e-05, "loss": 0.0656, "step": 8522 }, { "epoch": 2.6056251910730666, "grad_norm": 0.4538615047931671, "learning_rate": 2.9716784849900218e-05, "loss": 0.0725, "step": 8523 }, { "epoch": 2.6059309079792112, "grad_norm": 0.22536928951740265, "learning_rate": 2.971636023948028e-05, "loss": 0.0642, "step": 8524 }, { "epoch": 2.606236624885356, "grad_norm": 0.4438013434410095, "learning_rate": 2.971593562906034e-05, "loss": 0.0891, "step": 8525 }, { "epoch": 2.606542341791501, "grad_norm": 0.4771289527416229, "learning_rate": 2.97155110186404e-05, "loss": 0.0931, "step": 8526 }, { "epoch": 2.606848058697646, "grad_norm": 0.6300597190856934, "learning_rate": 2.971508640822046e-05, "loss": 0.078, "step": 8527 }, { "epoch": 2.607153775603791, "grad_norm": 0.8395363092422485, "learning_rate": 2.971466179780052e-05, "loss": 0.085, "step": 8528 }, { "epoch": 2.607459492509936, "grad_norm": 0.40986567735671997, "learning_rate": 2.971423718738058e-05, "loss": 0.0821, "step": 8529 }, { "epoch": 2.6077652094160806, "grad_norm": 0.2969958186149597, "learning_rate": 2.9713812576960642e-05, "loss": 0.0999, "step": 8530 }, { "epoch": 2.6080709263222257, "grad_norm": 0.8749004006385803, "learning_rate": 2.97133879665407e-05, "loss": 0.1855, "step": 8531 }, { "epoch": 2.6083766432283704, "grad_norm": 0.6670631170272827, "learning_rate": 2.9712963356120763e-05, "loss": 0.1274, "step": 8532 }, { "epoch": 2.6086823601345155, "grad_norm": 0.49505311250686646, "learning_rate": 2.971253874570082e-05, "loss": 0.1775, "step": 8533 }, { "epoch": 2.60898807704066, "grad_norm": 2.2334020137786865, "learning_rate": 2.971211413528088e-05, "loss": 0.1609, "step": 8534 }, { "epoch": 2.6092937939468053, "grad_norm": 0.9813356995582581, "learning_rate": 2.9711689524860942e-05, "loss": 0.1823, "step": 8535 }, { "epoch": 2.6095995108529504, "grad_norm": 0.6206424236297607, "learning_rate": 2.9711264914441e-05, "loss": 0.1872, "step": 8536 }, { "epoch": 2.609905227759095, "grad_norm": 0.8848998546600342, "learning_rate": 2.9710840304021063e-05, "loss": 0.2216, "step": 8537 }, { "epoch": 2.6102109446652397, "grad_norm": 0.8287000060081482, "learning_rate": 2.9710415693601122e-05, "loss": 0.2063, "step": 8538 }, { "epoch": 2.610516661571385, "grad_norm": 1.1269668340682983, "learning_rate": 2.9709991083181184e-05, "loss": 0.2437, "step": 8539 }, { "epoch": 2.61082237847753, "grad_norm": 0.893991231918335, "learning_rate": 2.9709566472761243e-05, "loss": 0.2255, "step": 8540 }, { "epoch": 2.6111280953836746, "grad_norm": 1.3230037689208984, "learning_rate": 2.9709141862341305e-05, "loss": 0.2123, "step": 8541 }, { "epoch": 2.6114338122898197, "grad_norm": 2.1103501319885254, "learning_rate": 2.9708717251921363e-05, "loss": 0.2989, "step": 8542 }, { "epoch": 2.6117395291959644, "grad_norm": 0.6613329648971558, "learning_rate": 2.9708292641501425e-05, "loss": 0.1512, "step": 8543 }, { "epoch": 2.6120452461021095, "grad_norm": 0.3463899493217468, "learning_rate": 2.9707868031081484e-05, "loss": 0.1154, "step": 8544 }, { "epoch": 2.612350963008254, "grad_norm": 0.2667384743690491, "learning_rate": 2.9707443420661546e-05, "loss": 0.0866, "step": 8545 }, { "epoch": 2.6126566799143993, "grad_norm": 0.3839867413043976, "learning_rate": 2.9707018810241605e-05, "loss": 0.0758, "step": 8546 }, { "epoch": 2.612962396820544, "grad_norm": 0.5980815291404724, "learning_rate": 2.9706594199821664e-05, "loss": 0.0509, "step": 8547 }, { "epoch": 2.613268113726689, "grad_norm": 0.3227832317352295, "learning_rate": 2.9706169589401726e-05, "loss": 0.0605, "step": 8548 }, { "epoch": 2.613573830632834, "grad_norm": 0.501562237739563, "learning_rate": 2.9705744978981784e-05, "loss": 0.0814, "step": 8549 }, { "epoch": 2.613879547538979, "grad_norm": 0.6239759922027588, "learning_rate": 2.9705320368561846e-05, "loss": 0.0945, "step": 8550 }, { "epoch": 2.6141852644451236, "grad_norm": 0.3278788924217224, "learning_rate": 2.9704895758141905e-05, "loss": 0.0631, "step": 8551 }, { "epoch": 2.6144909813512687, "grad_norm": 0.46961814165115356, "learning_rate": 2.9704471147721967e-05, "loss": 0.0972, "step": 8552 }, { "epoch": 2.614796698257414, "grad_norm": 0.5853075981140137, "learning_rate": 2.9704046537302026e-05, "loss": 0.1071, "step": 8553 }, { "epoch": 2.6151024151635585, "grad_norm": 0.4029936194419861, "learning_rate": 2.9703621926882088e-05, "loss": 0.0911, "step": 8554 }, { "epoch": 2.6154081320697036, "grad_norm": 1.046021580696106, "learning_rate": 2.9703197316462147e-05, "loss": 0.1015, "step": 8555 }, { "epoch": 2.6157138489758482, "grad_norm": 0.7036014199256897, "learning_rate": 2.970277270604221e-05, "loss": 0.1608, "step": 8556 }, { "epoch": 2.6160195658819934, "grad_norm": 0.49654754996299744, "learning_rate": 2.9702348095622268e-05, "loss": 0.1171, "step": 8557 }, { "epoch": 2.616325282788138, "grad_norm": 0.5514814853668213, "learning_rate": 2.970192348520233e-05, "loss": 0.1608, "step": 8558 }, { "epoch": 2.616630999694283, "grad_norm": 0.9212284684181213, "learning_rate": 2.970149887478239e-05, "loss": 0.1486, "step": 8559 }, { "epoch": 2.616936716600428, "grad_norm": 0.7529193758964539, "learning_rate": 2.9701074264362447e-05, "loss": 0.1489, "step": 8560 }, { "epoch": 2.617242433506573, "grad_norm": 1.215707778930664, "learning_rate": 2.970064965394251e-05, "loss": 0.1716, "step": 8561 }, { "epoch": 2.617548150412718, "grad_norm": 3.098513603210449, "learning_rate": 2.9700225043522568e-05, "loss": 0.1871, "step": 8562 }, { "epoch": 2.6178538673188627, "grad_norm": 0.6928406953811646, "learning_rate": 2.969980043310263e-05, "loss": 0.2402, "step": 8563 }, { "epoch": 2.6181595842250074, "grad_norm": 0.7188504338264465, "learning_rate": 2.969937582268269e-05, "loss": 0.1976, "step": 8564 }, { "epoch": 2.6184653011311525, "grad_norm": 1.6386858224868774, "learning_rate": 2.969895121226275e-05, "loss": 0.2522, "step": 8565 }, { "epoch": 2.6187710180372976, "grad_norm": 0.8024870157241821, "learning_rate": 2.969852660184281e-05, "loss": 0.2664, "step": 8566 }, { "epoch": 2.6190767349434423, "grad_norm": 1.8278521299362183, "learning_rate": 2.969810199142287e-05, "loss": 0.3503, "step": 8567 }, { "epoch": 2.6193824518495874, "grad_norm": 0.3977953791618347, "learning_rate": 2.969767738100293e-05, "loss": 0.1562, "step": 8568 }, { "epoch": 2.619688168755732, "grad_norm": 0.47853052616119385, "learning_rate": 2.9697252770582992e-05, "loss": 0.0951, "step": 8569 }, { "epoch": 2.619993885661877, "grad_norm": 0.24232834577560425, "learning_rate": 2.969682816016305e-05, "loss": 0.0688, "step": 8570 }, { "epoch": 2.620299602568022, "grad_norm": 0.30544281005859375, "learning_rate": 2.9696403549743113e-05, "loss": 0.0901, "step": 8571 }, { "epoch": 2.620605319474167, "grad_norm": 0.21605946123600006, "learning_rate": 2.9695978939323172e-05, "loss": 0.0601, "step": 8572 }, { "epoch": 2.6209110363803116, "grad_norm": 0.23732371628284454, "learning_rate": 2.969555432890323e-05, "loss": 0.056, "step": 8573 }, { "epoch": 2.6212167532864568, "grad_norm": 0.2833145260810852, "learning_rate": 2.9695129718483293e-05, "loss": 0.0732, "step": 8574 }, { "epoch": 2.621522470192602, "grad_norm": 0.32865288853645325, "learning_rate": 2.969470510806335e-05, "loss": 0.0853, "step": 8575 }, { "epoch": 2.6218281870987465, "grad_norm": 0.6622116565704346, "learning_rate": 2.9694280497643413e-05, "loss": 0.0844, "step": 8576 }, { "epoch": 2.622133904004891, "grad_norm": 0.30271613597869873, "learning_rate": 2.9693855887223472e-05, "loss": 0.0941, "step": 8577 }, { "epoch": 2.6224396209110363, "grad_norm": 0.43935641646385193, "learning_rate": 2.9693431276803534e-05, "loss": 0.1169, "step": 8578 }, { "epoch": 2.6227453378171814, "grad_norm": 0.45595598220825195, "learning_rate": 2.9693006666383593e-05, "loss": 0.0805, "step": 8579 }, { "epoch": 2.623051054723326, "grad_norm": 0.3661307394504547, "learning_rate": 2.9692582055963655e-05, "loss": 0.0991, "step": 8580 }, { "epoch": 2.6233567716294712, "grad_norm": 0.48602449893951416, "learning_rate": 2.9692157445543714e-05, "loss": 0.139, "step": 8581 }, { "epoch": 2.623662488535616, "grad_norm": 0.41564351320266724, "learning_rate": 2.9691732835123776e-05, "loss": 0.1515, "step": 8582 }, { "epoch": 2.623968205441761, "grad_norm": 0.7646878957748413, "learning_rate": 2.9691308224703834e-05, "loss": 0.2022, "step": 8583 }, { "epoch": 2.6242739223479057, "grad_norm": 0.48512572050094604, "learning_rate": 2.9690883614283896e-05, "loss": 0.1636, "step": 8584 }, { "epoch": 2.624579639254051, "grad_norm": 0.6463525891304016, "learning_rate": 2.9690459003863955e-05, "loss": 0.1962, "step": 8585 }, { "epoch": 2.6248853561601955, "grad_norm": 0.6615551710128784, "learning_rate": 2.9690034393444014e-05, "loss": 0.2166, "step": 8586 }, { "epoch": 2.6251910730663406, "grad_norm": 1.1649295091629028, "learning_rate": 2.9689609783024076e-05, "loss": 0.1951, "step": 8587 }, { "epoch": 2.6254967899724857, "grad_norm": 0.653557538986206, "learning_rate": 2.9689185172604135e-05, "loss": 0.2127, "step": 8588 }, { "epoch": 2.6258025068786304, "grad_norm": 0.9054247736930847, "learning_rate": 2.9688760562184197e-05, "loss": 0.2178, "step": 8589 }, { "epoch": 2.626108223784775, "grad_norm": 3.1743268966674805, "learning_rate": 2.9688335951764255e-05, "loss": 0.2171, "step": 8590 }, { "epoch": 2.62641394069092, "grad_norm": 1.5842610597610474, "learning_rate": 2.9687911341344318e-05, "loss": 0.2577, "step": 8591 }, { "epoch": 2.6267196575970653, "grad_norm": 1.8628991842269897, "learning_rate": 2.9687486730924376e-05, "loss": 0.2511, "step": 8592 }, { "epoch": 2.62702537450321, "grad_norm": 0.35634610056877136, "learning_rate": 2.968706212050444e-05, "loss": 0.1468, "step": 8593 }, { "epoch": 2.627331091409355, "grad_norm": 0.42580997943878174, "learning_rate": 2.9686637510084497e-05, "loss": 0.0891, "step": 8594 }, { "epoch": 2.6276368083154997, "grad_norm": 0.2237730473279953, "learning_rate": 2.968621289966456e-05, "loss": 0.0637, "step": 8595 }, { "epoch": 2.627942525221645, "grad_norm": 0.3148725926876068, "learning_rate": 2.9685788289244618e-05, "loss": 0.0703, "step": 8596 }, { "epoch": 2.6282482421277895, "grad_norm": 0.44228431582450867, "learning_rate": 2.968536367882468e-05, "loss": 0.0726, "step": 8597 }, { "epoch": 2.6285539590339346, "grad_norm": 0.3575983941555023, "learning_rate": 2.968493906840474e-05, "loss": 0.057, "step": 8598 }, { "epoch": 2.6288596759400793, "grad_norm": 0.5221765637397766, "learning_rate": 2.9684514457984797e-05, "loss": 0.0663, "step": 8599 }, { "epoch": 2.6291653928462244, "grad_norm": 0.2508810758590698, "learning_rate": 2.968408984756486e-05, "loss": 0.0622, "step": 8600 }, { "epoch": 2.6294711097523695, "grad_norm": 0.5007632374763489, "learning_rate": 2.9683665237144918e-05, "loss": 0.0895, "step": 8601 }, { "epoch": 2.629776826658514, "grad_norm": 0.40118762850761414, "learning_rate": 2.968324062672498e-05, "loss": 0.089, "step": 8602 }, { "epoch": 2.630082543564659, "grad_norm": 0.4074676036834717, "learning_rate": 2.968281601630504e-05, "loss": 0.125, "step": 8603 }, { "epoch": 2.630388260470804, "grad_norm": 0.3921993672847748, "learning_rate": 2.96823914058851e-05, "loss": 0.0868, "step": 8604 }, { "epoch": 2.630693977376949, "grad_norm": 0.4544733464717865, "learning_rate": 2.968196679546516e-05, "loss": 0.144, "step": 8605 }, { "epoch": 2.6309996942830938, "grad_norm": 1.362690806388855, "learning_rate": 2.9681542185045222e-05, "loss": 0.1266, "step": 8606 }, { "epoch": 2.631305411189239, "grad_norm": 0.432939738035202, "learning_rate": 2.968111757462528e-05, "loss": 0.1511, "step": 8607 }, { "epoch": 2.6316111280953836, "grad_norm": 0.5383768081665039, "learning_rate": 2.9680692964205343e-05, "loss": 0.1491, "step": 8608 }, { "epoch": 2.6319168450015287, "grad_norm": 1.9243006706237793, "learning_rate": 2.96802683537854e-05, "loss": 0.1643, "step": 8609 }, { "epoch": 2.6322225619076733, "grad_norm": 0.6111827492713928, "learning_rate": 2.9679843743365463e-05, "loss": 0.1905, "step": 8610 }, { "epoch": 2.6325282788138185, "grad_norm": 1.1429818868637085, "learning_rate": 2.9679419132945522e-05, "loss": 0.2063, "step": 8611 }, { "epoch": 2.632833995719963, "grad_norm": 1.4672199487686157, "learning_rate": 2.967899452252558e-05, "loss": 0.2104, "step": 8612 }, { "epoch": 2.6331397126261082, "grad_norm": 0.8789923787117004, "learning_rate": 2.9678569912105643e-05, "loss": 0.2011, "step": 8613 }, { "epoch": 2.6334454295322534, "grad_norm": 0.926796019077301, "learning_rate": 2.96781453016857e-05, "loss": 0.1977, "step": 8614 }, { "epoch": 2.633751146438398, "grad_norm": 1.185886263847351, "learning_rate": 2.9677720691265764e-05, "loss": 0.2412, "step": 8615 }, { "epoch": 2.6340568633445427, "grad_norm": 3.4306282997131348, "learning_rate": 2.9677296080845822e-05, "loss": 0.2749, "step": 8616 }, { "epoch": 2.634362580250688, "grad_norm": 1.8130711317062378, "learning_rate": 2.9676871470425884e-05, "loss": 0.2887, "step": 8617 }, { "epoch": 2.634668297156833, "grad_norm": 1.1386090517044067, "learning_rate": 2.9676446860005943e-05, "loss": 0.1718, "step": 8618 }, { "epoch": 2.6349740140629776, "grad_norm": 0.3043757975101471, "learning_rate": 2.9676022249586005e-05, "loss": 0.1124, "step": 8619 }, { "epoch": 2.6352797309691227, "grad_norm": 0.29354891180992126, "learning_rate": 2.9675597639166064e-05, "loss": 0.1044, "step": 8620 }, { "epoch": 2.6355854478752674, "grad_norm": 0.2799992859363556, "learning_rate": 2.9675173028746126e-05, "loss": 0.0504, "step": 8621 }, { "epoch": 2.6358911647814125, "grad_norm": 0.3703312575817108, "learning_rate": 2.9674748418326185e-05, "loss": 0.0561, "step": 8622 }, { "epoch": 2.636196881687557, "grad_norm": 0.2539896070957184, "learning_rate": 2.9674323807906247e-05, "loss": 0.0741, "step": 8623 }, { "epoch": 2.6365025985937023, "grad_norm": 0.34051960706710815, "learning_rate": 2.967389919748631e-05, "loss": 0.0637, "step": 8624 }, { "epoch": 2.636808315499847, "grad_norm": 0.24028125405311584, "learning_rate": 2.9673474587066368e-05, "loss": 0.0717, "step": 8625 }, { "epoch": 2.637114032405992, "grad_norm": 0.3603823781013489, "learning_rate": 2.967304997664643e-05, "loss": 0.0734, "step": 8626 }, { "epoch": 2.637419749312137, "grad_norm": 0.43803825974464417, "learning_rate": 2.967262536622649e-05, "loss": 0.0935, "step": 8627 }, { "epoch": 2.637725466218282, "grad_norm": 0.40671107172966003, "learning_rate": 2.967220075580655e-05, "loss": 0.1051, "step": 8628 }, { "epoch": 2.6380311831244265, "grad_norm": 0.9123678207397461, "learning_rate": 2.967177614538661e-05, "loss": 0.0919, "step": 8629 }, { "epoch": 2.6383369000305716, "grad_norm": 0.4045836925506592, "learning_rate": 2.967135153496667e-05, "loss": 0.0982, "step": 8630 }, { "epoch": 2.6386426169367168, "grad_norm": 0.4297780692577362, "learning_rate": 2.967092692454673e-05, "loss": 0.1566, "step": 8631 }, { "epoch": 2.6389483338428614, "grad_norm": 3.049506425857544, "learning_rate": 2.9670502314126792e-05, "loss": 0.1238, "step": 8632 }, { "epoch": 2.6392540507490065, "grad_norm": 0.5087599754333496, "learning_rate": 2.967007770370685e-05, "loss": 0.1378, "step": 8633 }, { "epoch": 2.639559767655151, "grad_norm": 0.5534583926200867, "learning_rate": 2.9669653093286913e-05, "loss": 0.1798, "step": 8634 }, { "epoch": 2.6398654845612963, "grad_norm": 0.7822840213775635, "learning_rate": 2.966922848286697e-05, "loss": 0.1794, "step": 8635 }, { "epoch": 2.640171201467441, "grad_norm": 0.7019453048706055, "learning_rate": 2.966880387244703e-05, "loss": 0.193, "step": 8636 }, { "epoch": 2.640476918373586, "grad_norm": 1.2457667589187622, "learning_rate": 2.9668379262027092e-05, "loss": 0.2399, "step": 8637 }, { "epoch": 2.640782635279731, "grad_norm": 3.342904567718506, "learning_rate": 2.966795465160715e-05, "loss": 0.2247, "step": 8638 }, { "epoch": 2.641088352185876, "grad_norm": 1.483022689819336, "learning_rate": 2.9667530041187213e-05, "loss": 0.2427, "step": 8639 }, { "epoch": 2.641394069092021, "grad_norm": 1.2786924839019775, "learning_rate": 2.9667105430767272e-05, "loss": 0.2249, "step": 8640 }, { "epoch": 2.6416997859981657, "grad_norm": 1.5413908958435059, "learning_rate": 2.9666680820347334e-05, "loss": 0.2831, "step": 8641 }, { "epoch": 2.6420055029043104, "grad_norm": 1.578609824180603, "learning_rate": 2.9666256209927393e-05, "loss": 0.2645, "step": 8642 }, { "epoch": 2.6423112198104555, "grad_norm": 0.5322411060333252, "learning_rate": 2.9665831599507455e-05, "loss": 0.1613, "step": 8643 }, { "epoch": 2.6426169367166006, "grad_norm": 0.32171595096588135, "learning_rate": 2.9665406989087513e-05, "loss": 0.0937, "step": 8644 }, { "epoch": 2.6429226536227453, "grad_norm": 0.2899972200393677, "learning_rate": 2.9664982378667575e-05, "loss": 0.067, "step": 8645 }, { "epoch": 2.6432283705288904, "grad_norm": 0.2625688910484314, "learning_rate": 2.9664557768247634e-05, "loss": 0.0767, "step": 8646 }, { "epoch": 2.643534087435035, "grad_norm": 0.2987097203731537, "learning_rate": 2.9664133157827696e-05, "loss": 0.0566, "step": 8647 }, { "epoch": 2.64383980434118, "grad_norm": 0.2989177405834198, "learning_rate": 2.9663708547407755e-05, "loss": 0.0642, "step": 8648 }, { "epoch": 2.644145521247325, "grad_norm": 0.2329387068748474, "learning_rate": 2.9663283936987814e-05, "loss": 0.0609, "step": 8649 }, { "epoch": 2.64445123815347, "grad_norm": 0.3325488269329071, "learning_rate": 2.9662859326567876e-05, "loss": 0.1012, "step": 8650 }, { "epoch": 2.6447569550596146, "grad_norm": 0.25173285603523254, "learning_rate": 2.9662434716147934e-05, "loss": 0.0614, "step": 8651 }, { "epoch": 2.6450626719657597, "grad_norm": 0.358030766248703, "learning_rate": 2.9662010105727997e-05, "loss": 0.0832, "step": 8652 }, { "epoch": 2.645368388871905, "grad_norm": 0.6611920595169067, "learning_rate": 2.9661585495308055e-05, "loss": 0.1199, "step": 8653 }, { "epoch": 2.6456741057780495, "grad_norm": 0.3945160210132599, "learning_rate": 2.9661160884888117e-05, "loss": 0.1222, "step": 8654 }, { "epoch": 2.645979822684194, "grad_norm": 0.3250751495361328, "learning_rate": 2.9660736274468176e-05, "loss": 0.0968, "step": 8655 }, { "epoch": 2.6462855395903393, "grad_norm": 0.4507169723510742, "learning_rate": 2.9660311664048238e-05, "loss": 0.1071, "step": 8656 }, { "epoch": 2.6465912564964844, "grad_norm": 0.5498884320259094, "learning_rate": 2.9659887053628297e-05, "loss": 0.1522, "step": 8657 }, { "epoch": 2.646896973402629, "grad_norm": 0.5433641076087952, "learning_rate": 2.965946244320836e-05, "loss": 0.1595, "step": 8658 }, { "epoch": 2.647202690308774, "grad_norm": 0.7312982678413391, "learning_rate": 2.9659037832788418e-05, "loss": 0.1593, "step": 8659 }, { "epoch": 2.647508407214919, "grad_norm": 0.8162384033203125, "learning_rate": 2.965861322236848e-05, "loss": 0.1632, "step": 8660 }, { "epoch": 2.647814124121064, "grad_norm": 1.0408000946044922, "learning_rate": 2.965818861194854e-05, "loss": 0.1701, "step": 8661 }, { "epoch": 2.6481198410272087, "grad_norm": 6.761633396148682, "learning_rate": 2.9657764001528597e-05, "loss": 0.1668, "step": 8662 }, { "epoch": 2.6484255579333538, "grad_norm": 0.6771640777587891, "learning_rate": 2.965733939110866e-05, "loss": 0.1968, "step": 8663 }, { "epoch": 2.6487312748394984, "grad_norm": 0.7572241425514221, "learning_rate": 2.9656914780688718e-05, "loss": 0.2139, "step": 8664 }, { "epoch": 2.6490369917456436, "grad_norm": 0.7600659728050232, "learning_rate": 2.965649017026878e-05, "loss": 0.2236, "step": 8665 }, { "epoch": 2.6493427086517887, "grad_norm": 1.7104971408843994, "learning_rate": 2.965606555984884e-05, "loss": 0.2591, "step": 8666 }, { "epoch": 2.6496484255579333, "grad_norm": 1.9577242136001587, "learning_rate": 2.96556409494289e-05, "loss": 0.3298, "step": 8667 }, { "epoch": 2.649954142464078, "grad_norm": 0.406866192817688, "learning_rate": 2.965521633900896e-05, "loss": 0.1655, "step": 8668 }, { "epoch": 2.650259859370223, "grad_norm": 0.33536919951438904, "learning_rate": 2.965479172858902e-05, "loss": 0.1078, "step": 8669 }, { "epoch": 2.6505655762763682, "grad_norm": 0.30185258388519287, "learning_rate": 2.965436711816908e-05, "loss": 0.0734, "step": 8670 }, { "epoch": 2.650871293182513, "grad_norm": 0.2917187213897705, "learning_rate": 2.9653942507749142e-05, "loss": 0.0674, "step": 8671 }, { "epoch": 2.651177010088658, "grad_norm": 0.2556661367416382, "learning_rate": 2.96535178973292e-05, "loss": 0.0664, "step": 8672 }, { "epoch": 2.6514827269948027, "grad_norm": 0.27571621537208557, "learning_rate": 2.9653093286909263e-05, "loss": 0.0729, "step": 8673 }, { "epoch": 2.651788443900948, "grad_norm": 0.3383842408657074, "learning_rate": 2.9652668676489322e-05, "loss": 0.0856, "step": 8674 }, { "epoch": 2.6520941608070925, "grad_norm": 0.36779066920280457, "learning_rate": 2.965224406606938e-05, "loss": 0.076, "step": 8675 }, { "epoch": 2.6523998777132376, "grad_norm": 0.2316930592060089, "learning_rate": 2.9651819455649443e-05, "loss": 0.0782, "step": 8676 }, { "epoch": 2.6527055946193823, "grad_norm": 0.2973344326019287, "learning_rate": 2.96513948452295e-05, "loss": 0.0881, "step": 8677 }, { "epoch": 2.6530113115255274, "grad_norm": 0.2571130394935608, "learning_rate": 2.9650970234809563e-05, "loss": 0.0694, "step": 8678 }, { "epoch": 2.6533170284316725, "grad_norm": 0.28444766998291016, "learning_rate": 2.9650545624389622e-05, "loss": 0.0866, "step": 8679 }, { "epoch": 2.653622745337817, "grad_norm": 0.467174768447876, "learning_rate": 2.9650121013969684e-05, "loss": 0.1272, "step": 8680 }, { "epoch": 2.653928462243962, "grad_norm": 0.47461938858032227, "learning_rate": 2.9649696403549743e-05, "loss": 0.1457, "step": 8681 }, { "epoch": 2.654234179150107, "grad_norm": 0.3564741015434265, "learning_rate": 2.9649271793129805e-05, "loss": 0.154, "step": 8682 }, { "epoch": 2.654539896056252, "grad_norm": 0.4470134973526001, "learning_rate": 2.9648847182709864e-05, "loss": 0.1534, "step": 8683 }, { "epoch": 2.6548456129623967, "grad_norm": 0.538486123085022, "learning_rate": 2.9648422572289926e-05, "loss": 0.1718, "step": 8684 }, { "epoch": 2.655151329868542, "grad_norm": 0.6164297461509705, "learning_rate": 2.9647997961869984e-05, "loss": 0.1922, "step": 8685 }, { "epoch": 2.6554570467746865, "grad_norm": 0.6920440793037415, "learning_rate": 2.9647573351450047e-05, "loss": 0.1681, "step": 8686 }, { "epoch": 2.6557627636808316, "grad_norm": 0.6287400126457214, "learning_rate": 2.9647148741030105e-05, "loss": 0.208, "step": 8687 }, { "epoch": 2.6560684805869763, "grad_norm": 0.7870118618011475, "learning_rate": 2.9646724130610164e-05, "loss": 0.1967, "step": 8688 }, { "epoch": 2.6563741974931214, "grad_norm": 0.8157984614372253, "learning_rate": 2.9646299520190226e-05, "loss": 0.2176, "step": 8689 }, { "epoch": 2.656679914399266, "grad_norm": 1.04789137840271, "learning_rate": 2.9645874909770285e-05, "loss": 0.231, "step": 8690 }, { "epoch": 2.656985631305411, "grad_norm": 0.9876081943511963, "learning_rate": 2.9645450299350347e-05, "loss": 0.2284, "step": 8691 }, { "epoch": 2.6572913482115563, "grad_norm": 0.8478752970695496, "learning_rate": 2.9645025688930405e-05, "loss": 0.268, "step": 8692 }, { "epoch": 2.657597065117701, "grad_norm": 0.3035908639431, "learning_rate": 2.9644601078510468e-05, "loss": 0.1541, "step": 8693 }, { "epoch": 2.6579027820238457, "grad_norm": 0.6852359175682068, "learning_rate": 2.9644176468090526e-05, "loss": 0.0826, "step": 8694 }, { "epoch": 2.658208498929991, "grad_norm": 0.4001828134059906, "learning_rate": 2.964375185767059e-05, "loss": 0.0983, "step": 8695 }, { "epoch": 2.658514215836136, "grad_norm": 0.35365843772888184, "learning_rate": 2.9643327247250647e-05, "loss": 0.0639, "step": 8696 }, { "epoch": 2.6588199327422806, "grad_norm": 0.3101315498352051, "learning_rate": 2.964290263683071e-05, "loss": 0.0563, "step": 8697 }, { "epoch": 2.6591256496484257, "grad_norm": 0.34632694721221924, "learning_rate": 2.9642478026410768e-05, "loss": 0.0574, "step": 8698 }, { "epoch": 2.6594313665545704, "grad_norm": 0.36748966574668884, "learning_rate": 2.964205341599083e-05, "loss": 0.0969, "step": 8699 }, { "epoch": 2.6597370834607155, "grad_norm": 0.2971348166465759, "learning_rate": 2.964162880557089e-05, "loss": 0.0774, "step": 8700 }, { "epoch": 2.66004280036686, "grad_norm": 0.32078561186790466, "learning_rate": 2.9641204195150947e-05, "loss": 0.0938, "step": 8701 }, { "epoch": 2.6603485172730053, "grad_norm": 0.41197019815444946, "learning_rate": 2.964077958473101e-05, "loss": 0.0905, "step": 8702 }, { "epoch": 2.66065423417915, "grad_norm": 0.5784210562705994, "learning_rate": 2.9640354974311068e-05, "loss": 0.1113, "step": 8703 }, { "epoch": 2.660959951085295, "grad_norm": 0.5140787363052368, "learning_rate": 2.963993036389113e-05, "loss": 0.1038, "step": 8704 }, { "epoch": 2.66126566799144, "grad_norm": 0.4545873999595642, "learning_rate": 2.963950575347119e-05, "loss": 0.0823, "step": 8705 }, { "epoch": 2.661571384897585, "grad_norm": 0.8718081116676331, "learning_rate": 2.963908114305125e-05, "loss": 0.1409, "step": 8706 }, { "epoch": 2.6618771018037295, "grad_norm": 0.42968958616256714, "learning_rate": 2.963865653263131e-05, "loss": 0.1044, "step": 8707 }, { "epoch": 2.6621828187098746, "grad_norm": 0.5840100646018982, "learning_rate": 2.9638231922211372e-05, "loss": 0.1512, "step": 8708 }, { "epoch": 2.6624885356160197, "grad_norm": 0.4727324843406677, "learning_rate": 2.963780731179143e-05, "loss": 0.1646, "step": 8709 }, { "epoch": 2.6627942525221644, "grad_norm": 0.5152813196182251, "learning_rate": 2.9637382701371493e-05, "loss": 0.1882, "step": 8710 }, { "epoch": 2.6630999694283095, "grad_norm": 1.0978182554244995, "learning_rate": 2.963695809095155e-05, "loss": 0.1882, "step": 8711 }, { "epoch": 2.663405686334454, "grad_norm": 0.6089798808097839, "learning_rate": 2.9636533480531613e-05, "loss": 0.2008, "step": 8712 }, { "epoch": 2.6637114032405993, "grad_norm": 0.9035851955413818, "learning_rate": 2.9636108870111672e-05, "loss": 0.2374, "step": 8713 }, { "epoch": 2.664017120146744, "grad_norm": 0.9974417090415955, "learning_rate": 2.963568425969173e-05, "loss": 0.2168, "step": 8714 }, { "epoch": 2.664322837052889, "grad_norm": 1.064261794090271, "learning_rate": 2.9635259649271793e-05, "loss": 0.1957, "step": 8715 }, { "epoch": 2.6646285539590338, "grad_norm": 1.3394023180007935, "learning_rate": 2.963483503885185e-05, "loss": 0.2384, "step": 8716 }, { "epoch": 2.664934270865179, "grad_norm": 2.0445258617401123, "learning_rate": 2.9634410428431914e-05, "loss": 0.2582, "step": 8717 }, { "epoch": 2.665239987771324, "grad_norm": 0.3464461863040924, "learning_rate": 2.9633985818011972e-05, "loss": 0.1612, "step": 8718 }, { "epoch": 2.6655457046774687, "grad_norm": 0.3175399601459503, "learning_rate": 2.9633561207592034e-05, "loss": 0.1101, "step": 8719 }, { "epoch": 2.6658514215836133, "grad_norm": 0.2845843434333801, "learning_rate": 2.9633136597172093e-05, "loss": 0.0788, "step": 8720 }, { "epoch": 2.6661571384897584, "grad_norm": 0.20462743937969208, "learning_rate": 2.9632711986752155e-05, "loss": 0.0649, "step": 8721 }, { "epoch": 2.6664628553959036, "grad_norm": 0.28023403882980347, "learning_rate": 2.9632287376332214e-05, "loss": 0.0842, "step": 8722 }, { "epoch": 2.6667685723020482, "grad_norm": 0.2974439561367035, "learning_rate": 2.9631862765912276e-05, "loss": 0.0489, "step": 8723 }, { "epoch": 2.6670742892081933, "grad_norm": 0.3944622874259949, "learning_rate": 2.9631438155492335e-05, "loss": 0.0659, "step": 8724 }, { "epoch": 2.667380006114338, "grad_norm": 0.2649383544921875, "learning_rate": 2.9631013545072397e-05, "loss": 0.0796, "step": 8725 }, { "epoch": 2.667685723020483, "grad_norm": 0.25376400351524353, "learning_rate": 2.963058893465246e-05, "loss": 0.0879, "step": 8726 }, { "epoch": 2.667991439926628, "grad_norm": 0.6196922063827515, "learning_rate": 2.9630164324232518e-05, "loss": 0.0553, "step": 8727 }, { "epoch": 2.668297156832773, "grad_norm": 0.38557982444763184, "learning_rate": 2.962973971381258e-05, "loss": 0.1092, "step": 8728 }, { "epoch": 2.6686028737389176, "grad_norm": 0.36017584800720215, "learning_rate": 2.962931510339264e-05, "loss": 0.094, "step": 8729 }, { "epoch": 2.6689085906450627, "grad_norm": 0.3923839032649994, "learning_rate": 2.96288904929727e-05, "loss": 0.1055, "step": 8730 }, { "epoch": 2.669214307551208, "grad_norm": 0.5420852899551392, "learning_rate": 2.962846588255276e-05, "loss": 0.1413, "step": 8731 }, { "epoch": 2.6695200244573525, "grad_norm": 0.6210389733314514, "learning_rate": 2.962804127213282e-05, "loss": 0.1188, "step": 8732 }, { "epoch": 2.669825741363497, "grad_norm": 0.49754586815834045, "learning_rate": 2.962761666171288e-05, "loss": 0.1597, "step": 8733 }, { "epoch": 2.6701314582696423, "grad_norm": 1.1040213108062744, "learning_rate": 2.9627192051292942e-05, "loss": 0.1907, "step": 8734 }, { "epoch": 2.6704371751757874, "grad_norm": 0.5792959928512573, "learning_rate": 2.9626767440873e-05, "loss": 0.1822, "step": 8735 }, { "epoch": 2.670742892081932, "grad_norm": 0.8374557495117188, "learning_rate": 2.9626342830453063e-05, "loss": 0.1893, "step": 8736 }, { "epoch": 2.671048608988077, "grad_norm": 0.6214209794998169, "learning_rate": 2.962591822003312e-05, "loss": 0.2026, "step": 8737 }, { "epoch": 2.671354325894222, "grad_norm": 1.197262167930603, "learning_rate": 2.9625493609613184e-05, "loss": 0.2474, "step": 8738 }, { "epoch": 2.671660042800367, "grad_norm": 1.3350852727890015, "learning_rate": 2.9625068999193242e-05, "loss": 0.1922, "step": 8739 }, { "epoch": 2.6719657597065116, "grad_norm": 1.2598234415054321, "learning_rate": 2.96246443887733e-05, "loss": 0.2063, "step": 8740 }, { "epoch": 2.6722714766126567, "grad_norm": 1.3052809238433838, "learning_rate": 2.9624219778353363e-05, "loss": 0.2545, "step": 8741 }, { "epoch": 2.6725771935188014, "grad_norm": Infinity, "learning_rate": 2.9624219778353363e-05, "loss": 0.2805, "step": 8742 }, { "epoch": 2.6728829104249465, "grad_norm": 0.6236213445663452, "learning_rate": 2.9623795167933422e-05, "loss": 0.1686, "step": 8743 }, { "epoch": 2.6731886273310916, "grad_norm": 0.3056718111038208, "learning_rate": 2.9623370557513484e-05, "loss": 0.0886, "step": 8744 }, { "epoch": 2.6734943442372363, "grad_norm": 0.32516196370124817, "learning_rate": 2.9622945947093543e-05, "loss": 0.0684, "step": 8745 }, { "epoch": 2.673800061143381, "grad_norm": 0.2560531198978424, "learning_rate": 2.9622521336673605e-05, "loss": 0.0542, "step": 8746 }, { "epoch": 2.674105778049526, "grad_norm": 0.30375370383262634, "learning_rate": 2.9622096726253663e-05, "loss": 0.0603, "step": 8747 }, { "epoch": 2.674411494955671, "grad_norm": 0.2839861810207367, "learning_rate": 2.9621672115833725e-05, "loss": 0.0714, "step": 8748 }, { "epoch": 2.674717211861816, "grad_norm": 0.36806926131248474, "learning_rate": 2.9621247505413784e-05, "loss": 0.0745, "step": 8749 }, { "epoch": 2.675022928767961, "grad_norm": 0.42150363326072693, "learning_rate": 2.9620822894993846e-05, "loss": 0.0758, "step": 8750 }, { "epoch": 2.6753286456741057, "grad_norm": 0.36842623353004456, "learning_rate": 2.9620398284573905e-05, "loss": 0.0947, "step": 8751 }, { "epoch": 2.675634362580251, "grad_norm": 0.5829852819442749, "learning_rate": 2.9619973674153964e-05, "loss": 0.0657, "step": 8752 }, { "epoch": 2.6759400794863955, "grad_norm": 1.0651925802230835, "learning_rate": 2.9619549063734026e-05, "loss": 0.1186, "step": 8753 }, { "epoch": 2.6762457963925406, "grad_norm": 0.46841371059417725, "learning_rate": 2.9619124453314084e-05, "loss": 0.1106, "step": 8754 }, { "epoch": 2.6765515132986852, "grad_norm": 0.35390594601631165, "learning_rate": 2.9618699842894147e-05, "loss": 0.0899, "step": 8755 }, { "epoch": 2.6768572302048304, "grad_norm": 0.4478570222854614, "learning_rate": 2.9618275232474205e-05, "loss": 0.128, "step": 8756 }, { "epoch": 2.6771629471109755, "grad_norm": 0.42261365056037903, "learning_rate": 2.9617850622054267e-05, "loss": 0.1234, "step": 8757 }, { "epoch": 2.67746866401712, "grad_norm": 0.44504716992378235, "learning_rate": 2.9617426011634326e-05, "loss": 0.1493, "step": 8758 }, { "epoch": 2.677774380923265, "grad_norm": 0.8467711806297302, "learning_rate": 2.9617001401214388e-05, "loss": 0.1741, "step": 8759 }, { "epoch": 2.67808009782941, "grad_norm": 0.5870293378829956, "learning_rate": 2.9616576790794447e-05, "loss": 0.15, "step": 8760 }, { "epoch": 2.678385814735555, "grad_norm": 0.9207030534744263, "learning_rate": 2.961615218037451e-05, "loss": 0.2174, "step": 8761 }, { "epoch": 2.6786915316416997, "grad_norm": 1.0931932926177979, "learning_rate": 2.9615727569954568e-05, "loss": 0.1911, "step": 8762 }, { "epoch": 2.678997248547845, "grad_norm": 0.8416117429733276, "learning_rate": 2.961530295953463e-05, "loss": 0.2028, "step": 8763 }, { "epoch": 2.6793029654539895, "grad_norm": 0.8003458380699158, "learning_rate": 2.961487834911469e-05, "loss": 0.1997, "step": 8764 }, { "epoch": 2.6796086823601346, "grad_norm": 1.6099694967269897, "learning_rate": 2.9614453738694747e-05, "loss": 0.2423, "step": 8765 }, { "epoch": 2.6799143992662793, "grad_norm": 1.1497273445129395, "learning_rate": 2.961402912827481e-05, "loss": 0.2343, "step": 8766 }, { "epoch": 2.6802201161724244, "grad_norm": 1.4872350692749023, "learning_rate": 2.9613604517854868e-05, "loss": 0.297, "step": 8767 }, { "epoch": 2.680525833078569, "grad_norm": 0.5393873453140259, "learning_rate": 2.961317990743493e-05, "loss": 0.1661, "step": 8768 }, { "epoch": 2.680831549984714, "grad_norm": 0.22769683599472046, "learning_rate": 2.961275529701499e-05, "loss": 0.0816, "step": 8769 }, { "epoch": 2.6811372668908593, "grad_norm": 0.37363749742507935, "learning_rate": 2.961233068659505e-05, "loss": 0.1294, "step": 8770 }, { "epoch": 2.681442983797004, "grad_norm": 0.25159430503845215, "learning_rate": 2.961190607617511e-05, "loss": 0.0665, "step": 8771 }, { "epoch": 2.6817487007031486, "grad_norm": 0.23555095493793488, "learning_rate": 2.961148146575517e-05, "loss": 0.0542, "step": 8772 }, { "epoch": 2.6820544176092938, "grad_norm": 0.419158935546875, "learning_rate": 2.961105685533523e-05, "loss": 0.0939, "step": 8773 }, { "epoch": 2.682360134515439, "grad_norm": 0.5319674015045166, "learning_rate": 2.9610632244915292e-05, "loss": 0.0616, "step": 8774 }, { "epoch": 2.6826658514215835, "grad_norm": 0.33960604667663574, "learning_rate": 2.961020763449535e-05, "loss": 0.0736, "step": 8775 }, { "epoch": 2.6829715683277287, "grad_norm": 0.5030962824821472, "learning_rate": 2.9609783024075413e-05, "loss": 0.0987, "step": 8776 }, { "epoch": 2.6832772852338733, "grad_norm": 0.2494470626115799, "learning_rate": 2.9609358413655472e-05, "loss": 0.0717, "step": 8777 }, { "epoch": 2.6835830021400184, "grad_norm": 0.32995739579200745, "learning_rate": 2.960893380323553e-05, "loss": 0.1066, "step": 8778 }, { "epoch": 2.683888719046163, "grad_norm": 0.3553001880645752, "learning_rate": 2.9608509192815593e-05, "loss": 0.1062, "step": 8779 }, { "epoch": 2.684194435952308, "grad_norm": 0.6593840718269348, "learning_rate": 2.960808458239565e-05, "loss": 0.0947, "step": 8780 }, { "epoch": 2.684500152858453, "grad_norm": 0.43652668595314026, "learning_rate": 2.9607659971975713e-05, "loss": 0.151, "step": 8781 }, { "epoch": 2.684805869764598, "grad_norm": 1.1349679231643677, "learning_rate": 2.9607235361555772e-05, "loss": 0.1039, "step": 8782 }, { "epoch": 2.685111586670743, "grad_norm": 0.6505615711212158, "learning_rate": 2.9606810751135834e-05, "loss": 0.1592, "step": 8783 }, { "epoch": 2.685417303576888, "grad_norm": 0.7366223335266113, "learning_rate": 2.9606386140715893e-05, "loss": 0.1754, "step": 8784 }, { "epoch": 2.6857230204830325, "grad_norm": 0.6934099197387695, "learning_rate": 2.9605961530295955e-05, "loss": 0.1671, "step": 8785 }, { "epoch": 2.6860287373891776, "grad_norm": 0.737497091293335, "learning_rate": 2.9605536919876014e-05, "loss": 0.1955, "step": 8786 }, { "epoch": 2.6863344542953227, "grad_norm": 0.7408062815666199, "learning_rate": 2.9605112309456076e-05, "loss": 0.1899, "step": 8787 }, { "epoch": 2.6866401712014674, "grad_norm": 0.7858190536499023, "learning_rate": 2.9604687699036134e-05, "loss": 0.1917, "step": 8788 }, { "epoch": 2.6869458881076125, "grad_norm": 1.2000923156738281, "learning_rate": 2.9604263088616197e-05, "loss": 0.2086, "step": 8789 }, { "epoch": 2.687251605013757, "grad_norm": 0.7699568867683411, "learning_rate": 2.9603838478196255e-05, "loss": 0.2023, "step": 8790 }, { "epoch": 2.6875573219199023, "grad_norm": 3.005228281021118, "learning_rate": 2.9603413867776314e-05, "loss": 0.2086, "step": 8791 }, { "epoch": 2.687863038826047, "grad_norm": 6.151845455169678, "learning_rate": 2.9602989257356376e-05, "loss": 0.3019, "step": 8792 }, { "epoch": 2.688168755732192, "grad_norm": 0.5889263153076172, "learning_rate": 2.9602564646936435e-05, "loss": 0.1665, "step": 8793 }, { "epoch": 2.6884744726383367, "grad_norm": 0.6188045740127563, "learning_rate": 2.9602140036516497e-05, "loss": 0.12, "step": 8794 }, { "epoch": 2.688780189544482, "grad_norm": 1.2032710313796997, "learning_rate": 2.9601715426096556e-05, "loss": 0.067, "step": 8795 }, { "epoch": 2.689085906450627, "grad_norm": 0.35831645131111145, "learning_rate": 2.9601290815676618e-05, "loss": 0.0772, "step": 8796 }, { "epoch": 2.6893916233567716, "grad_norm": 0.2701244056224823, "learning_rate": 2.9600866205256676e-05, "loss": 0.0511, "step": 8797 }, { "epoch": 2.6896973402629163, "grad_norm": 0.48703956604003906, "learning_rate": 2.960044159483674e-05, "loss": 0.0778, "step": 8798 }, { "epoch": 2.6900030571690614, "grad_norm": 0.3510128855705261, "learning_rate": 2.9600016984416797e-05, "loss": 0.0818, "step": 8799 }, { "epoch": 2.6903087740752065, "grad_norm": 0.6919806003570557, "learning_rate": 2.959959237399686e-05, "loss": 0.0758, "step": 8800 }, { "epoch": 2.690614490981351, "grad_norm": 0.4305296540260315, "learning_rate": 2.9599167763576918e-05, "loss": 0.0689, "step": 8801 }, { "epoch": 2.6909202078874963, "grad_norm": 0.32082653045654297, "learning_rate": 2.959874315315698e-05, "loss": 0.0853, "step": 8802 }, { "epoch": 2.691225924793641, "grad_norm": 0.4591390788555145, "learning_rate": 2.959831854273704e-05, "loss": 0.106, "step": 8803 }, { "epoch": 2.691531641699786, "grad_norm": 0.27082228660583496, "learning_rate": 2.9597893932317097e-05, "loss": 0.0818, "step": 8804 }, { "epoch": 2.6918373586059308, "grad_norm": 0.22924673557281494, "learning_rate": 2.959746932189716e-05, "loss": 0.0756, "step": 8805 }, { "epoch": 2.692143075512076, "grad_norm": 0.5057762265205383, "learning_rate": 2.9597044711477218e-05, "loss": 0.1347, "step": 8806 }, { "epoch": 2.6924487924182205, "grad_norm": 0.44534680247306824, "learning_rate": 2.959662010105728e-05, "loss": 0.1515, "step": 8807 }, { "epoch": 2.6927545093243657, "grad_norm": 0.6412291526794434, "learning_rate": 2.959619549063734e-05, "loss": 0.1625, "step": 8808 }, { "epoch": 2.693060226230511, "grad_norm": 0.6175499558448792, "learning_rate": 2.95957708802174e-05, "loss": 0.1709, "step": 8809 }, { "epoch": 2.6933659431366554, "grad_norm": 0.7595847845077515, "learning_rate": 2.959534626979746e-05, "loss": 0.2251, "step": 8810 }, { "epoch": 2.6936716600428, "grad_norm": 1.1453533172607422, "learning_rate": 2.9594921659377522e-05, "loss": 0.2079, "step": 8811 }, { "epoch": 2.6939773769489452, "grad_norm": 0.7410081028938293, "learning_rate": 2.959449704895758e-05, "loss": 0.2315, "step": 8812 }, { "epoch": 2.6942830938550903, "grad_norm": 0.8825443387031555, "learning_rate": 2.9594072438537643e-05, "loss": 0.2351, "step": 8813 }, { "epoch": 2.694588810761235, "grad_norm": 0.7469035387039185, "learning_rate": 2.95936478281177e-05, "loss": 0.1932, "step": 8814 }, { "epoch": 2.69489452766738, "grad_norm": 1.0586669445037842, "learning_rate": 2.9593223217697763e-05, "loss": 0.2187, "step": 8815 }, { "epoch": 2.695200244573525, "grad_norm": 1.5875682830810547, "learning_rate": 2.9592798607277822e-05, "loss": 0.2725, "step": 8816 }, { "epoch": 2.69550596147967, "grad_norm": 4.964749813079834, "learning_rate": 2.959237399685788e-05, "loss": 0.3323, "step": 8817 }, { "epoch": 2.6958116783858146, "grad_norm": 0.3892293870449066, "learning_rate": 2.9591949386437943e-05, "loss": 0.1453, "step": 8818 }, { "epoch": 2.6961173952919597, "grad_norm": 0.3777379095554352, "learning_rate": 2.9591524776018e-05, "loss": 0.105, "step": 8819 }, { "epoch": 2.6964231121981044, "grad_norm": 0.37327396869659424, "learning_rate": 2.9591100165598064e-05, "loss": 0.0809, "step": 8820 }, { "epoch": 2.6967288291042495, "grad_norm": 0.35781872272491455, "learning_rate": 2.9590675555178122e-05, "loss": 0.0719, "step": 8821 }, { "epoch": 2.6970345460103946, "grad_norm": 0.29866716265678406, "learning_rate": 2.9590250944758184e-05, "loss": 0.0674, "step": 8822 }, { "epoch": 2.6973402629165393, "grad_norm": 0.1973126381635666, "learning_rate": 2.9589826334338243e-05, "loss": 0.0605, "step": 8823 }, { "epoch": 2.697645979822684, "grad_norm": 0.47214964032173157, "learning_rate": 2.9589401723918305e-05, "loss": 0.0661, "step": 8824 }, { "epoch": 2.697951696728829, "grad_norm": 0.32050275802612305, "learning_rate": 2.9588977113498364e-05, "loss": 0.0836, "step": 8825 }, { "epoch": 2.698257413634974, "grad_norm": 0.39584362506866455, "learning_rate": 2.9588552503078426e-05, "loss": 0.0762, "step": 8826 }, { "epoch": 2.698563130541119, "grad_norm": 0.4037152826786041, "learning_rate": 2.9588127892658485e-05, "loss": 0.0848, "step": 8827 }, { "epoch": 2.698868847447264, "grad_norm": 0.2855997681617737, "learning_rate": 2.9587703282238547e-05, "loss": 0.118, "step": 8828 }, { "epoch": 2.6991745643534086, "grad_norm": 0.4274437725543976, "learning_rate": 2.958727867181861e-05, "loss": 0.0865, "step": 8829 }, { "epoch": 2.6994802812595537, "grad_norm": 0.3429509103298187, "learning_rate": 2.9586854061398668e-05, "loss": 0.0972, "step": 8830 }, { "epoch": 2.6997859981656984, "grad_norm": 0.5152809023857117, "learning_rate": 2.958642945097873e-05, "loss": 0.1541, "step": 8831 }, { "epoch": 2.7000917150718435, "grad_norm": 0.7246895432472229, "learning_rate": 2.958600484055879e-05, "loss": 0.1594, "step": 8832 }, { "epoch": 2.700397431977988, "grad_norm": 0.4222801625728607, "learning_rate": 2.958558023013885e-05, "loss": 0.1503, "step": 8833 }, { "epoch": 2.7007031488841333, "grad_norm": 0.7025434970855713, "learning_rate": 2.958515561971891e-05, "loss": 0.1922, "step": 8834 }, { "epoch": 2.7010088657902784, "grad_norm": 0.5024028420448303, "learning_rate": 2.958473100929897e-05, "loss": 0.1757, "step": 8835 }, { "epoch": 2.701314582696423, "grad_norm": 0.6324949860572815, "learning_rate": 2.958430639887903e-05, "loss": 0.2275, "step": 8836 }, { "epoch": 2.7016202996025678, "grad_norm": 0.8538098335266113, "learning_rate": 2.9583881788459092e-05, "loss": 0.2405, "step": 8837 }, { "epoch": 2.701926016508713, "grad_norm": 0.5674062371253967, "learning_rate": 2.958345717803915e-05, "loss": 0.1909, "step": 8838 }, { "epoch": 2.702231733414858, "grad_norm": 1.2629491090774536, "learning_rate": 2.9583032567619213e-05, "loss": 0.2, "step": 8839 }, { "epoch": 2.7025374503210027, "grad_norm": 1.5076920986175537, "learning_rate": 2.958260795719927e-05, "loss": 0.2075, "step": 8840 }, { "epoch": 2.702843167227148, "grad_norm": 0.922971248626709, "learning_rate": 2.9582183346779334e-05, "loss": 0.2652, "step": 8841 }, { "epoch": 2.7031488841332925, "grad_norm": 1.2445710897445679, "learning_rate": 2.9581758736359392e-05, "loss": 0.295, "step": 8842 }, { "epoch": 2.7034546010394376, "grad_norm": 0.8428782224655151, "learning_rate": 2.958133412593945e-05, "loss": 0.1513, "step": 8843 }, { "epoch": 2.7037603179455822, "grad_norm": 0.33668839931488037, "learning_rate": 2.9580909515519513e-05, "loss": 0.1218, "step": 8844 }, { "epoch": 2.7040660348517274, "grad_norm": 0.38433313369750977, "learning_rate": 2.9580484905099572e-05, "loss": 0.0773, "step": 8845 }, { "epoch": 2.704371751757872, "grad_norm": 0.4841252565383911, "learning_rate": 2.9580060294679634e-05, "loss": 0.084, "step": 8846 }, { "epoch": 2.704677468664017, "grad_norm": 0.2929609715938568, "learning_rate": 2.9579635684259693e-05, "loss": 0.0551, "step": 8847 }, { "epoch": 2.7049831855701623, "grad_norm": 0.3374837338924408, "learning_rate": 2.9579211073839755e-05, "loss": 0.063, "step": 8848 }, { "epoch": 2.705288902476307, "grad_norm": 0.8478454351425171, "learning_rate": 2.9578786463419813e-05, "loss": 0.1073, "step": 8849 }, { "epoch": 2.7055946193824516, "grad_norm": 0.2535572946071625, "learning_rate": 2.9578361852999875e-05, "loss": 0.0645, "step": 8850 }, { "epoch": 2.7059003362885967, "grad_norm": 0.28957599401474, "learning_rate": 2.9577937242579934e-05, "loss": 0.0786, "step": 8851 }, { "epoch": 2.706206053194742, "grad_norm": 0.6146442294120789, "learning_rate": 2.9577512632159996e-05, "loss": 0.0836, "step": 8852 }, { "epoch": 2.7065117701008865, "grad_norm": 0.5600945353507996, "learning_rate": 2.9577088021740055e-05, "loss": 0.0972, "step": 8853 }, { "epoch": 2.7068174870070316, "grad_norm": 0.7215248942375183, "learning_rate": 2.9576663411320117e-05, "loss": 0.1059, "step": 8854 }, { "epoch": 2.7071232039131763, "grad_norm": 0.32065945863723755, "learning_rate": 2.9576238800900176e-05, "loss": 0.107, "step": 8855 }, { "epoch": 2.7074289208193214, "grad_norm": 0.3924027681350708, "learning_rate": 2.9575814190480234e-05, "loss": 0.1366, "step": 8856 }, { "epoch": 2.707734637725466, "grad_norm": 0.4357604384422302, "learning_rate": 2.9575389580060297e-05, "loss": 0.1437, "step": 8857 }, { "epoch": 2.708040354631611, "grad_norm": 0.5384348034858704, "learning_rate": 2.9574964969640355e-05, "loss": 0.1522, "step": 8858 }, { "epoch": 2.708346071537756, "grad_norm": 0.5754145383834839, "learning_rate": 2.9574540359220417e-05, "loss": 0.1701, "step": 8859 }, { "epoch": 2.708651788443901, "grad_norm": 0.5559586882591248, "learning_rate": 2.9574115748800476e-05, "loss": 0.2152, "step": 8860 }, { "epoch": 2.708957505350046, "grad_norm": 0.8741341829299927, "learning_rate": 2.9573691138380538e-05, "loss": 0.1708, "step": 8861 }, { "epoch": 2.7092632222561908, "grad_norm": 0.6337645053863525, "learning_rate": 2.9573266527960597e-05, "loss": 0.1915, "step": 8862 }, { "epoch": 2.7095689391623354, "grad_norm": 1.1169265508651733, "learning_rate": 2.957284191754066e-05, "loss": 0.1896, "step": 8863 }, { "epoch": 2.7098746560684805, "grad_norm": 0.890170156955719, "learning_rate": 2.9572417307120718e-05, "loss": 0.2319, "step": 8864 }, { "epoch": 2.7101803729746257, "grad_norm": 0.8638143539428711, "learning_rate": 2.957199269670078e-05, "loss": 0.2092, "step": 8865 }, { "epoch": 2.7104860898807703, "grad_norm": 0.9246222376823425, "learning_rate": 2.957156808628084e-05, "loss": 0.2347, "step": 8866 }, { "epoch": 2.7107918067869154, "grad_norm": 1.2546696662902832, "learning_rate": 2.9571143475860897e-05, "loss": 0.2889, "step": 8867 }, { "epoch": 2.71109752369306, "grad_norm": 0.38382813334465027, "learning_rate": 2.957071886544096e-05, "loss": 0.1705, "step": 8868 }, { "epoch": 2.7114032405992052, "grad_norm": 0.28760892152786255, "learning_rate": 2.9570294255021018e-05, "loss": 0.0718, "step": 8869 }, { "epoch": 2.71170895750535, "grad_norm": 0.25527772307395935, "learning_rate": 2.956986964460108e-05, "loss": 0.0756, "step": 8870 }, { "epoch": 2.712014674411495, "grad_norm": 0.34373828768730164, "learning_rate": 2.956944503418114e-05, "loss": 0.0655, "step": 8871 }, { "epoch": 2.7123203913176397, "grad_norm": 0.29131895303726196, "learning_rate": 2.95690204237612e-05, "loss": 0.0516, "step": 8872 }, { "epoch": 2.712626108223785, "grad_norm": 0.24687030911445618, "learning_rate": 2.956859581334126e-05, "loss": 0.0643, "step": 8873 }, { "epoch": 2.71293182512993, "grad_norm": 0.3280605375766754, "learning_rate": 2.956817120292132e-05, "loss": 0.07, "step": 8874 }, { "epoch": 2.7132375420360746, "grad_norm": 0.4379046857357025, "learning_rate": 2.956774659250138e-05, "loss": 0.0537, "step": 8875 }, { "epoch": 2.7135432589422193, "grad_norm": 0.2661430239677429, "learning_rate": 2.9567321982081442e-05, "loss": 0.0826, "step": 8876 }, { "epoch": 2.7138489758483644, "grad_norm": 0.33354848623275757, "learning_rate": 2.95668973716615e-05, "loss": 0.0635, "step": 8877 }, { "epoch": 2.7141546927545095, "grad_norm": 0.35732489824295044, "learning_rate": 2.9566472761241563e-05, "loss": 0.0967, "step": 8878 }, { "epoch": 2.714460409660654, "grad_norm": 0.38735705614089966, "learning_rate": 2.9566048150821622e-05, "loss": 0.109, "step": 8879 }, { "epoch": 2.7147661265667993, "grad_norm": 0.41104158759117126, "learning_rate": 2.956562354040168e-05, "loss": 0.1056, "step": 8880 }, { "epoch": 2.715071843472944, "grad_norm": 0.5232689380645752, "learning_rate": 2.9565198929981743e-05, "loss": 0.1336, "step": 8881 }, { "epoch": 2.715377560379089, "grad_norm": 0.4056643545627594, "learning_rate": 2.95647743195618e-05, "loss": 0.1239, "step": 8882 }, { "epoch": 2.7156832772852337, "grad_norm": 0.5337862372398376, "learning_rate": 2.9564349709141863e-05, "loss": 0.1682, "step": 8883 }, { "epoch": 2.715988994191379, "grad_norm": 0.554460883140564, "learning_rate": 2.9563925098721922e-05, "loss": 0.1548, "step": 8884 }, { "epoch": 2.7162947110975235, "grad_norm": 0.5066083669662476, "learning_rate": 2.9563500488301984e-05, "loss": 0.1922, "step": 8885 }, { "epoch": 2.7166004280036686, "grad_norm": 3.4592275619506836, "learning_rate": 2.9563075877882043e-05, "loss": 0.2001, "step": 8886 }, { "epoch": 2.7169061449098137, "grad_norm": 0.7689265608787537, "learning_rate": 2.9562651267462105e-05, "loss": 0.1917, "step": 8887 }, { "epoch": 2.7172118618159584, "grad_norm": 0.7712385058403015, "learning_rate": 2.9562226657042164e-05, "loss": 0.206, "step": 8888 }, { "epoch": 2.717517578722103, "grad_norm": 0.9354832172393799, "learning_rate": 2.9561802046622226e-05, "loss": 0.1762, "step": 8889 }, { "epoch": 2.717823295628248, "grad_norm": 1.329920768737793, "learning_rate": 2.9561377436202284e-05, "loss": 0.2385, "step": 8890 }, { "epoch": 2.7181290125343933, "grad_norm": 1.35783052444458, "learning_rate": 2.9560952825782347e-05, "loss": 0.211, "step": 8891 }, { "epoch": 2.718434729440538, "grad_norm": 1.3804950714111328, "learning_rate": 2.9560528215362405e-05, "loss": 0.2917, "step": 8892 }, { "epoch": 2.718740446346683, "grad_norm": 0.3331294357776642, "learning_rate": 2.9560103604942464e-05, "loss": 0.1699, "step": 8893 }, { "epoch": 2.7190461632528278, "grad_norm": 0.2714889943599701, "learning_rate": 2.9559678994522526e-05, "loss": 0.0985, "step": 8894 }, { "epoch": 2.719351880158973, "grad_norm": 0.382799357175827, "learning_rate": 2.9559254384102585e-05, "loss": 0.0909, "step": 8895 }, { "epoch": 2.7196575970651176, "grad_norm": 0.40621551871299744, "learning_rate": 2.9558829773682647e-05, "loss": 0.0659, "step": 8896 }, { "epoch": 2.7199633139712627, "grad_norm": 0.3989184498786926, "learning_rate": 2.9558405163262706e-05, "loss": 0.085, "step": 8897 }, { "epoch": 2.7202690308774073, "grad_norm": 0.2284633219242096, "learning_rate": 2.9557980552842768e-05, "loss": 0.0564, "step": 8898 }, { "epoch": 2.7205747477835525, "grad_norm": 0.361341655254364, "learning_rate": 2.9557555942422826e-05, "loss": 0.0943, "step": 8899 }, { "epoch": 2.7208804646896976, "grad_norm": 0.2702975273132324, "learning_rate": 2.955713133200289e-05, "loss": 0.0574, "step": 8900 }, { "epoch": 2.7211861815958422, "grad_norm": 0.3364415168762207, "learning_rate": 2.9556706721582947e-05, "loss": 0.09, "step": 8901 }, { "epoch": 2.721491898501987, "grad_norm": 0.3383548855781555, "learning_rate": 2.955628211116301e-05, "loss": 0.0623, "step": 8902 }, { "epoch": 2.721797615408132, "grad_norm": 0.4209989607334137, "learning_rate": 2.9555857500743068e-05, "loss": 0.1208, "step": 8903 }, { "epoch": 2.722103332314277, "grad_norm": 0.2763955891132355, "learning_rate": 2.955543289032313e-05, "loss": 0.0923, "step": 8904 }, { "epoch": 2.722409049220422, "grad_norm": 0.32311055064201355, "learning_rate": 2.955500827990319e-05, "loss": 0.111, "step": 8905 }, { "epoch": 2.722714766126567, "grad_norm": 0.7019002437591553, "learning_rate": 2.9554583669483247e-05, "loss": 0.1622, "step": 8906 }, { "epoch": 2.7230204830327116, "grad_norm": 1.6213916540145874, "learning_rate": 2.955415905906331e-05, "loss": 0.1366, "step": 8907 }, { "epoch": 2.7233261999388567, "grad_norm": 0.5734357237815857, "learning_rate": 2.9553734448643368e-05, "loss": 0.1719, "step": 8908 }, { "epoch": 2.7236319168450014, "grad_norm": 0.7067188024520874, "learning_rate": 2.955330983822343e-05, "loss": 0.1881, "step": 8909 }, { "epoch": 2.7239376337511465, "grad_norm": 0.6177224516868591, "learning_rate": 2.955288522780349e-05, "loss": 0.1816, "step": 8910 }, { "epoch": 2.724243350657291, "grad_norm": 0.8981925249099731, "learning_rate": 2.955246061738355e-05, "loss": 0.2339, "step": 8911 }, { "epoch": 2.7245490675634363, "grad_norm": 0.9672415852546692, "learning_rate": 2.955203600696361e-05, "loss": 0.2221, "step": 8912 }, { "epoch": 2.7248547844695814, "grad_norm": 0.9189679026603699, "learning_rate": 2.9551611396543672e-05, "loss": 0.1904, "step": 8913 }, { "epoch": 2.725160501375726, "grad_norm": 1.4959415197372437, "learning_rate": 2.955118678612373e-05, "loss": 0.2262, "step": 8914 }, { "epoch": 2.7254662182818707, "grad_norm": 0.9269125461578369, "learning_rate": 2.9550762175703793e-05, "loss": 0.202, "step": 8915 }, { "epoch": 2.725771935188016, "grad_norm": 1.3292993307113647, "learning_rate": 2.955033756528385e-05, "loss": 0.2507, "step": 8916 }, { "epoch": 2.726077652094161, "grad_norm": 2.167597770690918, "learning_rate": 2.9549912954863913e-05, "loss": 0.3573, "step": 8917 }, { "epoch": 2.7263833690003056, "grad_norm": 0.7289736270904541, "learning_rate": 2.9549488344443972e-05, "loss": 0.1675, "step": 8918 }, { "epoch": 2.7266890859064508, "grad_norm": 0.2969193160533905, "learning_rate": 2.954906373402403e-05, "loss": 0.1006, "step": 8919 }, { "epoch": 2.7269948028125954, "grad_norm": 0.2221265435218811, "learning_rate": 2.9548639123604093e-05, "loss": 0.0582, "step": 8920 }, { "epoch": 2.7273005197187405, "grad_norm": 0.37762650847435, "learning_rate": 2.954821451318415e-05, "loss": 0.0562, "step": 8921 }, { "epoch": 2.727606236624885, "grad_norm": 0.3681592345237732, "learning_rate": 2.9547789902764214e-05, "loss": 0.0669, "step": 8922 }, { "epoch": 2.7279119535310303, "grad_norm": 0.42542171478271484, "learning_rate": 2.9547365292344272e-05, "loss": 0.0651, "step": 8923 }, { "epoch": 2.728217670437175, "grad_norm": 0.36794573068618774, "learning_rate": 2.9546940681924334e-05, "loss": 0.064, "step": 8924 }, { "epoch": 2.72852338734332, "grad_norm": 0.5130842924118042, "learning_rate": 2.9546516071504393e-05, "loss": 0.111, "step": 8925 }, { "epoch": 2.7288291042494652, "grad_norm": 0.29345470666885376, "learning_rate": 2.9546091461084455e-05, "loss": 0.0555, "step": 8926 }, { "epoch": 2.72913482115561, "grad_norm": 0.45940858125686646, "learning_rate": 2.9545666850664514e-05, "loss": 0.0859, "step": 8927 }, { "epoch": 2.7294405380617546, "grad_norm": 0.5929442048072815, "learning_rate": 2.9545242240244576e-05, "loss": 0.1389, "step": 8928 }, { "epoch": 2.7297462549678997, "grad_norm": 0.30663174390792847, "learning_rate": 2.9544817629824635e-05, "loss": 0.0964, "step": 8929 }, { "epoch": 2.730051971874045, "grad_norm": 0.44828304648399353, "learning_rate": 2.9544393019404697e-05, "loss": 0.0889, "step": 8930 }, { "epoch": 2.7303576887801895, "grad_norm": 0.5692293047904968, "learning_rate": 2.954396840898476e-05, "loss": 0.121, "step": 8931 }, { "epoch": 2.7306634056863346, "grad_norm": 1.150504231452942, "learning_rate": 2.9543543798564818e-05, "loss": 0.1342, "step": 8932 }, { "epoch": 2.7309691225924793, "grad_norm": 0.39157330989837646, "learning_rate": 2.954311918814488e-05, "loss": 0.1268, "step": 8933 }, { "epoch": 2.7312748394986244, "grad_norm": 0.47620639204978943, "learning_rate": 2.954269457772494e-05, "loss": 0.1695, "step": 8934 }, { "epoch": 2.731580556404769, "grad_norm": 0.6399511694908142, "learning_rate": 2.9542269967305e-05, "loss": 0.2101, "step": 8935 }, { "epoch": 2.731886273310914, "grad_norm": 1.5325757265090942, "learning_rate": 2.954184535688506e-05, "loss": 0.1844, "step": 8936 }, { "epoch": 2.732191990217059, "grad_norm": 0.5406950116157532, "learning_rate": 2.954142074646512e-05, "loss": 0.1747, "step": 8937 }, { "epoch": 2.732497707123204, "grad_norm": 1.1743888854980469, "learning_rate": 2.954099613604518e-05, "loss": 0.2049, "step": 8938 }, { "epoch": 2.732803424029349, "grad_norm": 2.830111503601074, "learning_rate": 2.9540571525625242e-05, "loss": 0.2174, "step": 8939 }, { "epoch": 2.7331091409354937, "grad_norm": 0.6311935782432556, "learning_rate": 2.95401469152053e-05, "loss": 0.2146, "step": 8940 }, { "epoch": 2.7334148578416384, "grad_norm": 0.973463237285614, "learning_rate": 2.9539722304785363e-05, "loss": 0.2269, "step": 8941 }, { "epoch": 2.7337205747477835, "grad_norm": 1.560346245765686, "learning_rate": 2.953929769436542e-05, "loss": 0.3051, "step": 8942 }, { "epoch": 2.7340262916539286, "grad_norm": 0.4503558576107025, "learning_rate": 2.9538873083945484e-05, "loss": 0.142, "step": 8943 }, { "epoch": 2.7343320085600733, "grad_norm": 0.5899693965911865, "learning_rate": 2.9538448473525542e-05, "loss": 0.0938, "step": 8944 }, { "epoch": 2.7346377254662184, "grad_norm": 0.41330525279045105, "learning_rate": 2.95380238631056e-05, "loss": 0.0671, "step": 8945 }, { "epoch": 2.734943442372363, "grad_norm": 0.4042695164680481, "learning_rate": 2.9537599252685663e-05, "loss": 0.0823, "step": 8946 }, { "epoch": 2.735249159278508, "grad_norm": 0.21299080550670624, "learning_rate": 2.9537174642265722e-05, "loss": 0.0554, "step": 8947 }, { "epoch": 2.735554876184653, "grad_norm": 0.4766937494277954, "learning_rate": 2.9536750031845784e-05, "loss": 0.0747, "step": 8948 }, { "epoch": 2.735860593090798, "grad_norm": 0.259302020072937, "learning_rate": 2.9536325421425843e-05, "loss": 0.0532, "step": 8949 }, { "epoch": 2.7361663099969427, "grad_norm": 0.33531638979911804, "learning_rate": 2.9535900811005905e-05, "loss": 0.0678, "step": 8950 }, { "epoch": 2.7364720269030878, "grad_norm": 0.39097487926483154, "learning_rate": 2.9535476200585963e-05, "loss": 0.0704, "step": 8951 }, { "epoch": 2.736777743809233, "grad_norm": 0.3441881537437439, "learning_rate": 2.9535051590166026e-05, "loss": 0.0795, "step": 8952 }, { "epoch": 2.7370834607153776, "grad_norm": 0.5173120498657227, "learning_rate": 2.9534626979746084e-05, "loss": 0.0691, "step": 8953 }, { "epoch": 2.7373891776215222, "grad_norm": 0.30571407079696655, "learning_rate": 2.9534202369326146e-05, "loss": 0.0928, "step": 8954 }, { "epoch": 2.7376948945276673, "grad_norm": 0.5133305788040161, "learning_rate": 2.9533777758906205e-05, "loss": 0.1031, "step": 8955 }, { "epoch": 2.7380006114338125, "grad_norm": 0.5291617512702942, "learning_rate": 2.9533353148486267e-05, "loss": 0.1402, "step": 8956 }, { "epoch": 2.738306328339957, "grad_norm": 0.4506904184818268, "learning_rate": 2.9532928538066326e-05, "loss": 0.128, "step": 8957 }, { "epoch": 2.7386120452461022, "grad_norm": 1.1799556016921997, "learning_rate": 2.9532503927646384e-05, "loss": 0.1636, "step": 8958 }, { "epoch": 2.738917762152247, "grad_norm": 0.5274310111999512, "learning_rate": 2.9532079317226447e-05, "loss": 0.1606, "step": 8959 }, { "epoch": 2.739223479058392, "grad_norm": 0.8847830891609192, "learning_rate": 2.9531654706806505e-05, "loss": 0.1741, "step": 8960 }, { "epoch": 2.7395291959645367, "grad_norm": 0.881223201751709, "learning_rate": 2.9531230096386567e-05, "loss": 0.1929, "step": 8961 }, { "epoch": 2.739834912870682, "grad_norm": 0.8177556395530701, "learning_rate": 2.9530805485966626e-05, "loss": 0.1873, "step": 8962 }, { "epoch": 2.7401406297768265, "grad_norm": 1.8757119178771973, "learning_rate": 2.9530380875546688e-05, "loss": 0.2133, "step": 8963 }, { "epoch": 2.7404463466829716, "grad_norm": 1.017235517501831, "learning_rate": 2.9529956265126747e-05, "loss": 0.1908, "step": 8964 }, { "epoch": 2.7407520635891167, "grad_norm": 1.0044530630111694, "learning_rate": 2.952953165470681e-05, "loss": 0.1826, "step": 8965 }, { "epoch": 2.7410577804952614, "grad_norm": 1.1419589519500732, "learning_rate": 2.9529107044286868e-05, "loss": 0.2164, "step": 8966 }, { "epoch": 2.741363497401406, "grad_norm": 1.1615439653396606, "learning_rate": 2.952868243386693e-05, "loss": 0.2801, "step": 8967 }, { "epoch": 2.741669214307551, "grad_norm": 0.3924882411956787, "learning_rate": 2.952825782344699e-05, "loss": 0.157, "step": 8968 }, { "epoch": 2.7419749312136963, "grad_norm": 0.3198155164718628, "learning_rate": 2.952783321302705e-05, "loss": 0.0642, "step": 8969 }, { "epoch": 2.742280648119841, "grad_norm": 0.7029460072517395, "learning_rate": 2.952740860260711e-05, "loss": 0.0679, "step": 8970 }, { "epoch": 2.742586365025986, "grad_norm": 0.35470131039619446, "learning_rate": 2.9526983992187168e-05, "loss": 0.0828, "step": 8971 }, { "epoch": 2.7428920819321307, "grad_norm": 0.30400604009628296, "learning_rate": 2.952655938176723e-05, "loss": 0.058, "step": 8972 }, { "epoch": 2.743197798838276, "grad_norm": 0.22630620002746582, "learning_rate": 2.952613477134729e-05, "loss": 0.0643, "step": 8973 }, { "epoch": 2.7435035157444205, "grad_norm": 0.18172794580459595, "learning_rate": 2.952571016092735e-05, "loss": 0.0547, "step": 8974 }, { "epoch": 2.7438092326505656, "grad_norm": 0.3796636164188385, "learning_rate": 2.952528555050741e-05, "loss": 0.0739, "step": 8975 }, { "epoch": 2.7441149495567103, "grad_norm": 0.3629956841468811, "learning_rate": 2.952486094008747e-05, "loss": 0.1016, "step": 8976 }, { "epoch": 2.7444206664628554, "grad_norm": 0.4247543513774872, "learning_rate": 2.952443632966753e-05, "loss": 0.1087, "step": 8977 }, { "epoch": 2.7447263833690005, "grad_norm": 0.6533893346786499, "learning_rate": 2.9524011719247592e-05, "loss": 0.0776, "step": 8978 }, { "epoch": 2.745032100275145, "grad_norm": 0.34743732213974, "learning_rate": 2.952358710882765e-05, "loss": 0.1024, "step": 8979 }, { "epoch": 2.74533781718129, "grad_norm": 0.4860205054283142, "learning_rate": 2.9523162498407713e-05, "loss": 0.1075, "step": 8980 }, { "epoch": 2.745643534087435, "grad_norm": 0.7481884956359863, "learning_rate": 2.9522737887987772e-05, "loss": 0.1408, "step": 8981 }, { "epoch": 2.74594925099358, "grad_norm": 0.3811589479446411, "learning_rate": 2.952231327756783e-05, "loss": 0.1377, "step": 8982 }, { "epoch": 2.746254967899725, "grad_norm": 0.49341949820518494, "learning_rate": 2.9521888667147893e-05, "loss": 0.1403, "step": 8983 }, { "epoch": 2.74656068480587, "grad_norm": 1.750240445137024, "learning_rate": 2.952146405672795e-05, "loss": 0.1585, "step": 8984 }, { "epoch": 2.7468664017120146, "grad_norm": 1.8921160697937012, "learning_rate": 2.9521039446308013e-05, "loss": 0.2013, "step": 8985 }, { "epoch": 2.7471721186181597, "grad_norm": 0.7457348108291626, "learning_rate": 2.9520614835888072e-05, "loss": 0.202, "step": 8986 }, { "epoch": 2.7474778355243044, "grad_norm": 0.9164198637008667, "learning_rate": 2.9520190225468134e-05, "loss": 0.1973, "step": 8987 }, { "epoch": 2.7477835524304495, "grad_norm": 1.2575277090072632, "learning_rate": 2.9519765615048193e-05, "loss": 0.1973, "step": 8988 }, { "epoch": 2.748089269336594, "grad_norm": 0.7955787181854248, "learning_rate": 2.9519341004628255e-05, "loss": 0.1838, "step": 8989 }, { "epoch": 2.7483949862427393, "grad_norm": 0.786929190158844, "learning_rate": 2.9518916394208314e-05, "loss": 0.2394, "step": 8990 }, { "epoch": 2.7487007031488844, "grad_norm": 0.8670465350151062, "learning_rate": 2.9518491783788376e-05, "loss": 0.2347, "step": 8991 }, { "epoch": 2.749006420055029, "grad_norm": 1.0498706102371216, "learning_rate": 2.9518067173368435e-05, "loss": 0.2638, "step": 8992 }, { "epoch": 2.7493121369611737, "grad_norm": 0.39317432045936584, "learning_rate": 2.9517642562948497e-05, "loss": 0.1788, "step": 8993 }, { "epoch": 2.749617853867319, "grad_norm": 0.28476881980895996, "learning_rate": 2.9517217952528555e-05, "loss": 0.0691, "step": 8994 }, { "epoch": 2.749923570773464, "grad_norm": 0.29670849442481995, "learning_rate": 2.9516793342108614e-05, "loss": 0.1112, "step": 8995 }, { "epoch": 2.7502292876796086, "grad_norm": 0.310567706823349, "learning_rate": 2.9516368731688676e-05, "loss": 0.0571, "step": 8996 }, { "epoch": 2.7505350045857537, "grad_norm": 0.2643621861934662, "learning_rate": 2.9515944121268735e-05, "loss": 0.0599, "step": 8997 }, { "epoch": 2.7508407214918984, "grad_norm": 0.2515692710876465, "learning_rate": 2.9515519510848797e-05, "loss": 0.076, "step": 8998 }, { "epoch": 2.7511464383980435, "grad_norm": 0.14842955768108368, "learning_rate": 2.9515094900428856e-05, "loss": 0.041, "step": 8999 }, { "epoch": 2.751452155304188, "grad_norm": 0.6587350964546204, "learning_rate": 2.9514670290008918e-05, "loss": 0.0627, "step": 9000 }, { "epoch": 2.751452155304188, "eval_cer": 0.19201771900537898, "eval_loss": 0.25483831763267517, "eval_runtime": 19.0005, "eval_samples_per_second": 238.836, "eval_steps_per_second": 0.789, "eval_wer": 0.3412753833691796, "step": 9000 }, { "epoch": 2.7517578722103333, "grad_norm": 0.49182531237602234, "learning_rate": 2.9514245679588976e-05, "loss": 0.0867, "step": 9001 }, { "epoch": 2.752063589116478, "grad_norm": 0.8678860664367676, "learning_rate": 2.951382106916904e-05, "loss": 0.0823, "step": 9002 }, { "epoch": 2.752369306022623, "grad_norm": 0.6429328322410583, "learning_rate": 2.9513396458749097e-05, "loss": 0.0876, "step": 9003 }, { "epoch": 2.752675022928768, "grad_norm": 0.4217776656150818, "learning_rate": 2.951297184832916e-05, "loss": 0.0968, "step": 9004 }, { "epoch": 2.752980739834913, "grad_norm": 0.7343816757202148, "learning_rate": 2.9512547237909218e-05, "loss": 0.1043, "step": 9005 }, { "epoch": 2.7532864567410575, "grad_norm": 0.4483078420162201, "learning_rate": 2.951212262748928e-05, "loss": 0.1602, "step": 9006 }, { "epoch": 2.7535921736472027, "grad_norm": 0.5138184428215027, "learning_rate": 2.951169801706934e-05, "loss": 0.1404, "step": 9007 }, { "epoch": 2.7538978905533478, "grad_norm": 0.4661647379398346, "learning_rate": 2.9511273406649397e-05, "loss": 0.1424, "step": 9008 }, { "epoch": 2.7542036074594924, "grad_norm": 0.5417746901512146, "learning_rate": 2.951084879622946e-05, "loss": 0.171, "step": 9009 }, { "epoch": 2.7545093243656376, "grad_norm": 0.6665751338005066, "learning_rate": 2.9510424185809518e-05, "loss": 0.1646, "step": 9010 }, { "epoch": 2.7548150412717822, "grad_norm": 0.8331485390663147, "learning_rate": 2.950999957538958e-05, "loss": 0.23, "step": 9011 }, { "epoch": 2.7551207581779273, "grad_norm": 0.5234645009040833, "learning_rate": 2.950957496496964e-05, "loss": 0.1926, "step": 9012 }, { "epoch": 2.755426475084072, "grad_norm": 1.4614323377609253, "learning_rate": 2.95091503545497e-05, "loss": 0.1994, "step": 9013 }, { "epoch": 2.755732191990217, "grad_norm": 0.8737764954566956, "learning_rate": 2.950872574412976e-05, "loss": 0.2166, "step": 9014 }, { "epoch": 2.756037908896362, "grad_norm": 1.5223280191421509, "learning_rate": 2.9508301133709822e-05, "loss": 0.2049, "step": 9015 }, { "epoch": 2.756343625802507, "grad_norm": 1.0963563919067383, "learning_rate": 2.950787652328988e-05, "loss": 0.2047, "step": 9016 }, { "epoch": 2.756649342708652, "grad_norm": 2.179558038711548, "learning_rate": 2.9507451912869943e-05, "loss": 0.2804, "step": 9017 }, { "epoch": 2.7569550596147967, "grad_norm": 0.4392877519130707, "learning_rate": 2.950702730245e-05, "loss": 0.1672, "step": 9018 }, { "epoch": 2.7572607765209414, "grad_norm": 0.3778644800186157, "learning_rate": 2.9506602692030063e-05, "loss": 0.0911, "step": 9019 }, { "epoch": 2.7575664934270865, "grad_norm": 0.49859246611595154, "learning_rate": 2.9506178081610122e-05, "loss": 0.0901, "step": 9020 }, { "epoch": 2.7578722103332316, "grad_norm": 0.2865770757198334, "learning_rate": 2.950575347119018e-05, "loss": 0.0557, "step": 9021 }, { "epoch": 2.7581779272393763, "grad_norm": 0.6657822132110596, "learning_rate": 2.9505328860770243e-05, "loss": 0.0653, "step": 9022 }, { "epoch": 2.7584836441455214, "grad_norm": 0.24784083664417267, "learning_rate": 2.95049042503503e-05, "loss": 0.0619, "step": 9023 }, { "epoch": 2.758789361051666, "grad_norm": 0.266912579536438, "learning_rate": 2.9504479639930364e-05, "loss": 0.0468, "step": 9024 }, { "epoch": 2.759095077957811, "grad_norm": 0.38388293981552124, "learning_rate": 2.9504055029510422e-05, "loss": 0.0586, "step": 9025 }, { "epoch": 2.759400794863956, "grad_norm": 0.3397732675075531, "learning_rate": 2.9503630419090485e-05, "loss": 0.0711, "step": 9026 }, { "epoch": 2.759706511770101, "grad_norm": 0.4340386688709259, "learning_rate": 2.9503205808670543e-05, "loss": 0.0958, "step": 9027 }, { "epoch": 2.7600122286762456, "grad_norm": 0.4739077091217041, "learning_rate": 2.9502781198250605e-05, "loss": 0.0988, "step": 9028 }, { "epoch": 2.7603179455823907, "grad_norm": 0.45212918519973755, "learning_rate": 2.9502356587830664e-05, "loss": 0.0913, "step": 9029 }, { "epoch": 2.760623662488536, "grad_norm": 0.49291208386421204, "learning_rate": 2.9501931977410726e-05, "loss": 0.0973, "step": 9030 }, { "epoch": 2.7609293793946805, "grad_norm": 0.6840059757232666, "learning_rate": 2.9501507366990785e-05, "loss": 0.114, "step": 9031 }, { "epoch": 2.761235096300825, "grad_norm": 0.5912116169929504, "learning_rate": 2.9501082756570847e-05, "loss": 0.1293, "step": 9032 }, { "epoch": 2.7615408132069703, "grad_norm": 1.0287690162658691, "learning_rate": 2.950065814615091e-05, "loss": 0.1849, "step": 9033 }, { "epoch": 2.7618465301131154, "grad_norm": 0.7677445411682129, "learning_rate": 2.9500233535730968e-05, "loss": 0.149, "step": 9034 }, { "epoch": 2.76215224701926, "grad_norm": 0.7305231690406799, "learning_rate": 2.949980892531103e-05, "loss": 0.1625, "step": 9035 }, { "epoch": 2.762457963925405, "grad_norm": 0.9717433452606201, "learning_rate": 2.949938431489109e-05, "loss": 0.1748, "step": 9036 }, { "epoch": 2.76276368083155, "grad_norm": 1.2862850427627563, "learning_rate": 2.949895970447115e-05, "loss": 0.2231, "step": 9037 }, { "epoch": 2.763069397737695, "grad_norm": 0.7835288643836975, "learning_rate": 2.949853509405121e-05, "loss": 0.1989, "step": 9038 }, { "epoch": 2.7633751146438397, "grad_norm": 0.9725880026817322, "learning_rate": 2.949811048363127e-05, "loss": 0.2474, "step": 9039 }, { "epoch": 2.763680831549985, "grad_norm": 1.2709404230117798, "learning_rate": 2.949768587321133e-05, "loss": 0.2292, "step": 9040 }, { "epoch": 2.7639865484561295, "grad_norm": 1.3110207319259644, "learning_rate": 2.9497261262791392e-05, "loss": 0.2924, "step": 9041 }, { "epoch": 2.7642922653622746, "grad_norm": 1.3591563701629639, "learning_rate": 2.949683665237145e-05, "loss": 0.2951, "step": 9042 }, { "epoch": 2.7645979822684197, "grad_norm": 0.4792231619358063, "learning_rate": 2.9496412041951513e-05, "loss": 0.1606, "step": 9043 }, { "epoch": 2.7649036991745644, "grad_norm": 0.5724527835845947, "learning_rate": 2.949598743153157e-05, "loss": 0.1063, "step": 9044 }, { "epoch": 2.765209416080709, "grad_norm": 0.29675379395484924, "learning_rate": 2.9495562821111634e-05, "loss": 0.0759, "step": 9045 }, { "epoch": 2.765515132986854, "grad_norm": 0.2943418323993683, "learning_rate": 2.9495138210691692e-05, "loss": 0.0684, "step": 9046 }, { "epoch": 2.7658208498929993, "grad_norm": 0.702014148235321, "learning_rate": 2.949471360027175e-05, "loss": 0.0765, "step": 9047 }, { "epoch": 2.766126566799144, "grad_norm": 0.2939695417881012, "learning_rate": 2.9494288989851813e-05, "loss": 0.0424, "step": 9048 }, { "epoch": 2.766432283705289, "grad_norm": 0.35690099000930786, "learning_rate": 2.9493864379431872e-05, "loss": 0.0709, "step": 9049 }, { "epoch": 2.7667380006114337, "grad_norm": 0.5490335822105408, "learning_rate": 2.9493439769011934e-05, "loss": 0.1115, "step": 9050 }, { "epoch": 2.767043717517579, "grad_norm": 0.48634272813796997, "learning_rate": 2.9493015158591993e-05, "loss": 0.0951, "step": 9051 }, { "epoch": 2.7673494344237235, "grad_norm": 0.26974642276763916, "learning_rate": 2.9492590548172055e-05, "loss": 0.0646, "step": 9052 }, { "epoch": 2.7676551513298686, "grad_norm": 0.574059784412384, "learning_rate": 2.9492165937752113e-05, "loss": 0.1005, "step": 9053 }, { "epoch": 2.7679608682360133, "grad_norm": 0.5247287750244141, "learning_rate": 2.9491741327332176e-05, "loss": 0.1098, "step": 9054 }, { "epoch": 2.7682665851421584, "grad_norm": 0.6501248478889465, "learning_rate": 2.9491316716912234e-05, "loss": 0.0823, "step": 9055 }, { "epoch": 2.7685723020483035, "grad_norm": 0.6826670169830322, "learning_rate": 2.9490892106492296e-05, "loss": 0.1356, "step": 9056 }, { "epoch": 2.768878018954448, "grad_norm": 0.8570169806480408, "learning_rate": 2.9490467496072355e-05, "loss": 0.1158, "step": 9057 }, { "epoch": 2.769183735860593, "grad_norm": 0.5930564403533936, "learning_rate": 2.9490042885652417e-05, "loss": 0.1476, "step": 9058 }, { "epoch": 2.769489452766738, "grad_norm": 0.9929170608520508, "learning_rate": 2.9489618275232476e-05, "loss": 0.173, "step": 9059 }, { "epoch": 2.769795169672883, "grad_norm": 0.6578879356384277, "learning_rate": 2.9489193664812535e-05, "loss": 0.1723, "step": 9060 }, { "epoch": 2.7701008865790278, "grad_norm": 0.7910533547401428, "learning_rate": 2.9488769054392597e-05, "loss": 0.2019, "step": 9061 }, { "epoch": 2.770406603485173, "grad_norm": 0.6556119322776794, "learning_rate": 2.9488344443972655e-05, "loss": 0.1954, "step": 9062 }, { "epoch": 2.7707123203913175, "grad_norm": 1.0392369031906128, "learning_rate": 2.9487919833552717e-05, "loss": 0.2206, "step": 9063 }, { "epoch": 2.7710180372974627, "grad_norm": 1.0344281196594238, "learning_rate": 2.9487495223132776e-05, "loss": 0.1973, "step": 9064 }, { "epoch": 2.7713237542036073, "grad_norm": 1.1121858358383179, "learning_rate": 2.9487070612712838e-05, "loss": 0.2755, "step": 9065 }, { "epoch": 2.7716294711097524, "grad_norm": 1.2131853103637695, "learning_rate": 2.9486646002292897e-05, "loss": 0.2, "step": 9066 }, { "epoch": 2.771935188015897, "grad_norm": 1.9921181201934814, "learning_rate": 2.948622139187296e-05, "loss": 0.311, "step": 9067 }, { "epoch": 2.7722409049220422, "grad_norm": 0.464420348405838, "learning_rate": 2.9485796781453018e-05, "loss": 0.162, "step": 9068 }, { "epoch": 2.7725466218281873, "grad_norm": 0.36489415168762207, "learning_rate": 2.948537217103308e-05, "loss": 0.0766, "step": 9069 }, { "epoch": 2.772852338734332, "grad_norm": 0.25489723682403564, "learning_rate": 2.948494756061314e-05, "loss": 0.0592, "step": 9070 }, { "epoch": 2.7731580556404767, "grad_norm": 0.48280516266822815, "learning_rate": 2.94845229501932e-05, "loss": 0.081, "step": 9071 }, { "epoch": 2.773463772546622, "grad_norm": 0.43164074420928955, "learning_rate": 2.948409833977326e-05, "loss": 0.0833, "step": 9072 }, { "epoch": 2.773769489452767, "grad_norm": 0.3071483373641968, "learning_rate": 2.9483673729353318e-05, "loss": 0.063, "step": 9073 }, { "epoch": 2.7740752063589116, "grad_norm": 0.31256306171417236, "learning_rate": 2.948324911893338e-05, "loss": 0.0709, "step": 9074 }, { "epoch": 2.7743809232650567, "grad_norm": 0.26656582951545715, "learning_rate": 2.948282450851344e-05, "loss": 0.0777, "step": 9075 }, { "epoch": 2.7746866401712014, "grad_norm": 0.43348684906959534, "learning_rate": 2.94823998980935e-05, "loss": 0.1015, "step": 9076 }, { "epoch": 2.7749923570773465, "grad_norm": 0.3962317705154419, "learning_rate": 2.948197528767356e-05, "loss": 0.0782, "step": 9077 }, { "epoch": 2.775298073983491, "grad_norm": 0.45140698552131653, "learning_rate": 2.948155067725362e-05, "loss": 0.0924, "step": 9078 }, { "epoch": 2.7756037908896363, "grad_norm": 0.6072771549224854, "learning_rate": 2.948112606683368e-05, "loss": 0.0911, "step": 9079 }, { "epoch": 2.775909507795781, "grad_norm": 0.38498011231422424, "learning_rate": 2.9480701456413742e-05, "loss": 0.1002, "step": 9080 }, { "epoch": 2.776215224701926, "grad_norm": 0.43087249994277954, "learning_rate": 2.94802768459938e-05, "loss": 0.1235, "step": 9081 }, { "epoch": 2.776520941608071, "grad_norm": 0.7773375511169434, "learning_rate": 2.9479852235573863e-05, "loss": 0.1355, "step": 9082 }, { "epoch": 2.776826658514216, "grad_norm": 0.5676358342170715, "learning_rate": 2.9479427625153922e-05, "loss": 0.155, "step": 9083 }, { "epoch": 2.7771323754203605, "grad_norm": 0.6001033186912537, "learning_rate": 2.9479003014733984e-05, "loss": 0.1811, "step": 9084 }, { "epoch": 2.7774380923265056, "grad_norm": 0.7846354842185974, "learning_rate": 2.9478578404314043e-05, "loss": 0.1601, "step": 9085 }, { "epoch": 2.7777438092326507, "grad_norm": 0.9073473811149597, "learning_rate": 2.94781537938941e-05, "loss": 0.1947, "step": 9086 }, { "epoch": 2.7780495261387954, "grad_norm": 0.7661173939704895, "learning_rate": 2.9477729183474163e-05, "loss": 0.2202, "step": 9087 }, { "epoch": 2.7783552430449405, "grad_norm": 0.9624272584915161, "learning_rate": 2.9477304573054222e-05, "loss": 0.2036, "step": 9088 }, { "epoch": 2.778660959951085, "grad_norm": 0.8020376563072205, "learning_rate": 2.9476879962634284e-05, "loss": 0.2146, "step": 9089 }, { "epoch": 2.7789666768572303, "grad_norm": 0.9402657151222229, "learning_rate": 2.9476455352214343e-05, "loss": 0.2324, "step": 9090 }, { "epoch": 2.779272393763375, "grad_norm": 1.2601616382598877, "learning_rate": 2.9476030741794405e-05, "loss": 0.2354, "step": 9091 }, { "epoch": 2.77957811066952, "grad_norm": 1.3935338258743286, "learning_rate": 2.9475606131374464e-05, "loss": 0.2965, "step": 9092 }, { "epoch": 2.7798838275756648, "grad_norm": 0.338405042886734, "learning_rate": 2.9475181520954526e-05, "loss": 0.1498, "step": 9093 }, { "epoch": 2.78018954448181, "grad_norm": 0.23259153962135315, "learning_rate": 2.9474756910534585e-05, "loss": 0.0847, "step": 9094 }, { "epoch": 2.780495261387955, "grad_norm": 0.6153401732444763, "learning_rate": 2.9474332300114647e-05, "loss": 0.0922, "step": 9095 }, { "epoch": 2.7808009782940997, "grad_norm": 0.26105669140815735, "learning_rate": 2.9473907689694705e-05, "loss": 0.0665, "step": 9096 }, { "epoch": 2.7811066952002443, "grad_norm": 0.2677977383136749, "learning_rate": 2.9473483079274764e-05, "loss": 0.0581, "step": 9097 }, { "epoch": 2.7814124121063895, "grad_norm": 0.23994605243206024, "learning_rate": 2.9473058468854826e-05, "loss": 0.0599, "step": 9098 }, { "epoch": 2.7817181290125346, "grad_norm": 0.3067789077758789, "learning_rate": 2.9472633858434885e-05, "loss": 0.0653, "step": 9099 }, { "epoch": 2.7820238459186792, "grad_norm": 0.3816576302051544, "learning_rate": 2.9472209248014947e-05, "loss": 0.0521, "step": 9100 }, { "epoch": 2.7823295628248244, "grad_norm": 0.5895141959190369, "learning_rate": 2.9471784637595006e-05, "loss": 0.0895, "step": 9101 }, { "epoch": 2.782635279730969, "grad_norm": 0.8460770845413208, "learning_rate": 2.9471360027175068e-05, "loss": 0.0875, "step": 9102 }, { "epoch": 2.782940996637114, "grad_norm": 0.3949074149131775, "learning_rate": 2.9470935416755126e-05, "loss": 0.097, "step": 9103 }, { "epoch": 2.783246713543259, "grad_norm": 0.3166978359222412, "learning_rate": 2.947051080633519e-05, "loss": 0.1028, "step": 9104 }, { "epoch": 2.783552430449404, "grad_norm": 0.31610217690467834, "learning_rate": 2.9470086195915247e-05, "loss": 0.1067, "step": 9105 }, { "epoch": 2.7838581473555486, "grad_norm": 0.3321707844734192, "learning_rate": 2.946966158549531e-05, "loss": 0.1647, "step": 9106 }, { "epoch": 2.7841638642616937, "grad_norm": 0.77250736951828, "learning_rate": 2.9469236975075368e-05, "loss": 0.1085, "step": 9107 }, { "epoch": 2.784469581167839, "grad_norm": 0.383654922246933, "learning_rate": 2.946881236465543e-05, "loss": 0.1494, "step": 9108 }, { "epoch": 2.7847752980739835, "grad_norm": 0.5074355602264404, "learning_rate": 2.946838775423549e-05, "loss": 0.1636, "step": 9109 }, { "epoch": 2.785081014980128, "grad_norm": 0.6937918663024902, "learning_rate": 2.9467963143815547e-05, "loss": 0.183, "step": 9110 }, { "epoch": 2.7853867318862733, "grad_norm": 0.8772748112678528, "learning_rate": 2.946753853339561e-05, "loss": 0.1828, "step": 9111 }, { "epoch": 2.7856924487924184, "grad_norm": 0.6041117906570435, "learning_rate": 2.9467113922975668e-05, "loss": 0.2189, "step": 9112 }, { "epoch": 2.785998165698563, "grad_norm": 1.5182812213897705, "learning_rate": 2.946668931255573e-05, "loss": 0.2601, "step": 9113 }, { "epoch": 2.786303882604708, "grad_norm": 0.496177613735199, "learning_rate": 2.946626470213579e-05, "loss": 0.1616, "step": 9114 }, { "epoch": 2.786609599510853, "grad_norm": 0.793789267539978, "learning_rate": 2.946584009171585e-05, "loss": 0.2291, "step": 9115 }, { "epoch": 2.786915316416998, "grad_norm": 1.1444648504257202, "learning_rate": 2.946541548129591e-05, "loss": 0.2155, "step": 9116 }, { "epoch": 2.7872210333231426, "grad_norm": 1.9690715074539185, "learning_rate": 2.9464990870875972e-05, "loss": 0.2341, "step": 9117 }, { "epoch": 2.7875267502292878, "grad_norm": 0.46023908257484436, "learning_rate": 2.946456626045603e-05, "loss": 0.1926, "step": 9118 }, { "epoch": 2.7878324671354324, "grad_norm": 0.2575492858886719, "learning_rate": 2.9464141650036093e-05, "loss": 0.0597, "step": 9119 }, { "epoch": 2.7881381840415775, "grad_norm": 0.7502312064170837, "learning_rate": 2.946371703961615e-05, "loss": 0.0754, "step": 9120 }, { "epoch": 2.7884439009477227, "grad_norm": 0.2641070783138275, "learning_rate": 2.9463292429196213e-05, "loss": 0.0559, "step": 9121 }, { "epoch": 2.7887496178538673, "grad_norm": 0.23605790734291077, "learning_rate": 2.9462867818776272e-05, "loss": 0.0509, "step": 9122 }, { "epoch": 2.789055334760012, "grad_norm": 0.2565252482891083, "learning_rate": 2.946244320835633e-05, "loss": 0.0524, "step": 9123 }, { "epoch": 2.789361051666157, "grad_norm": 0.34217673540115356, "learning_rate": 2.9462018597936393e-05, "loss": 0.045, "step": 9124 }, { "epoch": 2.7896667685723022, "grad_norm": 0.3018718957901001, "learning_rate": 2.946159398751645e-05, "loss": 0.0605, "step": 9125 }, { "epoch": 2.789972485478447, "grad_norm": 0.48900216817855835, "learning_rate": 2.9461169377096514e-05, "loss": 0.0768, "step": 9126 }, { "epoch": 2.790278202384592, "grad_norm": 0.4745106101036072, "learning_rate": 2.9460744766676572e-05, "loss": 0.0899, "step": 9127 }, { "epoch": 2.7905839192907367, "grad_norm": 1.3188917636871338, "learning_rate": 2.9460320156256635e-05, "loss": 0.1141, "step": 9128 }, { "epoch": 2.790889636196882, "grad_norm": 1.3194125890731812, "learning_rate": 2.9459895545836693e-05, "loss": 0.1214, "step": 9129 }, { "epoch": 2.7911953531030265, "grad_norm": 0.8253881931304932, "learning_rate": 2.9459470935416755e-05, "loss": 0.0907, "step": 9130 }, { "epoch": 2.7915010700091716, "grad_norm": 0.9033648371696472, "learning_rate": 2.9459046324996814e-05, "loss": 0.1361, "step": 9131 }, { "epoch": 2.7918067869153163, "grad_norm": 0.6259943842887878, "learning_rate": 2.9458621714576876e-05, "loss": 0.1172, "step": 9132 }, { "epoch": 2.7921125038214614, "grad_norm": 0.4866819977760315, "learning_rate": 2.9458197104156935e-05, "loss": 0.1463, "step": 9133 }, { "epoch": 2.7924182207276065, "grad_norm": 0.6190839409828186, "learning_rate": 2.9457772493736997e-05, "loss": 0.1843, "step": 9134 }, { "epoch": 2.792723937633751, "grad_norm": 0.5989601612091064, "learning_rate": 2.9457347883317056e-05, "loss": 0.1449, "step": 9135 }, { "epoch": 2.793029654539896, "grad_norm": 1.3023109436035156, "learning_rate": 2.9456923272897118e-05, "loss": 0.2228, "step": 9136 }, { "epoch": 2.793335371446041, "grad_norm": 0.6002767086029053, "learning_rate": 2.945649866247718e-05, "loss": 0.1771, "step": 9137 }, { "epoch": 2.793641088352186, "grad_norm": 1.314915418624878, "learning_rate": 2.945607405205724e-05, "loss": 0.1732, "step": 9138 }, { "epoch": 2.7939468052583307, "grad_norm": 0.623480498790741, "learning_rate": 2.94556494416373e-05, "loss": 0.1635, "step": 9139 }, { "epoch": 2.794252522164476, "grad_norm": 1.5098024606704712, "learning_rate": 2.945522483121736e-05, "loss": 0.2112, "step": 9140 }, { "epoch": 2.7945582390706205, "grad_norm": 0.9926416277885437, "learning_rate": 2.945480022079742e-05, "loss": 0.2447, "step": 9141 }, { "epoch": 2.7948639559767656, "grad_norm": 2.298102617263794, "learning_rate": 2.945437561037748e-05, "loss": 0.2398, "step": 9142 }, { "epoch": 2.7951696728829103, "grad_norm": 0.48549219965934753, "learning_rate": 2.9453950999957542e-05, "loss": 0.17, "step": 9143 }, { "epoch": 2.7954753897890554, "grad_norm": 0.3633997440338135, "learning_rate": 2.94535263895376e-05, "loss": 0.0864, "step": 9144 }, { "epoch": 2.7957811066952, "grad_norm": 0.21263819932937622, "learning_rate": 2.9453101779117663e-05, "loss": 0.0747, "step": 9145 }, { "epoch": 2.796086823601345, "grad_norm": 0.2491714060306549, "learning_rate": 2.945267716869772e-05, "loss": 0.0724, "step": 9146 }, { "epoch": 2.7963925405074903, "grad_norm": 0.9555745720863342, "learning_rate": 2.9452252558277784e-05, "loss": 0.0648, "step": 9147 }, { "epoch": 2.796698257413635, "grad_norm": 0.16992482542991638, "learning_rate": 2.9451827947857842e-05, "loss": 0.0415, "step": 9148 }, { "epoch": 2.7970039743197797, "grad_norm": 0.30638179183006287, "learning_rate": 2.94514033374379e-05, "loss": 0.087, "step": 9149 }, { "epoch": 2.7973096912259248, "grad_norm": 1.1486810445785522, "learning_rate": 2.9450978727017963e-05, "loss": 0.0697, "step": 9150 }, { "epoch": 2.79761540813207, "grad_norm": 0.5087655782699585, "learning_rate": 2.9450554116598022e-05, "loss": 0.1022, "step": 9151 }, { "epoch": 2.7979211250382146, "grad_norm": 0.2412012219429016, "learning_rate": 2.9450129506178084e-05, "loss": 0.0616, "step": 9152 }, { "epoch": 2.7982268419443597, "grad_norm": 0.375735878944397, "learning_rate": 2.9449704895758143e-05, "loss": 0.1014, "step": 9153 }, { "epoch": 2.7985325588505043, "grad_norm": 0.41409480571746826, "learning_rate": 2.9449280285338205e-05, "loss": 0.1018, "step": 9154 }, { "epoch": 2.7988382757566495, "grad_norm": 0.3772088885307312, "learning_rate": 2.9448855674918263e-05, "loss": 0.105, "step": 9155 }, { "epoch": 2.799143992662794, "grad_norm": 1.7011767625808716, "learning_rate": 2.9448431064498326e-05, "loss": 0.1354, "step": 9156 }, { "epoch": 2.7994497095689392, "grad_norm": 0.42013275623321533, "learning_rate": 2.9448006454078384e-05, "loss": 0.1134, "step": 9157 }, { "epoch": 2.799755426475084, "grad_norm": 0.620156466960907, "learning_rate": 2.9447581843658446e-05, "loss": 0.1585, "step": 9158 }, { "epoch": 2.800061143381229, "grad_norm": 0.6288834810256958, "learning_rate": 2.9447157233238505e-05, "loss": 0.1828, "step": 9159 }, { "epoch": 2.800366860287374, "grad_norm": 0.5806153416633606, "learning_rate": 2.9446732622818567e-05, "loss": 0.1897, "step": 9160 }, { "epoch": 2.800672577193519, "grad_norm": 0.9978366494178772, "learning_rate": 2.9446308012398626e-05, "loss": 0.1867, "step": 9161 }, { "epoch": 2.8009782940996635, "grad_norm": 0.8078206181526184, "learning_rate": 2.9445883401978685e-05, "loss": 0.2063, "step": 9162 }, { "epoch": 2.8012840110058086, "grad_norm": 4.702993392944336, "learning_rate": 2.9445458791558747e-05, "loss": 0.1677, "step": 9163 }, { "epoch": 2.8015897279119537, "grad_norm": 1.0795179605484009, "learning_rate": 2.9445034181138805e-05, "loss": 0.169, "step": 9164 }, { "epoch": 2.8018954448180984, "grad_norm": 1.2695293426513672, "learning_rate": 2.9444609570718867e-05, "loss": 0.2469, "step": 9165 }, { "epoch": 2.8022011617242435, "grad_norm": 1.1602694988250732, "learning_rate": 2.9444184960298926e-05, "loss": 0.3301, "step": 9166 }, { "epoch": 2.802506878630388, "grad_norm": 1.485474705696106, "learning_rate": 2.9443760349878988e-05, "loss": 0.2857, "step": 9167 }, { "epoch": 2.8028125955365333, "grad_norm": 0.5019502639770508, "learning_rate": 2.9443335739459047e-05, "loss": 0.1992, "step": 9168 }, { "epoch": 2.803118312442678, "grad_norm": 0.3384079337120056, "learning_rate": 2.944291112903911e-05, "loss": 0.0933, "step": 9169 }, { "epoch": 2.803424029348823, "grad_norm": 0.2961510419845581, "learning_rate": 2.9442486518619168e-05, "loss": 0.0715, "step": 9170 }, { "epoch": 2.8037297462549677, "grad_norm": 0.5455858707427979, "learning_rate": 2.944206190819923e-05, "loss": 0.0666, "step": 9171 }, { "epoch": 2.804035463161113, "grad_norm": 0.2870481610298157, "learning_rate": 2.944163729777929e-05, "loss": 0.0674, "step": 9172 }, { "epoch": 2.804341180067258, "grad_norm": 0.3050087094306946, "learning_rate": 2.944121268735935e-05, "loss": 0.0633, "step": 9173 }, { "epoch": 2.8046468969734026, "grad_norm": 0.3560301661491394, "learning_rate": 2.944078807693941e-05, "loss": 0.0567, "step": 9174 }, { "epoch": 2.8049526138795473, "grad_norm": 0.42825883626937866, "learning_rate": 2.9440363466519468e-05, "loss": 0.1097, "step": 9175 }, { "epoch": 2.8052583307856924, "grad_norm": 0.3266555964946747, "learning_rate": 2.943993885609953e-05, "loss": 0.0594, "step": 9176 }, { "epoch": 2.8055640476918375, "grad_norm": 0.2456534206867218, "learning_rate": 2.943951424567959e-05, "loss": 0.0644, "step": 9177 }, { "epoch": 2.805869764597982, "grad_norm": 0.640224814414978, "learning_rate": 2.943908963525965e-05, "loss": 0.0944, "step": 9178 }, { "epoch": 2.8061754815041273, "grad_norm": 0.389900267124176, "learning_rate": 2.943866502483971e-05, "loss": 0.079, "step": 9179 }, { "epoch": 2.806481198410272, "grad_norm": 1.0564411878585815, "learning_rate": 2.943824041441977e-05, "loss": 0.1507, "step": 9180 }, { "epoch": 2.806786915316417, "grad_norm": 0.41925305128097534, "learning_rate": 2.943781580399983e-05, "loss": 0.1391, "step": 9181 }, { "epoch": 2.807092632222562, "grad_norm": 0.5505188703536987, "learning_rate": 2.9437391193579892e-05, "loss": 0.119, "step": 9182 }, { "epoch": 2.807398349128707, "grad_norm": 0.6211351156234741, "learning_rate": 2.943696658315995e-05, "loss": 0.1659, "step": 9183 }, { "epoch": 2.8077040660348516, "grad_norm": 0.43405136466026306, "learning_rate": 2.9436541972740013e-05, "loss": 0.1717, "step": 9184 }, { "epoch": 2.8080097829409967, "grad_norm": 0.6671894192695618, "learning_rate": 2.9436117362320072e-05, "loss": 0.1863, "step": 9185 }, { "epoch": 2.808315499847142, "grad_norm": 0.7860468626022339, "learning_rate": 2.9435692751900134e-05, "loss": 0.2015, "step": 9186 }, { "epoch": 2.8086212167532865, "grad_norm": 0.7573533058166504, "learning_rate": 2.9435268141480193e-05, "loss": 0.1816, "step": 9187 }, { "epoch": 2.808926933659431, "grad_norm": 0.7623786330223083, "learning_rate": 2.943484353106025e-05, "loss": 0.188, "step": 9188 }, { "epoch": 2.8092326505655763, "grad_norm": 0.646364152431488, "learning_rate": 2.9434418920640314e-05, "loss": 0.2392, "step": 9189 }, { "epoch": 2.8095383674717214, "grad_norm": 0.9237139821052551, "learning_rate": 2.9433994310220372e-05, "loss": 0.1759, "step": 9190 }, { "epoch": 2.809844084377866, "grad_norm": 1.2606315612792969, "learning_rate": 2.9433569699800434e-05, "loss": 0.21, "step": 9191 }, { "epoch": 2.810149801284011, "grad_norm": 1.1647380590438843, "learning_rate": 2.9433145089380493e-05, "loss": 0.2101, "step": 9192 }, { "epoch": 2.810455518190156, "grad_norm": 0.3667152225971222, "learning_rate": 2.9432720478960555e-05, "loss": 0.1692, "step": 9193 }, { "epoch": 2.810761235096301, "grad_norm": 0.5493492484092712, "learning_rate": 2.9432295868540614e-05, "loss": 0.1479, "step": 9194 }, { "epoch": 2.8110669520024456, "grad_norm": 0.4163355231285095, "learning_rate": 2.9431871258120676e-05, "loss": 0.0786, "step": 9195 }, { "epoch": 2.8113726689085907, "grad_norm": 0.35091593861579895, "learning_rate": 2.9431446647700735e-05, "loss": 0.0844, "step": 9196 }, { "epoch": 2.8116783858147354, "grad_norm": 0.2441929280757904, "learning_rate": 2.9431022037280797e-05, "loss": 0.0577, "step": 9197 }, { "epoch": 2.8119841027208805, "grad_norm": 0.24336065351963043, "learning_rate": 2.9430597426860855e-05, "loss": 0.0531, "step": 9198 }, { "epoch": 2.8122898196270256, "grad_norm": 0.2049088031053543, "learning_rate": 2.9430172816440917e-05, "loss": 0.0605, "step": 9199 }, { "epoch": 2.8125955365331703, "grad_norm": 0.42569050192832947, "learning_rate": 2.9429748206020976e-05, "loss": 0.0498, "step": 9200 }, { "epoch": 2.812901253439315, "grad_norm": 0.41998741030693054, "learning_rate": 2.9429323595601035e-05, "loss": 0.077, "step": 9201 }, { "epoch": 2.81320697034546, "grad_norm": 0.38017669320106506, "learning_rate": 2.9428898985181097e-05, "loss": 0.0693, "step": 9202 }, { "epoch": 2.813512687251605, "grad_norm": 0.6832528114318848, "learning_rate": 2.9428474374761156e-05, "loss": 0.0886, "step": 9203 }, { "epoch": 2.81381840415775, "grad_norm": 0.3462420403957367, "learning_rate": 2.9428049764341218e-05, "loss": 0.0901, "step": 9204 }, { "epoch": 2.814124121063895, "grad_norm": 0.5087888836860657, "learning_rate": 2.9427625153921276e-05, "loss": 0.1187, "step": 9205 }, { "epoch": 2.8144298379700396, "grad_norm": 0.8404026031494141, "learning_rate": 2.942720054350134e-05, "loss": 0.1353, "step": 9206 }, { "epoch": 2.8147355548761848, "grad_norm": 0.5535116791725159, "learning_rate": 2.9426775933081397e-05, "loss": 0.1454, "step": 9207 }, { "epoch": 2.8150412717823294, "grad_norm": 1.1997708082199097, "learning_rate": 2.942635132266146e-05, "loss": 0.1694, "step": 9208 }, { "epoch": 2.8153469886884745, "grad_norm": 0.48186609148979187, "learning_rate": 2.9425926712241518e-05, "loss": 0.1874, "step": 9209 }, { "epoch": 2.815652705594619, "grad_norm": 0.5714230537414551, "learning_rate": 2.942550210182158e-05, "loss": 0.1562, "step": 9210 }, { "epoch": 2.8159584225007643, "grad_norm": 0.6950520873069763, "learning_rate": 2.942507749140164e-05, "loss": 0.1722, "step": 9211 }, { "epoch": 2.8162641394069095, "grad_norm": 1.5516431331634521, "learning_rate": 2.94246528809817e-05, "loss": 0.1579, "step": 9212 }, { "epoch": 2.816569856313054, "grad_norm": 1.5693957805633545, "learning_rate": 2.942422827056176e-05, "loss": 0.2341, "step": 9213 }, { "epoch": 2.816875573219199, "grad_norm": 0.9654374718666077, "learning_rate": 2.9423803660141818e-05, "loss": 0.2219, "step": 9214 }, { "epoch": 2.817181290125344, "grad_norm": 1.3613910675048828, "learning_rate": 2.942337904972188e-05, "loss": 0.2002, "step": 9215 }, { "epoch": 2.817487007031489, "grad_norm": 0.8023858070373535, "learning_rate": 2.942295443930194e-05, "loss": 0.2299, "step": 9216 }, { "epoch": 2.8177927239376337, "grad_norm": 1.5474176406860352, "learning_rate": 2.9422529828882e-05, "loss": 0.2462, "step": 9217 }, { "epoch": 2.818098440843779, "grad_norm": 0.4818096458911896, "learning_rate": 2.942210521846206e-05, "loss": 0.1677, "step": 9218 }, { "epoch": 2.8184041577499235, "grad_norm": 0.28879210352897644, "learning_rate": 2.9421680608042122e-05, "loss": 0.0791, "step": 9219 }, { "epoch": 2.8187098746560686, "grad_norm": 0.268905907869339, "learning_rate": 2.942125599762218e-05, "loss": 0.0716, "step": 9220 }, { "epoch": 2.8190155915622133, "grad_norm": 0.21558430790901184, "learning_rate": 2.9420831387202243e-05, "loss": 0.0701, "step": 9221 }, { "epoch": 2.8193213084683584, "grad_norm": 0.2588721215724945, "learning_rate": 2.94204067767823e-05, "loss": 0.0669, "step": 9222 }, { "epoch": 2.819627025374503, "grad_norm": 0.24985797703266144, "learning_rate": 2.9419982166362364e-05, "loss": 0.0543, "step": 9223 }, { "epoch": 2.819932742280648, "grad_norm": 0.2880270481109619, "learning_rate": 2.9419557555942422e-05, "loss": 0.082, "step": 9224 }, { "epoch": 2.8202384591867933, "grad_norm": 0.30928659439086914, "learning_rate": 2.941913294552248e-05, "loss": 0.0552, "step": 9225 }, { "epoch": 2.820544176092938, "grad_norm": 0.3307630121707916, "learning_rate": 2.9418708335102543e-05, "loss": 0.0928, "step": 9226 }, { "epoch": 2.8208498929990826, "grad_norm": 0.37016212940216064, "learning_rate": 2.9418283724682602e-05, "loss": 0.0549, "step": 9227 }, { "epoch": 2.8211556099052277, "grad_norm": 0.44208770990371704, "learning_rate": 2.9417859114262664e-05, "loss": 0.1292, "step": 9228 }, { "epoch": 2.821461326811373, "grad_norm": 0.7620301246643066, "learning_rate": 2.9417434503842722e-05, "loss": 0.104, "step": 9229 }, { "epoch": 2.8217670437175175, "grad_norm": 0.3696455657482147, "learning_rate": 2.9417009893422785e-05, "loss": 0.1055, "step": 9230 }, { "epoch": 2.8220727606236626, "grad_norm": 0.46407485008239746, "learning_rate": 2.9416585283002843e-05, "loss": 0.1352, "step": 9231 }, { "epoch": 2.8223784775298073, "grad_norm": 0.3409471809864044, "learning_rate": 2.9416160672582905e-05, "loss": 0.1075, "step": 9232 }, { "epoch": 2.8226841944359524, "grad_norm": 0.3833938539028168, "learning_rate": 2.9415736062162964e-05, "loss": 0.1463, "step": 9233 }, { "epoch": 2.822989911342097, "grad_norm": 0.4025406539440155, "learning_rate": 2.9415311451743026e-05, "loss": 0.1494, "step": 9234 }, { "epoch": 2.823295628248242, "grad_norm": 0.9446231126785278, "learning_rate": 2.9414886841323085e-05, "loss": 0.1842, "step": 9235 }, { "epoch": 2.823601345154387, "grad_norm": 0.4738118648529053, "learning_rate": 2.9414462230903147e-05, "loss": 0.205, "step": 9236 }, { "epoch": 2.823907062060532, "grad_norm": 1.6123896837234497, "learning_rate": 2.9414037620483206e-05, "loss": 0.2172, "step": 9237 }, { "epoch": 2.824212778966677, "grad_norm": 0.765811026096344, "learning_rate": 2.9413613010063268e-05, "loss": 0.2157, "step": 9238 }, { "epoch": 2.8245184958728218, "grad_norm": 0.7256080508232117, "learning_rate": 2.941318839964333e-05, "loss": 0.213, "step": 9239 }, { "epoch": 2.8248242127789664, "grad_norm": 1.0747121572494507, "learning_rate": 2.941276378922339e-05, "loss": 0.1829, "step": 9240 }, { "epoch": 2.8251299296851116, "grad_norm": 0.8751523494720459, "learning_rate": 2.941233917880345e-05, "loss": 0.2218, "step": 9241 }, { "epoch": 2.8254356465912567, "grad_norm": 1.6202682256698608, "learning_rate": 2.941191456838351e-05, "loss": 0.2785, "step": 9242 }, { "epoch": 2.8257413634974013, "grad_norm": 0.49705302715301514, "learning_rate": 2.941148995796357e-05, "loss": 0.1862, "step": 9243 }, { "epoch": 2.8260470804035465, "grad_norm": 0.2921295464038849, "learning_rate": 2.941106534754363e-05, "loss": 0.0994, "step": 9244 }, { "epoch": 2.826352797309691, "grad_norm": 0.4073435068130493, "learning_rate": 2.9410640737123692e-05, "loss": 0.0849, "step": 9245 }, { "epoch": 2.8266585142158362, "grad_norm": 0.34708723425865173, "learning_rate": 2.941021612670375e-05, "loss": 0.0774, "step": 9246 }, { "epoch": 2.826964231121981, "grad_norm": 0.22792288661003113, "learning_rate": 2.9409791516283813e-05, "loss": 0.06, "step": 9247 }, { "epoch": 2.827269948028126, "grad_norm": 0.3800932466983795, "learning_rate": 2.940936690586387e-05, "loss": 0.0764, "step": 9248 }, { "epoch": 2.8275756649342707, "grad_norm": 0.2800470292568207, "learning_rate": 2.9408942295443934e-05, "loss": 0.0702, "step": 9249 }, { "epoch": 2.827881381840416, "grad_norm": 0.38396212458610535, "learning_rate": 2.9408517685023992e-05, "loss": 0.0869, "step": 9250 }, { "epoch": 2.828187098746561, "grad_norm": 0.30783554911613464, "learning_rate": 2.940809307460405e-05, "loss": 0.0967, "step": 9251 }, { "epoch": 2.8284928156527056, "grad_norm": 0.3546639084815979, "learning_rate": 2.9407668464184113e-05, "loss": 0.0931, "step": 9252 }, { "epoch": 2.8287985325588503, "grad_norm": 0.33912888169288635, "learning_rate": 2.9407243853764172e-05, "loss": 0.0821, "step": 9253 }, { "epoch": 2.8291042494649954, "grad_norm": 5.974740505218506, "learning_rate": 2.9406819243344234e-05, "loss": 0.0754, "step": 9254 }, { "epoch": 2.8294099663711405, "grad_norm": 0.40314948558807373, "learning_rate": 2.9406394632924293e-05, "loss": 0.1254, "step": 9255 }, { "epoch": 2.829715683277285, "grad_norm": 0.3585081100463867, "learning_rate": 2.9405970022504355e-05, "loss": 0.1243, "step": 9256 }, { "epoch": 2.8300214001834303, "grad_norm": 0.351232647895813, "learning_rate": 2.9405545412084414e-05, "loss": 0.1227, "step": 9257 }, { "epoch": 2.830327117089575, "grad_norm": 0.8052029609680176, "learning_rate": 2.9405120801664476e-05, "loss": 0.1847, "step": 9258 }, { "epoch": 2.83063283399572, "grad_norm": 2.062041759490967, "learning_rate": 2.9404696191244534e-05, "loss": 0.1799, "step": 9259 }, { "epoch": 2.8309385509018647, "grad_norm": 3.1157004833221436, "learning_rate": 2.9404271580824596e-05, "loss": 0.1576, "step": 9260 }, { "epoch": 2.83124426780801, "grad_norm": 0.651469886302948, "learning_rate": 2.9403846970404655e-05, "loss": 0.2145, "step": 9261 }, { "epoch": 2.8315499847141545, "grad_norm": 0.6753536462783813, "learning_rate": 2.9403422359984717e-05, "loss": 0.1911, "step": 9262 }, { "epoch": 2.8318557016202996, "grad_norm": 0.9763350486755371, "learning_rate": 2.9402997749564776e-05, "loss": 0.2166, "step": 9263 }, { "epoch": 2.8321614185264448, "grad_norm": 1.0122578144073486, "learning_rate": 2.9402573139144835e-05, "loss": 0.2036, "step": 9264 }, { "epoch": 2.8324671354325894, "grad_norm": 2.9091219902038574, "learning_rate": 2.9402148528724897e-05, "loss": 0.2242, "step": 9265 }, { "epoch": 2.832772852338734, "grad_norm": 1.960984230041504, "learning_rate": 2.9401723918304955e-05, "loss": 0.2525, "step": 9266 }, { "epoch": 2.833078569244879, "grad_norm": 2.0184824466705322, "learning_rate": 2.9401299307885017e-05, "loss": 0.3303, "step": 9267 }, { "epoch": 2.8333842861510243, "grad_norm": 0.5924866199493408, "learning_rate": 2.9400874697465076e-05, "loss": 0.1889, "step": 9268 }, { "epoch": 2.833690003057169, "grad_norm": 0.30335038900375366, "learning_rate": 2.9400450087045138e-05, "loss": 0.1121, "step": 9269 }, { "epoch": 2.833995719963314, "grad_norm": 0.26109376549720764, "learning_rate": 2.9400025476625197e-05, "loss": 0.0854, "step": 9270 }, { "epoch": 2.834301436869459, "grad_norm": 0.7717112302780151, "learning_rate": 2.939960086620526e-05, "loss": 0.0722, "step": 9271 }, { "epoch": 2.834607153775604, "grad_norm": 0.2522926330566406, "learning_rate": 2.9399176255785318e-05, "loss": 0.067, "step": 9272 }, { "epoch": 2.8349128706817486, "grad_norm": 0.4380214512348175, "learning_rate": 2.939875164536538e-05, "loss": 0.053, "step": 9273 }, { "epoch": 2.8352185875878937, "grad_norm": 0.33541813492774963, "learning_rate": 2.939832703494544e-05, "loss": 0.0839, "step": 9274 }, { "epoch": 2.8355243044940384, "grad_norm": 0.31845518946647644, "learning_rate": 2.93979024245255e-05, "loss": 0.0771, "step": 9275 }, { "epoch": 2.8358300214001835, "grad_norm": 0.31087958812713623, "learning_rate": 2.939747781410556e-05, "loss": 0.0848, "step": 9276 }, { "epoch": 2.8361357383063286, "grad_norm": 0.2835655212402344, "learning_rate": 2.9397053203685618e-05, "loss": 0.0932, "step": 9277 }, { "epoch": 2.8364414552124733, "grad_norm": 0.7332214713096619, "learning_rate": 2.939662859326568e-05, "loss": 0.1134, "step": 9278 }, { "epoch": 2.836747172118618, "grad_norm": 0.5354810953140259, "learning_rate": 2.939620398284574e-05, "loss": 0.1203, "step": 9279 }, { "epoch": 2.837052889024763, "grad_norm": 0.44387590885162354, "learning_rate": 2.93957793724258e-05, "loss": 0.1044, "step": 9280 }, { "epoch": 2.837358605930908, "grad_norm": 0.49641290307044983, "learning_rate": 2.939535476200586e-05, "loss": 0.1258, "step": 9281 }, { "epoch": 2.837664322837053, "grad_norm": 0.42640188336372375, "learning_rate": 2.939493015158592e-05, "loss": 0.1304, "step": 9282 }, { "epoch": 2.837970039743198, "grad_norm": 0.7955858707427979, "learning_rate": 2.939450554116598e-05, "loss": 0.1875, "step": 9283 }, { "epoch": 2.8382757566493426, "grad_norm": 0.6774759292602539, "learning_rate": 2.9394080930746042e-05, "loss": 0.1538, "step": 9284 }, { "epoch": 2.8385814735554877, "grad_norm": 0.6606000661849976, "learning_rate": 2.93936563203261e-05, "loss": 0.1575, "step": 9285 }, { "epoch": 2.8388871904616324, "grad_norm": 0.8051173090934753, "learning_rate": 2.9393231709906163e-05, "loss": 0.1957, "step": 9286 }, { "epoch": 2.8391929073677775, "grad_norm": 0.6663312911987305, "learning_rate": 2.9392807099486222e-05, "loss": 0.1775, "step": 9287 }, { "epoch": 2.839498624273922, "grad_norm": 1.358551025390625, "learning_rate": 2.9392382489066284e-05, "loss": 0.1912, "step": 9288 }, { "epoch": 2.8398043411800673, "grad_norm": 0.7614307403564453, "learning_rate": 2.9391957878646343e-05, "loss": 0.2235, "step": 9289 }, { "epoch": 2.8401100580862124, "grad_norm": 0.8926811218261719, "learning_rate": 2.93915332682264e-05, "loss": 0.2289, "step": 9290 }, { "epoch": 2.840415774992357, "grad_norm": 1.5036985874176025, "learning_rate": 2.9391108657806464e-05, "loss": 0.3022, "step": 9291 }, { "epoch": 2.8407214918985018, "grad_norm": 1.0525542497634888, "learning_rate": 2.9390684047386522e-05, "loss": 0.2587, "step": 9292 }, { "epoch": 2.841027208804647, "grad_norm": 0.39394938945770264, "learning_rate": 2.9390259436966584e-05, "loss": 0.1561, "step": 9293 }, { "epoch": 2.841332925710792, "grad_norm": 0.3693227767944336, "learning_rate": 2.9389834826546643e-05, "loss": 0.076, "step": 9294 }, { "epoch": 2.8416386426169367, "grad_norm": 0.318780779838562, "learning_rate": 2.9389410216126705e-05, "loss": 0.076, "step": 9295 }, { "epoch": 2.8419443595230818, "grad_norm": 0.3036315441131592, "learning_rate": 2.9388985605706764e-05, "loss": 0.0779, "step": 9296 }, { "epoch": 2.8422500764292264, "grad_norm": 0.5437350869178772, "learning_rate": 2.9388560995286826e-05, "loss": 0.059, "step": 9297 }, { "epoch": 2.8425557933353716, "grad_norm": 0.38995444774627686, "learning_rate": 2.9388136384866885e-05, "loss": 0.0646, "step": 9298 }, { "epoch": 2.8428615102415162, "grad_norm": 0.2822433114051819, "learning_rate": 2.9387711774446947e-05, "loss": 0.043, "step": 9299 }, { "epoch": 2.8431672271476613, "grad_norm": 1.017906904220581, "learning_rate": 2.9387287164027005e-05, "loss": 0.071, "step": 9300 }, { "epoch": 2.843472944053806, "grad_norm": 0.5862139463424683, "learning_rate": 2.9386862553607067e-05, "loss": 0.0994, "step": 9301 }, { "epoch": 2.843778660959951, "grad_norm": 0.3634767234325409, "learning_rate": 2.9386437943187126e-05, "loss": 0.067, "step": 9302 }, { "epoch": 2.8440843778660962, "grad_norm": 0.744791567325592, "learning_rate": 2.9386013332767185e-05, "loss": 0.1382, "step": 9303 }, { "epoch": 2.844390094772241, "grad_norm": 0.3116419315338135, "learning_rate": 2.9385588722347247e-05, "loss": 0.0663, "step": 9304 }, { "epoch": 2.8446958116783856, "grad_norm": 0.4783865809440613, "learning_rate": 2.9385164111927306e-05, "loss": 0.0976, "step": 9305 }, { "epoch": 2.8450015285845307, "grad_norm": 0.46315646171569824, "learning_rate": 2.9384739501507368e-05, "loss": 0.1206, "step": 9306 }, { "epoch": 2.845307245490676, "grad_norm": 0.4426887035369873, "learning_rate": 2.9384314891087426e-05, "loss": 0.1228, "step": 9307 }, { "epoch": 2.8456129623968205, "grad_norm": 0.5174416899681091, "learning_rate": 2.938389028066749e-05, "loss": 0.1468, "step": 9308 }, { "epoch": 2.8459186793029656, "grad_norm": 0.5482811331748962, "learning_rate": 2.9383465670247547e-05, "loss": 0.132, "step": 9309 }, { "epoch": 2.8462243962091103, "grad_norm": 0.660132884979248, "learning_rate": 2.938304105982761e-05, "loss": 0.2062, "step": 9310 }, { "epoch": 2.8465301131152554, "grad_norm": 0.48538199067115784, "learning_rate": 2.9382616449407668e-05, "loss": 0.1974, "step": 9311 }, { "epoch": 2.8468358300214, "grad_norm": 0.7513828277587891, "learning_rate": 2.938219183898773e-05, "loss": 0.1715, "step": 9312 }, { "epoch": 2.847141546927545, "grad_norm": 0.8947213292121887, "learning_rate": 2.938176722856779e-05, "loss": 0.2084, "step": 9313 }, { "epoch": 2.84744726383369, "grad_norm": 0.7549887299537659, "learning_rate": 2.938134261814785e-05, "loss": 0.2451, "step": 9314 }, { "epoch": 2.847752980739835, "grad_norm": 0.7803683280944824, "learning_rate": 2.938091800772791e-05, "loss": 0.1872, "step": 9315 }, { "epoch": 2.84805869764598, "grad_norm": 0.9259335398674011, "learning_rate": 2.9380493397307968e-05, "loss": 0.2577, "step": 9316 }, { "epoch": 2.8483644145521247, "grad_norm": 1.524087905883789, "learning_rate": 2.938006878688803e-05, "loss": 0.2959, "step": 9317 }, { "epoch": 2.8486701314582694, "grad_norm": 0.3407677710056305, "learning_rate": 2.937964417646809e-05, "loss": 0.1486, "step": 9318 }, { "epoch": 2.8489758483644145, "grad_norm": 0.43380704522132874, "learning_rate": 2.937921956604815e-05, "loss": 0.1128, "step": 9319 }, { "epoch": 2.8492815652705596, "grad_norm": 0.4031316339969635, "learning_rate": 2.937879495562821e-05, "loss": 0.0758, "step": 9320 }, { "epoch": 2.8495872821767043, "grad_norm": 0.2456720769405365, "learning_rate": 2.9378370345208272e-05, "loss": 0.0839, "step": 9321 }, { "epoch": 2.8498929990828494, "grad_norm": 0.24485449492931366, "learning_rate": 2.937794573478833e-05, "loss": 0.0913, "step": 9322 }, { "epoch": 2.850198715988994, "grad_norm": 0.3120526075363159, "learning_rate": 2.9377521124368393e-05, "loss": 0.0435, "step": 9323 }, { "epoch": 2.850504432895139, "grad_norm": 0.41041645407676697, "learning_rate": 2.937709651394845e-05, "loss": 0.0636, "step": 9324 }, { "epoch": 2.850810149801284, "grad_norm": 0.35619476437568665, "learning_rate": 2.9376671903528514e-05, "loss": 0.0703, "step": 9325 }, { "epoch": 2.851115866707429, "grad_norm": 0.3049534857273102, "learning_rate": 2.9376247293108572e-05, "loss": 0.0865, "step": 9326 }, { "epoch": 2.8514215836135737, "grad_norm": 0.5125890374183655, "learning_rate": 2.9375822682688634e-05, "loss": 0.0847, "step": 9327 }, { "epoch": 2.851727300519719, "grad_norm": 0.5877946615219116, "learning_rate": 2.9375398072268693e-05, "loss": 0.0839, "step": 9328 }, { "epoch": 2.852033017425864, "grad_norm": 0.9857901334762573, "learning_rate": 2.9374973461848752e-05, "loss": 0.0747, "step": 9329 }, { "epoch": 2.8523387343320086, "grad_norm": 0.6976868510246277, "learning_rate": 2.9374548851428814e-05, "loss": 0.1242, "step": 9330 }, { "epoch": 2.8526444512381532, "grad_norm": 0.5126802921295166, "learning_rate": 2.9374124241008873e-05, "loss": 0.1765, "step": 9331 }, { "epoch": 2.8529501681442984, "grad_norm": 0.4874782860279083, "learning_rate": 2.9373699630588935e-05, "loss": 0.1461, "step": 9332 }, { "epoch": 2.8532558850504435, "grad_norm": 0.38353681564331055, "learning_rate": 2.9373275020168993e-05, "loss": 0.1634, "step": 9333 }, { "epoch": 2.853561601956588, "grad_norm": 0.5358685255050659, "learning_rate": 2.9372850409749055e-05, "loss": 0.1781, "step": 9334 }, { "epoch": 2.8538673188627333, "grad_norm": 0.5760524868965149, "learning_rate": 2.9372425799329114e-05, "loss": 0.2301, "step": 9335 }, { "epoch": 2.854173035768878, "grad_norm": 1.1444388628005981, "learning_rate": 2.9372001188909176e-05, "loss": 0.2222, "step": 9336 }, { "epoch": 2.854478752675023, "grad_norm": 0.6019194722175598, "learning_rate": 2.9371576578489235e-05, "loss": 0.1774, "step": 9337 }, { "epoch": 2.8547844695811677, "grad_norm": 0.8618525266647339, "learning_rate": 2.9371151968069297e-05, "loss": 0.2125, "step": 9338 }, { "epoch": 2.855090186487313, "grad_norm": 1.6781893968582153, "learning_rate": 2.9370727357649356e-05, "loss": 0.2256, "step": 9339 }, { "epoch": 2.8553959033934575, "grad_norm": 0.6802992820739746, "learning_rate": 2.9370302747229418e-05, "loss": 0.2344, "step": 9340 }, { "epoch": 2.8557016202996026, "grad_norm": 2.523951530456543, "learning_rate": 2.936987813680948e-05, "loss": 0.2688, "step": 9341 }, { "epoch": 2.8560073372057477, "grad_norm": 3.396601676940918, "learning_rate": 2.936945352638954e-05, "loss": 0.2999, "step": 9342 }, { "epoch": 2.8563130541118924, "grad_norm": 0.3948565423488617, "learning_rate": 2.93690289159696e-05, "loss": 0.1257, "step": 9343 }, { "epoch": 2.856618771018037, "grad_norm": 0.2988635301589966, "learning_rate": 2.936860430554966e-05, "loss": 0.096, "step": 9344 }, { "epoch": 2.856924487924182, "grad_norm": 0.37700942158699036, "learning_rate": 2.936817969512972e-05, "loss": 0.091, "step": 9345 }, { "epoch": 2.8572302048303273, "grad_norm": 0.24543876945972443, "learning_rate": 2.936775508470978e-05, "loss": 0.0497, "step": 9346 }, { "epoch": 2.857535921736472, "grad_norm": 0.2470775842666626, "learning_rate": 2.9367330474289842e-05, "loss": 0.0663, "step": 9347 }, { "epoch": 2.857841638642617, "grad_norm": 0.2829510271549225, "learning_rate": 2.93669058638699e-05, "loss": 0.0833, "step": 9348 }, { "epoch": 2.8581473555487618, "grad_norm": 0.22487083077430725, "learning_rate": 2.9366481253449963e-05, "loss": 0.0516, "step": 9349 }, { "epoch": 2.858453072454907, "grad_norm": 0.3188299536705017, "learning_rate": 2.9366056643030022e-05, "loss": 0.0657, "step": 9350 }, { "epoch": 2.8587587893610515, "grad_norm": 0.2807416021823883, "learning_rate": 2.9365632032610084e-05, "loss": 0.0684, "step": 9351 }, { "epoch": 2.8590645062671967, "grad_norm": 0.38296815752983093, "learning_rate": 2.9365207422190142e-05, "loss": 0.0731, "step": 9352 }, { "epoch": 2.8593702231733413, "grad_norm": 0.345012366771698, "learning_rate": 2.93647828117702e-05, "loss": 0.1302, "step": 9353 }, { "epoch": 2.8596759400794864, "grad_norm": 0.531414806842804, "learning_rate": 2.9364358201350263e-05, "loss": 0.0869, "step": 9354 }, { "epoch": 2.8599816569856316, "grad_norm": 0.2881336510181427, "learning_rate": 2.9363933590930322e-05, "loss": 0.0814, "step": 9355 }, { "epoch": 2.8602873738917762, "grad_norm": 0.5646059513092041, "learning_rate": 2.9363508980510384e-05, "loss": 0.1572, "step": 9356 }, { "epoch": 2.860593090797921, "grad_norm": 0.4812685549259186, "learning_rate": 2.9363084370090443e-05, "loss": 0.1103, "step": 9357 }, { "epoch": 2.860898807704066, "grad_norm": 0.41516298055648804, "learning_rate": 2.9362659759670505e-05, "loss": 0.1565, "step": 9358 }, { "epoch": 2.861204524610211, "grad_norm": 0.6346100568771362, "learning_rate": 2.9362235149250564e-05, "loss": 0.1879, "step": 9359 }, { "epoch": 2.861510241516356, "grad_norm": 1.2757924795150757, "learning_rate": 2.9361810538830626e-05, "loss": 0.1687, "step": 9360 }, { "epoch": 2.861815958422501, "grad_norm": 0.7485183477401733, "learning_rate": 2.9361385928410684e-05, "loss": 0.1993, "step": 9361 }, { "epoch": 2.8621216753286456, "grad_norm": 0.6038838028907776, "learning_rate": 2.9360961317990746e-05, "loss": 0.1711, "step": 9362 }, { "epoch": 2.8624273922347907, "grad_norm": 0.6524444818496704, "learning_rate": 2.9360536707570805e-05, "loss": 0.2101, "step": 9363 }, { "epoch": 2.8627331091409354, "grad_norm": 0.771824836730957, "learning_rate": 2.9360112097150867e-05, "loss": 0.1871, "step": 9364 }, { "epoch": 2.8630388260470805, "grad_norm": 0.9809243083000183, "learning_rate": 2.9359687486730926e-05, "loss": 0.2158, "step": 9365 }, { "epoch": 2.863344542953225, "grad_norm": 1.5384182929992676, "learning_rate": 2.9359262876310985e-05, "loss": 0.242, "step": 9366 }, { "epoch": 2.8636502598593703, "grad_norm": 1.138316035270691, "learning_rate": 2.9358838265891047e-05, "loss": 0.2531, "step": 9367 }, { "epoch": 2.8639559767655154, "grad_norm": 0.33694931864738464, "learning_rate": 2.9358413655471105e-05, "loss": 0.1345, "step": 9368 }, { "epoch": 2.86426169367166, "grad_norm": 0.35651901364326477, "learning_rate": 2.9357989045051167e-05, "loss": 0.0988, "step": 9369 }, { "epoch": 2.8645674105778047, "grad_norm": 0.2543579936027527, "learning_rate": 2.9357564434631226e-05, "loss": 0.0651, "step": 9370 }, { "epoch": 2.86487312748395, "grad_norm": 0.29807227849960327, "learning_rate": 2.9357139824211288e-05, "loss": 0.0839, "step": 9371 }, { "epoch": 2.865178844390095, "grad_norm": 0.43762946128845215, "learning_rate": 2.9356715213791347e-05, "loss": 0.0817, "step": 9372 }, { "epoch": 2.8654845612962396, "grad_norm": 0.28190964460372925, "learning_rate": 2.935629060337141e-05, "loss": 0.0692, "step": 9373 }, { "epoch": 2.8657902782023847, "grad_norm": 0.2562788128852844, "learning_rate": 2.9355865992951468e-05, "loss": 0.0486, "step": 9374 }, { "epoch": 2.8660959951085294, "grad_norm": 0.42456334829330444, "learning_rate": 2.935544138253153e-05, "loss": 0.0988, "step": 9375 }, { "epoch": 2.8664017120146745, "grad_norm": 0.5119654536247253, "learning_rate": 2.935501677211159e-05, "loss": 0.0751, "step": 9376 }, { "epoch": 2.866707428920819, "grad_norm": 0.2819218635559082, "learning_rate": 2.935459216169165e-05, "loss": 0.0801, "step": 9377 }, { "epoch": 2.8670131458269643, "grad_norm": 0.25208020210266113, "learning_rate": 2.935416755127171e-05, "loss": 0.1268, "step": 9378 }, { "epoch": 2.867318862733109, "grad_norm": 0.376184344291687, "learning_rate": 2.9353742940851768e-05, "loss": 0.093, "step": 9379 }, { "epoch": 2.867624579639254, "grad_norm": 0.34701624512672424, "learning_rate": 2.935331833043183e-05, "loss": 0.1156, "step": 9380 }, { "epoch": 2.867930296545399, "grad_norm": 0.43348759412765503, "learning_rate": 2.935289372001189e-05, "loss": 0.1069, "step": 9381 }, { "epoch": 2.868236013451544, "grad_norm": 0.9236341714859009, "learning_rate": 2.935246910959195e-05, "loss": 0.1515, "step": 9382 }, { "epoch": 2.8685417303576886, "grad_norm": 0.4919392466545105, "learning_rate": 2.935204449917201e-05, "loss": 0.1809, "step": 9383 }, { "epoch": 2.8688474472638337, "grad_norm": 0.4783887565135956, "learning_rate": 2.9351619888752072e-05, "loss": 0.1879, "step": 9384 }, { "epoch": 2.869153164169979, "grad_norm": 0.6934040188789368, "learning_rate": 2.935119527833213e-05, "loss": 0.1819, "step": 9385 }, { "epoch": 2.8694588810761235, "grad_norm": 1.3298730850219727, "learning_rate": 2.9350770667912192e-05, "loss": 0.1986, "step": 9386 }, { "epoch": 2.8697645979822686, "grad_norm": 0.612007737159729, "learning_rate": 2.935034605749225e-05, "loss": 0.2128, "step": 9387 }, { "epoch": 2.8700703148884132, "grad_norm": 1.034974217414856, "learning_rate": 2.9349921447072313e-05, "loss": 0.1914, "step": 9388 }, { "epoch": 2.8703760317945584, "grad_norm": 0.7899174094200134, "learning_rate": 2.9349496836652372e-05, "loss": 0.2018, "step": 9389 }, { "epoch": 2.870681748700703, "grad_norm": 0.6865609288215637, "learning_rate": 2.9349072226232434e-05, "loss": 0.2218, "step": 9390 }, { "epoch": 2.870987465606848, "grad_norm": 1.4801985025405884, "learning_rate": 2.9348647615812493e-05, "loss": 0.2418, "step": 9391 }, { "epoch": 2.871293182512993, "grad_norm": 3.4522109031677246, "learning_rate": 2.934822300539255e-05, "loss": 0.2654, "step": 9392 }, { "epoch": 2.871598899419138, "grad_norm": 0.5717837810516357, "learning_rate": 2.9347798394972614e-05, "loss": 0.1578, "step": 9393 }, { "epoch": 2.871904616325283, "grad_norm": 0.6548184156417847, "learning_rate": 2.9347373784552672e-05, "loss": 0.0847, "step": 9394 }, { "epoch": 2.8722103332314277, "grad_norm": 0.2801792621612549, "learning_rate": 2.9346949174132734e-05, "loss": 0.0779, "step": 9395 }, { "epoch": 2.8725160501375724, "grad_norm": 0.37702786922454834, "learning_rate": 2.9346524563712793e-05, "loss": 0.0788, "step": 9396 }, { "epoch": 2.8728217670437175, "grad_norm": 0.510862410068512, "learning_rate": 2.9346099953292855e-05, "loss": 0.0751, "step": 9397 }, { "epoch": 2.8731274839498626, "grad_norm": 0.25974610447883606, "learning_rate": 2.9345675342872914e-05, "loss": 0.0661, "step": 9398 }, { "epoch": 2.8734332008560073, "grad_norm": 1.3639581203460693, "learning_rate": 2.9345250732452976e-05, "loss": 0.0515, "step": 9399 }, { "epoch": 2.8737389177621524, "grad_norm": 0.21518273651599884, "learning_rate": 2.9344826122033035e-05, "loss": 0.0657, "step": 9400 }, { "epoch": 2.874044634668297, "grad_norm": 0.25976938009262085, "learning_rate": 2.9344401511613097e-05, "loss": 0.0726, "step": 9401 }, { "epoch": 2.874350351574442, "grad_norm": 0.31919941306114197, "learning_rate": 2.9343976901193155e-05, "loss": 0.0789, "step": 9402 }, { "epoch": 2.874656068480587, "grad_norm": 0.36618906259536743, "learning_rate": 2.9343552290773218e-05, "loss": 0.0725, "step": 9403 }, { "epoch": 2.874961785386732, "grad_norm": 0.3844805955886841, "learning_rate": 2.9343127680353276e-05, "loss": 0.0895, "step": 9404 }, { "epoch": 2.8752675022928766, "grad_norm": 0.35622552037239075, "learning_rate": 2.9342703069933335e-05, "loss": 0.0922, "step": 9405 }, { "epoch": 2.8755732191990218, "grad_norm": 0.5648289322853088, "learning_rate": 2.9342278459513397e-05, "loss": 0.127, "step": 9406 }, { "epoch": 2.875878936105167, "grad_norm": 0.5306695699691772, "learning_rate": 2.9341853849093456e-05, "loss": 0.121, "step": 9407 }, { "epoch": 2.8761846530113115, "grad_norm": 0.5191173553466797, "learning_rate": 2.9341429238673518e-05, "loss": 0.1459, "step": 9408 }, { "epoch": 2.876490369917456, "grad_norm": 0.5780380964279175, "learning_rate": 2.9341004628253576e-05, "loss": 0.1495, "step": 9409 }, { "epoch": 2.8767960868236013, "grad_norm": 0.6245042085647583, "learning_rate": 2.934058001783364e-05, "loss": 0.1735, "step": 9410 }, { "epoch": 2.8771018037297464, "grad_norm": 0.6149858236312866, "learning_rate": 2.9340155407413697e-05, "loss": 0.1704, "step": 9411 }, { "epoch": 2.877407520635891, "grad_norm": 1.2681620121002197, "learning_rate": 2.933973079699376e-05, "loss": 0.2374, "step": 9412 }, { "epoch": 2.8777132375420362, "grad_norm": 0.5481357574462891, "learning_rate": 2.9339306186573818e-05, "loss": 0.2157, "step": 9413 }, { "epoch": 2.878018954448181, "grad_norm": 0.7910399436950684, "learning_rate": 2.933888157615388e-05, "loss": 0.2437, "step": 9414 }, { "epoch": 2.878324671354326, "grad_norm": 1.1935598850250244, "learning_rate": 2.933845696573394e-05, "loss": 0.2134, "step": 9415 }, { "epoch": 2.8786303882604707, "grad_norm": 1.6344749927520752, "learning_rate": 2.9338032355314e-05, "loss": 0.2296, "step": 9416 }, { "epoch": 2.878936105166616, "grad_norm": 1.0975459814071655, "learning_rate": 2.933760774489406e-05, "loss": 0.2602, "step": 9417 }, { "epoch": 2.8792418220727605, "grad_norm": 0.3950599730014801, "learning_rate": 2.933718313447412e-05, "loss": 0.1577, "step": 9418 }, { "epoch": 2.8795475389789056, "grad_norm": 0.31424272060394287, "learning_rate": 2.933675852405418e-05, "loss": 0.1095, "step": 9419 }, { "epoch": 2.8798532558850507, "grad_norm": 0.2665090262889862, "learning_rate": 2.933633391363424e-05, "loss": 0.0883, "step": 9420 }, { "epoch": 2.8801589727911954, "grad_norm": 0.20060937106609344, "learning_rate": 2.93359093032143e-05, "loss": 0.0548, "step": 9421 }, { "epoch": 2.88046468969734, "grad_norm": 0.215310201048851, "learning_rate": 2.933548469279436e-05, "loss": 0.0549, "step": 9422 }, { "epoch": 2.880770406603485, "grad_norm": 0.39521628618240356, "learning_rate": 2.9335060082374422e-05, "loss": 0.0746, "step": 9423 }, { "epoch": 2.8810761235096303, "grad_norm": 0.3869701325893402, "learning_rate": 2.933463547195448e-05, "loss": 0.0667, "step": 9424 }, { "epoch": 2.881381840415775, "grad_norm": 0.284091979265213, "learning_rate": 2.9334210861534543e-05, "loss": 0.0496, "step": 9425 }, { "epoch": 2.88168755732192, "grad_norm": 1.0732332468032837, "learning_rate": 2.93337862511146e-05, "loss": 0.0783, "step": 9426 }, { "epoch": 2.8819932742280647, "grad_norm": 0.6570401787757874, "learning_rate": 2.9333361640694664e-05, "loss": 0.0895, "step": 9427 }, { "epoch": 2.88229899113421, "grad_norm": 0.3570442795753479, "learning_rate": 2.9332937030274722e-05, "loss": 0.0841, "step": 9428 }, { "epoch": 2.8826047080403545, "grad_norm": 0.4200623631477356, "learning_rate": 2.9332512419854784e-05, "loss": 0.0963, "step": 9429 }, { "epoch": 2.8829104249464996, "grad_norm": 0.4682164788246155, "learning_rate": 2.9332087809434843e-05, "loss": 0.1188, "step": 9430 }, { "epoch": 2.8832161418526443, "grad_norm": 0.3601558804512024, "learning_rate": 2.9331663199014902e-05, "loss": 0.1393, "step": 9431 }, { "epoch": 2.8835218587587894, "grad_norm": 0.5151785612106323, "learning_rate": 2.9331238588594964e-05, "loss": 0.138, "step": 9432 }, { "epoch": 2.8838275756649345, "grad_norm": 0.4781738817691803, "learning_rate": 2.9330813978175023e-05, "loss": 0.1634, "step": 9433 }, { "epoch": 2.884133292571079, "grad_norm": 0.745117723941803, "learning_rate": 2.9330389367755085e-05, "loss": 0.2053, "step": 9434 }, { "epoch": 2.884439009477224, "grad_norm": 0.7245194911956787, "learning_rate": 2.9329964757335143e-05, "loss": 0.1957, "step": 9435 }, { "epoch": 2.884744726383369, "grad_norm": 0.8676335215568542, "learning_rate": 2.9329540146915205e-05, "loss": 0.2002, "step": 9436 }, { "epoch": 2.885050443289514, "grad_norm": 1.4087234735488892, "learning_rate": 2.9329115536495264e-05, "loss": 0.2023, "step": 9437 }, { "epoch": 2.8853561601956588, "grad_norm": 1.0741760730743408, "learning_rate": 2.9328690926075326e-05, "loss": 0.1737, "step": 9438 }, { "epoch": 2.885661877101804, "grad_norm": 0.7978000044822693, "learning_rate": 2.9328266315655385e-05, "loss": 0.2076, "step": 9439 }, { "epoch": 2.8859675940079486, "grad_norm": 0.8916339874267578, "learning_rate": 2.9327841705235447e-05, "loss": 0.2268, "step": 9440 }, { "epoch": 2.8862733109140937, "grad_norm": 1.2061653137207031, "learning_rate": 2.9327417094815506e-05, "loss": 0.2161, "step": 9441 }, { "epoch": 2.8865790278202383, "grad_norm": 1.2071104049682617, "learning_rate": 2.9326992484395568e-05, "loss": 0.2803, "step": 9442 }, { "epoch": 2.8868847447263835, "grad_norm": 1.0914556980133057, "learning_rate": 2.932656787397563e-05, "loss": 0.1755, "step": 9443 }, { "epoch": 2.887190461632528, "grad_norm": 0.41432756185531616, "learning_rate": 2.932614326355569e-05, "loss": 0.0942, "step": 9444 }, { "epoch": 2.8874961785386732, "grad_norm": 0.3114290237426758, "learning_rate": 2.932571865313575e-05, "loss": 0.0754, "step": 9445 }, { "epoch": 2.8878018954448184, "grad_norm": 0.5128924250602722, "learning_rate": 2.932529404271581e-05, "loss": 0.0588, "step": 9446 }, { "epoch": 2.888107612350963, "grad_norm": 0.2502950131893158, "learning_rate": 2.932486943229587e-05, "loss": 0.0661, "step": 9447 }, { "epoch": 2.8884133292571077, "grad_norm": 0.25631266832351685, "learning_rate": 2.932444482187593e-05, "loss": 0.0531, "step": 9448 }, { "epoch": 2.888719046163253, "grad_norm": 0.2562286853790283, "learning_rate": 2.9324020211455992e-05, "loss": 0.0687, "step": 9449 }, { "epoch": 2.889024763069398, "grad_norm": 0.31203731894493103, "learning_rate": 2.932359560103605e-05, "loss": 0.0933, "step": 9450 }, { "epoch": 2.8893304799755426, "grad_norm": 0.2307712733745575, "learning_rate": 2.9323170990616113e-05, "loss": 0.0614, "step": 9451 }, { "epoch": 2.8896361968816877, "grad_norm": 0.47585350275039673, "learning_rate": 2.9322746380196172e-05, "loss": 0.0788, "step": 9452 }, { "epoch": 2.8899419137878324, "grad_norm": 0.2532622814178467, "learning_rate": 2.9322321769776234e-05, "loss": 0.0538, "step": 9453 }, { "epoch": 2.8902476306939775, "grad_norm": 0.46241241693496704, "learning_rate": 2.9321897159356293e-05, "loss": 0.0971, "step": 9454 }, { "epoch": 2.890553347600122, "grad_norm": 0.2705094516277313, "learning_rate": 2.932147254893635e-05, "loss": 0.1088, "step": 9455 }, { "epoch": 2.8908590645062673, "grad_norm": 0.5919674634933472, "learning_rate": 2.9321047938516413e-05, "loss": 0.0899, "step": 9456 }, { "epoch": 2.891164781412412, "grad_norm": 0.4188600480556488, "learning_rate": 2.9320623328096472e-05, "loss": 0.1297, "step": 9457 }, { "epoch": 2.891470498318557, "grad_norm": 0.35346519947052, "learning_rate": 2.9320198717676534e-05, "loss": 0.1372, "step": 9458 }, { "epoch": 2.891776215224702, "grad_norm": 0.6553148627281189, "learning_rate": 2.9319774107256593e-05, "loss": 0.1724, "step": 9459 }, { "epoch": 2.892081932130847, "grad_norm": 0.4959946870803833, "learning_rate": 2.9319349496836655e-05, "loss": 0.1402, "step": 9460 }, { "epoch": 2.8923876490369915, "grad_norm": 0.9400476217269897, "learning_rate": 2.9318924886416714e-05, "loss": 0.1897, "step": 9461 }, { "epoch": 2.8926933659431366, "grad_norm": 0.47316303849220276, "learning_rate": 2.9318500275996776e-05, "loss": 0.1673, "step": 9462 }, { "epoch": 2.8929990828492818, "grad_norm": 0.9790865182876587, "learning_rate": 2.9318075665576834e-05, "loss": 0.1739, "step": 9463 }, { "epoch": 2.8933047997554264, "grad_norm": 1.2235937118530273, "learning_rate": 2.9317651055156896e-05, "loss": 0.196, "step": 9464 }, { "epoch": 2.8936105166615715, "grad_norm": 0.7811759114265442, "learning_rate": 2.9317226444736955e-05, "loss": 0.1932, "step": 9465 }, { "epoch": 2.893916233567716, "grad_norm": 1.3016515970230103, "learning_rate": 2.9316801834317017e-05, "loss": 0.1889, "step": 9466 }, { "epoch": 2.8942219504738613, "grad_norm": 1.520200490951538, "learning_rate": 2.9316377223897076e-05, "loss": 0.2594, "step": 9467 }, { "epoch": 2.894527667380006, "grad_norm": 0.39950552582740784, "learning_rate": 2.9315952613477135e-05, "loss": 0.176, "step": 9468 }, { "epoch": 2.894833384286151, "grad_norm": 0.29610830545425415, "learning_rate": 2.9315528003057197e-05, "loss": 0.084, "step": 9469 }, { "epoch": 2.895139101192296, "grad_norm": 0.7358863949775696, "learning_rate": 2.9315103392637255e-05, "loss": 0.0667, "step": 9470 }, { "epoch": 2.895444818098441, "grad_norm": 0.4801211655139923, "learning_rate": 2.9314678782217318e-05, "loss": 0.0589, "step": 9471 }, { "epoch": 2.895750535004586, "grad_norm": 0.6334547400474548, "learning_rate": 2.9314254171797376e-05, "loss": 0.0667, "step": 9472 }, { "epoch": 2.8960562519107307, "grad_norm": 0.24267077445983887, "learning_rate": 2.931382956137744e-05, "loss": 0.0566, "step": 9473 }, { "epoch": 2.8963619688168754, "grad_norm": 0.6073476076126099, "learning_rate": 2.9313404950957497e-05, "loss": 0.0677, "step": 9474 }, { "epoch": 2.8966676857230205, "grad_norm": 0.2238348424434662, "learning_rate": 2.931298034053756e-05, "loss": 0.0563, "step": 9475 }, { "epoch": 2.8969734026291656, "grad_norm": 0.32006755471229553, "learning_rate": 2.9312555730117618e-05, "loss": 0.0861, "step": 9476 }, { "epoch": 2.8972791195353103, "grad_norm": 0.2989276945590973, "learning_rate": 2.931213111969768e-05, "loss": 0.0747, "step": 9477 }, { "epoch": 2.8975848364414554, "grad_norm": 0.4777471125125885, "learning_rate": 2.931170650927774e-05, "loss": 0.0821, "step": 9478 }, { "epoch": 2.8978905533476, "grad_norm": 0.603314995765686, "learning_rate": 2.93112818988578e-05, "loss": 0.0943, "step": 9479 }, { "epoch": 2.898196270253745, "grad_norm": 0.3409748077392578, "learning_rate": 2.931085728843786e-05, "loss": 0.0915, "step": 9480 }, { "epoch": 2.89850198715989, "grad_norm": 0.8656413555145264, "learning_rate": 2.9310432678017918e-05, "loss": 0.1078, "step": 9481 }, { "epoch": 2.898807704066035, "grad_norm": 0.5296049118041992, "learning_rate": 2.931000806759798e-05, "loss": 0.142, "step": 9482 }, { "epoch": 2.8991134209721796, "grad_norm": 0.5345261693000793, "learning_rate": 2.930958345717804e-05, "loss": 0.168, "step": 9483 }, { "epoch": 2.8994191378783247, "grad_norm": 0.7198222875595093, "learning_rate": 2.93091588467581e-05, "loss": 0.1947, "step": 9484 }, { "epoch": 2.89972485478447, "grad_norm": 0.7645419239997864, "learning_rate": 2.930873423633816e-05, "loss": 0.1793, "step": 9485 }, { "epoch": 2.9000305716906145, "grad_norm": 0.7308936715126038, "learning_rate": 2.9308309625918222e-05, "loss": 0.1933, "step": 9486 }, { "epoch": 2.900336288596759, "grad_norm": 1.1526949405670166, "learning_rate": 2.930788501549828e-05, "loss": 0.2134, "step": 9487 }, { "epoch": 2.9006420055029043, "grad_norm": 0.953068733215332, "learning_rate": 2.9307460405078343e-05, "loss": 0.185, "step": 9488 }, { "epoch": 2.9009477224090494, "grad_norm": 0.950156569480896, "learning_rate": 2.93070357946584e-05, "loss": 0.2246, "step": 9489 }, { "epoch": 2.901253439315194, "grad_norm": 2.238238573074341, "learning_rate": 2.9306611184238463e-05, "loss": 0.3712, "step": 9490 }, { "epoch": 2.901559156221339, "grad_norm": 1.2850456237792969, "learning_rate": 2.9306186573818522e-05, "loss": 0.1978, "step": 9491 }, { "epoch": 2.901864873127484, "grad_norm": 1.321852684020996, "learning_rate": 2.9305761963398584e-05, "loss": 0.2978, "step": 9492 }, { "epoch": 2.902170590033629, "grad_norm": 0.4139690399169922, "learning_rate": 2.9305337352978643e-05, "loss": 0.1672, "step": 9493 }, { "epoch": 2.9024763069397737, "grad_norm": 0.3234795331954956, "learning_rate": 2.93049127425587e-05, "loss": 0.089, "step": 9494 }, { "epoch": 2.9027820238459188, "grad_norm": 0.4606790244579315, "learning_rate": 2.9304488132138764e-05, "loss": 0.0925, "step": 9495 }, { "epoch": 2.9030877407520634, "grad_norm": 0.3908299207687378, "learning_rate": 2.9304063521718822e-05, "loss": 0.0578, "step": 9496 }, { "epoch": 2.9033934576582086, "grad_norm": 0.8974146842956543, "learning_rate": 2.9303638911298884e-05, "loss": 0.0677, "step": 9497 }, { "epoch": 2.9036991745643537, "grad_norm": 0.30248603224754333, "learning_rate": 2.9303214300878943e-05, "loss": 0.0582, "step": 9498 }, { "epoch": 2.9040048914704983, "grad_norm": 0.23517438769340515, "learning_rate": 2.9302789690459005e-05, "loss": 0.0776, "step": 9499 }, { "epoch": 2.904310608376643, "grad_norm": 0.22657829523086548, "learning_rate": 2.9302365080039064e-05, "loss": 0.0599, "step": 9500 }, { "epoch": 2.904616325282788, "grad_norm": 0.3924674987792969, "learning_rate": 2.9301940469619126e-05, "loss": 0.0779, "step": 9501 }, { "epoch": 2.9049220421889332, "grad_norm": 0.3648838996887207, "learning_rate": 2.9301515859199185e-05, "loss": 0.0887, "step": 9502 }, { "epoch": 2.905227759095078, "grad_norm": 0.5795908570289612, "learning_rate": 2.9301091248779247e-05, "loss": 0.0873, "step": 9503 }, { "epoch": 2.905533476001223, "grad_norm": 0.37225741147994995, "learning_rate": 2.9300666638359305e-05, "loss": 0.0927, "step": 9504 }, { "epoch": 2.9058391929073677, "grad_norm": 0.4438576400279999, "learning_rate": 2.9300242027939368e-05, "loss": 0.129, "step": 9505 }, { "epoch": 2.906144909813513, "grad_norm": 0.6251547336578369, "learning_rate": 2.9299817417519426e-05, "loss": 0.1013, "step": 9506 }, { "epoch": 2.9064506267196575, "grad_norm": 0.7763168811798096, "learning_rate": 2.9299392807099485e-05, "loss": 0.1464, "step": 9507 }, { "epoch": 2.9067563436258026, "grad_norm": 1.3129101991653442, "learning_rate": 2.9298968196679547e-05, "loss": 0.1751, "step": 9508 }, { "epoch": 2.9070620605319473, "grad_norm": 0.7433951497077942, "learning_rate": 2.9298543586259606e-05, "loss": 0.179, "step": 9509 }, { "epoch": 2.9073677774380924, "grad_norm": 0.5733801126480103, "learning_rate": 2.9298118975839668e-05, "loss": 0.1646, "step": 9510 }, { "epoch": 2.9076734943442375, "grad_norm": 0.6603352427482605, "learning_rate": 2.9297694365419727e-05, "loss": 0.1825, "step": 9511 }, { "epoch": 2.907979211250382, "grad_norm": 1.1890414953231812, "learning_rate": 2.929726975499979e-05, "loss": 0.2028, "step": 9512 }, { "epoch": 2.908284928156527, "grad_norm": 0.6553481817245483, "learning_rate": 2.9296845144579847e-05, "loss": 0.1722, "step": 9513 }, { "epoch": 2.908590645062672, "grad_norm": 0.7633565664291382, "learning_rate": 2.929642053415991e-05, "loss": 0.2266, "step": 9514 }, { "epoch": 2.908896361968817, "grad_norm": 0.8858582377433777, "learning_rate": 2.9295995923739968e-05, "loss": 0.1995, "step": 9515 }, { "epoch": 2.9092020788749617, "grad_norm": 1.4052703380584717, "learning_rate": 2.929557131332003e-05, "loss": 0.2084, "step": 9516 }, { "epoch": 2.909507795781107, "grad_norm": 4.411797523498535, "learning_rate": 2.929514670290009e-05, "loss": 0.2795, "step": 9517 }, { "epoch": 2.9098135126872515, "grad_norm": 0.42636868357658386, "learning_rate": 2.929472209248015e-05, "loss": 0.16, "step": 9518 }, { "epoch": 2.9101192295933966, "grad_norm": 0.256923109292984, "learning_rate": 2.929429748206021e-05, "loss": 0.0739, "step": 9519 }, { "epoch": 2.9104249464995413, "grad_norm": 0.31801077723503113, "learning_rate": 2.929387287164027e-05, "loss": 0.0699, "step": 9520 }, { "epoch": 2.9107306634056864, "grad_norm": 0.5439955592155457, "learning_rate": 2.929344826122033e-05, "loss": 0.0705, "step": 9521 }, { "epoch": 2.911036380311831, "grad_norm": 0.2303752899169922, "learning_rate": 2.929302365080039e-05, "loss": 0.064, "step": 9522 }, { "epoch": 2.911342097217976, "grad_norm": 0.2621385157108307, "learning_rate": 2.929259904038045e-05, "loss": 0.064, "step": 9523 }, { "epoch": 2.9116478141241213, "grad_norm": 0.41638481616973877, "learning_rate": 2.929217442996051e-05, "loss": 0.0821, "step": 9524 }, { "epoch": 2.911953531030266, "grad_norm": 0.3954157829284668, "learning_rate": 2.9291749819540572e-05, "loss": 0.0626, "step": 9525 }, { "epoch": 2.9122592479364107, "grad_norm": 0.34886598587036133, "learning_rate": 2.929132520912063e-05, "loss": 0.0735, "step": 9526 }, { "epoch": 2.912564964842556, "grad_norm": 0.2501238286495209, "learning_rate": 2.9290900598700693e-05, "loss": 0.0832, "step": 9527 }, { "epoch": 2.912870681748701, "grad_norm": 0.35949888825416565, "learning_rate": 2.929047598828075e-05, "loss": 0.0836, "step": 9528 }, { "epoch": 2.9131763986548456, "grad_norm": 0.419342964887619, "learning_rate": 2.9290051377860814e-05, "loss": 0.082, "step": 9529 }, { "epoch": 2.9134821155609907, "grad_norm": 0.28890928626060486, "learning_rate": 2.9289626767440872e-05, "loss": 0.085, "step": 9530 }, { "epoch": 2.9137878324671354, "grad_norm": 1.0739959478378296, "learning_rate": 2.9289202157020934e-05, "loss": 0.1568, "step": 9531 }, { "epoch": 2.9140935493732805, "grad_norm": 0.32217293977737427, "learning_rate": 2.9288777546600993e-05, "loss": 0.1335, "step": 9532 }, { "epoch": 2.914399266279425, "grad_norm": 0.560753583908081, "learning_rate": 2.9288352936181052e-05, "loss": 0.1317, "step": 9533 }, { "epoch": 2.9147049831855703, "grad_norm": 0.6119731664657593, "learning_rate": 2.9287928325761114e-05, "loss": 0.194, "step": 9534 }, { "epoch": 2.915010700091715, "grad_norm": 0.9606346487998962, "learning_rate": 2.9287503715341173e-05, "loss": 0.1666, "step": 9535 }, { "epoch": 2.91531641699786, "grad_norm": 0.6155529618263245, "learning_rate": 2.9287079104921235e-05, "loss": 0.1702, "step": 9536 }, { "epoch": 2.915622133904005, "grad_norm": 0.9791519045829773, "learning_rate": 2.9286654494501293e-05, "loss": 0.1984, "step": 9537 }, { "epoch": 2.91592785081015, "grad_norm": 1.1439716815948486, "learning_rate": 2.9286229884081355e-05, "loss": 0.2214, "step": 9538 }, { "epoch": 2.9162335677162945, "grad_norm": 1.194287896156311, "learning_rate": 2.9285805273661414e-05, "loss": 0.1859, "step": 9539 }, { "epoch": 2.9165392846224396, "grad_norm": 0.9337407350540161, "learning_rate": 2.9285380663241476e-05, "loss": 0.2202, "step": 9540 }, { "epoch": 2.9168450015285847, "grad_norm": 0.882502555847168, "learning_rate": 2.9284956052821535e-05, "loss": 0.1952, "step": 9541 }, { "epoch": 2.9171507184347294, "grad_norm": 1.7699084281921387, "learning_rate": 2.9284531442401597e-05, "loss": 0.2924, "step": 9542 }, { "epoch": 2.9174564353408745, "grad_norm": 0.50885409116745, "learning_rate": 2.9284106831981656e-05, "loss": 0.1603, "step": 9543 }, { "epoch": 2.917762152247019, "grad_norm": 0.3810670077800751, "learning_rate": 2.9283682221561718e-05, "loss": 0.1001, "step": 9544 }, { "epoch": 2.9180678691531643, "grad_norm": 0.2724936306476593, "learning_rate": 2.928325761114178e-05, "loss": 0.0817, "step": 9545 }, { "epoch": 2.918373586059309, "grad_norm": 0.3512064218521118, "learning_rate": 2.928283300072184e-05, "loss": 0.0594, "step": 9546 }, { "epoch": 2.918679302965454, "grad_norm": 0.248258575797081, "learning_rate": 2.92824083903019e-05, "loss": 0.0686, "step": 9547 }, { "epoch": 2.9189850198715988, "grad_norm": 0.28581956028938293, "learning_rate": 2.928198377988196e-05, "loss": 0.0514, "step": 9548 }, { "epoch": 2.919290736777744, "grad_norm": 0.39892300963401794, "learning_rate": 2.928155916946202e-05, "loss": 0.0733, "step": 9549 }, { "epoch": 2.919596453683889, "grad_norm": 0.29443350434303284, "learning_rate": 2.928113455904208e-05, "loss": 0.0696, "step": 9550 }, { "epoch": 2.9199021705900337, "grad_norm": 0.586995005607605, "learning_rate": 2.9280709948622142e-05, "loss": 0.0989, "step": 9551 }, { "epoch": 2.9202078874961783, "grad_norm": 0.3335859775543213, "learning_rate": 2.92802853382022e-05, "loss": 0.0703, "step": 9552 }, { "epoch": 2.9205136044023234, "grad_norm": 0.7376115918159485, "learning_rate": 2.9279860727782263e-05, "loss": 0.1362, "step": 9553 }, { "epoch": 2.9208193213084686, "grad_norm": 0.30719152092933655, "learning_rate": 2.9279436117362322e-05, "loss": 0.1141, "step": 9554 }, { "epoch": 2.9211250382146132, "grad_norm": 0.42819657921791077, "learning_rate": 2.9279011506942384e-05, "loss": 0.1048, "step": 9555 }, { "epoch": 2.9214307551207583, "grad_norm": 0.9159184098243713, "learning_rate": 2.9278586896522443e-05, "loss": 0.1535, "step": 9556 }, { "epoch": 2.921736472026903, "grad_norm": 0.8380901217460632, "learning_rate": 2.9278162286102505e-05, "loss": 0.1067, "step": 9557 }, { "epoch": 2.922042188933048, "grad_norm": 0.9876824617385864, "learning_rate": 2.9277737675682563e-05, "loss": 0.1536, "step": 9558 }, { "epoch": 2.922347905839193, "grad_norm": 1.3185901641845703, "learning_rate": 2.9277313065262622e-05, "loss": 0.1689, "step": 9559 }, { "epoch": 2.922653622745338, "grad_norm": 0.636849582195282, "learning_rate": 2.9276888454842684e-05, "loss": 0.1452, "step": 9560 }, { "epoch": 2.9229593396514826, "grad_norm": 0.8813018202781677, "learning_rate": 2.9276463844422743e-05, "loss": 0.2028, "step": 9561 }, { "epoch": 2.9232650565576277, "grad_norm": 0.6528657674789429, "learning_rate": 2.9276039234002805e-05, "loss": 0.1975, "step": 9562 }, { "epoch": 2.923570773463773, "grad_norm": 1.499597191810608, "learning_rate": 2.9275614623582864e-05, "loss": 0.2105, "step": 9563 }, { "epoch": 2.9238764903699175, "grad_norm": 1.1166744232177734, "learning_rate": 2.9275190013162926e-05, "loss": 0.1703, "step": 9564 }, { "epoch": 2.924182207276062, "grad_norm": 1.3889509439468384, "learning_rate": 2.9274765402742984e-05, "loss": 0.2149, "step": 9565 }, { "epoch": 2.9244879241822073, "grad_norm": 2.0468854904174805, "learning_rate": 2.9274340792323046e-05, "loss": 0.2016, "step": 9566 }, { "epoch": 2.9247936410883524, "grad_norm": 1.1911295652389526, "learning_rate": 2.9273916181903105e-05, "loss": 0.2799, "step": 9567 }, { "epoch": 2.925099357994497, "grad_norm": 0.531600296497345, "learning_rate": 2.9273491571483167e-05, "loss": 0.1626, "step": 9568 }, { "epoch": 2.925405074900642, "grad_norm": 0.29791873693466187, "learning_rate": 2.9273066961063226e-05, "loss": 0.0886, "step": 9569 }, { "epoch": 2.925710791806787, "grad_norm": 0.8431077599525452, "learning_rate": 2.9272642350643285e-05, "loss": 0.0801, "step": 9570 }, { "epoch": 2.926016508712932, "grad_norm": 0.421640008687973, "learning_rate": 2.9272217740223347e-05, "loss": 0.0664, "step": 9571 }, { "epoch": 2.9263222256190766, "grad_norm": 0.19654053449630737, "learning_rate": 2.9271793129803405e-05, "loss": 0.0597, "step": 9572 }, { "epoch": 2.9266279425252217, "grad_norm": 0.38030102849006653, "learning_rate": 2.9271368519383468e-05, "loss": 0.0608, "step": 9573 }, { "epoch": 2.9269336594313664, "grad_norm": 0.445823609828949, "learning_rate": 2.9270943908963526e-05, "loss": 0.0536, "step": 9574 }, { "epoch": 2.9272393763375115, "grad_norm": 0.37717175483703613, "learning_rate": 2.927051929854359e-05, "loss": 0.0775, "step": 9575 }, { "epoch": 2.9275450932436566, "grad_norm": 0.24588361382484436, "learning_rate": 2.9270094688123647e-05, "loss": 0.0635, "step": 9576 }, { "epoch": 2.9278508101498013, "grad_norm": 0.3426460325717926, "learning_rate": 2.926967007770371e-05, "loss": 0.0756, "step": 9577 }, { "epoch": 2.928156527055946, "grad_norm": 0.47611966729164124, "learning_rate": 2.9269245467283768e-05, "loss": 0.0992, "step": 9578 }, { "epoch": 2.928462243962091, "grad_norm": 0.28266358375549316, "learning_rate": 2.926882085686383e-05, "loss": 0.0835, "step": 9579 }, { "epoch": 2.928767960868236, "grad_norm": 0.543343722820282, "learning_rate": 2.926839624644389e-05, "loss": 0.0974, "step": 9580 }, { "epoch": 2.929073677774381, "grad_norm": 1.984086275100708, "learning_rate": 2.926797163602395e-05, "loss": 0.17, "step": 9581 }, { "epoch": 2.929379394680526, "grad_norm": 0.5063874125480652, "learning_rate": 2.926754702560401e-05, "loss": 0.1449, "step": 9582 }, { "epoch": 2.9296851115866707, "grad_norm": 0.589968740940094, "learning_rate": 2.9267122415184068e-05, "loss": 0.1927, "step": 9583 }, { "epoch": 2.929990828492816, "grad_norm": 0.503560483455658, "learning_rate": 2.926669780476413e-05, "loss": 0.155, "step": 9584 }, { "epoch": 2.9302965453989605, "grad_norm": 2.1099300384521484, "learning_rate": 2.926627319434419e-05, "loss": 0.1514, "step": 9585 }, { "epoch": 2.9306022623051056, "grad_norm": 0.7924941182136536, "learning_rate": 2.926584858392425e-05, "loss": 0.1869, "step": 9586 }, { "epoch": 2.9309079792112502, "grad_norm": 1.0975288152694702, "learning_rate": 2.926542397350431e-05, "loss": 0.178, "step": 9587 }, { "epoch": 2.9312136961173954, "grad_norm": 1.2116334438323975, "learning_rate": 2.9264999363084372e-05, "loss": 0.1632, "step": 9588 }, { "epoch": 2.9315194130235405, "grad_norm": 1.0730246305465698, "learning_rate": 2.926457475266443e-05, "loss": 0.1734, "step": 9589 }, { "epoch": 2.931825129929685, "grad_norm": 0.8630696535110474, "learning_rate": 2.9264150142244493e-05, "loss": 0.2235, "step": 9590 }, { "epoch": 2.93213084683583, "grad_norm": 1.8164535760879517, "learning_rate": 2.926372553182455e-05, "loss": 0.2119, "step": 9591 }, { "epoch": 2.932436563741975, "grad_norm": 1.5318471193313599, "learning_rate": 2.9263300921404613e-05, "loss": 0.2746, "step": 9592 }, { "epoch": 2.93274228064812, "grad_norm": 0.41075077652931213, "learning_rate": 2.9262876310984672e-05, "loss": 0.1615, "step": 9593 }, { "epoch": 2.9330479975542647, "grad_norm": 0.3874537944793701, "learning_rate": 2.9262451700564734e-05, "loss": 0.0715, "step": 9594 }, { "epoch": 2.93335371446041, "grad_norm": 0.3434481918811798, "learning_rate": 2.9262027090144793e-05, "loss": 0.0837, "step": 9595 }, { "epoch": 2.9336594313665545, "grad_norm": 1.0147613286972046, "learning_rate": 2.926160247972485e-05, "loss": 0.0577, "step": 9596 }, { "epoch": 2.9339651482726996, "grad_norm": 0.36902227997779846, "learning_rate": 2.9261177869304914e-05, "loss": 0.0413, "step": 9597 }, { "epoch": 2.9342708651788443, "grad_norm": 0.500054121017456, "learning_rate": 2.9260753258884972e-05, "loss": 0.0899, "step": 9598 }, { "epoch": 2.9345765820849894, "grad_norm": 0.5876935124397278, "learning_rate": 2.9260328648465034e-05, "loss": 0.0838, "step": 9599 }, { "epoch": 2.934882298991134, "grad_norm": 0.3902348577976227, "learning_rate": 2.9259904038045093e-05, "loss": 0.0694, "step": 9600 }, { "epoch": 2.935188015897279, "grad_norm": 0.5652562975883484, "learning_rate": 2.9259479427625155e-05, "loss": 0.0876, "step": 9601 }, { "epoch": 2.9354937328034243, "grad_norm": 0.9254360795021057, "learning_rate": 2.9259054817205214e-05, "loss": 0.0847, "step": 9602 }, { "epoch": 2.935799449709569, "grad_norm": 0.40137431025505066, "learning_rate": 2.9258630206785276e-05, "loss": 0.1006, "step": 9603 }, { "epoch": 2.9361051666157136, "grad_norm": 0.9886817932128906, "learning_rate": 2.9258205596365335e-05, "loss": 0.1076, "step": 9604 }, { "epoch": 2.9364108835218588, "grad_norm": 0.8787093162536621, "learning_rate": 2.9257780985945397e-05, "loss": 0.1248, "step": 9605 }, { "epoch": 2.936716600428004, "grad_norm": 0.43787881731987, "learning_rate": 2.9257356375525455e-05, "loss": 0.1071, "step": 9606 }, { "epoch": 2.9370223173341485, "grad_norm": 0.7101043462753296, "learning_rate": 2.9256931765105518e-05, "loss": 0.1712, "step": 9607 }, { "epoch": 2.9373280342402937, "grad_norm": 1.4172983169555664, "learning_rate": 2.9256507154685576e-05, "loss": 0.1364, "step": 9608 }, { "epoch": 2.9376337511464383, "grad_norm": 0.616346001625061, "learning_rate": 2.9256082544265635e-05, "loss": 0.1464, "step": 9609 }, { "epoch": 2.9379394680525834, "grad_norm": 1.3580348491668701, "learning_rate": 2.9255657933845697e-05, "loss": 0.2629, "step": 9610 }, { "epoch": 2.938245184958728, "grad_norm": 3.2972400188446045, "learning_rate": 2.9255233323425756e-05, "loss": 0.192, "step": 9611 }, { "epoch": 2.938550901864873, "grad_norm": 15.364459037780762, "learning_rate": 2.9254808713005818e-05, "loss": 0.2129, "step": 9612 }, { "epoch": 2.938856618771018, "grad_norm": 1.3109256029129028, "learning_rate": 2.9254384102585877e-05, "loss": 0.2051, "step": 9613 }, { "epoch": 2.939162335677163, "grad_norm": 0.8432818651199341, "learning_rate": 2.925395949216594e-05, "loss": 0.1988, "step": 9614 }, { "epoch": 2.939468052583308, "grad_norm": 6.274487018585205, "learning_rate": 2.9253534881745997e-05, "loss": 0.1807, "step": 9615 }, { "epoch": 2.939773769489453, "grad_norm": 11.974018096923828, "learning_rate": 2.925311027132606e-05, "loss": 0.2427, "step": 9616 }, { "epoch": 2.9400794863955975, "grad_norm": 1.485700249671936, "learning_rate": 2.9252685660906118e-05, "loss": 0.2657, "step": 9617 }, { "epoch": 2.9403852033017426, "grad_norm": 0.6357463598251343, "learning_rate": 2.925226105048618e-05, "loss": 0.2, "step": 9618 }, { "epoch": 2.9406909202078877, "grad_norm": 0.35212117433547974, "learning_rate": 2.925183644006624e-05, "loss": 0.0647, "step": 9619 }, { "epoch": 2.9409966371140324, "grad_norm": 0.384356290102005, "learning_rate": 2.92514118296463e-05, "loss": 0.1073, "step": 9620 }, { "epoch": 2.9413023540201775, "grad_norm": 0.3798381984233856, "learning_rate": 2.925098721922636e-05, "loss": 0.0706, "step": 9621 }, { "epoch": 2.941608070926322, "grad_norm": 0.5643787384033203, "learning_rate": 2.925056260880642e-05, "loss": 0.0527, "step": 9622 }, { "epoch": 2.9419137878324673, "grad_norm": 0.5512122511863708, "learning_rate": 2.925013799838648e-05, "loss": 0.0554, "step": 9623 }, { "epoch": 2.942219504738612, "grad_norm": 0.32106801867485046, "learning_rate": 2.924971338796654e-05, "loss": 0.0893, "step": 9624 }, { "epoch": 2.942525221644757, "grad_norm": 0.33639660477638245, "learning_rate": 2.92492887775466e-05, "loss": 0.0502, "step": 9625 }, { "epoch": 2.9428309385509017, "grad_norm": 1.2884845733642578, "learning_rate": 2.924886416712666e-05, "loss": 0.1009, "step": 9626 }, { "epoch": 2.943136655457047, "grad_norm": 0.32614773511886597, "learning_rate": 2.9248439556706722e-05, "loss": 0.0656, "step": 9627 }, { "epoch": 2.943442372363192, "grad_norm": 0.6083450317382812, "learning_rate": 2.924801494628678e-05, "loss": 0.0904, "step": 9628 }, { "epoch": 2.9437480892693366, "grad_norm": 0.6526762843132019, "learning_rate": 2.9247590335866843e-05, "loss": 0.0765, "step": 9629 }, { "epoch": 2.9440538061754813, "grad_norm": 0.5361718535423279, "learning_rate": 2.92471657254469e-05, "loss": 0.1015, "step": 9630 }, { "epoch": 2.9443595230816264, "grad_norm": 0.738547146320343, "learning_rate": 2.9246741115026964e-05, "loss": 0.1475, "step": 9631 }, { "epoch": 2.9446652399877715, "grad_norm": 0.5734702944755554, "learning_rate": 2.9246316504607022e-05, "loss": 0.1347, "step": 9632 }, { "epoch": 2.944970956893916, "grad_norm": 1.1149710416793823, "learning_rate": 2.9245891894187084e-05, "loss": 0.1479, "step": 9633 }, { "epoch": 2.9452766738000613, "grad_norm": 0.6299647688865662, "learning_rate": 2.9245467283767143e-05, "loss": 0.1992, "step": 9634 }, { "epoch": 2.945582390706206, "grad_norm": 0.7792457938194275, "learning_rate": 2.9245042673347202e-05, "loss": 0.1776, "step": 9635 }, { "epoch": 2.945888107612351, "grad_norm": 1.2495981454849243, "learning_rate": 2.9244618062927264e-05, "loss": 0.2032, "step": 9636 }, { "epoch": 2.9461938245184958, "grad_norm": 0.961858332157135, "learning_rate": 2.9244193452507323e-05, "loss": 0.2121, "step": 9637 }, { "epoch": 2.946499541424641, "grad_norm": 0.9031283855438232, "learning_rate": 2.9243768842087385e-05, "loss": 0.2038, "step": 9638 }, { "epoch": 2.9468052583307855, "grad_norm": 0.8768535852432251, "learning_rate": 2.9243344231667443e-05, "loss": 0.2481, "step": 9639 }, { "epoch": 2.9471109752369307, "grad_norm": 1.7492749691009521, "learning_rate": 2.9242919621247505e-05, "loss": 0.2387, "step": 9640 }, { "epoch": 2.947416692143076, "grad_norm": 0.9024081230163574, "learning_rate": 2.9242495010827564e-05, "loss": 0.2309, "step": 9641 }, { "epoch": 2.9477224090492204, "grad_norm": 1.5755265951156616, "learning_rate": 2.9242070400407626e-05, "loss": 0.3043, "step": 9642 }, { "epoch": 2.948028125955365, "grad_norm": 0.7126705050468445, "learning_rate": 2.9241645789987685e-05, "loss": 0.1571, "step": 9643 }, { "epoch": 2.9483338428615102, "grad_norm": 0.27792900800704956, "learning_rate": 2.9241221179567747e-05, "loss": 0.095, "step": 9644 }, { "epoch": 2.9486395597676553, "grad_norm": 0.35485681891441345, "learning_rate": 2.9240796569147806e-05, "loss": 0.07, "step": 9645 }, { "epoch": 2.9489452766738, "grad_norm": 0.35442617535591125, "learning_rate": 2.9240371958727868e-05, "loss": 0.0525, "step": 9646 }, { "epoch": 2.949250993579945, "grad_norm": 0.2754741311073303, "learning_rate": 2.923994734830793e-05, "loss": 0.0888, "step": 9647 }, { "epoch": 2.94955671048609, "grad_norm": 0.2898331880569458, "learning_rate": 2.923952273788799e-05, "loss": 0.0663, "step": 9648 }, { "epoch": 2.949862427392235, "grad_norm": 0.33339443802833557, "learning_rate": 2.923909812746805e-05, "loss": 0.0523, "step": 9649 }, { "epoch": 2.9501681442983796, "grad_norm": 0.24895340204238892, "learning_rate": 2.923867351704811e-05, "loss": 0.074, "step": 9650 }, { "epoch": 2.9504738612045247, "grad_norm": 0.845832347869873, "learning_rate": 2.923824890662817e-05, "loss": 0.057, "step": 9651 }, { "epoch": 2.9507795781106694, "grad_norm": 0.24557135999202728, "learning_rate": 2.923782429620823e-05, "loss": 0.0846, "step": 9652 }, { "epoch": 2.9510852950168145, "grad_norm": 0.43398383259773254, "learning_rate": 2.9237399685788292e-05, "loss": 0.1006, "step": 9653 }, { "epoch": 2.9513910119229596, "grad_norm": 0.49457624554634094, "learning_rate": 2.923697507536835e-05, "loss": 0.0945, "step": 9654 }, { "epoch": 2.9516967288291043, "grad_norm": 0.5440248847007751, "learning_rate": 2.9236550464948413e-05, "loss": 0.1247, "step": 9655 }, { "epoch": 2.952002445735249, "grad_norm": 0.5774962902069092, "learning_rate": 2.9236125854528472e-05, "loss": 0.1152, "step": 9656 }, { "epoch": 2.952308162641394, "grad_norm": 1.3235417604446411, "learning_rate": 2.9235701244108534e-05, "loss": 0.1275, "step": 9657 }, { "epoch": 2.952613879547539, "grad_norm": 0.573714017868042, "learning_rate": 2.9235276633688593e-05, "loss": 0.151, "step": 9658 }, { "epoch": 2.952919596453684, "grad_norm": 0.751115620136261, "learning_rate": 2.9234852023268655e-05, "loss": 0.1555, "step": 9659 }, { "epoch": 2.953225313359829, "grad_norm": 5.768263816833496, "learning_rate": 2.9234427412848713e-05, "loss": 0.2162, "step": 9660 }, { "epoch": 2.9535310302659736, "grad_norm": 0.6129060387611389, "learning_rate": 2.9234002802428772e-05, "loss": 0.1904, "step": 9661 }, { "epoch": 2.9538367471721187, "grad_norm": 0.9399309158325195, "learning_rate": 2.9233578192008834e-05, "loss": 0.2017, "step": 9662 }, { "epoch": 2.9541424640782634, "grad_norm": 0.9905286431312561, "learning_rate": 2.9233153581588893e-05, "loss": 0.2378, "step": 9663 }, { "epoch": 2.9544481809844085, "grad_norm": 1.1140319108963013, "learning_rate": 2.9232728971168955e-05, "loss": 0.2151, "step": 9664 }, { "epoch": 2.954753897890553, "grad_norm": 1.0224595069885254, "learning_rate": 2.9232304360749014e-05, "loss": 0.1836, "step": 9665 }, { "epoch": 2.9550596147966983, "grad_norm": 1.2752197980880737, "learning_rate": 2.9231879750329076e-05, "loss": 0.2281, "step": 9666 }, { "epoch": 2.9553653317028434, "grad_norm": 1.9820750951766968, "learning_rate": 2.9231455139909134e-05, "loss": 0.2194, "step": 9667 }, { "epoch": 2.955671048608988, "grad_norm": 0.3858889043331146, "learning_rate": 2.9231030529489197e-05, "loss": 0.156, "step": 9668 }, { "epoch": 2.9559767655151328, "grad_norm": 0.4256429970264435, "learning_rate": 2.9230605919069255e-05, "loss": 0.0717, "step": 9669 }, { "epoch": 2.956282482421278, "grad_norm": 0.2064121961593628, "learning_rate": 2.9230181308649317e-05, "loss": 0.0767, "step": 9670 }, { "epoch": 2.956588199327423, "grad_norm": 0.43793705105781555, "learning_rate": 2.9229756698229376e-05, "loss": 0.0815, "step": 9671 }, { "epoch": 2.9568939162335677, "grad_norm": 0.7234390377998352, "learning_rate": 2.9229332087809438e-05, "loss": 0.0652, "step": 9672 }, { "epoch": 2.957199633139713, "grad_norm": 0.2895190119743347, "learning_rate": 2.9228907477389497e-05, "loss": 0.0621, "step": 9673 }, { "epoch": 2.9575053500458575, "grad_norm": 0.21968385577201843, "learning_rate": 2.9228482866969555e-05, "loss": 0.0697, "step": 9674 }, { "epoch": 2.9578110669520026, "grad_norm": 0.3578644096851349, "learning_rate": 2.9228058256549618e-05, "loss": 0.0611, "step": 9675 }, { "epoch": 2.9581167838581472, "grad_norm": 0.5867796540260315, "learning_rate": 2.9227633646129676e-05, "loss": 0.068, "step": 9676 }, { "epoch": 2.9584225007642924, "grad_norm": 0.3145062029361725, "learning_rate": 2.922720903570974e-05, "loss": 0.0637, "step": 9677 }, { "epoch": 2.958728217670437, "grad_norm": 0.35029488801956177, "learning_rate": 2.9226784425289797e-05, "loss": 0.0986, "step": 9678 }, { "epoch": 2.959033934576582, "grad_norm": 0.35104143619537354, "learning_rate": 2.922635981486986e-05, "loss": 0.0831, "step": 9679 }, { "epoch": 2.9593396514827273, "grad_norm": 1.1221952438354492, "learning_rate": 2.9225935204449918e-05, "loss": 0.1019, "step": 9680 }, { "epoch": 2.959645368388872, "grad_norm": 5.187539577484131, "learning_rate": 2.922551059402998e-05, "loss": 0.1152, "step": 9681 }, { "epoch": 2.9599510852950166, "grad_norm": 0.33511701226234436, "learning_rate": 2.922508598361004e-05, "loss": 0.1115, "step": 9682 }, { "epoch": 2.9602568022011617, "grad_norm": 1.9611514806747437, "learning_rate": 2.92246613731901e-05, "loss": 0.1728, "step": 9683 }, { "epoch": 2.960562519107307, "grad_norm": 0.5222477912902832, "learning_rate": 2.922423676277016e-05, "loss": 0.1707, "step": 9684 }, { "epoch": 2.9608682360134515, "grad_norm": 1.2353956699371338, "learning_rate": 2.922381215235022e-05, "loss": 0.1684, "step": 9685 }, { "epoch": 2.9611739529195966, "grad_norm": 0.6053135991096497, "learning_rate": 2.922338754193028e-05, "loss": 0.1863, "step": 9686 }, { "epoch": 2.9614796698257413, "grad_norm": 0.7403700351715088, "learning_rate": 2.922296293151034e-05, "loss": 0.2074, "step": 9687 }, { "epoch": 2.9617853867318864, "grad_norm": 1.0289249420166016, "learning_rate": 2.92225383210904e-05, "loss": 0.2556, "step": 9688 }, { "epoch": 2.962091103638031, "grad_norm": 0.7201842069625854, "learning_rate": 2.922211371067046e-05, "loss": 0.1645, "step": 9689 }, { "epoch": 2.962396820544176, "grad_norm": 1.5951884984970093, "learning_rate": 2.9221689100250522e-05, "loss": 0.2015, "step": 9690 }, { "epoch": 2.962702537450321, "grad_norm": 1.1410032510757446, "learning_rate": 2.922126448983058e-05, "loss": 0.2507, "step": 9691 }, { "epoch": 2.963008254356466, "grad_norm": 2.889763355255127, "learning_rate": 2.9220839879410643e-05, "loss": 0.2722, "step": 9692 }, { "epoch": 2.963313971262611, "grad_norm": 0.3931080102920532, "learning_rate": 2.92204152689907e-05, "loss": 0.1663, "step": 9693 }, { "epoch": 2.9636196881687558, "grad_norm": 0.5349857211112976, "learning_rate": 2.9219990658570763e-05, "loss": 0.113, "step": 9694 }, { "epoch": 2.9639254050749004, "grad_norm": 0.2731051743030548, "learning_rate": 2.9219566048150822e-05, "loss": 0.0797, "step": 9695 }, { "epoch": 2.9642311219810455, "grad_norm": 0.2531861364841461, "learning_rate": 2.9219141437730884e-05, "loss": 0.0574, "step": 9696 }, { "epoch": 2.9645368388871907, "grad_norm": 0.24530918896198273, "learning_rate": 2.9218716827310943e-05, "loss": 0.0764, "step": 9697 }, { "epoch": 2.9648425557933353, "grad_norm": 0.4258497655391693, "learning_rate": 2.9218292216891e-05, "loss": 0.0795, "step": 9698 }, { "epoch": 2.9651482726994804, "grad_norm": 0.2433251440525055, "learning_rate": 2.9217867606471064e-05, "loss": 0.063, "step": 9699 }, { "epoch": 2.965453989605625, "grad_norm": 1.3875195980072021, "learning_rate": 2.9217442996051122e-05, "loss": 0.0902, "step": 9700 }, { "epoch": 2.9657597065117702, "grad_norm": 0.5258328914642334, "learning_rate": 2.9217018385631184e-05, "loss": 0.0713, "step": 9701 }, { "epoch": 2.966065423417915, "grad_norm": 0.3151976466178894, "learning_rate": 2.9216593775211243e-05, "loss": 0.0752, "step": 9702 }, { "epoch": 2.96637114032406, "grad_norm": 0.48111051321029663, "learning_rate": 2.9216169164791305e-05, "loss": 0.1346, "step": 9703 }, { "epoch": 2.9666768572302047, "grad_norm": 0.3140221834182739, "learning_rate": 2.9215744554371364e-05, "loss": 0.0966, "step": 9704 }, { "epoch": 2.96698257413635, "grad_norm": 0.6748590469360352, "learning_rate": 2.9215319943951426e-05, "loss": 0.1152, "step": 9705 }, { "epoch": 2.967288291042495, "grad_norm": 0.6426633596420288, "learning_rate": 2.9214895333531485e-05, "loss": 0.1198, "step": 9706 }, { "epoch": 2.9675940079486396, "grad_norm": 0.692663848400116, "learning_rate": 2.9214470723111547e-05, "loss": 0.1468, "step": 9707 }, { "epoch": 2.9678997248547843, "grad_norm": 0.40378281474113464, "learning_rate": 2.9214046112691606e-05, "loss": 0.1122, "step": 9708 }, { "epoch": 2.9682054417609294, "grad_norm": 0.5635777711868286, "learning_rate": 2.9213621502271668e-05, "loss": 0.1793, "step": 9709 }, { "epoch": 2.9685111586670745, "grad_norm": 1.0573458671569824, "learning_rate": 2.9213196891851726e-05, "loss": 0.1994, "step": 9710 }, { "epoch": 2.968816875573219, "grad_norm": 0.8386842608451843, "learning_rate": 2.9212772281431785e-05, "loss": 0.2103, "step": 9711 }, { "epoch": 2.969122592479364, "grad_norm": 1.9114612340927124, "learning_rate": 2.9212347671011847e-05, "loss": 0.1887, "step": 9712 }, { "epoch": 2.969428309385509, "grad_norm": 0.8270473480224609, "learning_rate": 2.9211923060591906e-05, "loss": 0.185, "step": 9713 }, { "epoch": 2.969734026291654, "grad_norm": 0.7191406488418579, "learning_rate": 2.9211498450171968e-05, "loss": 0.1968, "step": 9714 }, { "epoch": 2.9700397431977987, "grad_norm": 1.0168871879577637, "learning_rate": 2.9211073839752027e-05, "loss": 0.2149, "step": 9715 }, { "epoch": 2.970345460103944, "grad_norm": 1.4609034061431885, "learning_rate": 2.921064922933209e-05, "loss": 0.231, "step": 9716 }, { "epoch": 2.9706511770100885, "grad_norm": 1.1511629819869995, "learning_rate": 2.9210224618912147e-05, "loss": 0.2256, "step": 9717 }, { "epoch": 2.9709568939162336, "grad_norm": 0.6515172719955444, "learning_rate": 2.920980000849221e-05, "loss": 0.1543, "step": 9718 }, { "epoch": 2.9712626108223787, "grad_norm": 0.2903732657432556, "learning_rate": 2.9209375398072268e-05, "loss": 0.0879, "step": 9719 }, { "epoch": 2.9715683277285234, "grad_norm": 0.36117124557495117, "learning_rate": 2.920895078765233e-05, "loss": 0.0735, "step": 9720 }, { "epoch": 2.971874044634668, "grad_norm": 0.3154255449771881, "learning_rate": 2.920852617723239e-05, "loss": 0.0867, "step": 9721 }, { "epoch": 2.972179761540813, "grad_norm": 0.42956000566482544, "learning_rate": 2.920810156681245e-05, "loss": 0.0832, "step": 9722 }, { "epoch": 2.9724854784469583, "grad_norm": 0.365443617105484, "learning_rate": 2.920767695639251e-05, "loss": 0.0724, "step": 9723 }, { "epoch": 2.972791195353103, "grad_norm": 0.2845870554447174, "learning_rate": 2.920725234597257e-05, "loss": 0.069, "step": 9724 }, { "epoch": 2.9730969122592477, "grad_norm": 0.3234858810901642, "learning_rate": 2.920682773555263e-05, "loss": 0.0645, "step": 9725 }, { "epoch": 2.9734026291653928, "grad_norm": 0.3770597577095032, "learning_rate": 2.920640312513269e-05, "loss": 0.0982, "step": 9726 }, { "epoch": 2.973708346071538, "grad_norm": 0.5215054154396057, "learning_rate": 2.920597851471275e-05, "loss": 0.0887, "step": 9727 }, { "epoch": 2.9740140629776826, "grad_norm": 0.31996265053749084, "learning_rate": 2.920555390429281e-05, "loss": 0.076, "step": 9728 }, { "epoch": 2.9743197798838277, "grad_norm": 1.06072199344635, "learning_rate": 2.9205129293872872e-05, "loss": 0.0681, "step": 9729 }, { "epoch": 2.9746254967899723, "grad_norm": 0.8706464171409607, "learning_rate": 2.920470468345293e-05, "loss": 0.1166, "step": 9730 }, { "epoch": 2.9749312136961175, "grad_norm": 0.4362679719924927, "learning_rate": 2.9204280073032993e-05, "loss": 0.131, "step": 9731 }, { "epoch": 2.9752369306022626, "grad_norm": 0.4954105615615845, "learning_rate": 2.920385546261305e-05, "loss": 0.1261, "step": 9732 }, { "epoch": 2.9755426475084072, "grad_norm": 0.38442936539649963, "learning_rate": 2.9203430852193114e-05, "loss": 0.1543, "step": 9733 }, { "epoch": 2.975848364414552, "grad_norm": 0.5791740417480469, "learning_rate": 2.9203006241773172e-05, "loss": 0.1511, "step": 9734 }, { "epoch": 2.976154081320697, "grad_norm": 0.8343188762664795, "learning_rate": 2.9202581631353234e-05, "loss": 0.1659, "step": 9735 }, { "epoch": 2.976459798226842, "grad_norm": 0.916337788105011, "learning_rate": 2.9202157020933293e-05, "loss": 0.2028, "step": 9736 }, { "epoch": 2.976765515132987, "grad_norm": 1.1443713903427124, "learning_rate": 2.9201732410513352e-05, "loss": 0.1996, "step": 9737 }, { "epoch": 2.9770712320391315, "grad_norm": 0.8155877590179443, "learning_rate": 2.9201307800093414e-05, "loss": 0.1884, "step": 9738 }, { "epoch": 2.9773769489452766, "grad_norm": 0.9262773990631104, "learning_rate": 2.9200883189673473e-05, "loss": 0.1872, "step": 9739 }, { "epoch": 2.9776826658514217, "grad_norm": 1.2127101421356201, "learning_rate": 2.9200458579253535e-05, "loss": 0.2355, "step": 9740 }, { "epoch": 2.9779883827575664, "grad_norm": 1.7077219486236572, "learning_rate": 2.9200033968833593e-05, "loss": 0.2753, "step": 9741 }, { "epoch": 2.9782940996637115, "grad_norm": 1.5671055316925049, "learning_rate": 2.9199609358413656e-05, "loss": 0.2741, "step": 9742 }, { "epoch": 2.978599816569856, "grad_norm": 0.8227390050888062, "learning_rate": 2.9199184747993714e-05, "loss": 0.1654, "step": 9743 }, { "epoch": 2.9789055334760013, "grad_norm": 0.3199424147605896, "learning_rate": 2.9198760137573776e-05, "loss": 0.0832, "step": 9744 }, { "epoch": 2.9792112503821464, "grad_norm": 0.3443079888820648, "learning_rate": 2.9198335527153835e-05, "loss": 0.0588, "step": 9745 }, { "epoch": 2.979516967288291, "grad_norm": 0.47367164492607117, "learning_rate": 2.9197910916733897e-05, "loss": 0.0714, "step": 9746 }, { "epoch": 2.9798226841944357, "grad_norm": 0.4252077043056488, "learning_rate": 2.9197486306313956e-05, "loss": 0.054, "step": 9747 }, { "epoch": 2.980128401100581, "grad_norm": 0.5180029273033142, "learning_rate": 2.9197061695894018e-05, "loss": 0.0606, "step": 9748 }, { "epoch": 2.980434118006726, "grad_norm": 0.7167211771011353, "learning_rate": 2.919663708547408e-05, "loss": 0.0692, "step": 9749 }, { "epoch": 2.9807398349128706, "grad_norm": 0.5415371656417847, "learning_rate": 2.919621247505414e-05, "loss": 0.098, "step": 9750 }, { "epoch": 2.9810455518190153, "grad_norm": 0.2757478356361389, "learning_rate": 2.91957878646342e-05, "loss": 0.082, "step": 9751 }, { "epoch": 2.9813512687251604, "grad_norm": 0.45699435472488403, "learning_rate": 2.919536325421426e-05, "loss": 0.0777, "step": 9752 }, { "epoch": 2.9816569856313055, "grad_norm": 0.5351483225822449, "learning_rate": 2.919493864379432e-05, "loss": 0.0918, "step": 9753 }, { "epoch": 2.98196270253745, "grad_norm": 0.5953289866447449, "learning_rate": 2.919451403337438e-05, "loss": 0.0741, "step": 9754 }, { "epoch": 2.9822684194435953, "grad_norm": 0.8466144800186157, "learning_rate": 2.9194089422954442e-05, "loss": 0.0995, "step": 9755 }, { "epoch": 2.98257413634974, "grad_norm": 0.8070288896560669, "learning_rate": 2.91936648125345e-05, "loss": 0.1247, "step": 9756 }, { "epoch": 2.982879853255885, "grad_norm": 1.1744111776351929, "learning_rate": 2.9193240202114563e-05, "loss": 0.1138, "step": 9757 }, { "epoch": 2.9831855701620302, "grad_norm": 0.7191773653030396, "learning_rate": 2.9192815591694622e-05, "loss": 0.1799, "step": 9758 }, { "epoch": 2.983491287068175, "grad_norm": 0.6184241771697998, "learning_rate": 2.9192390981274684e-05, "loss": 0.1758, "step": 9759 }, { "epoch": 2.9837970039743196, "grad_norm": 0.7951871752738953, "learning_rate": 2.9191966370854743e-05, "loss": 0.1566, "step": 9760 }, { "epoch": 2.9841027208804647, "grad_norm": 1.35600745677948, "learning_rate": 2.9191541760434805e-05, "loss": 0.1679, "step": 9761 }, { "epoch": 2.98440843778661, "grad_norm": 1.2044991254806519, "learning_rate": 2.9191117150014863e-05, "loss": 0.2331, "step": 9762 }, { "epoch": 2.9847141546927545, "grad_norm": 1.2498493194580078, "learning_rate": 2.9190692539594922e-05, "loss": 0.2149, "step": 9763 }, { "epoch": 2.985019871598899, "grad_norm": 1.3298674821853638, "learning_rate": 2.9190267929174984e-05, "loss": 0.1959, "step": 9764 }, { "epoch": 2.9853255885050443, "grad_norm": 1.2377568483352661, "learning_rate": 2.9189843318755043e-05, "loss": 0.2433, "step": 9765 }, { "epoch": 2.9856313054111894, "grad_norm": 0.9111335277557373, "learning_rate": 2.9189418708335105e-05, "loss": 0.2015, "step": 9766 }, { "epoch": 2.985937022317334, "grad_norm": 1.4288768768310547, "learning_rate": 2.9188994097915164e-05, "loss": 0.2663, "step": 9767 }, { "epoch": 2.986242739223479, "grad_norm": 0.7578693628311157, "learning_rate": 2.9188569487495226e-05, "loss": 0.1813, "step": 9768 }, { "epoch": 2.986548456129624, "grad_norm": 0.5141378045082092, "learning_rate": 2.9188144877075284e-05, "loss": 0.0764, "step": 9769 }, { "epoch": 2.986854173035769, "grad_norm": 0.35434073209762573, "learning_rate": 2.9187720266655347e-05, "loss": 0.0892, "step": 9770 }, { "epoch": 2.987159889941914, "grad_norm": 0.4904114305973053, "learning_rate": 2.9187295656235405e-05, "loss": 0.0883, "step": 9771 }, { "epoch": 2.9874656068480587, "grad_norm": 0.7036105394363403, "learning_rate": 2.9186871045815467e-05, "loss": 0.0612, "step": 9772 }, { "epoch": 2.9877713237542034, "grad_norm": 0.5539510846138, "learning_rate": 2.9186446435395526e-05, "loss": 0.0605, "step": 9773 }, { "epoch": 2.9880770406603485, "grad_norm": 0.28424811363220215, "learning_rate": 2.9186021824975588e-05, "loss": 0.0815, "step": 9774 }, { "epoch": 2.9883827575664936, "grad_norm": 0.56105637550354, "learning_rate": 2.9185597214555647e-05, "loss": 0.0868, "step": 9775 }, { "epoch": 2.9886884744726383, "grad_norm": 0.28852495551109314, "learning_rate": 2.9185172604135706e-05, "loss": 0.0742, "step": 9776 }, { "epoch": 2.988994191378783, "grad_norm": 0.4019556939601898, "learning_rate": 2.9184747993715768e-05, "loss": 0.0746, "step": 9777 }, { "epoch": 2.989299908284928, "grad_norm": 0.2910701632499695, "learning_rate": 2.9184323383295826e-05, "loss": 0.0925, "step": 9778 }, { "epoch": 2.989605625191073, "grad_norm": 0.610960841178894, "learning_rate": 2.918389877287589e-05, "loss": 0.0748, "step": 9779 }, { "epoch": 2.989911342097218, "grad_norm": 0.4478066563606262, "learning_rate": 2.9183474162455947e-05, "loss": 0.0996, "step": 9780 }, { "epoch": 2.990217059003363, "grad_norm": 0.3964863419532776, "learning_rate": 2.918304955203601e-05, "loss": 0.1473, "step": 9781 }, { "epoch": 2.9905227759095077, "grad_norm": 0.608395516872406, "learning_rate": 2.9182624941616068e-05, "loss": 0.1105, "step": 9782 }, { "epoch": 2.9908284928156528, "grad_norm": 1.4929276704788208, "learning_rate": 2.918220033119613e-05, "loss": 0.1467, "step": 9783 }, { "epoch": 2.991134209721798, "grad_norm": 0.6170409321784973, "learning_rate": 2.918177572077619e-05, "loss": 0.1704, "step": 9784 }, { "epoch": 2.9914399266279426, "grad_norm": 0.7112331390380859, "learning_rate": 2.918135111035625e-05, "loss": 0.1517, "step": 9785 }, { "epoch": 2.9917456435340872, "grad_norm": 3.6701245307922363, "learning_rate": 2.918092649993631e-05, "loss": 0.1767, "step": 9786 }, { "epoch": 2.9920513604402323, "grad_norm": 0.9564414620399475, "learning_rate": 2.918050188951637e-05, "loss": 0.1886, "step": 9787 }, { "epoch": 2.9923570773463775, "grad_norm": 0.8200288414955139, "learning_rate": 2.918007727909643e-05, "loss": 0.1993, "step": 9788 }, { "epoch": 2.992662794252522, "grad_norm": 0.8599677085876465, "learning_rate": 2.917965266867649e-05, "loss": 0.2254, "step": 9789 }, { "epoch": 2.992968511158667, "grad_norm": 1.6989431381225586, "learning_rate": 2.917922805825655e-05, "loss": 0.226, "step": 9790 }, { "epoch": 2.993274228064812, "grad_norm": 0.8747321367263794, "learning_rate": 2.917880344783661e-05, "loss": 0.2175, "step": 9791 }, { "epoch": 2.993579944970957, "grad_norm": 1.2007137537002563, "learning_rate": 2.9178378837416672e-05, "loss": 0.3064, "step": 9792 }, { "epoch": 2.9938856618771017, "grad_norm": 0.48190832138061523, "learning_rate": 2.917795422699673e-05, "loss": 0.1492, "step": 9793 }, { "epoch": 2.994191378783247, "grad_norm": 0.5143500566482544, "learning_rate": 2.9177529616576793e-05, "loss": 0.1015, "step": 9794 }, { "epoch": 2.9944970956893915, "grad_norm": 0.328666627407074, "learning_rate": 2.917710500615685e-05, "loss": 0.0575, "step": 9795 }, { "epoch": 2.9948028125955366, "grad_norm": 0.28606873750686646, "learning_rate": 2.9176680395736913e-05, "loss": 0.0689, "step": 9796 }, { "epoch": 2.9951085295016817, "grad_norm": 0.3853461742401123, "learning_rate": 2.9176255785316972e-05, "loss": 0.0533, "step": 9797 }, { "epoch": 2.9954142464078264, "grad_norm": 0.47320476174354553, "learning_rate": 2.9175831174897034e-05, "loss": 0.0633, "step": 9798 }, { "epoch": 2.995719963313971, "grad_norm": 0.4859251379966736, "learning_rate": 2.9175406564477093e-05, "loss": 0.1023, "step": 9799 }, { "epoch": 2.996025680220116, "grad_norm": 0.3544001877307892, "learning_rate": 2.9174981954057155e-05, "loss": 0.0703, "step": 9800 }, { "epoch": 2.9963313971262613, "grad_norm": 1.1854215860366821, "learning_rate": 2.9174557343637214e-05, "loss": 0.1054, "step": 9801 }, { "epoch": 2.996637114032406, "grad_norm": 0.514259397983551, "learning_rate": 2.9174132733217272e-05, "loss": 0.1266, "step": 9802 }, { "epoch": 2.9969428309385506, "grad_norm": 0.450078547000885, "learning_rate": 2.9173708122797334e-05, "loss": 0.1012, "step": 9803 }, { "epoch": 2.9972485478446957, "grad_norm": 0.47836050391197205, "learning_rate": 2.9173283512377393e-05, "loss": 0.1285, "step": 9804 }, { "epoch": 2.997554264750841, "grad_norm": 0.5224893093109131, "learning_rate": 2.9172858901957455e-05, "loss": 0.1433, "step": 9805 }, { "epoch": 2.9978599816569855, "grad_norm": 0.6325882077217102, "learning_rate": 2.9172434291537514e-05, "loss": 0.181, "step": 9806 }, { "epoch": 2.9981656985631306, "grad_norm": 0.7013418078422546, "learning_rate": 2.9172009681117576e-05, "loss": 0.1825, "step": 9807 }, { "epoch": 2.9984714154692753, "grad_norm": 0.8652428388595581, "learning_rate": 2.9171585070697635e-05, "loss": 0.1659, "step": 9808 }, { "epoch": 2.9987771323754204, "grad_norm": 0.8708938956260681, "learning_rate": 2.9171160460277697e-05, "loss": 0.206, "step": 9809 }, { "epoch": 2.9990828492815655, "grad_norm": 1.0328105688095093, "learning_rate": 2.9170735849857756e-05, "loss": 0.2193, "step": 9810 }, { "epoch": 2.99938856618771, "grad_norm": 0.8087608218193054, "learning_rate": 2.9170311239437818e-05, "loss": 0.1795, "step": 9811 }, { "epoch": 2.999694283093855, "grad_norm": 1.4317342042922974, "learning_rate": 2.9169886629017876e-05, "loss": 0.2212, "step": 9812 }, { "epoch": 3.0, "grad_norm": 0.8346335887908936, "learning_rate": 2.9169462018597935e-05, "loss": 0.202, "step": 9813 }, { "epoch": 3.000305716906145, "grad_norm": 0.42346978187561035, "learning_rate": 2.9169037408177997e-05, "loss": 0.1628, "step": 9814 }, { "epoch": 3.00061143381229, "grad_norm": 0.31085702776908875, "learning_rate": 2.9168612797758056e-05, "loss": 0.0816, "step": 9815 }, { "epoch": 3.000917150718435, "grad_norm": 0.44277313351631165, "learning_rate": 2.9168188187338118e-05, "loss": 0.0719, "step": 9816 }, { "epoch": 3.0012228676245796, "grad_norm": 0.3740095794200897, "learning_rate": 2.9167763576918177e-05, "loss": 0.0661, "step": 9817 }, { "epoch": 3.0015285845307247, "grad_norm": 0.3693801164627075, "learning_rate": 2.916733896649824e-05, "loss": 0.0617, "step": 9818 }, { "epoch": 3.0018343014368694, "grad_norm": 0.42890655994415283, "learning_rate": 2.9166914356078297e-05, "loss": 0.0761, "step": 9819 }, { "epoch": 3.0021400183430145, "grad_norm": 0.29669684171676636, "learning_rate": 2.916648974565836e-05, "loss": 0.0759, "step": 9820 }, { "epoch": 3.002445735249159, "grad_norm": 0.4185309112071991, "learning_rate": 2.9166065135238418e-05, "loss": 0.0577, "step": 9821 }, { "epoch": 3.0027514521553043, "grad_norm": 0.4702351689338684, "learning_rate": 2.916564052481848e-05, "loss": 0.0703, "step": 9822 }, { "epoch": 3.003057169061449, "grad_norm": 0.42210209369659424, "learning_rate": 2.916521591439854e-05, "loss": 0.0841, "step": 9823 }, { "epoch": 3.003362885967594, "grad_norm": 0.5109655261039734, "learning_rate": 2.91647913039786e-05, "loss": 0.1116, "step": 9824 }, { "epoch": 3.0036686028737387, "grad_norm": 0.8710323572158813, "learning_rate": 2.916436669355866e-05, "loss": 0.0813, "step": 9825 }, { "epoch": 3.003974319779884, "grad_norm": 0.5578445792198181, "learning_rate": 2.916394208313872e-05, "loss": 0.1078, "step": 9826 }, { "epoch": 3.004280036686029, "grad_norm": 0.3646198809146881, "learning_rate": 2.916351747271878e-05, "loss": 0.1326, "step": 9827 }, { "epoch": 3.0045857535921736, "grad_norm": 0.5661798715591431, "learning_rate": 2.916309286229884e-05, "loss": 0.1348, "step": 9828 }, { "epoch": 3.0048914704983187, "grad_norm": 0.3833496570587158, "learning_rate": 2.91626682518789e-05, "loss": 0.1293, "step": 9829 }, { "epoch": 3.0051971874044634, "grad_norm": 0.6193963289260864, "learning_rate": 2.916224364145896e-05, "loss": 0.2015, "step": 9830 }, { "epoch": 3.0055029043106085, "grad_norm": 2.260901689529419, "learning_rate": 2.9161819031039022e-05, "loss": 0.189, "step": 9831 }, { "epoch": 3.005808621216753, "grad_norm": 0.60114586353302, "learning_rate": 2.916139442061908e-05, "loss": 0.1702, "step": 9832 }, { "epoch": 3.0061143381228983, "grad_norm": 1.8134087324142456, "learning_rate": 2.9160969810199143e-05, "loss": 0.1622, "step": 9833 }, { "epoch": 3.006420055029043, "grad_norm": 0.6778774857521057, "learning_rate": 2.91605451997792e-05, "loss": 0.1648, "step": 9834 }, { "epoch": 3.006725771935188, "grad_norm": 1.268831729888916, "learning_rate": 2.9160120589359264e-05, "loss": 0.1657, "step": 9835 }, { "epoch": 3.0070314888413328, "grad_norm": 1.2071192264556885, "learning_rate": 2.9159695978939322e-05, "loss": 0.2346, "step": 9836 }, { "epoch": 3.007337205747478, "grad_norm": 0.9194573163986206, "learning_rate": 2.9159271368519384e-05, "loss": 0.1903, "step": 9837 }, { "epoch": 3.0076429226536225, "grad_norm": 3.0787129402160645, "learning_rate": 2.9158846758099443e-05, "loss": 0.2312, "step": 9838 }, { "epoch": 3.0079486395597677, "grad_norm": 0.8229395747184753, "learning_rate": 2.9158422147679502e-05, "loss": 0.1684, "step": 9839 }, { "epoch": 3.0082543564659128, "grad_norm": 0.266791433095932, "learning_rate": 2.9157997537259564e-05, "loss": 0.0765, "step": 9840 }, { "epoch": 3.0085600733720574, "grad_norm": 0.7054421901702881, "learning_rate": 2.9157572926839623e-05, "loss": 0.0655, "step": 9841 }, { "epoch": 3.0088657902782026, "grad_norm": 0.4697444438934326, "learning_rate": 2.9157148316419685e-05, "loss": 0.0599, "step": 9842 }, { "epoch": 3.0091715071843472, "grad_norm": 0.23402822017669678, "learning_rate": 2.9156723705999743e-05, "loss": 0.0468, "step": 9843 }, { "epoch": 3.0094772240904923, "grad_norm": 0.3201437294483185, "learning_rate": 2.9156299095579806e-05, "loss": 0.0768, "step": 9844 }, { "epoch": 3.009782940996637, "grad_norm": 0.310086190700531, "learning_rate": 2.9155874485159864e-05, "loss": 0.0757, "step": 9845 }, { "epoch": 3.010088657902782, "grad_norm": 0.2650007903575897, "learning_rate": 2.9155449874739926e-05, "loss": 0.0581, "step": 9846 }, { "epoch": 3.010394374808927, "grad_norm": 0.43977412581443787, "learning_rate": 2.9155025264319985e-05, "loss": 0.074, "step": 9847 }, { "epoch": 3.010700091715072, "grad_norm": 0.33258381485939026, "learning_rate": 2.9154600653900047e-05, "loss": 0.0726, "step": 9848 }, { "epoch": 3.0110058086212166, "grad_norm": 0.26298239827156067, "learning_rate": 2.9154176043480106e-05, "loss": 0.1063, "step": 9849 }, { "epoch": 3.0113115255273617, "grad_norm": 0.33632028102874756, "learning_rate": 2.9153751433060168e-05, "loss": 0.1032, "step": 9850 }, { "epoch": 3.0116172424335064, "grad_norm": 0.5470579862594604, "learning_rate": 2.915332682264023e-05, "loss": 0.133, "step": 9851 }, { "epoch": 3.0119229593396515, "grad_norm": 0.4657822847366333, "learning_rate": 2.915290221222029e-05, "loss": 0.1123, "step": 9852 }, { "epoch": 3.0122286762457966, "grad_norm": 0.41752779483795166, "learning_rate": 2.915247760180035e-05, "loss": 0.1103, "step": 9853 }, { "epoch": 3.0125343931519413, "grad_norm": 1.182564616203308, "learning_rate": 2.915205299138041e-05, "loss": 0.1538, "step": 9854 }, { "epoch": 3.0128401100580864, "grad_norm": 0.5873005986213684, "learning_rate": 2.915162838096047e-05, "loss": 0.1622, "step": 9855 }, { "epoch": 3.013145826964231, "grad_norm": 1.1239440441131592, "learning_rate": 2.915120377054053e-05, "loss": 0.1545, "step": 9856 }, { "epoch": 3.013451543870376, "grad_norm": 2.016841173171997, "learning_rate": 2.9150779160120592e-05, "loss": 0.1795, "step": 9857 }, { "epoch": 3.013757260776521, "grad_norm": 0.9573583006858826, "learning_rate": 2.915035454970065e-05, "loss": 0.2081, "step": 9858 }, { "epoch": 3.014062977682666, "grad_norm": 0.8239682912826538, "learning_rate": 2.9149929939280713e-05, "loss": 0.251, "step": 9859 }, { "epoch": 3.0143686945888106, "grad_norm": 0.8684477806091309, "learning_rate": 2.9149505328860772e-05, "loss": 0.1834, "step": 9860 }, { "epoch": 3.0146744114949557, "grad_norm": 1.8513543605804443, "learning_rate": 2.9149080718440834e-05, "loss": 0.2116, "step": 9861 }, { "epoch": 3.0149801284011004, "grad_norm": 3.9151082038879395, "learning_rate": 2.9148656108020893e-05, "loss": 0.2286, "step": 9862 }, { "epoch": 3.0152858453072455, "grad_norm": 1.8443140983581543, "learning_rate": 2.9148231497600955e-05, "loss": 0.3277, "step": 9863 }, { "epoch": 3.01559156221339, "grad_norm": 0.4562356770038605, "learning_rate": 2.9147806887181013e-05, "loss": 0.1549, "step": 9864 }, { "epoch": 3.0158972791195353, "grad_norm": 0.2976401746273041, "learning_rate": 2.9147382276761072e-05, "loss": 0.1073, "step": 9865 }, { "epoch": 3.0162029960256804, "grad_norm": 0.2427670657634735, "learning_rate": 2.9146957666341134e-05, "loss": 0.0603, "step": 9866 }, { "epoch": 3.016508712931825, "grad_norm": 0.32457560300827026, "learning_rate": 2.9146533055921193e-05, "loss": 0.0573, "step": 9867 }, { "epoch": 3.01681442983797, "grad_norm": 0.5805197358131409, "learning_rate": 2.9146108445501255e-05, "loss": 0.0604, "step": 9868 }, { "epoch": 3.017120146744115, "grad_norm": 0.36472517251968384, "learning_rate": 2.9145683835081314e-05, "loss": 0.0709, "step": 9869 }, { "epoch": 3.01742586365026, "grad_norm": 0.5427183508872986, "learning_rate": 2.9145259224661376e-05, "loss": 0.0792, "step": 9870 }, { "epoch": 3.0177315805564047, "grad_norm": 0.3784457743167877, "learning_rate": 2.9144834614241434e-05, "loss": 0.0841, "step": 9871 }, { "epoch": 3.01803729746255, "grad_norm": 0.3116036057472229, "learning_rate": 2.9144410003821497e-05, "loss": 0.0657, "step": 9872 }, { "epoch": 3.0183430143686945, "grad_norm": 0.3434503376483917, "learning_rate": 2.9143985393401555e-05, "loss": 0.0813, "step": 9873 }, { "epoch": 3.0186487312748396, "grad_norm": 0.3164171576499939, "learning_rate": 2.9143560782981617e-05, "loss": 0.0973, "step": 9874 }, { "epoch": 3.0189544481809842, "grad_norm": 0.3757691979408264, "learning_rate": 2.9143136172561676e-05, "loss": 0.0744, "step": 9875 }, { "epoch": 3.0192601650871294, "grad_norm": 0.6157011389732361, "learning_rate": 2.9142711562141738e-05, "loss": 0.1125, "step": 9876 }, { "epoch": 3.019565881993274, "grad_norm": 0.40145501494407654, "learning_rate": 2.9142286951721797e-05, "loss": 0.1361, "step": 9877 }, { "epoch": 3.019871598899419, "grad_norm": 0.44051530957221985, "learning_rate": 2.9141862341301856e-05, "loss": 0.1879, "step": 9878 }, { "epoch": 3.0201773158055643, "grad_norm": 0.5468085408210754, "learning_rate": 2.9141437730881918e-05, "loss": 0.1528, "step": 9879 }, { "epoch": 3.020483032711709, "grad_norm": 0.5161142349243164, "learning_rate": 2.9141013120461976e-05, "loss": 0.1519, "step": 9880 }, { "epoch": 3.020788749617854, "grad_norm": 0.9319376349449158, "learning_rate": 2.914058851004204e-05, "loss": 0.1767, "step": 9881 }, { "epoch": 3.0210944665239987, "grad_norm": 1.1544052362442017, "learning_rate": 2.9140163899622097e-05, "loss": 0.1777, "step": 9882 }, { "epoch": 3.021400183430144, "grad_norm": 0.7025966644287109, "learning_rate": 2.913973928920216e-05, "loss": 0.1805, "step": 9883 }, { "epoch": 3.0217059003362885, "grad_norm": 1.2526135444641113, "learning_rate": 2.9139314678782218e-05, "loss": 0.1903, "step": 9884 }, { "epoch": 3.0220116172424336, "grad_norm": 1.3519750833511353, "learning_rate": 2.913889006836228e-05, "loss": 0.1953, "step": 9885 }, { "epoch": 3.0223173341485783, "grad_norm": 1.4668052196502686, "learning_rate": 2.913846545794234e-05, "loss": 0.1758, "step": 9886 }, { "epoch": 3.0226230510547234, "grad_norm": 1.5399457216262817, "learning_rate": 2.91380408475224e-05, "loss": 0.2616, "step": 9887 }, { "epoch": 3.022928767960868, "grad_norm": 2.248068332672119, "learning_rate": 2.913761623710246e-05, "loss": 0.253, "step": 9888 }, { "epoch": 3.023234484867013, "grad_norm": 0.4327199161052704, "learning_rate": 2.913719162668252e-05, "loss": 0.1517, "step": 9889 }, { "epoch": 3.023540201773158, "grad_norm": 0.35614001750946045, "learning_rate": 2.913676701626258e-05, "loss": 0.0837, "step": 9890 }, { "epoch": 3.023845918679303, "grad_norm": 0.26878878474235535, "learning_rate": 2.913634240584264e-05, "loss": 0.0898, "step": 9891 }, { "epoch": 3.024151635585448, "grad_norm": 0.20473748445510864, "learning_rate": 2.91359177954227e-05, "loss": 0.0652, "step": 9892 }, { "epoch": 3.0244573524915928, "grad_norm": 0.2138693779706955, "learning_rate": 2.913549318500276e-05, "loss": 0.0576, "step": 9893 }, { "epoch": 3.024763069397738, "grad_norm": 0.21835742890834808, "learning_rate": 2.9135068574582822e-05, "loss": 0.0557, "step": 9894 }, { "epoch": 3.0250687863038825, "grad_norm": 0.30302757024765015, "learning_rate": 2.913464396416288e-05, "loss": 0.0552, "step": 9895 }, { "epoch": 3.0253745032100277, "grad_norm": 0.735779345035553, "learning_rate": 2.9134219353742943e-05, "loss": 0.0782, "step": 9896 }, { "epoch": 3.0256802201161723, "grad_norm": 0.6538363695144653, "learning_rate": 2.9133794743323e-05, "loss": 0.0867, "step": 9897 }, { "epoch": 3.0259859370223174, "grad_norm": 0.5421738624572754, "learning_rate": 2.9133370132903063e-05, "loss": 0.1024, "step": 9898 }, { "epoch": 3.026291653928462, "grad_norm": 0.4476008713245392, "learning_rate": 2.9132945522483122e-05, "loss": 0.0925, "step": 9899 }, { "epoch": 3.0265973708346072, "grad_norm": 0.3618099093437195, "learning_rate": 2.9132520912063184e-05, "loss": 0.0908, "step": 9900 }, { "epoch": 3.026903087740752, "grad_norm": 0.5626158714294434, "learning_rate": 2.9132096301643243e-05, "loss": 0.1332, "step": 9901 }, { "epoch": 3.027208804646897, "grad_norm": 0.43226227164268494, "learning_rate": 2.9131671691223305e-05, "loss": 0.107, "step": 9902 }, { "epoch": 3.0275145215530417, "grad_norm": 0.5577691793441772, "learning_rate": 2.9131247080803364e-05, "loss": 0.1446, "step": 9903 }, { "epoch": 3.027820238459187, "grad_norm": 0.7113911509513855, "learning_rate": 2.9130822470383422e-05, "loss": 0.1416, "step": 9904 }, { "epoch": 3.028125955365332, "grad_norm": 0.7310557961463928, "learning_rate": 2.9130397859963484e-05, "loss": 0.1639, "step": 9905 }, { "epoch": 3.0284316722714766, "grad_norm": 0.7257323265075684, "learning_rate": 2.9129973249543543e-05, "loss": 0.178, "step": 9906 }, { "epoch": 3.0287373891776217, "grad_norm": 0.8134094476699829, "learning_rate": 2.9129548639123605e-05, "loss": 0.1794, "step": 9907 }, { "epoch": 3.0290431060837664, "grad_norm": 1.0326179265975952, "learning_rate": 2.9129124028703664e-05, "loss": 0.2002, "step": 9908 }, { "epoch": 3.0293488229899115, "grad_norm": 1.2176483869552612, "learning_rate": 2.9128699418283726e-05, "loss": 0.2212, "step": 9909 }, { "epoch": 3.029654539896056, "grad_norm": 1.395903468132019, "learning_rate": 2.9128274807863785e-05, "loss": 0.2, "step": 9910 }, { "epoch": 3.0299602568022013, "grad_norm": 0.7877084612846375, "learning_rate": 2.9127850197443847e-05, "loss": 0.2057, "step": 9911 }, { "epoch": 3.030265973708346, "grad_norm": 3.748426675796509, "learning_rate": 2.9127425587023906e-05, "loss": 0.2242, "step": 9912 }, { "epoch": 3.030571690614491, "grad_norm": 1.3642346858978271, "learning_rate": 2.9127000976603968e-05, "loss": 0.254, "step": 9913 }, { "epoch": 3.0308774075206357, "grad_norm": 0.5494166612625122, "learning_rate": 2.9126576366184026e-05, "loss": 0.153, "step": 9914 }, { "epoch": 3.031183124426781, "grad_norm": 0.46273288130760193, "learning_rate": 2.912615175576409e-05, "loss": 0.0853, "step": 9915 }, { "epoch": 3.0314888413329255, "grad_norm": 1.5637967586517334, "learning_rate": 2.9125727145344147e-05, "loss": 0.1002, "step": 9916 }, { "epoch": 3.0317945582390706, "grad_norm": 0.2746178209781647, "learning_rate": 2.9125302534924206e-05, "loss": 0.0687, "step": 9917 }, { "epoch": 3.0321002751452157, "grad_norm": 0.22988256812095642, "learning_rate": 2.9124877924504268e-05, "loss": 0.0622, "step": 9918 }, { "epoch": 3.0324059920513604, "grad_norm": 0.4319975972175598, "learning_rate": 2.9124453314084327e-05, "loss": 0.0721, "step": 9919 }, { "epoch": 3.0327117089575055, "grad_norm": 0.39087164402008057, "learning_rate": 2.912402870366439e-05, "loss": 0.0466, "step": 9920 }, { "epoch": 3.03301742586365, "grad_norm": 0.34156617522239685, "learning_rate": 2.9123604093244447e-05, "loss": 0.0928, "step": 9921 }, { "epoch": 3.0333231427697953, "grad_norm": 0.2406630963087082, "learning_rate": 2.912317948282451e-05, "loss": 0.0669, "step": 9922 }, { "epoch": 3.03362885967594, "grad_norm": 0.5155388116836548, "learning_rate": 2.9122754872404568e-05, "loss": 0.0641, "step": 9923 }, { "epoch": 3.033934576582085, "grad_norm": 0.35716712474823, "learning_rate": 2.912233026198463e-05, "loss": 0.1051, "step": 9924 }, { "epoch": 3.0342402934882298, "grad_norm": 0.38639330863952637, "learning_rate": 2.912190565156469e-05, "loss": 0.0806, "step": 9925 }, { "epoch": 3.034546010394375, "grad_norm": 0.7412813901901245, "learning_rate": 2.912148104114475e-05, "loss": 0.1028, "step": 9926 }, { "epoch": 3.0348517273005196, "grad_norm": 0.5582312941551208, "learning_rate": 2.912105643072481e-05, "loss": 0.1299, "step": 9927 }, { "epoch": 3.0351574442066647, "grad_norm": 1.0793581008911133, "learning_rate": 2.912063182030487e-05, "loss": 0.0964, "step": 9928 }, { "epoch": 3.0354631611128093, "grad_norm": 0.47238636016845703, "learning_rate": 2.912020720988493e-05, "loss": 0.1306, "step": 9929 }, { "epoch": 3.0357688780189545, "grad_norm": 0.7725540399551392, "learning_rate": 2.911978259946499e-05, "loss": 0.1383, "step": 9930 }, { "epoch": 3.0360745949250996, "grad_norm": 0.8116467595100403, "learning_rate": 2.911935798904505e-05, "loss": 0.1694, "step": 9931 }, { "epoch": 3.0363803118312442, "grad_norm": 1.1140550374984741, "learning_rate": 2.911893337862511e-05, "loss": 0.2335, "step": 9932 }, { "epoch": 3.0366860287373894, "grad_norm": 0.5806221961975098, "learning_rate": 2.9118508768205172e-05, "loss": 0.2119, "step": 9933 }, { "epoch": 3.036991745643534, "grad_norm": 0.6003894209861755, "learning_rate": 2.911808415778523e-05, "loss": 0.2082, "step": 9934 }, { "epoch": 3.037297462549679, "grad_norm": 0.7746021747589111, "learning_rate": 2.9117659547365293e-05, "loss": 0.2161, "step": 9935 }, { "epoch": 3.037603179455824, "grad_norm": 1.1458995342254639, "learning_rate": 2.911723493694535e-05, "loss": 0.2251, "step": 9936 }, { "epoch": 3.037908896361969, "grad_norm": 1.239461064338684, "learning_rate": 2.9116810326525414e-05, "loss": 0.2454, "step": 9937 }, { "epoch": 3.0382146132681136, "grad_norm": 2.2767932415008545, "learning_rate": 2.9116385716105472e-05, "loss": 0.3295, "step": 9938 }, { "epoch": 3.0385203301742587, "grad_norm": 0.3298475444316864, "learning_rate": 2.9115961105685535e-05, "loss": 0.1591, "step": 9939 }, { "epoch": 3.0388260470804034, "grad_norm": 0.25040575861930847, "learning_rate": 2.9115536495265593e-05, "loss": 0.0845, "step": 9940 }, { "epoch": 3.0391317639865485, "grad_norm": 0.29983171820640564, "learning_rate": 2.9115111884845652e-05, "loss": 0.0996, "step": 9941 }, { "epoch": 3.039437480892693, "grad_norm": 0.22631274163722992, "learning_rate": 2.9114687274425714e-05, "loss": 0.0651, "step": 9942 }, { "epoch": 3.0397431977988383, "grad_norm": 0.21979133784770966, "learning_rate": 2.9114262664005773e-05, "loss": 0.0504, "step": 9943 }, { "epoch": 3.0400489147049834, "grad_norm": 0.36350563168525696, "learning_rate": 2.9113838053585835e-05, "loss": 0.0806, "step": 9944 }, { "epoch": 3.040354631611128, "grad_norm": 0.26604020595550537, "learning_rate": 2.9113413443165893e-05, "loss": 0.0768, "step": 9945 }, { "epoch": 3.040660348517273, "grad_norm": 0.37508314847946167, "learning_rate": 2.9112988832745956e-05, "loss": 0.0633, "step": 9946 }, { "epoch": 3.040966065423418, "grad_norm": 0.606623113155365, "learning_rate": 2.9112564222326014e-05, "loss": 0.0846, "step": 9947 }, { "epoch": 3.041271782329563, "grad_norm": 0.33359506726264954, "learning_rate": 2.9112139611906076e-05, "loss": 0.0682, "step": 9948 }, { "epoch": 3.0415774992357076, "grad_norm": 0.3643019199371338, "learning_rate": 2.9111715001486135e-05, "loss": 0.0878, "step": 9949 }, { "epoch": 3.0418832161418528, "grad_norm": 0.33096054196357727, "learning_rate": 2.9111290391066197e-05, "loss": 0.0955, "step": 9950 }, { "epoch": 3.0421889330479974, "grad_norm": 0.5743088722229004, "learning_rate": 2.9110865780646256e-05, "loss": 0.114, "step": 9951 }, { "epoch": 3.0424946499541425, "grad_norm": 0.39921554923057556, "learning_rate": 2.9110441170226318e-05, "loss": 0.1, "step": 9952 }, { "epoch": 3.042800366860287, "grad_norm": 0.47699227929115295, "learning_rate": 2.9110016559806377e-05, "loss": 0.1221, "step": 9953 }, { "epoch": 3.0431060837664323, "grad_norm": 0.6024357080459595, "learning_rate": 2.910959194938644e-05, "loss": 0.1179, "step": 9954 }, { "epoch": 3.043411800672577, "grad_norm": 0.7833068370819092, "learning_rate": 2.91091673389665e-05, "loss": 0.1466, "step": 9955 }, { "epoch": 3.043717517578722, "grad_norm": 0.6036311388015747, "learning_rate": 2.910874272854656e-05, "loss": 0.1686, "step": 9956 }, { "epoch": 3.0440232344848672, "grad_norm": 0.891036331653595, "learning_rate": 2.910831811812662e-05, "loss": 0.1765, "step": 9957 }, { "epoch": 3.044328951391012, "grad_norm": 0.9491572380065918, "learning_rate": 2.910789350770668e-05, "loss": 0.1946, "step": 9958 }, { "epoch": 3.044634668297157, "grad_norm": 1.1428550481796265, "learning_rate": 2.9107468897286742e-05, "loss": 0.1926, "step": 9959 }, { "epoch": 3.0449403852033017, "grad_norm": 1.238967776298523, "learning_rate": 2.91070442868668e-05, "loss": 0.1971, "step": 9960 }, { "epoch": 3.045246102109447, "grad_norm": 1.1865068674087524, "learning_rate": 2.9106619676446863e-05, "loss": 0.2006, "step": 9961 }, { "epoch": 3.0455518190155915, "grad_norm": 1.0945185422897339, "learning_rate": 2.9106195066026922e-05, "loss": 0.197, "step": 9962 }, { "epoch": 3.0458575359217366, "grad_norm": 1.7602344751358032, "learning_rate": 2.9105770455606984e-05, "loss": 0.2856, "step": 9963 }, { "epoch": 3.0461632528278813, "grad_norm": 0.4231763184070587, "learning_rate": 2.9105345845187043e-05, "loss": 0.1456, "step": 9964 }, { "epoch": 3.0464689697340264, "grad_norm": 0.3031119406223297, "learning_rate": 2.9104921234767105e-05, "loss": 0.0801, "step": 9965 }, { "epoch": 3.046774686640171, "grad_norm": 0.39212435483932495, "learning_rate": 2.9104496624347163e-05, "loss": 0.077, "step": 9966 }, { "epoch": 3.047080403546316, "grad_norm": 0.314142644405365, "learning_rate": 2.9104072013927222e-05, "loss": 0.0713, "step": 9967 }, { "epoch": 3.047386120452461, "grad_norm": 0.8179952502250671, "learning_rate": 2.9103647403507284e-05, "loss": 0.0609, "step": 9968 }, { "epoch": 3.047691837358606, "grad_norm": 0.20648305118083954, "learning_rate": 2.9103222793087343e-05, "loss": 0.059, "step": 9969 }, { "epoch": 3.047997554264751, "grad_norm": 0.33909523487091064, "learning_rate": 2.9102798182667405e-05, "loss": 0.0915, "step": 9970 }, { "epoch": 3.0483032711708957, "grad_norm": 0.6398138999938965, "learning_rate": 2.9102373572247464e-05, "loss": 0.0833, "step": 9971 }, { "epoch": 3.048608988077041, "grad_norm": 0.5261000394821167, "learning_rate": 2.9101948961827526e-05, "loss": 0.0717, "step": 9972 }, { "epoch": 3.0489147049831855, "grad_norm": 0.3790794014930725, "learning_rate": 2.9101524351407585e-05, "loss": 0.0656, "step": 9973 }, { "epoch": 3.0492204218893306, "grad_norm": 1.417400598526001, "learning_rate": 2.9101099740987647e-05, "loss": 0.1265, "step": 9974 }, { "epoch": 3.0495261387954753, "grad_norm": 0.4987010657787323, "learning_rate": 2.9100675130567705e-05, "loss": 0.0967, "step": 9975 }, { "epoch": 3.0498318557016204, "grad_norm": 0.6948097944259644, "learning_rate": 2.9100250520147767e-05, "loss": 0.1339, "step": 9976 }, { "epoch": 3.050137572607765, "grad_norm": 2.696585178375244, "learning_rate": 2.9099825909727826e-05, "loss": 0.1005, "step": 9977 }, { "epoch": 3.05044328951391, "grad_norm": 0.47487178444862366, "learning_rate": 2.9099401299307888e-05, "loss": 0.1412, "step": 9978 }, { "epoch": 3.050749006420055, "grad_norm": 0.7186842560768127, "learning_rate": 2.9098976688887947e-05, "loss": 0.1624, "step": 9979 }, { "epoch": 3.0510547233262, "grad_norm": 1.3719522953033447, "learning_rate": 2.9098552078468006e-05, "loss": 0.1384, "step": 9980 }, { "epoch": 3.0513604402323447, "grad_norm": 0.8336465358734131, "learning_rate": 2.9098127468048068e-05, "loss": 0.1885, "step": 9981 }, { "epoch": 3.0516661571384898, "grad_norm": 1.1709837913513184, "learning_rate": 2.9097702857628126e-05, "loss": 0.2263, "step": 9982 }, { "epoch": 3.051971874044635, "grad_norm": 0.8848329782485962, "learning_rate": 2.909727824720819e-05, "loss": 0.1936, "step": 9983 }, { "epoch": 3.0522775909507796, "grad_norm": 1.0356723070144653, "learning_rate": 2.9096853636788247e-05, "loss": 0.2118, "step": 9984 }, { "epoch": 3.0525833078569247, "grad_norm": 0.9136974811553955, "learning_rate": 2.909642902636831e-05, "loss": 0.1754, "step": 9985 }, { "epoch": 3.0528890247630693, "grad_norm": 1.1877378225326538, "learning_rate": 2.9096004415948368e-05, "loss": 0.2008, "step": 9986 }, { "epoch": 3.0531947416692145, "grad_norm": 1.0892009735107422, "learning_rate": 2.909557980552843e-05, "loss": 0.2214, "step": 9987 }, { "epoch": 3.053500458575359, "grad_norm": 1.3733361959457397, "learning_rate": 2.909515519510849e-05, "loss": 0.3299, "step": 9988 }, { "epoch": 3.0538061754815042, "grad_norm": 0.6210670471191406, "learning_rate": 2.909473058468855e-05, "loss": 0.1439, "step": 9989 }, { "epoch": 3.054111892387649, "grad_norm": 0.5189393162727356, "learning_rate": 2.909430597426861e-05, "loss": 0.0978, "step": 9990 }, { "epoch": 3.054417609293794, "grad_norm": 0.2215009480714798, "learning_rate": 2.909388136384867e-05, "loss": 0.0775, "step": 9991 }, { "epoch": 3.0547233261999387, "grad_norm": 0.29092761874198914, "learning_rate": 2.909345675342873e-05, "loss": 0.0497, "step": 9992 }, { "epoch": 3.055029043106084, "grad_norm": 0.33225777745246887, "learning_rate": 2.909303214300879e-05, "loss": 0.0521, "step": 9993 }, { "epoch": 3.0553347600122285, "grad_norm": 0.2332017719745636, "learning_rate": 2.909260753258885e-05, "loss": 0.0648, "step": 9994 }, { "epoch": 3.0556404769183736, "grad_norm": 0.28730300068855286, "learning_rate": 2.909218292216891e-05, "loss": 0.0589, "step": 9995 }, { "epoch": 3.0559461938245187, "grad_norm": 0.25589704513549805, "learning_rate": 2.9091758311748972e-05, "loss": 0.0582, "step": 9996 }, { "epoch": 3.0562519107306634, "grad_norm": 0.32279670238494873, "learning_rate": 2.909133370132903e-05, "loss": 0.0727, "step": 9997 }, { "epoch": 3.0565576276368085, "grad_norm": 0.2891189455986023, "learning_rate": 2.9090909090909093e-05, "loss": 0.0689, "step": 9998 }, { "epoch": 3.056863344542953, "grad_norm": 0.5595896244049072, "learning_rate": 2.909048448048915e-05, "loss": 0.1035, "step": 9999 }, { "epoch": 3.0571690614490983, "grad_norm": 0.3263062834739685, "learning_rate": 2.9090059870069213e-05, "loss": 0.0701, "step": 10000 }, { "epoch": 3.0571690614490983, "eval_cer": 0.19142854025509257, "eval_loss": 0.25329089164733887, "eval_runtime": 19.2123, "eval_samples_per_second": 236.203, "eval_steps_per_second": 0.781, "eval_wer": 0.33909667327728277, "step": 10000 }, { "epoch": 3.057474778355243, "grad_norm": 0.7752137184143066, "learning_rate": 2.9089635259649272e-05, "loss": 0.1059, "step": 10001 }, { "epoch": 3.057780495261388, "grad_norm": 0.6599632501602173, "learning_rate": 2.9089210649229334e-05, "loss": 0.1435, "step": 10002 }, { "epoch": 3.0580862121675327, "grad_norm": 0.5732297301292419, "learning_rate": 2.9088786038809393e-05, "loss": 0.157, "step": 10003 }, { "epoch": 3.058391929073678, "grad_norm": 1.5359485149383545, "learning_rate": 2.9088361428389455e-05, "loss": 0.1685, "step": 10004 }, { "epoch": 3.0586976459798225, "grad_norm": 0.49692270159721375, "learning_rate": 2.9087936817969514e-05, "loss": 0.1613, "step": 10005 }, { "epoch": 3.0590033628859676, "grad_norm": 0.6027255654335022, "learning_rate": 2.9087512207549572e-05, "loss": 0.1674, "step": 10006 }, { "epoch": 3.0593090797921123, "grad_norm": 1.4179556369781494, "learning_rate": 2.9087087597129635e-05, "loss": 0.1839, "step": 10007 }, { "epoch": 3.0596147966982574, "grad_norm": 0.8725062608718872, "learning_rate": 2.9086662986709693e-05, "loss": 0.1936, "step": 10008 }, { "epoch": 3.0599205136044025, "grad_norm": 1.0006487369537354, "learning_rate": 2.9086238376289755e-05, "loss": 0.2201, "step": 10009 }, { "epoch": 3.060226230510547, "grad_norm": 0.728834867477417, "learning_rate": 2.9085813765869814e-05, "loss": 0.1788, "step": 10010 }, { "epoch": 3.0605319474166923, "grad_norm": 0.7991411685943604, "learning_rate": 2.9085389155449876e-05, "loss": 0.2309, "step": 10011 }, { "epoch": 3.060837664322837, "grad_norm": 1.9486746788024902, "learning_rate": 2.9084964545029935e-05, "loss": 0.2623, "step": 10012 }, { "epoch": 3.061143381228982, "grad_norm": 2.2473528385162354, "learning_rate": 2.9084539934609997e-05, "loss": 0.3585, "step": 10013 }, { "epoch": 3.061449098135127, "grad_norm": 0.4509549140930176, "learning_rate": 2.9084115324190056e-05, "loss": 0.1467, "step": 10014 }, { "epoch": 3.061754815041272, "grad_norm": 0.2511819005012512, "learning_rate": 2.9083690713770118e-05, "loss": 0.0906, "step": 10015 }, { "epoch": 3.0620605319474166, "grad_norm": 0.5383131504058838, "learning_rate": 2.9083266103350176e-05, "loss": 0.0777, "step": 10016 }, { "epoch": 3.0623662488535617, "grad_norm": 0.2384410947561264, "learning_rate": 2.908284149293024e-05, "loss": 0.0718, "step": 10017 }, { "epoch": 3.0626719657597063, "grad_norm": 0.768470048904419, "learning_rate": 2.9082416882510297e-05, "loss": 0.06, "step": 10018 }, { "epoch": 3.0629776826658515, "grad_norm": 0.34090808033943176, "learning_rate": 2.9081992272090356e-05, "loss": 0.053, "step": 10019 }, { "epoch": 3.063283399571996, "grad_norm": 0.38054850697517395, "learning_rate": 2.9081567661670418e-05, "loss": 0.0743, "step": 10020 }, { "epoch": 3.0635891164781412, "grad_norm": 0.30358803272247314, "learning_rate": 2.9081143051250477e-05, "loss": 0.0562, "step": 10021 }, { "epoch": 3.0638948333842864, "grad_norm": 0.2769474685192108, "learning_rate": 2.908071844083054e-05, "loss": 0.0705, "step": 10022 }, { "epoch": 3.064200550290431, "grad_norm": 0.23718972504138947, "learning_rate": 2.9080293830410597e-05, "loss": 0.0749, "step": 10023 }, { "epoch": 3.064506267196576, "grad_norm": 0.2660408318042755, "learning_rate": 2.907986921999066e-05, "loss": 0.0857, "step": 10024 }, { "epoch": 3.064811984102721, "grad_norm": 0.44107723236083984, "learning_rate": 2.9079444609570718e-05, "loss": 0.0916, "step": 10025 }, { "epoch": 3.065117701008866, "grad_norm": 0.3565465807914734, "learning_rate": 2.907901999915078e-05, "loss": 0.084, "step": 10026 }, { "epoch": 3.0654234179150106, "grad_norm": 0.43784981966018677, "learning_rate": 2.907859538873084e-05, "loss": 0.1371, "step": 10027 }, { "epoch": 3.0657291348211557, "grad_norm": 0.9045982360839844, "learning_rate": 2.90781707783109e-05, "loss": 0.1175, "step": 10028 }, { "epoch": 3.0660348517273004, "grad_norm": 0.41149282455444336, "learning_rate": 2.907774616789096e-05, "loss": 0.1377, "step": 10029 }, { "epoch": 3.0663405686334455, "grad_norm": 0.41112545132637024, "learning_rate": 2.9077321557471022e-05, "loss": 0.1534, "step": 10030 }, { "epoch": 3.06664628553959, "grad_norm": 0.734288215637207, "learning_rate": 2.907689694705108e-05, "loss": 0.1688, "step": 10031 }, { "epoch": 3.0669520024457353, "grad_norm": 0.761220395565033, "learning_rate": 2.907647233663114e-05, "loss": 0.1398, "step": 10032 }, { "epoch": 3.06725771935188, "grad_norm": 1.311556339263916, "learning_rate": 2.90760477262112e-05, "loss": 0.1857, "step": 10033 }, { "epoch": 3.067563436258025, "grad_norm": 0.8490754961967468, "learning_rate": 2.907562311579126e-05, "loss": 0.1979, "step": 10034 }, { "epoch": 3.06786915316417, "grad_norm": 0.883800745010376, "learning_rate": 2.9075198505371322e-05, "loss": 0.212, "step": 10035 }, { "epoch": 3.068174870070315, "grad_norm": 1.0769314765930176, "learning_rate": 2.907477389495138e-05, "loss": 0.2083, "step": 10036 }, { "epoch": 3.06848058697646, "grad_norm": 1.0748541355133057, "learning_rate": 2.9074349284531443e-05, "loss": 0.2459, "step": 10037 }, { "epoch": 3.0687863038826046, "grad_norm": 1.6452142000198364, "learning_rate": 2.90739246741115e-05, "loss": 0.2305, "step": 10038 }, { "epoch": 3.0690920207887498, "grad_norm": 0.41340774297714233, "learning_rate": 2.9073500063691564e-05, "loss": 0.1467, "step": 10039 }, { "epoch": 3.0693977376948944, "grad_norm": 0.20101012289524078, "learning_rate": 2.9073075453271622e-05, "loss": 0.0874, "step": 10040 }, { "epoch": 3.0697034546010395, "grad_norm": 0.4428040683269501, "learning_rate": 2.9072650842851685e-05, "loss": 0.0824, "step": 10041 }, { "epoch": 3.070009171507184, "grad_norm": 0.3742675483226776, "learning_rate": 2.9072226232431743e-05, "loss": 0.0612, "step": 10042 }, { "epoch": 3.0703148884133293, "grad_norm": 0.2696075439453125, "learning_rate": 2.9071801622011802e-05, "loss": 0.0933, "step": 10043 }, { "epoch": 3.070620605319474, "grad_norm": 0.22469530999660492, "learning_rate": 2.9071377011591864e-05, "loss": 0.0601, "step": 10044 }, { "epoch": 3.070926322225619, "grad_norm": 0.21875572204589844, "learning_rate": 2.9070952401171923e-05, "loss": 0.0588, "step": 10045 }, { "epoch": 3.071232039131764, "grad_norm": 0.45943987369537354, "learning_rate": 2.9070527790751985e-05, "loss": 0.0538, "step": 10046 }, { "epoch": 3.071537756037909, "grad_norm": 0.46818557381629944, "learning_rate": 2.9070103180332044e-05, "loss": 0.0714, "step": 10047 }, { "epoch": 3.071843472944054, "grad_norm": 0.24278977513313293, "learning_rate": 2.9069678569912106e-05, "loss": 0.0676, "step": 10048 }, { "epoch": 3.0721491898501987, "grad_norm": 0.4072519540786743, "learning_rate": 2.9069253959492164e-05, "loss": 0.114, "step": 10049 }, { "epoch": 3.072454906756344, "grad_norm": 0.3440033495426178, "learning_rate": 2.9068829349072226e-05, "loss": 0.0796, "step": 10050 }, { "epoch": 3.0727606236624885, "grad_norm": 0.3648938238620758, "learning_rate": 2.9068404738652285e-05, "loss": 0.0928, "step": 10051 }, { "epoch": 3.0730663405686336, "grad_norm": 0.45745426416397095, "learning_rate": 2.9067980128232347e-05, "loss": 0.1283, "step": 10052 }, { "epoch": 3.0733720574747783, "grad_norm": 0.44091612100601196, "learning_rate": 2.9067555517812406e-05, "loss": 0.1575, "step": 10053 }, { "epoch": 3.0736777743809234, "grad_norm": 0.47077178955078125, "learning_rate": 2.9067130907392468e-05, "loss": 0.1572, "step": 10054 }, { "epoch": 3.073983491287068, "grad_norm": 0.6292386054992676, "learning_rate": 2.9066706296972527e-05, "loss": 0.1541, "step": 10055 }, { "epoch": 3.074289208193213, "grad_norm": 0.6726807355880737, "learning_rate": 2.906628168655259e-05, "loss": 0.1609, "step": 10056 }, { "epoch": 3.074594925099358, "grad_norm": 0.5453143119812012, "learning_rate": 2.906585707613265e-05, "loss": 0.1627, "step": 10057 }, { "epoch": 3.074900642005503, "grad_norm": 0.8598425388336182, "learning_rate": 2.906543246571271e-05, "loss": 0.1887, "step": 10058 }, { "epoch": 3.0752063589116476, "grad_norm": 0.7569369673728943, "learning_rate": 2.906500785529277e-05, "loss": 0.1693, "step": 10059 }, { "epoch": 3.0755120758177927, "grad_norm": 1.0183203220367432, "learning_rate": 2.906458324487283e-05, "loss": 0.1702, "step": 10060 }, { "epoch": 3.075817792723938, "grad_norm": 1.0593981742858887, "learning_rate": 2.9064158634452892e-05, "loss": 0.2185, "step": 10061 }, { "epoch": 3.0761235096300825, "grad_norm": 1.2314079999923706, "learning_rate": 2.906373402403295e-05, "loss": 0.2095, "step": 10062 }, { "epoch": 3.0764292265362276, "grad_norm": 1.5018969774246216, "learning_rate": 2.9063309413613013e-05, "loss": 0.2477, "step": 10063 }, { "epoch": 3.0767349434423723, "grad_norm": 0.6881759762763977, "learning_rate": 2.9062884803193072e-05, "loss": 0.1469, "step": 10064 }, { "epoch": 3.0770406603485174, "grad_norm": 0.29284945130348206, "learning_rate": 2.9062460192773134e-05, "loss": 0.0897, "step": 10065 }, { "epoch": 3.077346377254662, "grad_norm": 0.32500287890434265, "learning_rate": 2.9062035582353193e-05, "loss": 0.0671, "step": 10066 }, { "epoch": 3.077652094160807, "grad_norm": 0.22057361900806427, "learning_rate": 2.9061610971933255e-05, "loss": 0.0558, "step": 10067 }, { "epoch": 3.077957811066952, "grad_norm": 0.31360116600990295, "learning_rate": 2.9061186361513313e-05, "loss": 0.0629, "step": 10068 }, { "epoch": 3.078263527973097, "grad_norm": 0.35599932074546814, "learning_rate": 2.9060761751093372e-05, "loss": 0.0657, "step": 10069 }, { "epoch": 3.0785692448792417, "grad_norm": 0.24790656566619873, "learning_rate": 2.9060337140673434e-05, "loss": 0.0611, "step": 10070 }, { "epoch": 3.0788749617853868, "grad_norm": 0.22037404775619507, "learning_rate": 2.9059912530253493e-05, "loss": 0.0832, "step": 10071 }, { "epoch": 3.0791806786915314, "grad_norm": 0.19495049118995667, "learning_rate": 2.9059487919833555e-05, "loss": 0.0553, "step": 10072 }, { "epoch": 3.0794863955976766, "grad_norm": 0.2453378289937973, "learning_rate": 2.9059063309413614e-05, "loss": 0.0665, "step": 10073 }, { "epoch": 3.0797921125038217, "grad_norm": 0.37647029757499695, "learning_rate": 2.9058638698993676e-05, "loss": 0.0977, "step": 10074 }, { "epoch": 3.0800978294099663, "grad_norm": 0.8393172025680542, "learning_rate": 2.9058214088573735e-05, "loss": 0.0803, "step": 10075 }, { "epoch": 3.0804035463161115, "grad_norm": 0.5765260457992554, "learning_rate": 2.9057789478153797e-05, "loss": 0.1148, "step": 10076 }, { "epoch": 3.080709263222256, "grad_norm": 0.4038340747356415, "learning_rate": 2.9057364867733855e-05, "loss": 0.0784, "step": 10077 }, { "epoch": 3.0810149801284012, "grad_norm": 0.45533373951911926, "learning_rate": 2.9056940257313917e-05, "loss": 0.1293, "step": 10078 }, { "epoch": 3.081320697034546, "grad_norm": 0.5102387070655823, "learning_rate": 2.9056515646893976e-05, "loss": 0.1517, "step": 10079 }, { "epoch": 3.081626413940691, "grad_norm": 0.6716141700744629, "learning_rate": 2.9056091036474038e-05, "loss": 0.1627, "step": 10080 }, { "epoch": 3.0819321308468357, "grad_norm": 0.9886074066162109, "learning_rate": 2.9055666426054097e-05, "loss": 0.1749, "step": 10081 }, { "epoch": 3.082237847752981, "grad_norm": 1.2411950826644897, "learning_rate": 2.9055241815634156e-05, "loss": 0.209, "step": 10082 }, { "epoch": 3.0825435646591255, "grad_norm": 0.9770867824554443, "learning_rate": 2.9054817205214218e-05, "loss": 0.1927, "step": 10083 }, { "epoch": 3.0828492815652706, "grad_norm": 1.4293723106384277, "learning_rate": 2.9054392594794276e-05, "loss": 0.1859, "step": 10084 }, { "epoch": 3.0831549984714153, "grad_norm": 4.110073089599609, "learning_rate": 2.905396798437434e-05, "loss": 0.1948, "step": 10085 }, { "epoch": 3.0834607153775604, "grad_norm": 2.401430130004883, "learning_rate": 2.9053543373954397e-05, "loss": 0.2138, "step": 10086 }, { "epoch": 3.0837664322837055, "grad_norm": 1.3293367624282837, "learning_rate": 2.905311876353446e-05, "loss": 0.2221, "step": 10087 }, { "epoch": 3.08407214918985, "grad_norm": 2.2296526432037354, "learning_rate": 2.9052694153114518e-05, "loss": 0.2648, "step": 10088 }, { "epoch": 3.0843778660959953, "grad_norm": 0.3734959661960602, "learning_rate": 2.905226954269458e-05, "loss": 0.1779, "step": 10089 }, { "epoch": 3.08468358300214, "grad_norm": 0.2198134809732437, "learning_rate": 2.905184493227464e-05, "loss": 0.0739, "step": 10090 }, { "epoch": 3.084989299908285, "grad_norm": 0.6042577028274536, "learning_rate": 2.90514203218547e-05, "loss": 0.0626, "step": 10091 }, { "epoch": 3.0852950168144297, "grad_norm": 0.49336203932762146, "learning_rate": 2.905099571143476e-05, "loss": 0.0707, "step": 10092 }, { "epoch": 3.085600733720575, "grad_norm": 0.17436914145946503, "learning_rate": 2.905057110101482e-05, "loss": 0.0707, "step": 10093 }, { "epoch": 3.0859064506267195, "grad_norm": 0.29508382081985474, "learning_rate": 2.905014649059488e-05, "loss": 0.0496, "step": 10094 }, { "epoch": 3.0862121675328646, "grad_norm": 0.2724146246910095, "learning_rate": 2.904972188017494e-05, "loss": 0.0455, "step": 10095 }, { "epoch": 3.0865178844390093, "grad_norm": 0.3551980257034302, "learning_rate": 2.9049297269755e-05, "loss": 0.0492, "step": 10096 }, { "epoch": 3.0868236013451544, "grad_norm": 0.6239196062088013, "learning_rate": 2.904887265933506e-05, "loss": 0.0643, "step": 10097 }, { "epoch": 3.087129318251299, "grad_norm": 0.30639052391052246, "learning_rate": 2.9048448048915122e-05, "loss": 0.0779, "step": 10098 }, { "epoch": 3.087435035157444, "grad_norm": 0.33164364099502563, "learning_rate": 2.904802343849518e-05, "loss": 0.1159, "step": 10099 }, { "epoch": 3.0877407520635893, "grad_norm": 0.9641338586807251, "learning_rate": 2.9047598828075243e-05, "loss": 0.1069, "step": 10100 }, { "epoch": 3.088046468969734, "grad_norm": 0.5976400375366211, "learning_rate": 2.90471742176553e-05, "loss": 0.0972, "step": 10101 }, { "epoch": 3.088352185875879, "grad_norm": 0.36687833070755005, "learning_rate": 2.9046749607235363e-05, "loss": 0.1195, "step": 10102 }, { "epoch": 3.088657902782024, "grad_norm": 0.5719929337501526, "learning_rate": 2.9046324996815422e-05, "loss": 0.1179, "step": 10103 }, { "epoch": 3.088963619688169, "grad_norm": 0.8923267722129822, "learning_rate": 2.9045900386395484e-05, "loss": 0.1364, "step": 10104 }, { "epoch": 3.0892693365943136, "grad_norm": 0.8033274412155151, "learning_rate": 2.9045475775975543e-05, "loss": 0.1296, "step": 10105 }, { "epoch": 3.0895750535004587, "grad_norm": 0.6403385996818542, "learning_rate": 2.9045051165555605e-05, "loss": 0.1978, "step": 10106 }, { "epoch": 3.0898807704066034, "grad_norm": 1.561601996421814, "learning_rate": 2.9044626555135664e-05, "loss": 0.19, "step": 10107 }, { "epoch": 3.0901864873127485, "grad_norm": 0.6780383586883545, "learning_rate": 2.9044201944715722e-05, "loss": 0.2097, "step": 10108 }, { "epoch": 3.090492204218893, "grad_norm": 0.898051917552948, "learning_rate": 2.9043777334295785e-05, "loss": 0.1773, "step": 10109 }, { "epoch": 3.0907979211250383, "grad_norm": 0.6717727780342102, "learning_rate": 2.9043352723875843e-05, "loss": 0.1669, "step": 10110 }, { "epoch": 3.091103638031183, "grad_norm": 0.8607982397079468, "learning_rate": 2.9042928113455905e-05, "loss": 0.277, "step": 10111 }, { "epoch": 3.091409354937328, "grad_norm": 1.249191164970398, "learning_rate": 2.9042503503035964e-05, "loss": 0.2385, "step": 10112 }, { "epoch": 3.091715071843473, "grad_norm": 2.7037391662597656, "learning_rate": 2.9042078892616026e-05, "loss": 0.2414, "step": 10113 }, { "epoch": 3.092020788749618, "grad_norm": 1.494218349456787, "learning_rate": 2.9041654282196085e-05, "loss": 0.1572, "step": 10114 }, { "epoch": 3.092326505655763, "grad_norm": 0.33083781599998474, "learning_rate": 2.9041229671776147e-05, "loss": 0.0927, "step": 10115 }, { "epoch": 3.0926322225619076, "grad_norm": 0.48243531584739685, "learning_rate": 2.9040805061356206e-05, "loss": 0.0782, "step": 10116 }, { "epoch": 3.0929379394680527, "grad_norm": 0.28220224380493164, "learning_rate": 2.9040380450936268e-05, "loss": 0.0786, "step": 10117 }, { "epoch": 3.0932436563741974, "grad_norm": 0.2587142288684845, "learning_rate": 2.9039955840516326e-05, "loss": 0.0704, "step": 10118 }, { "epoch": 3.0935493732803425, "grad_norm": 0.38914063572883606, "learning_rate": 2.903953123009639e-05, "loss": 0.0794, "step": 10119 }, { "epoch": 3.093855090186487, "grad_norm": 0.40007615089416504, "learning_rate": 2.9039106619676447e-05, "loss": 0.0905, "step": 10120 }, { "epoch": 3.0941608070926323, "grad_norm": 0.38032257556915283, "learning_rate": 2.9038682009256506e-05, "loss": 0.0605, "step": 10121 }, { "epoch": 3.094466523998777, "grad_norm": 0.4262148439884186, "learning_rate": 2.9038257398836568e-05, "loss": 0.0858, "step": 10122 }, { "epoch": 3.094772240904922, "grad_norm": 0.4910809099674225, "learning_rate": 2.9037832788416627e-05, "loss": 0.0837, "step": 10123 }, { "epoch": 3.0950779578110668, "grad_norm": 0.3735189437866211, "learning_rate": 2.903740817799669e-05, "loss": 0.1009, "step": 10124 }, { "epoch": 3.095383674717212, "grad_norm": 0.3343009650707245, "learning_rate": 2.9036983567576747e-05, "loss": 0.1001, "step": 10125 }, { "epoch": 3.095689391623357, "grad_norm": 0.44826170802116394, "learning_rate": 2.903655895715681e-05, "loss": 0.1112, "step": 10126 }, { "epoch": 3.0959951085295017, "grad_norm": 0.691718339920044, "learning_rate": 2.9036134346736868e-05, "loss": 0.1257, "step": 10127 }, { "epoch": 3.0963008254356468, "grad_norm": 0.5176783204078674, "learning_rate": 2.903570973631693e-05, "loss": 0.137, "step": 10128 }, { "epoch": 3.0966065423417914, "grad_norm": 1.0548324584960938, "learning_rate": 2.903528512589699e-05, "loss": 0.1632, "step": 10129 }, { "epoch": 3.0969122592479366, "grad_norm": 0.9064885973930359, "learning_rate": 2.903486051547705e-05, "loss": 0.1722, "step": 10130 }, { "epoch": 3.0972179761540812, "grad_norm": 0.8339161276817322, "learning_rate": 2.903443590505711e-05, "loss": 0.1846, "step": 10131 }, { "epoch": 3.0975236930602263, "grad_norm": 0.7952666878700256, "learning_rate": 2.9034011294637172e-05, "loss": 0.1759, "step": 10132 }, { "epoch": 3.097829409966371, "grad_norm": 0.7220894694328308, "learning_rate": 2.903358668421723e-05, "loss": 0.1733, "step": 10133 }, { "epoch": 3.098135126872516, "grad_norm": 1.673033595085144, "learning_rate": 2.903316207379729e-05, "loss": 0.1807, "step": 10134 }, { "epoch": 3.098440843778661, "grad_norm": 0.8954816460609436, "learning_rate": 2.903273746337735e-05, "loss": 0.2433, "step": 10135 }, { "epoch": 3.098746560684806, "grad_norm": 1.0838631391525269, "learning_rate": 2.903231285295741e-05, "loss": 0.1907, "step": 10136 }, { "epoch": 3.0990522775909506, "grad_norm": 1.8889449834823608, "learning_rate": 2.9031888242537472e-05, "loss": 0.1823, "step": 10137 }, { "epoch": 3.0993579944970957, "grad_norm": 1.4438962936401367, "learning_rate": 2.903146363211753e-05, "loss": 0.2267, "step": 10138 }, { "epoch": 3.099663711403241, "grad_norm": 2.6031017303466797, "learning_rate": 2.9031039021697593e-05, "loss": 0.1471, "step": 10139 }, { "epoch": 3.0999694283093855, "grad_norm": 0.24160915613174438, "learning_rate": 2.903061441127765e-05, "loss": 0.0796, "step": 10140 }, { "epoch": 3.1002751452155306, "grad_norm": 0.37818053364753723, "learning_rate": 2.9030189800857714e-05, "loss": 0.087, "step": 10141 }, { "epoch": 3.1005808621216753, "grad_norm": 0.2767764627933502, "learning_rate": 2.9029765190437772e-05, "loss": 0.0847, "step": 10142 }, { "epoch": 3.1008865790278204, "grad_norm": 0.2580049932003021, "learning_rate": 2.9029340580017835e-05, "loss": 0.0587, "step": 10143 }, { "epoch": 3.101192295933965, "grad_norm": 0.3440234661102295, "learning_rate": 2.9028915969597893e-05, "loss": 0.0639, "step": 10144 }, { "epoch": 3.10149801284011, "grad_norm": 1.4063260555267334, "learning_rate": 2.9028491359177955e-05, "loss": 0.0572, "step": 10145 }, { "epoch": 3.101803729746255, "grad_norm": 0.491294264793396, "learning_rate": 2.9028066748758014e-05, "loss": 0.0723, "step": 10146 }, { "epoch": 3.1021094466524, "grad_norm": 0.6703107953071594, "learning_rate": 2.9027642138338073e-05, "loss": 0.1295, "step": 10147 }, { "epoch": 3.1024151635585446, "grad_norm": 0.5219569802284241, "learning_rate": 2.9027217527918135e-05, "loss": 0.0644, "step": 10148 }, { "epoch": 3.1027208804646897, "grad_norm": 0.47150567173957825, "learning_rate": 2.9026792917498194e-05, "loss": 0.0795, "step": 10149 }, { "epoch": 3.1030265973708344, "grad_norm": 0.4649871289730072, "learning_rate": 2.9026368307078256e-05, "loss": 0.113, "step": 10150 }, { "epoch": 3.1033323142769795, "grad_norm": 0.45036011934280396, "learning_rate": 2.9025943696658314e-05, "loss": 0.0947, "step": 10151 }, { "epoch": 3.1036380311831246, "grad_norm": 0.3624693751335144, "learning_rate": 2.9025519086238376e-05, "loss": 0.0962, "step": 10152 }, { "epoch": 3.1039437480892693, "grad_norm": 0.4273260831832886, "learning_rate": 2.9025094475818435e-05, "loss": 0.1379, "step": 10153 }, { "epoch": 3.1042494649954144, "grad_norm": 17.695415496826172, "learning_rate": 2.9024669865398497e-05, "loss": 0.1546, "step": 10154 }, { "epoch": 3.104555181901559, "grad_norm": 0.7503246665000916, "learning_rate": 2.9024245254978556e-05, "loss": 0.2488, "step": 10155 }, { "epoch": 3.104860898807704, "grad_norm": 1.7017892599105835, "learning_rate": 2.9023820644558618e-05, "loss": 0.1404, "step": 10156 }, { "epoch": 3.105166615713849, "grad_norm": 1.3110790252685547, "learning_rate": 2.9023396034138677e-05, "loss": 0.2032, "step": 10157 }, { "epoch": 3.105472332619994, "grad_norm": 0.9492514729499817, "learning_rate": 2.902297142371874e-05, "loss": 0.2054, "step": 10158 }, { "epoch": 3.1057780495261387, "grad_norm": 1.781901478767395, "learning_rate": 2.90225468132988e-05, "loss": 0.1687, "step": 10159 }, { "epoch": 3.106083766432284, "grad_norm": 0.8614996671676636, "learning_rate": 2.902212220287886e-05, "loss": 0.2007, "step": 10160 }, { "epoch": 3.1063894833384285, "grad_norm": 0.923270583152771, "learning_rate": 2.902169759245892e-05, "loss": 0.1795, "step": 10161 }, { "epoch": 3.1066952002445736, "grad_norm": 1.0844855308532715, "learning_rate": 2.902127298203898e-05, "loss": 0.1994, "step": 10162 }, { "epoch": 3.1070009171507182, "grad_norm": 3.3363242149353027, "learning_rate": 2.9020848371619042e-05, "loss": 0.2449, "step": 10163 }, { "epoch": 3.1073066340568634, "grad_norm": 0.40046781301498413, "learning_rate": 2.90204237611991e-05, "loss": 0.1576, "step": 10164 }, { "epoch": 3.1076123509630085, "grad_norm": 0.31198650598526, "learning_rate": 2.9019999150779163e-05, "loss": 0.0839, "step": 10165 }, { "epoch": 3.107918067869153, "grad_norm": 0.1707192212343216, "learning_rate": 2.9019574540359222e-05, "loss": 0.062, "step": 10166 }, { "epoch": 3.1082237847752983, "grad_norm": 0.286409854888916, "learning_rate": 2.9019149929939284e-05, "loss": 0.078, "step": 10167 }, { "epoch": 3.108529501681443, "grad_norm": 0.23307833075523376, "learning_rate": 2.9018725319519343e-05, "loss": 0.0863, "step": 10168 }, { "epoch": 3.108835218587588, "grad_norm": 0.2747732102870941, "learning_rate": 2.9018300709099405e-05, "loss": 0.0619, "step": 10169 }, { "epoch": 3.1091409354937327, "grad_norm": 0.37478092312812805, "learning_rate": 2.9017876098679464e-05, "loss": 0.0641, "step": 10170 }, { "epoch": 3.109446652399878, "grad_norm": 0.21448466181755066, "learning_rate": 2.9017451488259522e-05, "loss": 0.0481, "step": 10171 }, { "epoch": 3.1097523693060225, "grad_norm": 0.374193012714386, "learning_rate": 2.9017026877839584e-05, "loss": 0.0852, "step": 10172 }, { "epoch": 3.1100580862121676, "grad_norm": 0.24993543326854706, "learning_rate": 2.9016602267419643e-05, "loss": 0.0665, "step": 10173 }, { "epoch": 3.1103638031183123, "grad_norm": 0.4172515273094177, "learning_rate": 2.9016177656999705e-05, "loss": 0.0785, "step": 10174 }, { "epoch": 3.1106695200244574, "grad_norm": 0.34482377767562866, "learning_rate": 2.9015753046579764e-05, "loss": 0.1055, "step": 10175 }, { "epoch": 3.110975236930602, "grad_norm": 0.5373492240905762, "learning_rate": 2.9015328436159826e-05, "loss": 0.1248, "step": 10176 }, { "epoch": 3.111280953836747, "grad_norm": 0.5519682765007019, "learning_rate": 2.9014903825739885e-05, "loss": 0.1012, "step": 10177 }, { "epoch": 3.1115866707428923, "grad_norm": 0.43474748730659485, "learning_rate": 2.9014479215319947e-05, "loss": 0.1433, "step": 10178 }, { "epoch": 3.111892387649037, "grad_norm": 0.7042310237884521, "learning_rate": 2.9014054604900005e-05, "loss": 0.1522, "step": 10179 }, { "epoch": 3.112198104555182, "grad_norm": 0.6776449084281921, "learning_rate": 2.9013629994480067e-05, "loss": 0.1561, "step": 10180 }, { "epoch": 3.1125038214613268, "grad_norm": 0.557202935218811, "learning_rate": 2.9013205384060126e-05, "loss": 0.1789, "step": 10181 }, { "epoch": 3.112809538367472, "grad_norm": 1.0361992120742798, "learning_rate": 2.9012780773640188e-05, "loss": 0.2056, "step": 10182 }, { "epoch": 3.1131152552736165, "grad_norm": 0.8286022543907166, "learning_rate": 2.9012356163220247e-05, "loss": 0.2312, "step": 10183 }, { "epoch": 3.1134209721797617, "grad_norm": 0.6650339365005493, "learning_rate": 2.9011931552800306e-05, "loss": 0.1994, "step": 10184 }, { "epoch": 3.1137266890859063, "grad_norm": 1.1288816928863525, "learning_rate": 2.9011506942380368e-05, "loss": 0.1864, "step": 10185 }, { "epoch": 3.1140324059920514, "grad_norm": 1.2911580801010132, "learning_rate": 2.9011082331960426e-05, "loss": 0.1859, "step": 10186 }, { "epoch": 3.114338122898196, "grad_norm": 0.9096306562423706, "learning_rate": 2.901065772154049e-05, "loss": 0.2175, "step": 10187 }, { "epoch": 3.1146438398043412, "grad_norm": 1.7718554735183716, "learning_rate": 2.9010233111120547e-05, "loss": 0.2373, "step": 10188 }, { "epoch": 3.114949556710486, "grad_norm": 0.3925233781337738, "learning_rate": 2.900980850070061e-05, "loss": 0.1643, "step": 10189 }, { "epoch": 3.115255273616631, "grad_norm": 0.39127621054649353, "learning_rate": 2.9009383890280668e-05, "loss": 0.0821, "step": 10190 }, { "epoch": 3.115560990522776, "grad_norm": 0.24411433935165405, "learning_rate": 2.900895927986073e-05, "loss": 0.0577, "step": 10191 }, { "epoch": 3.115866707428921, "grad_norm": 0.3197753131389618, "learning_rate": 2.900853466944079e-05, "loss": 0.066, "step": 10192 }, { "epoch": 3.116172424335066, "grad_norm": 0.24409236013889313, "learning_rate": 2.900811005902085e-05, "loss": 0.0571, "step": 10193 }, { "epoch": 3.1164781412412106, "grad_norm": 0.23779897391796112, "learning_rate": 2.900768544860091e-05, "loss": 0.0805, "step": 10194 }, { "epoch": 3.1167838581473557, "grad_norm": 0.26942920684814453, "learning_rate": 2.900726083818097e-05, "loss": 0.0588, "step": 10195 }, { "epoch": 3.1170895750535004, "grad_norm": 0.245290607213974, "learning_rate": 2.900683622776103e-05, "loss": 0.064, "step": 10196 }, { "epoch": 3.1173952919596455, "grad_norm": 0.5660454630851746, "learning_rate": 2.900641161734109e-05, "loss": 0.0867, "step": 10197 }, { "epoch": 3.11770100886579, "grad_norm": 0.27667325735092163, "learning_rate": 2.900598700692115e-05, "loss": 0.0731, "step": 10198 }, { "epoch": 3.1180067257719353, "grad_norm": 0.4201212525367737, "learning_rate": 2.900556239650121e-05, "loss": 0.0831, "step": 10199 }, { "epoch": 3.11831244267808, "grad_norm": 0.5643944144248962, "learning_rate": 2.9005137786081272e-05, "loss": 0.0887, "step": 10200 }, { "epoch": 3.118618159584225, "grad_norm": 0.4706017076969147, "learning_rate": 2.900471317566133e-05, "loss": 0.0949, "step": 10201 }, { "epoch": 3.1189238764903697, "grad_norm": 0.6028682589530945, "learning_rate": 2.9004288565241393e-05, "loss": 0.1311, "step": 10202 }, { "epoch": 3.119229593396515, "grad_norm": 0.5093600153923035, "learning_rate": 2.900386395482145e-05, "loss": 0.1223, "step": 10203 }, { "epoch": 3.11953531030266, "grad_norm": 1.010223388671875, "learning_rate": 2.9003439344401514e-05, "loss": 0.1602, "step": 10204 }, { "epoch": 3.1198410272088046, "grad_norm": 0.3855401873588562, "learning_rate": 2.9003014733981572e-05, "loss": 0.167, "step": 10205 }, { "epoch": 3.1201467441149497, "grad_norm": 0.7130523324012756, "learning_rate": 2.9002590123561634e-05, "loss": 0.19, "step": 10206 }, { "epoch": 3.1204524610210944, "grad_norm": 0.633550763130188, "learning_rate": 2.9002165513141693e-05, "loss": 0.2067, "step": 10207 }, { "epoch": 3.1207581779272395, "grad_norm": 0.6608103513717651, "learning_rate": 2.9001740902721755e-05, "loss": 0.1592, "step": 10208 }, { "epoch": 3.121063894833384, "grad_norm": 1.0225163698196411, "learning_rate": 2.9001316292301814e-05, "loss": 0.1546, "step": 10209 }, { "epoch": 3.1213696117395293, "grad_norm": 0.6879536509513855, "learning_rate": 2.9000891681881872e-05, "loss": 0.1904, "step": 10210 }, { "epoch": 3.121675328645674, "grad_norm": 1.033057451248169, "learning_rate": 2.9000467071461935e-05, "loss": 0.2291, "step": 10211 }, { "epoch": 3.121981045551819, "grad_norm": 1.1008775234222412, "learning_rate": 2.9000042461041993e-05, "loss": 0.253, "step": 10212 }, { "epoch": 3.1222867624579638, "grad_norm": 1.4021543264389038, "learning_rate": 2.8999617850622055e-05, "loss": 0.276, "step": 10213 }, { "epoch": 3.122592479364109, "grad_norm": 0.44944489002227783, "learning_rate": 2.8999193240202114e-05, "loss": 0.1654, "step": 10214 }, { "epoch": 3.1228981962702536, "grad_norm": 0.3052438497543335, "learning_rate": 2.8998768629782176e-05, "loss": 0.0846, "step": 10215 }, { "epoch": 3.1232039131763987, "grad_norm": 0.2623600959777832, "learning_rate": 2.8998344019362235e-05, "loss": 0.0811, "step": 10216 }, { "epoch": 3.123509630082544, "grad_norm": 0.2474243938922882, "learning_rate": 2.8997919408942297e-05, "loss": 0.0653, "step": 10217 }, { "epoch": 3.1238153469886885, "grad_norm": 0.23398780822753906, "learning_rate": 2.8997494798522356e-05, "loss": 0.0691, "step": 10218 }, { "epoch": 3.1241210638948336, "grad_norm": 0.2907038927078247, "learning_rate": 2.8997070188102418e-05, "loss": 0.0498, "step": 10219 }, { "epoch": 3.1244267808009782, "grad_norm": 0.25155186653137207, "learning_rate": 2.8996645577682476e-05, "loss": 0.0491, "step": 10220 }, { "epoch": 3.1247324977071234, "grad_norm": 0.3254307210445404, "learning_rate": 2.899622096726254e-05, "loss": 0.0621, "step": 10221 }, { "epoch": 3.125038214613268, "grad_norm": 0.747490406036377, "learning_rate": 2.8995796356842597e-05, "loss": 0.0672, "step": 10222 }, { "epoch": 3.125343931519413, "grad_norm": 0.2551405131816864, "learning_rate": 2.8995371746422656e-05, "loss": 0.076, "step": 10223 }, { "epoch": 3.125649648425558, "grad_norm": 0.559005618095398, "learning_rate": 2.8994947136002718e-05, "loss": 0.0812, "step": 10224 }, { "epoch": 3.125955365331703, "grad_norm": 0.3586134612560272, "learning_rate": 2.8994522525582777e-05, "loss": 0.0955, "step": 10225 }, { "epoch": 3.1262610822378476, "grad_norm": 0.847227156162262, "learning_rate": 2.899409791516284e-05, "loss": 0.102, "step": 10226 }, { "epoch": 3.1265667991439927, "grad_norm": 0.3877527713775635, "learning_rate": 2.8993673304742898e-05, "loss": 0.1259, "step": 10227 }, { "epoch": 3.1268725160501374, "grad_norm": 0.45991000533103943, "learning_rate": 2.899324869432296e-05, "loss": 0.151, "step": 10228 }, { "epoch": 3.1271782329562825, "grad_norm": 0.5128366947174072, "learning_rate": 2.8992824083903018e-05, "loss": 0.1479, "step": 10229 }, { "epoch": 3.1274839498624276, "grad_norm": 0.9297137260437012, "learning_rate": 2.899239947348308e-05, "loss": 0.1857, "step": 10230 }, { "epoch": 3.1277896667685723, "grad_norm": 1.7098102569580078, "learning_rate": 2.899197486306314e-05, "loss": 0.154, "step": 10231 }, { "epoch": 3.1280953836747174, "grad_norm": 0.6439261436462402, "learning_rate": 2.89915502526432e-05, "loss": 0.1808, "step": 10232 }, { "epoch": 3.128401100580862, "grad_norm": 2.1485469341278076, "learning_rate": 2.899112564222326e-05, "loss": 0.1594, "step": 10233 }, { "epoch": 3.128706817487007, "grad_norm": 1.0043927431106567, "learning_rate": 2.8990701031803322e-05, "loss": 0.1859, "step": 10234 }, { "epoch": 3.129012534393152, "grad_norm": 0.7461325526237488, "learning_rate": 2.899027642138338e-05, "loss": 0.1951, "step": 10235 }, { "epoch": 3.129318251299297, "grad_norm": 1.5041502714157104, "learning_rate": 2.898985181096344e-05, "loss": 0.2023, "step": 10236 }, { "epoch": 3.1296239682054416, "grad_norm": 1.4212908744812012, "learning_rate": 2.89894272005435e-05, "loss": 0.1842, "step": 10237 }, { "epoch": 3.1299296851115868, "grad_norm": 1.2839913368225098, "learning_rate": 2.898900259012356e-05, "loss": 0.243, "step": 10238 }, { "epoch": 3.1302354020177314, "grad_norm": 0.4903572201728821, "learning_rate": 2.8988577979703622e-05, "loss": 0.1547, "step": 10239 }, { "epoch": 3.1305411189238765, "grad_norm": 0.5030423402786255, "learning_rate": 2.898815336928368e-05, "loss": 0.0922, "step": 10240 }, { "epoch": 3.130846835830021, "grad_norm": 0.2477397322654724, "learning_rate": 2.8987728758863743e-05, "loss": 0.0584, "step": 10241 }, { "epoch": 3.1311525527361663, "grad_norm": 0.7632263898849487, "learning_rate": 2.8987304148443802e-05, "loss": 0.0737, "step": 10242 }, { "epoch": 3.1314582696423114, "grad_norm": 0.16258756816387177, "learning_rate": 2.8986879538023864e-05, "loss": 0.0428, "step": 10243 }, { "epoch": 3.131763986548456, "grad_norm": 0.29201072454452515, "learning_rate": 2.8986454927603923e-05, "loss": 0.0487, "step": 10244 }, { "epoch": 3.1320697034546012, "grad_norm": 0.2376793920993805, "learning_rate": 2.8986030317183985e-05, "loss": 0.0598, "step": 10245 }, { "epoch": 3.132375420360746, "grad_norm": 0.24498184025287628, "learning_rate": 2.8985605706764043e-05, "loss": 0.0455, "step": 10246 }, { "epoch": 3.132681137266891, "grad_norm": 0.24297019839286804, "learning_rate": 2.8985181096344105e-05, "loss": 0.0789, "step": 10247 }, { "epoch": 3.1329868541730357, "grad_norm": 0.2842842936515808, "learning_rate": 2.8984756485924164e-05, "loss": 0.0666, "step": 10248 }, { "epoch": 3.133292571079181, "grad_norm": 0.2996571362018585, "learning_rate": 2.8984331875504223e-05, "loss": 0.0865, "step": 10249 }, { "epoch": 3.1335982879853255, "grad_norm": 0.31700843572616577, "learning_rate": 2.8983907265084285e-05, "loss": 0.0939, "step": 10250 }, { "epoch": 3.1339040048914706, "grad_norm": 0.6342162489891052, "learning_rate": 2.8983482654664344e-05, "loss": 0.1039, "step": 10251 }, { "epoch": 3.1342097217976153, "grad_norm": 0.42932021617889404, "learning_rate": 2.8983058044244406e-05, "loss": 0.108, "step": 10252 }, { "epoch": 3.1345154387037604, "grad_norm": 0.42954373359680176, "learning_rate": 2.8982633433824464e-05, "loss": 0.1236, "step": 10253 }, { "epoch": 3.134821155609905, "grad_norm": 0.5402647852897644, "learning_rate": 2.8982208823404526e-05, "loss": 0.1848, "step": 10254 }, { "epoch": 3.13512687251605, "grad_norm": 0.7672491669654846, "learning_rate": 2.8981784212984585e-05, "loss": 0.1537, "step": 10255 }, { "epoch": 3.1354325894221953, "grad_norm": 0.8763360381126404, "learning_rate": 2.8981359602564647e-05, "loss": 0.1928, "step": 10256 }, { "epoch": 3.13573830632834, "grad_norm": 0.8873546719551086, "learning_rate": 2.8980934992144706e-05, "loss": 0.1643, "step": 10257 }, { "epoch": 3.136044023234485, "grad_norm": 1.3063812255859375, "learning_rate": 2.8980510381724768e-05, "loss": 0.1753, "step": 10258 }, { "epoch": 3.1363497401406297, "grad_norm": 0.8618035912513733, "learning_rate": 2.8980085771304827e-05, "loss": 0.1772, "step": 10259 }, { "epoch": 3.136655457046775, "grad_norm": 0.7086197733879089, "learning_rate": 2.897966116088489e-05, "loss": 0.1948, "step": 10260 }, { "epoch": 3.1369611739529195, "grad_norm": 0.9924372434616089, "learning_rate": 2.897923655046495e-05, "loss": 0.1776, "step": 10261 }, { "epoch": 3.1372668908590646, "grad_norm": 2.723292589187622, "learning_rate": 2.897881194004501e-05, "loss": 0.2298, "step": 10262 }, { "epoch": 3.1375726077652093, "grad_norm": 1.7389355897903442, "learning_rate": 2.897838732962507e-05, "loss": 0.3162, "step": 10263 }, { "epoch": 3.1378783246713544, "grad_norm": 0.5222474336624146, "learning_rate": 2.897796271920513e-05, "loss": 0.1637, "step": 10264 }, { "epoch": 3.138184041577499, "grad_norm": 0.3561881184577942, "learning_rate": 2.8977538108785192e-05, "loss": 0.0708, "step": 10265 }, { "epoch": 3.138489758483644, "grad_norm": 0.2615278661251068, "learning_rate": 2.897711349836525e-05, "loss": 0.0821, "step": 10266 }, { "epoch": 3.138795475389789, "grad_norm": 0.18635202944278717, "learning_rate": 2.8976688887945313e-05, "loss": 0.0498, "step": 10267 }, { "epoch": 3.139101192295934, "grad_norm": 0.704046368598938, "learning_rate": 2.8976264277525372e-05, "loss": 0.0992, "step": 10268 }, { "epoch": 3.139406909202079, "grad_norm": 0.21113035082817078, "learning_rate": 2.8975839667105434e-05, "loss": 0.0615, "step": 10269 }, { "epoch": 3.1397126261082238, "grad_norm": 0.185187429189682, "learning_rate": 2.8975415056685493e-05, "loss": 0.0378, "step": 10270 }, { "epoch": 3.140018343014369, "grad_norm": 0.24964606761932373, "learning_rate": 2.8974990446265555e-05, "loss": 0.0927, "step": 10271 }, { "epoch": 3.1403240599205136, "grad_norm": 0.38263237476348877, "learning_rate": 2.8974565835845614e-05, "loss": 0.0607, "step": 10272 }, { "epoch": 3.1406297768266587, "grad_norm": 0.48742803931236267, "learning_rate": 2.8974141225425676e-05, "loss": 0.0811, "step": 10273 }, { "epoch": 3.1409354937328033, "grad_norm": 0.3578076660633087, "learning_rate": 2.8973716615005734e-05, "loss": 0.0901, "step": 10274 }, { "epoch": 3.1412412106389485, "grad_norm": 0.3351646661758423, "learning_rate": 2.8973292004585793e-05, "loss": 0.0794, "step": 10275 }, { "epoch": 3.141546927545093, "grad_norm": 0.2636694610118866, "learning_rate": 2.8972867394165855e-05, "loss": 0.0673, "step": 10276 }, { "epoch": 3.1418526444512382, "grad_norm": 0.4289308786392212, "learning_rate": 2.8972442783745914e-05, "loss": 0.1494, "step": 10277 }, { "epoch": 3.142158361357383, "grad_norm": 0.42700907588005066, "learning_rate": 2.8972018173325976e-05, "loss": 0.1685, "step": 10278 }, { "epoch": 3.142464078263528, "grad_norm": 0.6224977374076843, "learning_rate": 2.8971593562906035e-05, "loss": 0.1658, "step": 10279 }, { "epoch": 3.1427697951696727, "grad_norm": 0.39091455936431885, "learning_rate": 2.8971168952486097e-05, "loss": 0.1767, "step": 10280 }, { "epoch": 3.143075512075818, "grad_norm": 0.4942493140697479, "learning_rate": 2.8970744342066155e-05, "loss": 0.1558, "step": 10281 }, { "epoch": 3.143381228981963, "grad_norm": 0.516236424446106, "learning_rate": 2.8970319731646217e-05, "loss": 0.1587, "step": 10282 }, { "epoch": 3.1436869458881076, "grad_norm": 0.44774961471557617, "learning_rate": 2.8969895121226276e-05, "loss": 0.1675, "step": 10283 }, { "epoch": 3.1439926627942527, "grad_norm": 0.9338566064834595, "learning_rate": 2.8969470510806338e-05, "loss": 0.1929, "step": 10284 }, { "epoch": 3.1442983797003974, "grad_norm": 2.035860776901245, "learning_rate": 2.8969045900386397e-05, "loss": 0.2098, "step": 10285 }, { "epoch": 3.1446040966065425, "grad_norm": 1.1478455066680908, "learning_rate": 2.8968621289966456e-05, "loss": 0.2592, "step": 10286 }, { "epoch": 3.144909813512687, "grad_norm": 0.7525178790092468, "learning_rate": 2.8968196679546518e-05, "loss": 0.2135, "step": 10287 }, { "epoch": 3.1452155304188323, "grad_norm": 1.7376807928085327, "learning_rate": 2.8967772069126576e-05, "loss": 0.388, "step": 10288 }, { "epoch": 3.145521247324977, "grad_norm": 0.41085371375083923, "learning_rate": 2.896734745870664e-05, "loss": 0.147, "step": 10289 }, { "epoch": 3.145826964231122, "grad_norm": 0.24897074699401855, "learning_rate": 2.8966922848286697e-05, "loss": 0.0917, "step": 10290 }, { "epoch": 3.1461326811372667, "grad_norm": 0.24241597950458527, "learning_rate": 2.896649823786676e-05, "loss": 0.0819, "step": 10291 }, { "epoch": 3.146438398043412, "grad_norm": 0.1660022884607315, "learning_rate": 2.8966073627446818e-05, "loss": 0.053, "step": 10292 }, { "epoch": 3.1467441149495565, "grad_norm": 0.18616977334022522, "learning_rate": 2.896564901702688e-05, "loss": 0.095, "step": 10293 }, { "epoch": 3.1470498318557016, "grad_norm": 0.2733456492424011, "learning_rate": 2.896522440660694e-05, "loss": 0.0465, "step": 10294 }, { "epoch": 3.1473555487618468, "grad_norm": 0.2541136145591736, "learning_rate": 2.8964799796187e-05, "loss": 0.0617, "step": 10295 }, { "epoch": 3.1476612656679914, "grad_norm": 0.37824615836143494, "learning_rate": 2.896437518576706e-05, "loss": 0.0931, "step": 10296 }, { "epoch": 3.1479669825741365, "grad_norm": 0.2931547462940216, "learning_rate": 2.8963950575347122e-05, "loss": 0.0622, "step": 10297 }, { "epoch": 3.148272699480281, "grad_norm": 0.4609910547733307, "learning_rate": 2.896352596492718e-05, "loss": 0.0563, "step": 10298 }, { "epoch": 3.1485784163864263, "grad_norm": 0.7953585982322693, "learning_rate": 2.896310135450724e-05, "loss": 0.1048, "step": 10299 }, { "epoch": 3.148884133292571, "grad_norm": 0.3550972044467926, "learning_rate": 2.89626767440873e-05, "loss": 0.0785, "step": 10300 }, { "epoch": 3.149189850198716, "grad_norm": 0.22483272850513458, "learning_rate": 2.896225213366736e-05, "loss": 0.0795, "step": 10301 }, { "epoch": 3.149495567104861, "grad_norm": 0.30775538086891174, "learning_rate": 2.8961827523247422e-05, "loss": 0.1231, "step": 10302 }, { "epoch": 3.149801284011006, "grad_norm": 0.41634538769721985, "learning_rate": 2.896140291282748e-05, "loss": 0.116, "step": 10303 }, { "epoch": 3.1501070009171506, "grad_norm": 0.4577889144420624, "learning_rate": 2.8960978302407543e-05, "loss": 0.135, "step": 10304 }, { "epoch": 3.1504127178232957, "grad_norm": 0.7097747325897217, "learning_rate": 2.89605536919876e-05, "loss": 0.1577, "step": 10305 }, { "epoch": 3.1507184347294404, "grad_norm": 0.42533960938453674, "learning_rate": 2.8960129081567664e-05, "loss": 0.1883, "step": 10306 }, { "epoch": 3.1510241516355855, "grad_norm": 0.7189053893089294, "learning_rate": 2.8959704471147722e-05, "loss": 0.1599, "step": 10307 }, { "epoch": 3.1513298685417306, "grad_norm": 0.7183886170387268, "learning_rate": 2.8959279860727784e-05, "loss": 0.2024, "step": 10308 }, { "epoch": 3.1516355854478753, "grad_norm": 0.5874846577644348, "learning_rate": 2.8958855250307843e-05, "loss": 0.1969, "step": 10309 }, { "epoch": 3.1519413023540204, "grad_norm": 0.5393203496932983, "learning_rate": 2.8958430639887905e-05, "loss": 0.181, "step": 10310 }, { "epoch": 3.152247019260165, "grad_norm": 1.3771523237228394, "learning_rate": 2.8958006029467964e-05, "loss": 0.212, "step": 10311 }, { "epoch": 3.15255273616631, "grad_norm": 1.3864179849624634, "learning_rate": 2.8957581419048023e-05, "loss": 0.2033, "step": 10312 }, { "epoch": 3.152858453072455, "grad_norm": 2.5158560276031494, "learning_rate": 2.8957156808628085e-05, "loss": 0.2634, "step": 10313 }, { "epoch": 3.1531641699786, "grad_norm": 0.3594142496585846, "learning_rate": 2.8956732198208143e-05, "loss": 0.148, "step": 10314 }, { "epoch": 3.1534698868847446, "grad_norm": 0.21951749920845032, "learning_rate": 2.8956307587788205e-05, "loss": 0.0745, "step": 10315 }, { "epoch": 3.1537756037908897, "grad_norm": 0.21078631281852722, "learning_rate": 2.8955882977368264e-05, "loss": 0.0633, "step": 10316 }, { "epoch": 3.1540813206970344, "grad_norm": 0.15636320412158966, "learning_rate": 2.8955458366948326e-05, "loss": 0.0551, "step": 10317 }, { "epoch": 3.1543870376031795, "grad_norm": 0.2793445289134979, "learning_rate": 2.8955033756528385e-05, "loss": 0.0733, "step": 10318 }, { "epoch": 3.154692754509324, "grad_norm": 0.2408265769481659, "learning_rate": 2.8954609146108447e-05, "loss": 0.0495, "step": 10319 }, { "epoch": 3.1549984714154693, "grad_norm": 0.573138952255249, "learning_rate": 2.8954184535688506e-05, "loss": 0.0631, "step": 10320 }, { "epoch": 3.1553041883216144, "grad_norm": 0.5133402347564697, "learning_rate": 2.8953759925268568e-05, "loss": 0.0899, "step": 10321 }, { "epoch": 3.155609905227759, "grad_norm": 0.3015466630458832, "learning_rate": 2.8953335314848626e-05, "loss": 0.0697, "step": 10322 }, { "epoch": 3.155915622133904, "grad_norm": 0.35292646288871765, "learning_rate": 2.895291070442869e-05, "loss": 0.0592, "step": 10323 }, { "epoch": 3.156221339040049, "grad_norm": 0.3003939688205719, "learning_rate": 2.8952486094008747e-05, "loss": 0.0745, "step": 10324 }, { "epoch": 3.156527055946194, "grad_norm": 0.348146915435791, "learning_rate": 2.8952061483588806e-05, "loss": 0.0701, "step": 10325 }, { "epoch": 3.1568327728523387, "grad_norm": 0.6308395266532898, "learning_rate": 2.8951636873168868e-05, "loss": 0.1365, "step": 10326 }, { "epoch": 3.1571384897584838, "grad_norm": 0.49866145849227905, "learning_rate": 2.8951212262748927e-05, "loss": 0.1083, "step": 10327 }, { "epoch": 3.1574442066646284, "grad_norm": 0.31493446230888367, "learning_rate": 2.895078765232899e-05, "loss": 0.1205, "step": 10328 }, { "epoch": 3.1577499235707736, "grad_norm": 0.4502767026424408, "learning_rate": 2.8950363041909048e-05, "loss": 0.1707, "step": 10329 }, { "epoch": 3.1580556404769182, "grad_norm": 0.49606582522392273, "learning_rate": 2.894993843148911e-05, "loss": 0.1338, "step": 10330 }, { "epoch": 3.1583613573830633, "grad_norm": 0.5330062508583069, "learning_rate": 2.894951382106917e-05, "loss": 0.1906, "step": 10331 }, { "epoch": 3.158667074289208, "grad_norm": 0.68075031042099, "learning_rate": 2.894908921064923e-05, "loss": 0.174, "step": 10332 }, { "epoch": 3.158972791195353, "grad_norm": 0.6249681115150452, "learning_rate": 2.894866460022929e-05, "loss": 0.1865, "step": 10333 }, { "epoch": 3.1592785081014982, "grad_norm": 0.5545948147773743, "learning_rate": 2.894823998980935e-05, "loss": 0.2217, "step": 10334 }, { "epoch": 3.159584225007643, "grad_norm": 0.8028237819671631, "learning_rate": 2.894781537938941e-05, "loss": 0.1695, "step": 10335 }, { "epoch": 3.159889941913788, "grad_norm": 1.0245047807693481, "learning_rate": 2.8947390768969472e-05, "loss": 0.2096, "step": 10336 }, { "epoch": 3.1601956588199327, "grad_norm": 1.0390950441360474, "learning_rate": 2.894696615854953e-05, "loss": 0.2181, "step": 10337 }, { "epoch": 3.160501375726078, "grad_norm": 1.5423967838287354, "learning_rate": 2.894654154812959e-05, "loss": 0.2268, "step": 10338 }, { "epoch": 3.1608070926322225, "grad_norm": 0.7271143794059753, "learning_rate": 2.894611693770965e-05, "loss": 0.1419, "step": 10339 }, { "epoch": 3.1611128095383676, "grad_norm": 0.3446357250213623, "learning_rate": 2.894569232728971e-05, "loss": 0.106, "step": 10340 }, { "epoch": 3.1614185264445123, "grad_norm": 0.37693551182746887, "learning_rate": 2.8945267716869772e-05, "loss": 0.0953, "step": 10341 }, { "epoch": 3.1617242433506574, "grad_norm": 0.43417707085609436, "learning_rate": 2.894484310644983e-05, "loss": 0.0699, "step": 10342 }, { "epoch": 3.162029960256802, "grad_norm": 0.29777127504348755, "learning_rate": 2.8944418496029893e-05, "loss": 0.0557, "step": 10343 }, { "epoch": 3.162335677162947, "grad_norm": 0.32569563388824463, "learning_rate": 2.8943993885609952e-05, "loss": 0.0708, "step": 10344 }, { "epoch": 3.162641394069092, "grad_norm": 0.4559873342514038, "learning_rate": 2.8943569275190014e-05, "loss": 0.0627, "step": 10345 }, { "epoch": 3.162947110975237, "grad_norm": 0.2646925747394562, "learning_rate": 2.8943144664770073e-05, "loss": 0.079, "step": 10346 }, { "epoch": 3.163252827881382, "grad_norm": 0.28513431549072266, "learning_rate": 2.8942720054350135e-05, "loss": 0.051, "step": 10347 }, { "epoch": 3.1635585447875267, "grad_norm": 0.3178243339061737, "learning_rate": 2.8942295443930193e-05, "loss": 0.0875, "step": 10348 }, { "epoch": 3.163864261693672, "grad_norm": 0.39546123147010803, "learning_rate": 2.8941870833510255e-05, "loss": 0.0994, "step": 10349 }, { "epoch": 3.1641699785998165, "grad_norm": 0.5625643134117126, "learning_rate": 2.8941446223090314e-05, "loss": 0.076, "step": 10350 }, { "epoch": 3.1644756955059616, "grad_norm": 0.3211381435394287, "learning_rate": 2.8941021612670373e-05, "loss": 0.1209, "step": 10351 }, { "epoch": 3.1647814124121063, "grad_norm": 0.9273998737335205, "learning_rate": 2.8940597002250435e-05, "loss": 0.1601, "step": 10352 }, { "epoch": 3.1650871293182514, "grad_norm": 0.3119290769100189, "learning_rate": 2.8940172391830494e-05, "loss": 0.113, "step": 10353 }, { "epoch": 3.165392846224396, "grad_norm": 0.5781148672103882, "learning_rate": 2.8939747781410556e-05, "loss": 0.1474, "step": 10354 }, { "epoch": 3.165698563130541, "grad_norm": 0.8124240040779114, "learning_rate": 2.8939323170990614e-05, "loss": 0.1659, "step": 10355 }, { "epoch": 3.166004280036686, "grad_norm": 0.6231951117515564, "learning_rate": 2.8938898560570676e-05, "loss": 0.1523, "step": 10356 }, { "epoch": 3.166309996942831, "grad_norm": 0.6175893545150757, "learning_rate": 2.8938473950150735e-05, "loss": 0.2108, "step": 10357 }, { "epoch": 3.1666157138489757, "grad_norm": 0.5559443235397339, "learning_rate": 2.8938049339730797e-05, "loss": 0.189, "step": 10358 }, { "epoch": 3.166921430755121, "grad_norm": 0.656129777431488, "learning_rate": 2.8937624729310856e-05, "loss": 0.1953, "step": 10359 }, { "epoch": 3.167227147661266, "grad_norm": 1.3767762184143066, "learning_rate": 2.8937200118890918e-05, "loss": 0.1829, "step": 10360 }, { "epoch": 3.1675328645674106, "grad_norm": 1.7427644729614258, "learning_rate": 2.8936775508470977e-05, "loss": 0.2144, "step": 10361 }, { "epoch": 3.1678385814735557, "grad_norm": 1.0417771339416504, "learning_rate": 2.893635089805104e-05, "loss": 0.251, "step": 10362 }, { "epoch": 3.1681442983797004, "grad_norm": 1.4137701988220215, "learning_rate": 2.89359262876311e-05, "loss": 0.2476, "step": 10363 }, { "epoch": 3.1684500152858455, "grad_norm": 0.32380592823028564, "learning_rate": 2.893550167721116e-05, "loss": 0.1262, "step": 10364 }, { "epoch": 3.16875573219199, "grad_norm": 0.2249813675880432, "learning_rate": 2.8935077066791222e-05, "loss": 0.0788, "step": 10365 }, { "epoch": 3.1690614490981353, "grad_norm": 0.2198571115732193, "learning_rate": 2.893465245637128e-05, "loss": 0.0493, "step": 10366 }, { "epoch": 3.16936716600428, "grad_norm": 0.4216362535953522, "learning_rate": 2.8934227845951343e-05, "loss": 0.1058, "step": 10367 }, { "epoch": 3.169672882910425, "grad_norm": 0.26356038451194763, "learning_rate": 2.89338032355314e-05, "loss": 0.0532, "step": 10368 }, { "epoch": 3.1699785998165697, "grad_norm": 0.2522503733634949, "learning_rate": 2.8933378625111463e-05, "loss": 0.0625, "step": 10369 }, { "epoch": 3.170284316722715, "grad_norm": 0.5914239883422852, "learning_rate": 2.8932954014691522e-05, "loss": 0.0714, "step": 10370 }, { "epoch": 3.1705900336288595, "grad_norm": 0.5363079905509949, "learning_rate": 2.8932529404271584e-05, "loss": 0.055, "step": 10371 }, { "epoch": 3.1708957505350046, "grad_norm": 0.19637542963027954, "learning_rate": 2.8932104793851643e-05, "loss": 0.0685, "step": 10372 }, { "epoch": 3.1712014674411497, "grad_norm": 0.35744789242744446, "learning_rate": 2.8931680183431705e-05, "loss": 0.0879, "step": 10373 }, { "epoch": 3.1715071843472944, "grad_norm": 0.7409740090370178, "learning_rate": 2.8931255573011764e-05, "loss": 0.084, "step": 10374 }, { "epoch": 3.1718129012534395, "grad_norm": 0.2922605872154236, "learning_rate": 2.8930830962591826e-05, "loss": 0.0855, "step": 10375 }, { "epoch": 3.172118618159584, "grad_norm": 0.31801140308380127, "learning_rate": 2.8930406352171884e-05, "loss": 0.0797, "step": 10376 }, { "epoch": 3.1724243350657293, "grad_norm": 1.4916596412658691, "learning_rate": 2.8929981741751943e-05, "loss": 0.177, "step": 10377 }, { "epoch": 3.172730051971874, "grad_norm": 0.4245353043079376, "learning_rate": 2.8929557131332005e-05, "loss": 0.1026, "step": 10378 }, { "epoch": 3.173035768878019, "grad_norm": 0.7588163018226624, "learning_rate": 2.8929132520912064e-05, "loss": 0.1487, "step": 10379 }, { "epoch": 3.1733414857841638, "grad_norm": 0.5029361248016357, "learning_rate": 2.8928707910492126e-05, "loss": 0.1501, "step": 10380 }, { "epoch": 3.173647202690309, "grad_norm": 0.4696407616138458, "learning_rate": 2.8928283300072185e-05, "loss": 0.1449, "step": 10381 }, { "epoch": 3.1739529195964535, "grad_norm": 1.378650188446045, "learning_rate": 2.8927858689652247e-05, "loss": 0.2273, "step": 10382 }, { "epoch": 3.1742586365025987, "grad_norm": 0.6728714108467102, "learning_rate": 2.8927434079232305e-05, "loss": 0.1676, "step": 10383 }, { "epoch": 3.1745643534087433, "grad_norm": 0.9252486228942871, "learning_rate": 2.8927009468812368e-05, "loss": 0.1855, "step": 10384 }, { "epoch": 3.1748700703148884, "grad_norm": 1.0324466228485107, "learning_rate": 2.8926584858392426e-05, "loss": 0.1779, "step": 10385 }, { "epoch": 3.1751757872210336, "grad_norm": 1.3853899240493774, "learning_rate": 2.8926160247972488e-05, "loss": 0.2021, "step": 10386 }, { "epoch": 3.1754815041271782, "grad_norm": 0.9265398383140564, "learning_rate": 2.8925735637552547e-05, "loss": 0.2099, "step": 10387 }, { "epoch": 3.1757872210333233, "grad_norm": 1.2718372344970703, "learning_rate": 2.892531102713261e-05, "loss": 0.3014, "step": 10388 }, { "epoch": 3.176092937939468, "grad_norm": 0.3673943281173706, "learning_rate": 2.8924886416712668e-05, "loss": 0.1685, "step": 10389 }, { "epoch": 3.176398654845613, "grad_norm": 0.3225771486759186, "learning_rate": 2.8924461806292726e-05, "loss": 0.1149, "step": 10390 }, { "epoch": 3.176704371751758, "grad_norm": 0.30433714389801025, "learning_rate": 2.892403719587279e-05, "loss": 0.0766, "step": 10391 }, { "epoch": 3.177010088657903, "grad_norm": 0.34489667415618896, "learning_rate": 2.8923612585452847e-05, "loss": 0.0632, "step": 10392 }, { "epoch": 3.1773158055640476, "grad_norm": 0.33075135946273804, "learning_rate": 2.892318797503291e-05, "loss": 0.0664, "step": 10393 }, { "epoch": 3.1776215224701927, "grad_norm": 0.2004011571407318, "learning_rate": 2.8922763364612968e-05, "loss": 0.0549, "step": 10394 }, { "epoch": 3.1779272393763374, "grad_norm": 0.3098849058151245, "learning_rate": 2.892233875419303e-05, "loss": 0.0627, "step": 10395 }, { "epoch": 3.1782329562824825, "grad_norm": 0.2252720445394516, "learning_rate": 2.892191414377309e-05, "loss": 0.0653, "step": 10396 }, { "epoch": 3.178538673188627, "grad_norm": 0.3029820919036865, "learning_rate": 2.892148953335315e-05, "loss": 0.0636, "step": 10397 }, { "epoch": 3.1788443900947723, "grad_norm": 0.3069455027580261, "learning_rate": 2.892106492293321e-05, "loss": 0.0823, "step": 10398 }, { "epoch": 3.1791501070009174, "grad_norm": 0.6973271369934082, "learning_rate": 2.8920640312513272e-05, "loss": 0.1147, "step": 10399 }, { "epoch": 3.179455823907062, "grad_norm": 0.314800888299942, "learning_rate": 2.892021570209333e-05, "loss": 0.0726, "step": 10400 }, { "epoch": 3.179761540813207, "grad_norm": 0.6784534454345703, "learning_rate": 2.891979109167339e-05, "loss": 0.1257, "step": 10401 }, { "epoch": 3.180067257719352, "grad_norm": 0.49673548340797424, "learning_rate": 2.891936648125345e-05, "loss": 0.1066, "step": 10402 }, { "epoch": 3.180372974625497, "grad_norm": 0.3788943886756897, "learning_rate": 2.891894187083351e-05, "loss": 0.119, "step": 10403 }, { "epoch": 3.1806786915316416, "grad_norm": 0.4808180034160614, "learning_rate": 2.8918517260413572e-05, "loss": 0.139, "step": 10404 }, { "epoch": 3.1809844084377867, "grad_norm": 0.5215859413146973, "learning_rate": 2.891809264999363e-05, "loss": 0.1554, "step": 10405 }, { "epoch": 3.1812901253439314, "grad_norm": 0.49991804361343384, "learning_rate": 2.8917668039573693e-05, "loss": 0.1688, "step": 10406 }, { "epoch": 3.1815958422500765, "grad_norm": 0.5255736708641052, "learning_rate": 2.891724342915375e-05, "loss": 0.1328, "step": 10407 }, { "epoch": 3.181901559156221, "grad_norm": 0.9352663159370422, "learning_rate": 2.8916818818733814e-05, "loss": 0.1923, "step": 10408 }, { "epoch": 3.1822072760623663, "grad_norm": 1.0829075574874878, "learning_rate": 2.8916394208313872e-05, "loss": 0.2084, "step": 10409 }, { "epoch": 3.182512992968511, "grad_norm": 1.3158094882965088, "learning_rate": 2.8915969597893934e-05, "loss": 0.2251, "step": 10410 }, { "epoch": 3.182818709874656, "grad_norm": 1.7487733364105225, "learning_rate": 2.8915544987473993e-05, "loss": 0.2021, "step": 10411 }, { "epoch": 3.183124426780801, "grad_norm": 2.011556386947632, "learning_rate": 2.8915120377054055e-05, "loss": 0.2526, "step": 10412 }, { "epoch": 3.183430143686946, "grad_norm": 0.940179169178009, "learning_rate": 2.8914695766634114e-05, "loss": 0.278, "step": 10413 }, { "epoch": 3.183735860593091, "grad_norm": 0.45186689496040344, "learning_rate": 2.8914271156214173e-05, "loss": 0.1329, "step": 10414 }, { "epoch": 3.1840415774992357, "grad_norm": 0.3452359437942505, "learning_rate": 2.8913846545794235e-05, "loss": 0.0763, "step": 10415 }, { "epoch": 3.184347294405381, "grad_norm": 0.3351374566555023, "learning_rate": 2.8913421935374293e-05, "loss": 0.0664, "step": 10416 }, { "epoch": 3.1846530113115255, "grad_norm": 0.3940337002277374, "learning_rate": 2.8912997324954355e-05, "loss": 0.0762, "step": 10417 }, { "epoch": 3.1849587282176706, "grad_norm": 0.3181935250759125, "learning_rate": 2.8912572714534414e-05, "loss": 0.0874, "step": 10418 }, { "epoch": 3.1852644451238152, "grad_norm": 0.3074764907360077, "learning_rate": 2.8912148104114476e-05, "loss": 0.063, "step": 10419 }, { "epoch": 3.1855701620299604, "grad_norm": 0.1907963901758194, "learning_rate": 2.8911723493694535e-05, "loss": 0.0473, "step": 10420 }, { "epoch": 3.185875878936105, "grad_norm": 0.2486201822757721, "learning_rate": 2.8911298883274597e-05, "loss": 0.0699, "step": 10421 }, { "epoch": 3.18618159584225, "grad_norm": 0.2709200978279114, "learning_rate": 2.8910874272854656e-05, "loss": 0.07, "step": 10422 }, { "epoch": 3.186487312748395, "grad_norm": 0.46359696984291077, "learning_rate": 2.8910449662434718e-05, "loss": 0.0825, "step": 10423 }, { "epoch": 3.18679302965454, "grad_norm": 0.2988613247871399, "learning_rate": 2.8910025052014776e-05, "loss": 0.1171, "step": 10424 }, { "epoch": 3.187098746560685, "grad_norm": 0.3177892565727234, "learning_rate": 2.890960044159484e-05, "loss": 0.0845, "step": 10425 }, { "epoch": 3.1874044634668297, "grad_norm": 0.3639858365058899, "learning_rate": 2.8909175831174897e-05, "loss": 0.0824, "step": 10426 }, { "epoch": 3.187710180372975, "grad_norm": 0.635928213596344, "learning_rate": 2.8908751220754956e-05, "loss": 0.1258, "step": 10427 }, { "epoch": 3.1880158972791195, "grad_norm": 0.36352095007896423, "learning_rate": 2.8908326610335018e-05, "loss": 0.1356, "step": 10428 }, { "epoch": 3.1883216141852646, "grad_norm": 0.472891241312027, "learning_rate": 2.8907901999915077e-05, "loss": 0.1354, "step": 10429 }, { "epoch": 3.1886273310914093, "grad_norm": 0.5582836270332336, "learning_rate": 2.890747738949514e-05, "loss": 0.1456, "step": 10430 }, { "epoch": 3.1889330479975544, "grad_norm": 0.9827665686607361, "learning_rate": 2.8907052779075198e-05, "loss": 0.1676, "step": 10431 }, { "epoch": 3.189238764903699, "grad_norm": 1.1098436117172241, "learning_rate": 2.890662816865526e-05, "loss": 0.1689, "step": 10432 }, { "epoch": 3.189544481809844, "grad_norm": 0.7669288516044617, "learning_rate": 2.890620355823532e-05, "loss": 0.1806, "step": 10433 }, { "epoch": 3.189850198715989, "grad_norm": 0.6207071542739868, "learning_rate": 2.890577894781538e-05, "loss": 0.2033, "step": 10434 }, { "epoch": 3.190155915622134, "grad_norm": 0.8880026936531067, "learning_rate": 2.890535433739544e-05, "loss": 0.1997, "step": 10435 }, { "epoch": 3.1904616325282786, "grad_norm": 0.6784281730651855, "learning_rate": 2.89049297269755e-05, "loss": 0.213, "step": 10436 }, { "epoch": 3.1907673494344237, "grad_norm": 0.9664744734764099, "learning_rate": 2.890450511655556e-05, "loss": 0.189, "step": 10437 }, { "epoch": 3.191073066340569, "grad_norm": 1.1475801467895508, "learning_rate": 2.8904080506135622e-05, "loss": 0.2362, "step": 10438 }, { "epoch": 3.1913787832467135, "grad_norm": 1.0642118453979492, "learning_rate": 2.890365589571568e-05, "loss": 0.1466, "step": 10439 }, { "epoch": 3.1916845001528587, "grad_norm": 0.3936113715171814, "learning_rate": 2.890323128529574e-05, "loss": 0.0858, "step": 10440 }, { "epoch": 3.1919902170590033, "grad_norm": 0.1616409868001938, "learning_rate": 2.89028066748758e-05, "loss": 0.0608, "step": 10441 }, { "epoch": 3.1922959339651484, "grad_norm": 0.6516021490097046, "learning_rate": 2.890238206445586e-05, "loss": 0.0718, "step": 10442 }, { "epoch": 3.192601650871293, "grad_norm": 0.3273179233074188, "learning_rate": 2.8901957454035922e-05, "loss": 0.0649, "step": 10443 }, { "epoch": 3.192907367777438, "grad_norm": 0.25446441769599915, "learning_rate": 2.890153284361598e-05, "loss": 0.0524, "step": 10444 }, { "epoch": 3.193213084683583, "grad_norm": 0.2964603900909424, "learning_rate": 2.8901108233196043e-05, "loss": 0.0693, "step": 10445 }, { "epoch": 3.193518801589728, "grad_norm": 0.7051530480384827, "learning_rate": 2.8900683622776102e-05, "loss": 0.0604, "step": 10446 }, { "epoch": 3.1938245184958727, "grad_norm": 0.490668922662735, "learning_rate": 2.8900259012356164e-05, "loss": 0.0883, "step": 10447 }, { "epoch": 3.194130235402018, "grad_norm": 0.2496608942747116, "learning_rate": 2.8899834401936223e-05, "loss": 0.0861, "step": 10448 }, { "epoch": 3.1944359523081625, "grad_norm": 0.2783708870410919, "learning_rate": 2.8899409791516285e-05, "loss": 0.0991, "step": 10449 }, { "epoch": 3.1947416692143076, "grad_norm": 0.5877504348754883, "learning_rate": 2.8898985181096343e-05, "loss": 0.0829, "step": 10450 }, { "epoch": 3.1950473861204527, "grad_norm": 0.37703952193260193, "learning_rate": 2.8898560570676405e-05, "loss": 0.1089, "step": 10451 }, { "epoch": 3.1953531030265974, "grad_norm": 0.37443143129348755, "learning_rate": 2.8898135960256464e-05, "loss": 0.129, "step": 10452 }, { "epoch": 3.1956588199327425, "grad_norm": 2.275608539581299, "learning_rate": 2.8897711349836523e-05, "loss": 0.1356, "step": 10453 }, { "epoch": 3.195964536838887, "grad_norm": 0.6629512310028076, "learning_rate": 2.8897286739416585e-05, "loss": 0.1569, "step": 10454 }, { "epoch": 3.1962702537450323, "grad_norm": 0.6007782816886902, "learning_rate": 2.8896862128996644e-05, "loss": 0.1634, "step": 10455 }, { "epoch": 3.196575970651177, "grad_norm": 0.5642523765563965, "learning_rate": 2.8896437518576706e-05, "loss": 0.1569, "step": 10456 }, { "epoch": 3.196881687557322, "grad_norm": 0.9696415662765503, "learning_rate": 2.8896012908156764e-05, "loss": 0.1684, "step": 10457 }, { "epoch": 3.1971874044634667, "grad_norm": 0.912240743637085, "learning_rate": 2.8895588297736827e-05, "loss": 0.1618, "step": 10458 }, { "epoch": 3.197493121369612, "grad_norm": 1.2138128280639648, "learning_rate": 2.8895163687316885e-05, "loss": 0.1891, "step": 10459 }, { "epoch": 3.1977988382757565, "grad_norm": 1.0734429359436035, "learning_rate": 2.8894739076896947e-05, "loss": 0.221, "step": 10460 }, { "epoch": 3.1981045551819016, "grad_norm": 0.6671456098556519, "learning_rate": 2.8894314466477006e-05, "loss": 0.2086, "step": 10461 }, { "epoch": 3.1984102720880463, "grad_norm": 1.5091363191604614, "learning_rate": 2.8893889856057068e-05, "loss": 0.2151, "step": 10462 }, { "epoch": 3.1987159889941914, "grad_norm": 1.1134912967681885, "learning_rate": 2.8893465245637127e-05, "loss": 0.2207, "step": 10463 }, { "epoch": 3.1990217059003365, "grad_norm": 0.4297177791595459, "learning_rate": 2.889304063521719e-05, "loss": 0.1708, "step": 10464 }, { "epoch": 3.199327422806481, "grad_norm": 0.30579692125320435, "learning_rate": 2.889261602479725e-05, "loss": 0.084, "step": 10465 }, { "epoch": 3.1996331397126263, "grad_norm": 0.1835135966539383, "learning_rate": 2.889219141437731e-05, "loss": 0.1038, "step": 10466 }, { "epoch": 3.199938856618771, "grad_norm": 0.38419240713119507, "learning_rate": 2.8891766803957372e-05, "loss": 0.068, "step": 10467 }, { "epoch": 3.200244573524916, "grad_norm": 0.2361433207988739, "learning_rate": 2.889134219353743e-05, "loss": 0.0637, "step": 10468 }, { "epoch": 3.2005502904310608, "grad_norm": 0.16327723860740662, "learning_rate": 2.8890917583117493e-05, "loss": 0.0492, "step": 10469 }, { "epoch": 3.200856007337206, "grad_norm": 0.23947305977344513, "learning_rate": 2.889049297269755e-05, "loss": 0.0694, "step": 10470 }, { "epoch": 3.2011617242433505, "grad_norm": 0.43292081356048584, "learning_rate": 2.8890068362277613e-05, "loss": 0.0682, "step": 10471 }, { "epoch": 3.2014674411494957, "grad_norm": 0.35495826601982117, "learning_rate": 2.8889643751857672e-05, "loss": 0.0967, "step": 10472 }, { "epoch": 3.2017731580556403, "grad_norm": 0.22964023053646088, "learning_rate": 2.8889219141437734e-05, "loss": 0.0869, "step": 10473 }, { "epoch": 3.2020788749617854, "grad_norm": 0.29759687185287476, "learning_rate": 2.8888794531017793e-05, "loss": 0.1033, "step": 10474 }, { "epoch": 3.20238459186793, "grad_norm": 0.45244723558425903, "learning_rate": 2.8888369920597855e-05, "loss": 0.0768, "step": 10475 }, { "epoch": 3.2026903087740752, "grad_norm": 0.776054322719574, "learning_rate": 2.8887945310177914e-05, "loss": 0.0944, "step": 10476 }, { "epoch": 3.2029960256802203, "grad_norm": 0.4086386561393738, "learning_rate": 2.8887520699757976e-05, "loss": 0.1236, "step": 10477 }, { "epoch": 3.203301742586365, "grad_norm": 0.3771533668041229, "learning_rate": 2.8887096089338034e-05, "loss": 0.1438, "step": 10478 }, { "epoch": 3.20360745949251, "grad_norm": 0.5628329515457153, "learning_rate": 2.8886671478918093e-05, "loss": 0.1379, "step": 10479 }, { "epoch": 3.203913176398655, "grad_norm": 0.6276693940162659, "learning_rate": 2.8886246868498155e-05, "loss": 0.1733, "step": 10480 }, { "epoch": 3.2042188933048, "grad_norm": 0.4830339252948761, "learning_rate": 2.8885822258078214e-05, "loss": 0.1597, "step": 10481 }, { "epoch": 3.2045246102109446, "grad_norm": 0.8605450987815857, "learning_rate": 2.8885397647658276e-05, "loss": 0.1951, "step": 10482 }, { "epoch": 3.2048303271170897, "grad_norm": 1.2452090978622437, "learning_rate": 2.8884973037238335e-05, "loss": 0.213, "step": 10483 }, { "epoch": 3.2051360440232344, "grad_norm": 0.8625948429107666, "learning_rate": 2.8884548426818397e-05, "loss": 0.2271, "step": 10484 }, { "epoch": 3.2054417609293795, "grad_norm": 1.3434405326843262, "learning_rate": 2.8884123816398455e-05, "loss": 0.17, "step": 10485 }, { "epoch": 3.205747477835524, "grad_norm": 0.6889351606369019, "learning_rate": 2.8883699205978518e-05, "loss": 0.2204, "step": 10486 }, { "epoch": 3.2060531947416693, "grad_norm": 1.5690975189208984, "learning_rate": 2.8883274595558576e-05, "loss": 0.1901, "step": 10487 }, { "epoch": 3.206358911647814, "grad_norm": 1.207532286643982, "learning_rate": 2.888284998513864e-05, "loss": 0.2688, "step": 10488 }, { "epoch": 3.206664628553959, "grad_norm": 0.3346012830734253, "learning_rate": 2.8882425374718697e-05, "loss": 0.1862, "step": 10489 }, { "epoch": 3.206970345460104, "grad_norm": 0.265266478061676, "learning_rate": 2.888200076429876e-05, "loss": 0.0833, "step": 10490 }, { "epoch": 3.207276062366249, "grad_norm": 0.3270065188407898, "learning_rate": 2.8881576153878818e-05, "loss": 0.0742, "step": 10491 }, { "epoch": 3.207581779272394, "grad_norm": 0.2365746647119522, "learning_rate": 2.8881151543458877e-05, "loss": 0.0655, "step": 10492 }, { "epoch": 3.2078874961785386, "grad_norm": 0.730700671672821, "learning_rate": 2.888072693303894e-05, "loss": 0.0638, "step": 10493 }, { "epoch": 3.2081932130846837, "grad_norm": 0.2832150459289551, "learning_rate": 2.8880302322618997e-05, "loss": 0.0456, "step": 10494 }, { "epoch": 3.2084989299908284, "grad_norm": 0.5017862915992737, "learning_rate": 2.887987771219906e-05, "loss": 0.0731, "step": 10495 }, { "epoch": 3.2088046468969735, "grad_norm": 0.2933886647224426, "learning_rate": 2.8879453101779118e-05, "loss": 0.0761, "step": 10496 }, { "epoch": 3.209110363803118, "grad_norm": 0.39577987790107727, "learning_rate": 2.887902849135918e-05, "loss": 0.086, "step": 10497 }, { "epoch": 3.2094160807092633, "grad_norm": 0.3237619400024414, "learning_rate": 2.887860388093924e-05, "loss": 0.0887, "step": 10498 }, { "epoch": 3.209721797615408, "grad_norm": 3.906724691390991, "learning_rate": 2.88781792705193e-05, "loss": 0.0839, "step": 10499 }, { "epoch": 3.210027514521553, "grad_norm": 0.5090740919113159, "learning_rate": 2.887775466009936e-05, "loss": 0.1071, "step": 10500 }, { "epoch": 3.2103332314276978, "grad_norm": 0.9490135908126831, "learning_rate": 2.8877330049679422e-05, "loss": 0.0857, "step": 10501 }, { "epoch": 3.210638948333843, "grad_norm": 0.325226753950119, "learning_rate": 2.887690543925948e-05, "loss": 0.1156, "step": 10502 }, { "epoch": 3.210944665239988, "grad_norm": 0.3100927174091339, "learning_rate": 2.8876480828839543e-05, "loss": 0.1068, "step": 10503 }, { "epoch": 3.2112503821461327, "grad_norm": 0.4618784487247467, "learning_rate": 2.88760562184196e-05, "loss": 0.1699, "step": 10504 }, { "epoch": 3.211556099052278, "grad_norm": 0.9104285836219788, "learning_rate": 2.887563160799966e-05, "loss": 0.1663, "step": 10505 }, { "epoch": 3.2118618159584225, "grad_norm": 0.7820422649383545, "learning_rate": 2.8875206997579722e-05, "loss": 0.1837, "step": 10506 }, { "epoch": 3.2121675328645676, "grad_norm": 1.349163293838501, "learning_rate": 2.887478238715978e-05, "loss": 0.1787, "step": 10507 }, { "epoch": 3.2124732497707122, "grad_norm": 1.0367317199707031, "learning_rate": 2.8874357776739843e-05, "loss": 0.1898, "step": 10508 }, { "epoch": 3.2127789666768574, "grad_norm": 0.8935732841491699, "learning_rate": 2.88739331663199e-05, "loss": 0.1544, "step": 10509 }, { "epoch": 3.213084683583002, "grad_norm": 0.5975874066352844, "learning_rate": 2.8873508555899964e-05, "loss": 0.1827, "step": 10510 }, { "epoch": 3.213390400489147, "grad_norm": 3.2081265449523926, "learning_rate": 2.8873083945480022e-05, "loss": 0.3157, "step": 10511 }, { "epoch": 3.213696117395292, "grad_norm": 0.8635193705558777, "learning_rate": 2.8872659335060084e-05, "loss": 0.1924, "step": 10512 }, { "epoch": 3.214001834301437, "grad_norm": 1.4656599760055542, "learning_rate": 2.8872234724640143e-05, "loss": 0.2539, "step": 10513 }, { "epoch": 3.2143075512075816, "grad_norm": 0.5244922637939453, "learning_rate": 2.8871810114220205e-05, "loss": 0.1297, "step": 10514 }, { "epoch": 3.2146132681137267, "grad_norm": 0.3010903298854828, "learning_rate": 2.8871385503800264e-05, "loss": 0.1047, "step": 10515 }, { "epoch": 3.214918985019872, "grad_norm": 0.17653028666973114, "learning_rate": 2.8870960893380323e-05, "loss": 0.0691, "step": 10516 }, { "epoch": 3.2152247019260165, "grad_norm": 0.20219215750694275, "learning_rate": 2.8870536282960385e-05, "loss": 0.0891, "step": 10517 }, { "epoch": 3.2155304188321616, "grad_norm": 0.31432196497917175, "learning_rate": 2.8870111672540443e-05, "loss": 0.0707, "step": 10518 }, { "epoch": 3.2158361357383063, "grad_norm": 0.19383615255355835, "learning_rate": 2.8869687062120505e-05, "loss": 0.0384, "step": 10519 }, { "epoch": 3.2161418526444514, "grad_norm": 0.2937846779823303, "learning_rate": 2.8869262451700564e-05, "loss": 0.0697, "step": 10520 }, { "epoch": 3.216447569550596, "grad_norm": 0.26363709568977356, "learning_rate": 2.8868837841280626e-05, "loss": 0.0769, "step": 10521 }, { "epoch": 3.216753286456741, "grad_norm": 0.27902868390083313, "learning_rate": 2.8868413230860685e-05, "loss": 0.094, "step": 10522 }, { "epoch": 3.217059003362886, "grad_norm": 0.3403518795967102, "learning_rate": 2.8867988620440747e-05, "loss": 0.0868, "step": 10523 }, { "epoch": 3.217364720269031, "grad_norm": 0.2806042730808258, "learning_rate": 2.8867564010020806e-05, "loss": 0.105, "step": 10524 }, { "epoch": 3.2176704371751756, "grad_norm": 0.4355252981185913, "learning_rate": 2.8867139399600868e-05, "loss": 0.104, "step": 10525 }, { "epoch": 3.2179761540813208, "grad_norm": 0.5004198551177979, "learning_rate": 2.8866714789180927e-05, "loss": 0.103, "step": 10526 }, { "epoch": 3.2182818709874654, "grad_norm": 0.4165099263191223, "learning_rate": 2.886629017876099e-05, "loss": 0.1318, "step": 10527 }, { "epoch": 3.2185875878936105, "grad_norm": 0.6774531006813049, "learning_rate": 2.8865865568341047e-05, "loss": 0.1339, "step": 10528 }, { "epoch": 3.2188933047997557, "grad_norm": 0.6157504916191101, "learning_rate": 2.8865440957921106e-05, "loss": 0.1425, "step": 10529 }, { "epoch": 3.2191990217059003, "grad_norm": 0.5181717872619629, "learning_rate": 2.8865016347501168e-05, "loss": 0.1377, "step": 10530 }, { "epoch": 3.2195047386120454, "grad_norm": 0.7362261414527893, "learning_rate": 2.8864591737081227e-05, "loss": 0.1842, "step": 10531 }, { "epoch": 3.21981045551819, "grad_norm": 0.6333439946174622, "learning_rate": 2.886416712666129e-05, "loss": 0.2243, "step": 10532 }, { "epoch": 3.2201161724243352, "grad_norm": 0.8444836735725403, "learning_rate": 2.8863742516241348e-05, "loss": 0.1728, "step": 10533 }, { "epoch": 3.22042188933048, "grad_norm": 0.9363928437232971, "learning_rate": 2.886331790582141e-05, "loss": 0.1942, "step": 10534 }, { "epoch": 3.220727606236625, "grad_norm": 0.8696133494377136, "learning_rate": 2.886289329540147e-05, "loss": 0.2318, "step": 10535 }, { "epoch": 3.2210333231427697, "grad_norm": 1.2795121669769287, "learning_rate": 2.886246868498153e-05, "loss": 0.2105, "step": 10536 }, { "epoch": 3.221339040048915, "grad_norm": 0.6588760018348694, "learning_rate": 2.886204407456159e-05, "loss": 0.1801, "step": 10537 }, { "epoch": 3.2216447569550595, "grad_norm": 1.087823510169983, "learning_rate": 2.886161946414165e-05, "loss": 0.2525, "step": 10538 }, { "epoch": 3.2219504738612046, "grad_norm": 0.31320691108703613, "learning_rate": 2.886119485372171e-05, "loss": 0.1356, "step": 10539 }, { "epoch": 3.2222561907673493, "grad_norm": 0.2404547780752182, "learning_rate": 2.8860770243301772e-05, "loss": 0.079, "step": 10540 }, { "epoch": 3.2225619076734944, "grad_norm": 0.2688133418560028, "learning_rate": 2.886034563288183e-05, "loss": 0.0762, "step": 10541 }, { "epoch": 3.2228676245796395, "grad_norm": 0.17369739711284637, "learning_rate": 2.885992102246189e-05, "loss": 0.0539, "step": 10542 }, { "epoch": 3.223173341485784, "grad_norm": 1.0433924198150635, "learning_rate": 2.885949641204195e-05, "loss": 0.0497, "step": 10543 }, { "epoch": 3.2234790583919293, "grad_norm": 0.2458207756280899, "learning_rate": 2.885907180162201e-05, "loss": 0.0628, "step": 10544 }, { "epoch": 3.223784775298074, "grad_norm": 3.3590919971466064, "learning_rate": 2.8858647191202072e-05, "loss": 0.0499, "step": 10545 }, { "epoch": 3.224090492204219, "grad_norm": 0.4284380376338959, "learning_rate": 2.885822258078213e-05, "loss": 0.0831, "step": 10546 }, { "epoch": 3.2243962091103637, "grad_norm": 0.876947820186615, "learning_rate": 2.8857797970362193e-05, "loss": 0.1038, "step": 10547 }, { "epoch": 3.224701926016509, "grad_norm": 0.3458676040172577, "learning_rate": 2.8857373359942252e-05, "loss": 0.0896, "step": 10548 }, { "epoch": 3.2250076429226535, "grad_norm": 0.6172247529029846, "learning_rate": 2.8856948749522314e-05, "loss": 0.1024, "step": 10549 }, { "epoch": 3.2253133598287986, "grad_norm": 1.2004575729370117, "learning_rate": 2.8856524139102373e-05, "loss": 0.092, "step": 10550 }, { "epoch": 3.2256190767349433, "grad_norm": 0.2850377857685089, "learning_rate": 2.8856099528682435e-05, "loss": 0.0872, "step": 10551 }, { "epoch": 3.2259247936410884, "grad_norm": 0.5246469974517822, "learning_rate": 2.8855674918262493e-05, "loss": 0.1601, "step": 10552 }, { "epoch": 3.226230510547233, "grad_norm": 0.3577025532722473, "learning_rate": 2.8855250307842555e-05, "loss": 0.152, "step": 10553 }, { "epoch": 3.226536227453378, "grad_norm": 0.6962556838989258, "learning_rate": 2.8854825697422614e-05, "loss": 0.1692, "step": 10554 }, { "epoch": 3.226841944359523, "grad_norm": 0.5429767966270447, "learning_rate": 2.8854401087002673e-05, "loss": 0.1737, "step": 10555 }, { "epoch": 3.227147661265668, "grad_norm": 0.6646235585212708, "learning_rate": 2.8853976476582735e-05, "loss": 0.1783, "step": 10556 }, { "epoch": 3.227453378171813, "grad_norm": 0.6350944638252258, "learning_rate": 2.8853551866162794e-05, "loss": 0.1917, "step": 10557 }, { "epoch": 3.2277590950779578, "grad_norm": 0.4968664348125458, "learning_rate": 2.8853127255742856e-05, "loss": 0.1652, "step": 10558 }, { "epoch": 3.228064811984103, "grad_norm": 0.965681791305542, "learning_rate": 2.8852702645322914e-05, "loss": 0.1761, "step": 10559 }, { "epoch": 3.2283705288902476, "grad_norm": 0.5754183530807495, "learning_rate": 2.8852278034902977e-05, "loss": 0.1777, "step": 10560 }, { "epoch": 3.2286762457963927, "grad_norm": 0.8060497641563416, "learning_rate": 2.8851853424483035e-05, "loss": 0.1695, "step": 10561 }, { "epoch": 3.2289819627025373, "grad_norm": 1.2986674308776855, "learning_rate": 2.8851428814063097e-05, "loss": 0.2127, "step": 10562 }, { "epoch": 3.2292876796086825, "grad_norm": 1.3918981552124023, "learning_rate": 2.8851004203643156e-05, "loss": 0.2655, "step": 10563 }, { "epoch": 3.229593396514827, "grad_norm": 0.3538292944431305, "learning_rate": 2.8850579593223218e-05, "loss": 0.171, "step": 10564 }, { "epoch": 3.2298991134209722, "grad_norm": 0.32746621966362, "learning_rate": 2.8850154982803277e-05, "loss": 0.1037, "step": 10565 }, { "epoch": 3.230204830327117, "grad_norm": 0.22989407181739807, "learning_rate": 2.884973037238334e-05, "loss": 0.0694, "step": 10566 }, { "epoch": 3.230510547233262, "grad_norm": 0.6170952916145325, "learning_rate": 2.88493057619634e-05, "loss": 0.0831, "step": 10567 }, { "epoch": 3.2308162641394067, "grad_norm": 0.21069766581058502, "learning_rate": 2.884888115154346e-05, "loss": 0.0649, "step": 10568 }, { "epoch": 3.231121981045552, "grad_norm": 0.3830619752407074, "learning_rate": 2.8848456541123522e-05, "loss": 0.0567, "step": 10569 }, { "epoch": 3.231427697951697, "grad_norm": 1.032784342765808, "learning_rate": 2.884803193070358e-05, "loss": 0.0596, "step": 10570 }, { "epoch": 3.2317334148578416, "grad_norm": 0.5984843373298645, "learning_rate": 2.8847607320283643e-05, "loss": 0.1132, "step": 10571 }, { "epoch": 3.2320391317639867, "grad_norm": 0.3863191306591034, "learning_rate": 2.88471827098637e-05, "loss": 0.0698, "step": 10572 }, { "epoch": 3.2323448486701314, "grad_norm": 0.26459136605262756, "learning_rate": 2.8846758099443763e-05, "loss": 0.0714, "step": 10573 }, { "epoch": 3.2326505655762765, "grad_norm": 0.21648091077804565, "learning_rate": 2.8846333489023822e-05, "loss": 0.1001, "step": 10574 }, { "epoch": 3.232956282482421, "grad_norm": 0.5180677771568298, "learning_rate": 2.8845908878603884e-05, "loss": 0.1175, "step": 10575 }, { "epoch": 3.2332619993885663, "grad_norm": 0.2746424078941345, "learning_rate": 2.8845484268183943e-05, "loss": 0.105, "step": 10576 }, { "epoch": 3.233567716294711, "grad_norm": 0.42433252930641174, "learning_rate": 2.8845059657764005e-05, "loss": 0.091, "step": 10577 }, { "epoch": 3.233873433200856, "grad_norm": 0.38742348551750183, "learning_rate": 2.8844635047344064e-05, "loss": 0.1307, "step": 10578 }, { "epoch": 3.2341791501070007, "grad_norm": 0.49261441826820374, "learning_rate": 2.8844210436924126e-05, "loss": 0.1263, "step": 10579 }, { "epoch": 3.234484867013146, "grad_norm": 1.2088640928268433, "learning_rate": 2.8843785826504184e-05, "loss": 0.1362, "step": 10580 }, { "epoch": 3.2347905839192905, "grad_norm": 0.795027494430542, "learning_rate": 2.8843361216084243e-05, "loss": 0.226, "step": 10581 }, { "epoch": 3.2350963008254356, "grad_norm": 1.8557071685791016, "learning_rate": 2.8842936605664305e-05, "loss": 0.1719, "step": 10582 }, { "epoch": 3.2354020177315808, "grad_norm": 0.7851055860519409, "learning_rate": 2.8842511995244364e-05, "loss": 0.1915, "step": 10583 }, { "epoch": 3.2357077346377254, "grad_norm": 1.3364362716674805, "learning_rate": 2.8842087384824426e-05, "loss": 0.2069, "step": 10584 }, { "epoch": 3.2360134515438705, "grad_norm": 1.414033055305481, "learning_rate": 2.8841662774404485e-05, "loss": 0.1829, "step": 10585 }, { "epoch": 3.236319168450015, "grad_norm": 1.2417123317718506, "learning_rate": 2.8841238163984547e-05, "loss": 0.191, "step": 10586 }, { "epoch": 3.2366248853561603, "grad_norm": 1.0209914445877075, "learning_rate": 2.8840813553564605e-05, "loss": 0.2172, "step": 10587 }, { "epoch": 3.236930602262305, "grad_norm": 2.0340640544891357, "learning_rate": 2.8840388943144668e-05, "loss": 0.2105, "step": 10588 }, { "epoch": 3.23723631916845, "grad_norm": 0.3232780694961548, "learning_rate": 2.8839964332724726e-05, "loss": 0.1489, "step": 10589 }, { "epoch": 3.237542036074595, "grad_norm": 0.3749852180480957, "learning_rate": 2.883953972230479e-05, "loss": 0.0863, "step": 10590 }, { "epoch": 3.23784775298074, "grad_norm": 0.3401715159416199, "learning_rate": 2.8839115111884847e-05, "loss": 0.0848, "step": 10591 }, { "epoch": 3.2381534698868846, "grad_norm": 0.26285645365715027, "learning_rate": 2.883869050146491e-05, "loss": 0.0917, "step": 10592 }, { "epoch": 3.2384591867930297, "grad_norm": 0.3813059329986572, "learning_rate": 2.8838265891044968e-05, "loss": 0.0543, "step": 10593 }, { "epoch": 3.2387649036991744, "grad_norm": 0.3047633171081543, "learning_rate": 2.8837841280625027e-05, "loss": 0.0625, "step": 10594 }, { "epoch": 3.2390706206053195, "grad_norm": 0.26845523715019226, "learning_rate": 2.883741667020509e-05, "loss": 0.0715, "step": 10595 }, { "epoch": 3.2393763375114646, "grad_norm": 0.22751019895076752, "learning_rate": 2.8836992059785147e-05, "loss": 0.0759, "step": 10596 }, { "epoch": 3.2396820544176093, "grad_norm": 0.3508749008178711, "learning_rate": 2.883656744936521e-05, "loss": 0.0732, "step": 10597 }, { "epoch": 3.2399877713237544, "grad_norm": 0.33490097522735596, "learning_rate": 2.8836142838945268e-05, "loss": 0.0709, "step": 10598 }, { "epoch": 3.240293488229899, "grad_norm": 0.4641721844673157, "learning_rate": 2.883571822852533e-05, "loss": 0.0892, "step": 10599 }, { "epoch": 3.240599205136044, "grad_norm": 0.34391018748283386, "learning_rate": 2.883529361810539e-05, "loss": 0.1049, "step": 10600 }, { "epoch": 3.240904922042189, "grad_norm": 0.6434691548347473, "learning_rate": 2.883486900768545e-05, "loss": 0.0973, "step": 10601 }, { "epoch": 3.241210638948334, "grad_norm": 0.38857799768447876, "learning_rate": 2.883444439726551e-05, "loss": 0.1381, "step": 10602 }, { "epoch": 3.2415163558544786, "grad_norm": 0.37987077236175537, "learning_rate": 2.8834019786845572e-05, "loss": 0.117, "step": 10603 }, { "epoch": 3.2418220727606237, "grad_norm": 1.1203927993774414, "learning_rate": 2.883359517642563e-05, "loss": 0.1614, "step": 10604 }, { "epoch": 3.2421277896667684, "grad_norm": 0.5886783599853516, "learning_rate": 2.8833170566005693e-05, "loss": 0.16, "step": 10605 }, { "epoch": 3.2424335065729135, "grad_norm": 0.865328848361969, "learning_rate": 2.883274595558575e-05, "loss": 0.2065, "step": 10606 }, { "epoch": 3.242739223479058, "grad_norm": 0.5364003777503967, "learning_rate": 2.883232134516581e-05, "loss": 0.2018, "step": 10607 }, { "epoch": 3.2430449403852033, "grad_norm": 0.662782609462738, "learning_rate": 2.8831896734745872e-05, "loss": 0.162, "step": 10608 }, { "epoch": 3.2433506572913484, "grad_norm": 0.9395014047622681, "learning_rate": 2.883147212432593e-05, "loss": 0.211, "step": 10609 }, { "epoch": 3.243656374197493, "grad_norm": 0.8677689433097839, "learning_rate": 2.8831047513905993e-05, "loss": 0.2034, "step": 10610 }, { "epoch": 3.243962091103638, "grad_norm": 1.3243541717529297, "learning_rate": 2.883062290348605e-05, "loss": 0.2023, "step": 10611 }, { "epoch": 3.244267808009783, "grad_norm": 0.844549834728241, "learning_rate": 2.8830198293066114e-05, "loss": 0.2322, "step": 10612 }, { "epoch": 3.244573524915928, "grad_norm": 1.7006962299346924, "learning_rate": 2.8829773682646172e-05, "loss": 0.2368, "step": 10613 }, { "epoch": 3.2448792418220727, "grad_norm": 0.3696390390396118, "learning_rate": 2.8829349072226234e-05, "loss": 0.1441, "step": 10614 }, { "epoch": 3.2451849587282178, "grad_norm": 0.2191426008939743, "learning_rate": 2.8828924461806293e-05, "loss": 0.0659, "step": 10615 }, { "epoch": 3.2454906756343624, "grad_norm": 0.24405750632286072, "learning_rate": 2.8828499851386355e-05, "loss": 0.0775, "step": 10616 }, { "epoch": 3.2457963925405076, "grad_norm": 0.2000950276851654, "learning_rate": 2.8828075240966414e-05, "loss": 0.0672, "step": 10617 }, { "epoch": 3.2461021094466522, "grad_norm": 0.45259755849838257, "learning_rate": 2.8827650630546476e-05, "loss": 0.042, "step": 10618 }, { "epoch": 3.2464078263527973, "grad_norm": 0.352444052696228, "learning_rate": 2.8827226020126535e-05, "loss": 0.0794, "step": 10619 }, { "epoch": 3.246713543258942, "grad_norm": 0.3878990113735199, "learning_rate": 2.8826801409706593e-05, "loss": 0.1026, "step": 10620 }, { "epoch": 3.247019260165087, "grad_norm": 0.3835465908050537, "learning_rate": 2.8826376799286655e-05, "loss": 0.0549, "step": 10621 }, { "epoch": 3.2473249770712322, "grad_norm": 0.6886758208274841, "learning_rate": 2.8825952188866714e-05, "loss": 0.0764, "step": 10622 }, { "epoch": 3.247630693977377, "grad_norm": 0.31115177273750305, "learning_rate": 2.8825527578446776e-05, "loss": 0.0808, "step": 10623 }, { "epoch": 3.247936410883522, "grad_norm": 0.27329227328300476, "learning_rate": 2.8825102968026835e-05, "loss": 0.0713, "step": 10624 }, { "epoch": 3.2482421277896667, "grad_norm": 0.3351220488548279, "learning_rate": 2.8824678357606897e-05, "loss": 0.092, "step": 10625 }, { "epoch": 3.248547844695812, "grad_norm": 0.4506594240665436, "learning_rate": 2.8824253747186956e-05, "loss": 0.1205, "step": 10626 }, { "epoch": 3.2488535616019565, "grad_norm": 0.45526260137557983, "learning_rate": 2.8823829136767018e-05, "loss": 0.1452, "step": 10627 }, { "epoch": 3.2491592785081016, "grad_norm": 0.570706844329834, "learning_rate": 2.8823404526347077e-05, "loss": 0.173, "step": 10628 }, { "epoch": 3.2494649954142463, "grad_norm": 0.32429951429367065, "learning_rate": 2.882297991592714e-05, "loss": 0.1297, "step": 10629 }, { "epoch": 3.2497707123203914, "grad_norm": 0.9133560061454773, "learning_rate": 2.8822555305507197e-05, "loss": 0.1857, "step": 10630 }, { "epoch": 3.250076429226536, "grad_norm": 0.7829674482345581, "learning_rate": 2.8822130695087256e-05, "loss": 0.1693, "step": 10631 }, { "epoch": 3.250382146132681, "grad_norm": 0.5562377572059631, "learning_rate": 2.8821706084667318e-05, "loss": 0.1795, "step": 10632 }, { "epoch": 3.2506878630388263, "grad_norm": 1.1683475971221924, "learning_rate": 2.8821281474247377e-05, "loss": 0.1909, "step": 10633 }, { "epoch": 3.250993579944971, "grad_norm": 0.6457872986793518, "learning_rate": 2.882085686382744e-05, "loss": 0.1794, "step": 10634 }, { "epoch": 3.2512992968511156, "grad_norm": 1.2373071908950806, "learning_rate": 2.8820432253407498e-05, "loss": 0.2174, "step": 10635 }, { "epoch": 3.2516050137572607, "grad_norm": 0.8832749128341675, "learning_rate": 2.882000764298756e-05, "loss": 0.2057, "step": 10636 }, { "epoch": 3.251910730663406, "grad_norm": 1.7998733520507812, "learning_rate": 2.881958303256762e-05, "loss": 0.2146, "step": 10637 }, { "epoch": 3.2522164475695505, "grad_norm": 1.196232557296753, "learning_rate": 2.881915842214768e-05, "loss": 0.3205, "step": 10638 }, { "epoch": 3.2525221644756956, "grad_norm": 0.3697861433029175, "learning_rate": 2.881873381172774e-05, "loss": 0.1645, "step": 10639 }, { "epoch": 3.2528278813818403, "grad_norm": 0.24572545289993286, "learning_rate": 2.88183092013078e-05, "loss": 0.0837, "step": 10640 }, { "epoch": 3.2531335982879854, "grad_norm": 0.2672721743583679, "learning_rate": 2.881788459088786e-05, "loss": 0.0729, "step": 10641 }, { "epoch": 3.25343931519413, "grad_norm": 0.24402928352355957, "learning_rate": 2.8817459980467922e-05, "loss": 0.0773, "step": 10642 }, { "epoch": 3.253745032100275, "grad_norm": 0.2168385088443756, "learning_rate": 2.881703537004798e-05, "loss": 0.0418, "step": 10643 }, { "epoch": 3.25405074900642, "grad_norm": 0.3791157603263855, "learning_rate": 2.881661075962804e-05, "loss": 0.0775, "step": 10644 }, { "epoch": 3.254356465912565, "grad_norm": 0.27287837862968445, "learning_rate": 2.88161861492081e-05, "loss": 0.0823, "step": 10645 }, { "epoch": 3.25466218281871, "grad_norm": 0.31663569808006287, "learning_rate": 2.881576153878816e-05, "loss": 0.0764, "step": 10646 }, { "epoch": 3.254967899724855, "grad_norm": 0.41135355830192566, "learning_rate": 2.8815336928368222e-05, "loss": 0.091, "step": 10647 }, { "epoch": 3.2552736166309995, "grad_norm": 0.25356772541999817, "learning_rate": 2.881491231794828e-05, "loss": 0.0938, "step": 10648 }, { "epoch": 3.2555793335371446, "grad_norm": 0.35714828968048096, "learning_rate": 2.8814487707528343e-05, "loss": 0.099, "step": 10649 }, { "epoch": 3.2558850504432897, "grad_norm": 0.31674861907958984, "learning_rate": 2.8814063097108402e-05, "loss": 0.0955, "step": 10650 }, { "epoch": 3.2561907673494344, "grad_norm": 0.3944692611694336, "learning_rate": 2.8813638486688464e-05, "loss": 0.1263, "step": 10651 }, { "epoch": 3.2564964842555795, "grad_norm": 0.680392861366272, "learning_rate": 2.8813213876268523e-05, "loss": 0.1599, "step": 10652 }, { "epoch": 3.256802201161724, "grad_norm": 0.4755667448043823, "learning_rate": 2.8812789265848585e-05, "loss": 0.1319, "step": 10653 }, { "epoch": 3.2571079180678693, "grad_norm": 1.076014757156372, "learning_rate": 2.8812364655428643e-05, "loss": 0.1674, "step": 10654 }, { "epoch": 3.257413634974014, "grad_norm": 0.6228817701339722, "learning_rate": 2.8811940045008705e-05, "loss": 0.1616, "step": 10655 }, { "epoch": 3.257719351880159, "grad_norm": 0.5858290791511536, "learning_rate": 2.8811515434588764e-05, "loss": 0.2325, "step": 10656 }, { "epoch": 3.2580250687863037, "grad_norm": 0.5437281131744385, "learning_rate": 2.8811090824168823e-05, "loss": 0.1582, "step": 10657 }, { "epoch": 3.258330785692449, "grad_norm": 0.5313270688056946, "learning_rate": 2.8810666213748885e-05, "loss": 0.18, "step": 10658 }, { "epoch": 3.258636502598594, "grad_norm": 0.8504934310913086, "learning_rate": 2.8810241603328944e-05, "loss": 0.1815, "step": 10659 }, { "epoch": 3.2589422195047386, "grad_norm": 0.8175503015518188, "learning_rate": 2.8809816992909006e-05, "loss": 0.1614, "step": 10660 }, { "epoch": 3.2592479364108833, "grad_norm": 1.5359829664230347, "learning_rate": 2.8809392382489064e-05, "loss": 0.1994, "step": 10661 }, { "epoch": 3.2595536533170284, "grad_norm": 1.8059628009796143, "learning_rate": 2.8808967772069127e-05, "loss": 0.2316, "step": 10662 }, { "epoch": 3.2598593702231735, "grad_norm": 3.1630735397338867, "learning_rate": 2.8808543161649185e-05, "loss": 0.2494, "step": 10663 }, { "epoch": 3.260165087129318, "grad_norm": 0.36769482493400574, "learning_rate": 2.8808118551229247e-05, "loss": 0.1858, "step": 10664 }, { "epoch": 3.2604708040354633, "grad_norm": 0.29835453629493713, "learning_rate": 2.8807693940809306e-05, "loss": 0.0854, "step": 10665 }, { "epoch": 3.260776520941608, "grad_norm": 0.24888582527637482, "learning_rate": 2.8807269330389368e-05, "loss": 0.0727, "step": 10666 }, { "epoch": 3.261082237847753, "grad_norm": 0.343437522649765, "learning_rate": 2.8806844719969427e-05, "loss": 0.0561, "step": 10667 }, { "epoch": 3.2613879547538978, "grad_norm": 0.256984144449234, "learning_rate": 2.880642010954949e-05, "loss": 0.0513, "step": 10668 }, { "epoch": 3.261693671660043, "grad_norm": 0.23697586357593536, "learning_rate": 2.880599549912955e-05, "loss": 0.042, "step": 10669 }, { "epoch": 3.2619993885661875, "grad_norm": 0.23875004053115845, "learning_rate": 2.880557088870961e-05, "loss": 0.0618, "step": 10670 }, { "epoch": 3.2623051054723327, "grad_norm": 0.25510522723197937, "learning_rate": 2.8805146278289672e-05, "loss": 0.0544, "step": 10671 }, { "epoch": 3.2626108223784778, "grad_norm": 0.5669688582420349, "learning_rate": 2.880472166786973e-05, "loss": 0.0586, "step": 10672 }, { "epoch": 3.2629165392846224, "grad_norm": 0.40235692262649536, "learning_rate": 2.8804297057449793e-05, "loss": 0.0696, "step": 10673 }, { "epoch": 3.263222256190767, "grad_norm": 0.6165787577629089, "learning_rate": 2.880387244702985e-05, "loss": 0.1326, "step": 10674 }, { "epoch": 3.2635279730969122, "grad_norm": 0.4380878210067749, "learning_rate": 2.8803447836609913e-05, "loss": 0.105, "step": 10675 }, { "epoch": 3.2638336900030573, "grad_norm": 0.31923383474349976, "learning_rate": 2.8803023226189972e-05, "loss": 0.1195, "step": 10676 }, { "epoch": 3.264139406909202, "grad_norm": 0.3893606960773468, "learning_rate": 2.8802598615770034e-05, "loss": 0.1122, "step": 10677 }, { "epoch": 3.264445123815347, "grad_norm": 0.37783151865005493, "learning_rate": 2.8802174005350093e-05, "loss": 0.1209, "step": 10678 }, { "epoch": 3.264750840721492, "grad_norm": 0.7017829418182373, "learning_rate": 2.8801749394930155e-05, "loss": 0.1467, "step": 10679 }, { "epoch": 3.265056557627637, "grad_norm": 0.5038353204727173, "learning_rate": 2.8801324784510214e-05, "loss": 0.1539, "step": 10680 }, { "epoch": 3.2653622745337816, "grad_norm": 0.44213229417800903, "learning_rate": 2.8800900174090276e-05, "loss": 0.1779, "step": 10681 }, { "epoch": 3.2656679914399267, "grad_norm": 0.5116874575614929, "learning_rate": 2.8800475563670334e-05, "loss": 0.1463, "step": 10682 }, { "epoch": 3.2659737083460714, "grad_norm": 0.6302129030227661, "learning_rate": 2.8800050953250393e-05, "loss": 0.1678, "step": 10683 }, { "epoch": 3.2662794252522165, "grad_norm": 1.5101094245910645, "learning_rate": 2.8799626342830455e-05, "loss": 0.1718, "step": 10684 }, { "epoch": 3.2665851421583616, "grad_norm": 0.8594716191291809, "learning_rate": 2.8799201732410514e-05, "loss": 0.222, "step": 10685 }, { "epoch": 3.2668908590645063, "grad_norm": 1.0215739011764526, "learning_rate": 2.8798777121990576e-05, "loss": 0.2286, "step": 10686 }, { "epoch": 3.267196575970651, "grad_norm": 2.967313766479492, "learning_rate": 2.8798352511570635e-05, "loss": 0.2699, "step": 10687 }, { "epoch": 3.267502292876796, "grad_norm": 3.1747796535491943, "learning_rate": 2.8797927901150697e-05, "loss": 0.276, "step": 10688 }, { "epoch": 3.267808009782941, "grad_norm": 0.3615938425064087, "learning_rate": 2.8797503290730756e-05, "loss": 0.1684, "step": 10689 }, { "epoch": 3.268113726689086, "grad_norm": 0.38801634311676025, "learning_rate": 2.8797078680310818e-05, "loss": 0.1086, "step": 10690 }, { "epoch": 3.268419443595231, "grad_norm": 0.2132461816072464, "learning_rate": 2.8796654069890876e-05, "loss": 0.0697, "step": 10691 }, { "epoch": 3.2687251605013756, "grad_norm": 0.18976598978042603, "learning_rate": 2.879622945947094e-05, "loss": 0.0566, "step": 10692 }, { "epoch": 3.2690308774075207, "grad_norm": 0.21518374979496002, "learning_rate": 2.8795804849050997e-05, "loss": 0.0573, "step": 10693 }, { "epoch": 3.2693365943136654, "grad_norm": 0.31047406792640686, "learning_rate": 2.879538023863106e-05, "loss": 0.0669, "step": 10694 }, { "epoch": 3.2696423112198105, "grad_norm": 0.3666713535785675, "learning_rate": 2.8794955628211118e-05, "loss": 0.0572, "step": 10695 }, { "epoch": 3.269948028125955, "grad_norm": 0.2654256224632263, "learning_rate": 2.8794531017791177e-05, "loss": 0.097, "step": 10696 }, { "epoch": 3.2702537450321003, "grad_norm": 0.31913110613822937, "learning_rate": 2.879410640737124e-05, "loss": 0.0632, "step": 10697 }, { "epoch": 3.2705594619382454, "grad_norm": 1.0832406282424927, "learning_rate": 2.8793681796951297e-05, "loss": 0.0651, "step": 10698 }, { "epoch": 3.27086517884439, "grad_norm": 0.33610478043556213, "learning_rate": 2.879325718653136e-05, "loss": 0.1045, "step": 10699 }, { "epoch": 3.2711708957505348, "grad_norm": 0.32436931133270264, "learning_rate": 2.8792832576111418e-05, "loss": 0.0735, "step": 10700 }, { "epoch": 3.27147661265668, "grad_norm": 0.40848997235298157, "learning_rate": 2.879240796569148e-05, "loss": 0.1205, "step": 10701 }, { "epoch": 3.271782329562825, "grad_norm": 0.43018895387649536, "learning_rate": 2.879198335527154e-05, "loss": 0.1002, "step": 10702 }, { "epoch": 3.2720880464689697, "grad_norm": 2.2641985416412354, "learning_rate": 2.87915587448516e-05, "loss": 0.1339, "step": 10703 }, { "epoch": 3.272393763375115, "grad_norm": 0.6063635349273682, "learning_rate": 2.879113413443166e-05, "loss": 0.1289, "step": 10704 }, { "epoch": 3.2726994802812595, "grad_norm": 0.7622556090354919, "learning_rate": 2.8790709524011722e-05, "loss": 0.205, "step": 10705 }, { "epoch": 3.2730051971874046, "grad_norm": 0.8083246946334839, "learning_rate": 2.879028491359178e-05, "loss": 0.1379, "step": 10706 }, { "epoch": 3.2733109140935492, "grad_norm": 0.9113272428512573, "learning_rate": 2.8789860303171843e-05, "loss": 0.1782, "step": 10707 }, { "epoch": 3.2736166309996944, "grad_norm": 0.6485589742660522, "learning_rate": 2.87894356927519e-05, "loss": 0.1875, "step": 10708 }, { "epoch": 3.273922347905839, "grad_norm": 0.9749108552932739, "learning_rate": 2.878901108233196e-05, "loss": 0.2298, "step": 10709 }, { "epoch": 3.274228064811984, "grad_norm": 1.2186684608459473, "learning_rate": 2.8788586471912022e-05, "loss": 0.2112, "step": 10710 }, { "epoch": 3.2745337817181293, "grad_norm": 1.8179634809494019, "learning_rate": 2.878816186149208e-05, "loss": 0.2365, "step": 10711 }, { "epoch": 3.274839498624274, "grad_norm": 0.8932722210884094, "learning_rate": 2.8787737251072143e-05, "loss": 0.221, "step": 10712 }, { "epoch": 3.2751452155304186, "grad_norm": 1.7751054763793945, "learning_rate": 2.87873126406522e-05, "loss": 0.2944, "step": 10713 }, { "epoch": 3.2754509324365637, "grad_norm": 0.5727121233940125, "learning_rate": 2.8786888030232264e-05, "loss": 0.1491, "step": 10714 }, { "epoch": 3.275756649342709, "grad_norm": 0.21116290986537933, "learning_rate": 2.8786463419812322e-05, "loss": 0.0754, "step": 10715 }, { "epoch": 3.2760623662488535, "grad_norm": 0.40427422523498535, "learning_rate": 2.8786038809392384e-05, "loss": 0.0631, "step": 10716 }, { "epoch": 3.2763680831549986, "grad_norm": 0.48542359471321106, "learning_rate": 2.8785614198972443e-05, "loss": 0.0746, "step": 10717 }, { "epoch": 3.2766738000611433, "grad_norm": 0.4819718301296234, "learning_rate": 2.8785189588552505e-05, "loss": 0.0698, "step": 10718 }, { "epoch": 3.2769795169672884, "grad_norm": 0.21388912200927734, "learning_rate": 2.8784764978132564e-05, "loss": 0.0302, "step": 10719 }, { "epoch": 3.277285233873433, "grad_norm": 0.9101128578186035, "learning_rate": 2.8784340367712626e-05, "loss": 0.0809, "step": 10720 }, { "epoch": 3.277590950779578, "grad_norm": 0.4006093740463257, "learning_rate": 2.8783915757292685e-05, "loss": 0.0674, "step": 10721 }, { "epoch": 3.277896667685723, "grad_norm": 0.41506829857826233, "learning_rate": 2.8783491146872743e-05, "loss": 0.0821, "step": 10722 }, { "epoch": 3.278202384591868, "grad_norm": 0.345859169960022, "learning_rate": 2.8783066536452806e-05, "loss": 0.073, "step": 10723 }, { "epoch": 3.278508101498013, "grad_norm": 0.30490997433662415, "learning_rate": 2.8782641926032864e-05, "loss": 0.0919, "step": 10724 }, { "epoch": 3.2788138184041578, "grad_norm": 0.3451857566833496, "learning_rate": 2.8782217315612926e-05, "loss": 0.0809, "step": 10725 }, { "epoch": 3.2791195353103024, "grad_norm": 0.5089197754859924, "learning_rate": 2.8781792705192985e-05, "loss": 0.1122, "step": 10726 }, { "epoch": 3.2794252522164475, "grad_norm": 0.5335745215415955, "learning_rate": 2.8781368094773047e-05, "loss": 0.1177, "step": 10727 }, { "epoch": 3.2797309691225927, "grad_norm": 0.6627233624458313, "learning_rate": 2.8780943484353106e-05, "loss": 0.1335, "step": 10728 }, { "epoch": 3.2800366860287373, "grad_norm": 0.8184587955474854, "learning_rate": 2.8780518873933168e-05, "loss": 0.1499, "step": 10729 }, { "epoch": 3.2803424029348824, "grad_norm": 0.43036577105522156, "learning_rate": 2.8780094263513227e-05, "loss": 0.148, "step": 10730 }, { "epoch": 3.280648119841027, "grad_norm": 0.4171397387981415, "learning_rate": 2.877966965309329e-05, "loss": 0.1602, "step": 10731 }, { "epoch": 3.2809538367471722, "grad_norm": 0.6603798270225525, "learning_rate": 2.8779245042673347e-05, "loss": 0.2175, "step": 10732 }, { "epoch": 3.281259553653317, "grad_norm": 1.1336698532104492, "learning_rate": 2.877882043225341e-05, "loss": 0.1801, "step": 10733 }, { "epoch": 3.281565270559462, "grad_norm": 0.8512790203094482, "learning_rate": 2.8778395821833468e-05, "loss": 0.2049, "step": 10734 }, { "epoch": 3.2818709874656067, "grad_norm": 0.736292839050293, "learning_rate": 2.8777971211413527e-05, "loss": 0.1875, "step": 10735 }, { "epoch": 3.282176704371752, "grad_norm": 0.7586683034896851, "learning_rate": 2.877754660099359e-05, "loss": 0.2091, "step": 10736 }, { "epoch": 3.282482421277897, "grad_norm": 0.8950480818748474, "learning_rate": 2.8777121990573648e-05, "loss": 0.1906, "step": 10737 }, { "epoch": 3.2827881381840416, "grad_norm": 3.5015971660614014, "learning_rate": 2.877669738015371e-05, "loss": 0.3228, "step": 10738 }, { "epoch": 3.2830938550901863, "grad_norm": 0.426777720451355, "learning_rate": 2.877627276973377e-05, "loss": 0.1331, "step": 10739 }, { "epoch": 3.2833995719963314, "grad_norm": 0.5875369310379028, "learning_rate": 2.877584815931383e-05, "loss": 0.0908, "step": 10740 }, { "epoch": 3.2837052889024765, "grad_norm": 0.40554705262184143, "learning_rate": 2.877542354889389e-05, "loss": 0.0649, "step": 10741 }, { "epoch": 3.284011005808621, "grad_norm": 0.18960227072238922, "learning_rate": 2.877499893847395e-05, "loss": 0.056, "step": 10742 }, { "epoch": 3.2843167227147663, "grad_norm": 1.0669682025909424, "learning_rate": 2.877457432805401e-05, "loss": 0.049, "step": 10743 }, { "epoch": 3.284622439620911, "grad_norm": 0.2994231879711151, "learning_rate": 2.8774149717634072e-05, "loss": 0.0786, "step": 10744 }, { "epoch": 3.284928156527056, "grad_norm": 0.2284844070672989, "learning_rate": 2.877372510721413e-05, "loss": 0.0474, "step": 10745 }, { "epoch": 3.2852338734332007, "grad_norm": 0.263779878616333, "learning_rate": 2.8773300496794193e-05, "loss": 0.053, "step": 10746 }, { "epoch": 3.285539590339346, "grad_norm": 3.6279489994049072, "learning_rate": 2.877287588637425e-05, "loss": 0.1017, "step": 10747 }, { "epoch": 3.2858453072454905, "grad_norm": 0.40871700644493103, "learning_rate": 2.877245127595431e-05, "loss": 0.0977, "step": 10748 }, { "epoch": 3.2861510241516356, "grad_norm": 0.3413103520870209, "learning_rate": 2.8772026665534372e-05, "loss": 0.0841, "step": 10749 }, { "epoch": 3.2864567410577807, "grad_norm": 0.9112032651901245, "learning_rate": 2.877160205511443e-05, "loss": 0.0896, "step": 10750 }, { "epoch": 3.2867624579639254, "grad_norm": 0.38378995656967163, "learning_rate": 2.8771177444694493e-05, "loss": 0.0966, "step": 10751 }, { "epoch": 3.28706817487007, "grad_norm": 1.1581456661224365, "learning_rate": 2.8770752834274552e-05, "loss": 0.1379, "step": 10752 }, { "epoch": 3.287373891776215, "grad_norm": 0.5630114674568176, "learning_rate": 2.8770328223854614e-05, "loss": 0.1108, "step": 10753 }, { "epoch": 3.2876796086823603, "grad_norm": 0.5144164562225342, "learning_rate": 2.8769903613434673e-05, "loss": 0.1739, "step": 10754 }, { "epoch": 3.287985325588505, "grad_norm": 0.5745360255241394, "learning_rate": 2.8769479003014735e-05, "loss": 0.1469, "step": 10755 }, { "epoch": 3.28829104249465, "grad_norm": 0.3722344636917114, "learning_rate": 2.8769054392594793e-05, "loss": 0.1384, "step": 10756 }, { "epoch": 3.2885967594007948, "grad_norm": 1.2080944776535034, "learning_rate": 2.8768629782174856e-05, "loss": 0.1396, "step": 10757 }, { "epoch": 3.28890247630694, "grad_norm": 1.1683650016784668, "learning_rate": 2.8768205171754914e-05, "loss": 0.2072, "step": 10758 }, { "epoch": 3.2892081932130846, "grad_norm": 0.822648286819458, "learning_rate": 2.8767780561334973e-05, "loss": 0.1866, "step": 10759 }, { "epoch": 3.2895139101192297, "grad_norm": 0.8499592542648315, "learning_rate": 2.8767355950915035e-05, "loss": 0.2077, "step": 10760 }, { "epoch": 3.2898196270253743, "grad_norm": 0.8292199373245239, "learning_rate": 2.8766931340495094e-05, "loss": 0.174, "step": 10761 }, { "epoch": 3.2901253439315195, "grad_norm": 0.7958196997642517, "learning_rate": 2.8766506730075156e-05, "loss": 0.221, "step": 10762 }, { "epoch": 3.2904310608376646, "grad_norm": 1.4527833461761475, "learning_rate": 2.8766082119655215e-05, "loss": 0.2662, "step": 10763 }, { "epoch": 3.2907367777438092, "grad_norm": 0.5617910623550415, "learning_rate": 2.8765657509235277e-05, "loss": 0.1929, "step": 10764 }, { "epoch": 3.291042494649954, "grad_norm": 0.2234809696674347, "learning_rate": 2.8765232898815335e-05, "loss": 0.0921, "step": 10765 }, { "epoch": 3.291348211556099, "grad_norm": 0.3536331057548523, "learning_rate": 2.8764808288395397e-05, "loss": 0.0616, "step": 10766 }, { "epoch": 3.291653928462244, "grad_norm": 0.5773003697395325, "learning_rate": 2.8764383677975456e-05, "loss": 0.0511, "step": 10767 }, { "epoch": 3.291959645368389, "grad_norm": 0.20229850709438324, "learning_rate": 2.8763959067555518e-05, "loss": 0.059, "step": 10768 }, { "epoch": 3.292265362274534, "grad_norm": 0.21263274550437927, "learning_rate": 2.8763534457135577e-05, "loss": 0.0651, "step": 10769 }, { "epoch": 3.2925710791806786, "grad_norm": 0.1813049465417862, "learning_rate": 2.876310984671564e-05, "loss": 0.0487, "step": 10770 }, { "epoch": 3.2928767960868237, "grad_norm": 0.46538370847702026, "learning_rate": 2.8762685236295698e-05, "loss": 0.0887, "step": 10771 }, { "epoch": 3.2931825129929684, "grad_norm": 0.5248755812644958, "learning_rate": 2.876226062587576e-05, "loss": 0.0794, "step": 10772 }, { "epoch": 3.2934882298991135, "grad_norm": 0.4038351774215698, "learning_rate": 2.8761836015455822e-05, "loss": 0.0783, "step": 10773 }, { "epoch": 3.293793946805258, "grad_norm": 0.3153238594532013, "learning_rate": 2.876141140503588e-05, "loss": 0.0812, "step": 10774 }, { "epoch": 3.2940996637114033, "grad_norm": 1.3917553424835205, "learning_rate": 2.8760986794615943e-05, "loss": 0.1045, "step": 10775 }, { "epoch": 3.2944053806175484, "grad_norm": 0.42406174540519714, "learning_rate": 2.8760562184196e-05, "loss": 0.1128, "step": 10776 }, { "epoch": 3.294711097523693, "grad_norm": 0.554750382900238, "learning_rate": 2.8760137573776063e-05, "loss": 0.1434, "step": 10777 }, { "epoch": 3.2950168144298377, "grad_norm": 0.487321674823761, "learning_rate": 2.8759712963356122e-05, "loss": 0.1052, "step": 10778 }, { "epoch": 3.295322531335983, "grad_norm": 0.4848528206348419, "learning_rate": 2.8759288352936184e-05, "loss": 0.1212, "step": 10779 }, { "epoch": 3.295628248242128, "grad_norm": 0.5452450513839722, "learning_rate": 2.8758863742516243e-05, "loss": 0.174, "step": 10780 }, { "epoch": 3.2959339651482726, "grad_norm": 0.6016244292259216, "learning_rate": 2.8758439132096305e-05, "loss": 0.1885, "step": 10781 }, { "epoch": 3.2962396820544178, "grad_norm": 0.5833252668380737, "learning_rate": 2.8758014521676364e-05, "loss": 0.1654, "step": 10782 }, { "epoch": 3.2965453989605624, "grad_norm": 0.5758500695228577, "learning_rate": 2.8757589911256426e-05, "loss": 0.1571, "step": 10783 }, { "epoch": 3.2968511158667075, "grad_norm": 0.6010109782218933, "learning_rate": 2.8757165300836484e-05, "loss": 0.1737, "step": 10784 }, { "epoch": 3.297156832772852, "grad_norm": 0.7657299637794495, "learning_rate": 2.8756740690416543e-05, "loss": 0.2095, "step": 10785 }, { "epoch": 3.2974625496789973, "grad_norm": 0.8192263841629028, "learning_rate": 2.8756316079996605e-05, "loss": 0.1837, "step": 10786 }, { "epoch": 3.297768266585142, "grad_norm": 1.4528484344482422, "learning_rate": 2.8755891469576664e-05, "loss": 0.2167, "step": 10787 }, { "epoch": 3.298073983491287, "grad_norm": 1.1863517761230469, "learning_rate": 2.8755466859156726e-05, "loss": 0.2407, "step": 10788 }, { "epoch": 3.2983797003974322, "grad_norm": 0.45743146538734436, "learning_rate": 2.8755042248736785e-05, "loss": 0.1333, "step": 10789 }, { "epoch": 3.298685417303577, "grad_norm": 0.3433409631252289, "learning_rate": 2.8754617638316847e-05, "loss": 0.0735, "step": 10790 }, { "epoch": 3.2989911342097216, "grad_norm": 0.27888643741607666, "learning_rate": 2.8754193027896906e-05, "loss": 0.0657, "step": 10791 }, { "epoch": 3.2992968511158667, "grad_norm": 0.5174843668937683, "learning_rate": 2.8753768417476968e-05, "loss": 0.0698, "step": 10792 }, { "epoch": 3.299602568022012, "grad_norm": 0.3891867399215698, "learning_rate": 2.8753343807057026e-05, "loss": 0.0888, "step": 10793 }, { "epoch": 3.2999082849281565, "grad_norm": 0.25440189242362976, "learning_rate": 2.875291919663709e-05, "loss": 0.0727, "step": 10794 }, { "epoch": 3.3002140018343016, "grad_norm": 0.4507249593734741, "learning_rate": 2.8752494586217147e-05, "loss": 0.0471, "step": 10795 }, { "epoch": 3.3005197187404463, "grad_norm": 0.5717864036560059, "learning_rate": 2.875206997579721e-05, "loss": 0.0787, "step": 10796 }, { "epoch": 3.3008254356465914, "grad_norm": 0.4366248846054077, "learning_rate": 2.8751645365377268e-05, "loss": 0.0932, "step": 10797 }, { "epoch": 3.301131152552736, "grad_norm": 0.36396709084510803, "learning_rate": 2.8751220754957327e-05, "loss": 0.0712, "step": 10798 }, { "epoch": 3.301436869458881, "grad_norm": 0.22137142717838287, "learning_rate": 2.875079614453739e-05, "loss": 0.0814, "step": 10799 }, { "epoch": 3.301742586365026, "grad_norm": 0.29697155952453613, "learning_rate": 2.8750371534117447e-05, "loss": 0.0689, "step": 10800 }, { "epoch": 3.302048303271171, "grad_norm": 0.39076414704322815, "learning_rate": 2.874994692369751e-05, "loss": 0.1042, "step": 10801 }, { "epoch": 3.302354020177316, "grad_norm": 0.5555484890937805, "learning_rate": 2.8749522313277568e-05, "loss": 0.1377, "step": 10802 }, { "epoch": 3.3026597370834607, "grad_norm": 0.45504453778266907, "learning_rate": 2.874909770285763e-05, "loss": 0.1377, "step": 10803 }, { "epoch": 3.3029654539896054, "grad_norm": 0.7065227031707764, "learning_rate": 2.874867309243769e-05, "loss": 0.1653, "step": 10804 }, { "epoch": 3.3032711708957505, "grad_norm": 0.6112934947013855, "learning_rate": 2.874824848201775e-05, "loss": 0.1635, "step": 10805 }, { "epoch": 3.3035768878018956, "grad_norm": 0.573981523513794, "learning_rate": 2.874782387159781e-05, "loss": 0.1532, "step": 10806 }, { "epoch": 3.3038826047080403, "grad_norm": 0.8004670143127441, "learning_rate": 2.8747399261177872e-05, "loss": 0.1663, "step": 10807 }, { "epoch": 3.3041883216141854, "grad_norm": 0.7381240725517273, "learning_rate": 2.874697465075793e-05, "loss": 0.2076, "step": 10808 }, { "epoch": 3.30449403852033, "grad_norm": 0.4958325922489166, "learning_rate": 2.8746550040337993e-05, "loss": 0.2042, "step": 10809 }, { "epoch": 3.304799755426475, "grad_norm": 0.5863160490989685, "learning_rate": 2.874612542991805e-05, "loss": 0.1826, "step": 10810 }, { "epoch": 3.30510547233262, "grad_norm": 0.702994704246521, "learning_rate": 2.874570081949811e-05, "loss": 0.2022, "step": 10811 }, { "epoch": 3.305411189238765, "grad_norm": 1.175748586654663, "learning_rate": 2.8745276209078172e-05, "loss": 0.2518, "step": 10812 }, { "epoch": 3.3057169061449097, "grad_norm": 0.7727872729301453, "learning_rate": 2.874485159865823e-05, "loss": 0.2793, "step": 10813 }, { "epoch": 3.3060226230510548, "grad_norm": 0.5323163270950317, "learning_rate": 2.8744426988238293e-05, "loss": 0.1579, "step": 10814 }, { "epoch": 3.3063283399572, "grad_norm": 0.1975831240415573, "learning_rate": 2.874400237781835e-05, "loss": 0.0757, "step": 10815 }, { "epoch": 3.3066340568633446, "grad_norm": 0.25469502806663513, "learning_rate": 2.8743577767398414e-05, "loss": 0.0611, "step": 10816 }, { "epoch": 3.306939773769489, "grad_norm": 0.4336417317390442, "learning_rate": 2.8743153156978472e-05, "loss": 0.0943, "step": 10817 }, { "epoch": 3.3072454906756343, "grad_norm": 0.857454776763916, "learning_rate": 2.8742728546558534e-05, "loss": 0.0718, "step": 10818 }, { "epoch": 3.3075512075817795, "grad_norm": 0.18481704592704773, "learning_rate": 2.8742303936138593e-05, "loss": 0.0562, "step": 10819 }, { "epoch": 3.307856924487924, "grad_norm": 0.3835495412349701, "learning_rate": 2.8741879325718655e-05, "loss": 0.075, "step": 10820 }, { "epoch": 3.3081626413940692, "grad_norm": 0.3440777063369751, "learning_rate": 2.8741454715298714e-05, "loss": 0.0816, "step": 10821 }, { "epoch": 3.308468358300214, "grad_norm": 0.29217275977134705, "learning_rate": 2.8741030104878776e-05, "loss": 0.0743, "step": 10822 }, { "epoch": 3.308774075206359, "grad_norm": 0.22667473554611206, "learning_rate": 2.8740605494458835e-05, "loss": 0.0823, "step": 10823 }, { "epoch": 3.3090797921125037, "grad_norm": 0.4493618607521057, "learning_rate": 2.8740180884038893e-05, "loss": 0.1182, "step": 10824 }, { "epoch": 3.309385509018649, "grad_norm": 0.5626574754714966, "learning_rate": 2.8739756273618956e-05, "loss": 0.0805, "step": 10825 }, { "epoch": 3.3096912259247935, "grad_norm": 0.31479412317276, "learning_rate": 2.8739331663199014e-05, "loss": 0.1026, "step": 10826 }, { "epoch": 3.3099969428309386, "grad_norm": 0.37255042791366577, "learning_rate": 2.8738907052779076e-05, "loss": 0.111, "step": 10827 }, { "epoch": 3.3103026597370837, "grad_norm": 0.38234949111938477, "learning_rate": 2.8738482442359135e-05, "loss": 0.0988, "step": 10828 }, { "epoch": 3.3106083766432284, "grad_norm": 0.4813401401042938, "learning_rate": 2.8738057831939197e-05, "loss": 0.1388, "step": 10829 }, { "epoch": 3.310914093549373, "grad_norm": 0.43951788544654846, "learning_rate": 2.8737633221519256e-05, "loss": 0.1498, "step": 10830 }, { "epoch": 3.311219810455518, "grad_norm": 2.005922555923462, "learning_rate": 2.8737208611099318e-05, "loss": 0.2064, "step": 10831 }, { "epoch": 3.3115255273616633, "grad_norm": 0.7702115774154663, "learning_rate": 2.8736784000679377e-05, "loss": 0.1743, "step": 10832 }, { "epoch": 3.311831244267808, "grad_norm": 0.6138361692428589, "learning_rate": 2.873635939025944e-05, "loss": 0.1757, "step": 10833 }, { "epoch": 3.312136961173953, "grad_norm": 0.7565755844116211, "learning_rate": 2.8735934779839497e-05, "loss": 0.2077, "step": 10834 }, { "epoch": 3.3124426780800977, "grad_norm": 0.6130148768424988, "learning_rate": 2.873551016941956e-05, "loss": 0.1946, "step": 10835 }, { "epoch": 3.312748394986243, "grad_norm": 1.1529450416564941, "learning_rate": 2.8735085558999618e-05, "loss": 0.2058, "step": 10836 }, { "epoch": 3.3130541118923875, "grad_norm": 0.749747097492218, "learning_rate": 2.8734660948579677e-05, "loss": 0.2438, "step": 10837 }, { "epoch": 3.3133598287985326, "grad_norm": 1.0993163585662842, "learning_rate": 2.873423633815974e-05, "loss": 0.2597, "step": 10838 }, { "epoch": 3.3136655457046773, "grad_norm": 0.6964902281761169, "learning_rate": 2.8733811727739798e-05, "loss": 0.1456, "step": 10839 }, { "epoch": 3.3139712626108224, "grad_norm": 0.335894912481308, "learning_rate": 2.873338711731986e-05, "loss": 0.089, "step": 10840 }, { "epoch": 3.3142769795169675, "grad_norm": 0.28697165846824646, "learning_rate": 2.873296250689992e-05, "loss": 0.0843, "step": 10841 }, { "epoch": 3.314582696423112, "grad_norm": 0.16110359132289886, "learning_rate": 2.873253789647998e-05, "loss": 0.0335, "step": 10842 }, { "epoch": 3.314888413329257, "grad_norm": 0.28468745946884155, "learning_rate": 2.873211328606004e-05, "loss": 0.0754, "step": 10843 }, { "epoch": 3.315194130235402, "grad_norm": 0.31090909242630005, "learning_rate": 2.87316886756401e-05, "loss": 0.071, "step": 10844 }, { "epoch": 3.315499847141547, "grad_norm": 0.3314419388771057, "learning_rate": 2.873126406522016e-05, "loss": 0.0521, "step": 10845 }, { "epoch": 3.315805564047692, "grad_norm": 0.177435502409935, "learning_rate": 2.8730839454800222e-05, "loss": 0.0503, "step": 10846 }, { "epoch": 3.316111280953837, "grad_norm": 0.7089155912399292, "learning_rate": 2.873041484438028e-05, "loss": 0.1124, "step": 10847 }, { "epoch": 3.3164169978599816, "grad_norm": 0.29377809166908264, "learning_rate": 2.8729990233960343e-05, "loss": 0.0617, "step": 10848 }, { "epoch": 3.3167227147661267, "grad_norm": 0.2646845877170563, "learning_rate": 2.87295656235404e-05, "loss": 0.0821, "step": 10849 }, { "epoch": 3.3170284316722713, "grad_norm": 0.8221212029457092, "learning_rate": 2.872914101312046e-05, "loss": 0.0853, "step": 10850 }, { "epoch": 3.3173341485784165, "grad_norm": 0.44431158900260925, "learning_rate": 2.8728716402700522e-05, "loss": 0.0843, "step": 10851 }, { "epoch": 3.317639865484561, "grad_norm": 0.497318297624588, "learning_rate": 2.872829179228058e-05, "loss": 0.1299, "step": 10852 }, { "epoch": 3.3179455823907062, "grad_norm": 0.4159863293170929, "learning_rate": 2.8727867181860643e-05, "loss": 0.1134, "step": 10853 }, { "epoch": 3.3182512992968514, "grad_norm": 0.48789873719215393, "learning_rate": 2.8727442571440702e-05, "loss": 0.1471, "step": 10854 }, { "epoch": 3.318557016202996, "grad_norm": 0.9466536641120911, "learning_rate": 2.8727017961020764e-05, "loss": 0.159, "step": 10855 }, { "epoch": 3.3188627331091407, "grad_norm": 0.8276566863059998, "learning_rate": 2.8726593350600823e-05, "loss": 0.1697, "step": 10856 }, { "epoch": 3.319168450015286, "grad_norm": 0.8239809274673462, "learning_rate": 2.8726168740180885e-05, "loss": 0.173, "step": 10857 }, { "epoch": 3.319474166921431, "grad_norm": 1.2764016389846802, "learning_rate": 2.8725744129760943e-05, "loss": 0.1924, "step": 10858 }, { "epoch": 3.3197798838275756, "grad_norm": 0.7297049760818481, "learning_rate": 2.8725319519341006e-05, "loss": 0.2407, "step": 10859 }, { "epoch": 3.3200856007337207, "grad_norm": 0.6638231873512268, "learning_rate": 2.8724894908921064e-05, "loss": 0.178, "step": 10860 }, { "epoch": 3.3203913176398654, "grad_norm": 1.2940967082977295, "learning_rate": 2.8724470298501126e-05, "loss": 0.1996, "step": 10861 }, { "epoch": 3.3206970345460105, "grad_norm": 1.3549937009811401, "learning_rate": 2.8724045688081185e-05, "loss": 0.2544, "step": 10862 }, { "epoch": 3.321002751452155, "grad_norm": 1.4954406023025513, "learning_rate": 2.8723621077661244e-05, "loss": 0.272, "step": 10863 }, { "epoch": 3.3213084683583003, "grad_norm": 0.3934219181537628, "learning_rate": 2.8723196467241306e-05, "loss": 0.1501, "step": 10864 }, { "epoch": 3.321614185264445, "grad_norm": 0.1561182290315628, "learning_rate": 2.8722771856821365e-05, "loss": 0.067, "step": 10865 }, { "epoch": 3.32191990217059, "grad_norm": 0.4002782702445984, "learning_rate": 2.8722347246401427e-05, "loss": 0.0881, "step": 10866 }, { "epoch": 3.322225619076735, "grad_norm": 0.3729495108127594, "learning_rate": 2.8721922635981485e-05, "loss": 0.0828, "step": 10867 }, { "epoch": 3.32253133598288, "grad_norm": 0.47391772270202637, "learning_rate": 2.8721498025561547e-05, "loss": 0.0585, "step": 10868 }, { "epoch": 3.3228370528890245, "grad_norm": 0.2959035336971283, "learning_rate": 2.8721073415141606e-05, "loss": 0.0546, "step": 10869 }, { "epoch": 3.3231427697951696, "grad_norm": 0.24149341881275177, "learning_rate": 2.8720648804721668e-05, "loss": 0.0664, "step": 10870 }, { "epoch": 3.3234484867013148, "grad_norm": 0.1871393620967865, "learning_rate": 2.8720224194301727e-05, "loss": 0.0613, "step": 10871 }, { "epoch": 3.3237542036074594, "grad_norm": 0.27904433012008667, "learning_rate": 2.871979958388179e-05, "loss": 0.0919, "step": 10872 }, { "epoch": 3.3240599205136045, "grad_norm": 0.2309628427028656, "learning_rate": 2.8719374973461848e-05, "loss": 0.0717, "step": 10873 }, { "epoch": 3.324365637419749, "grad_norm": 0.35104987025260925, "learning_rate": 2.871895036304191e-05, "loss": 0.1089, "step": 10874 }, { "epoch": 3.3246713543258943, "grad_norm": 0.6507737636566162, "learning_rate": 2.8718525752621972e-05, "loss": 0.1082, "step": 10875 }, { "epoch": 3.324977071232039, "grad_norm": 0.5482450127601624, "learning_rate": 2.871810114220203e-05, "loss": 0.1078, "step": 10876 }, { "epoch": 3.325282788138184, "grad_norm": 0.3898242712020874, "learning_rate": 2.8717676531782093e-05, "loss": 0.1564, "step": 10877 }, { "epoch": 3.325588505044329, "grad_norm": 0.5510368347167969, "learning_rate": 2.871725192136215e-05, "loss": 0.1287, "step": 10878 }, { "epoch": 3.325894221950474, "grad_norm": 1.4357450008392334, "learning_rate": 2.8716827310942213e-05, "loss": 0.136, "step": 10879 }, { "epoch": 3.326199938856619, "grad_norm": 0.5817170143127441, "learning_rate": 2.8716402700522272e-05, "loss": 0.1523, "step": 10880 }, { "epoch": 3.3265056557627637, "grad_norm": 0.6185963153839111, "learning_rate": 2.8715978090102334e-05, "loss": 0.1562, "step": 10881 }, { "epoch": 3.3268113726689084, "grad_norm": 0.6660125851631165, "learning_rate": 2.8715553479682393e-05, "loss": 0.176, "step": 10882 }, { "epoch": 3.3271170895750535, "grad_norm": 0.5563603639602661, "learning_rate": 2.8715128869262455e-05, "loss": 0.1678, "step": 10883 }, { "epoch": 3.3274228064811986, "grad_norm": 0.7631221413612366, "learning_rate": 2.8714704258842514e-05, "loss": 0.1918, "step": 10884 }, { "epoch": 3.3277285233873433, "grad_norm": 0.9008687734603882, "learning_rate": 2.8714279648422576e-05, "loss": 0.2109, "step": 10885 }, { "epoch": 3.3280342402934884, "grad_norm": 0.8422077298164368, "learning_rate": 2.8713855038002635e-05, "loss": 0.2216, "step": 10886 }, { "epoch": 3.328339957199633, "grad_norm": 1.3036868572235107, "learning_rate": 2.8713430427582693e-05, "loss": 0.1769, "step": 10887 }, { "epoch": 3.328645674105778, "grad_norm": 3.225583076477051, "learning_rate": 2.8713005817162755e-05, "loss": 0.2524, "step": 10888 }, { "epoch": 3.328951391011923, "grad_norm": 0.504551887512207, "learning_rate": 2.8712581206742814e-05, "loss": 0.1326, "step": 10889 }, { "epoch": 3.329257107918068, "grad_norm": 0.3924224078655243, "learning_rate": 2.8712156596322876e-05, "loss": 0.0771, "step": 10890 }, { "epoch": 3.3295628248242126, "grad_norm": 0.23273049294948578, "learning_rate": 2.8711731985902935e-05, "loss": 0.0721, "step": 10891 }, { "epoch": 3.3298685417303577, "grad_norm": 0.359000027179718, "learning_rate": 2.8711307375482997e-05, "loss": 0.0677, "step": 10892 }, { "epoch": 3.3301742586365024, "grad_norm": 0.6842906475067139, "learning_rate": 2.8710882765063056e-05, "loss": 0.057, "step": 10893 }, { "epoch": 3.3304799755426475, "grad_norm": 0.32186782360076904, "learning_rate": 2.8710458154643118e-05, "loss": 0.0367, "step": 10894 }, { "epoch": 3.330785692448792, "grad_norm": 0.3936348259449005, "learning_rate": 2.8710033544223176e-05, "loss": 0.0587, "step": 10895 }, { "epoch": 3.3310914093549373, "grad_norm": 0.23228178918361664, "learning_rate": 2.870960893380324e-05, "loss": 0.0684, "step": 10896 }, { "epoch": 3.3313971262610824, "grad_norm": 0.2560083568096161, "learning_rate": 2.8709184323383297e-05, "loss": 0.0556, "step": 10897 }, { "epoch": 3.331702843167227, "grad_norm": 0.40281298756599426, "learning_rate": 2.870875971296336e-05, "loss": 0.0684, "step": 10898 }, { "epoch": 3.332008560073372, "grad_norm": 0.6132236123085022, "learning_rate": 2.8708335102543418e-05, "loss": 0.11, "step": 10899 }, { "epoch": 3.332314276979517, "grad_norm": 0.6787664890289307, "learning_rate": 2.8707910492123477e-05, "loss": 0.0992, "step": 10900 }, { "epoch": 3.332619993885662, "grad_norm": 1.6368705034255981, "learning_rate": 2.870748588170354e-05, "loss": 0.0781, "step": 10901 }, { "epoch": 3.3329257107918067, "grad_norm": 3.3713338375091553, "learning_rate": 2.8707061271283597e-05, "loss": 0.1516, "step": 10902 }, { "epoch": 3.3332314276979518, "grad_norm": 0.505166232585907, "learning_rate": 2.870663666086366e-05, "loss": 0.1129, "step": 10903 }, { "epoch": 3.3335371446040964, "grad_norm": 0.49432358145713806, "learning_rate": 2.8706212050443718e-05, "loss": 0.1362, "step": 10904 }, { "epoch": 3.3338428615102416, "grad_norm": 0.5585086941719055, "learning_rate": 2.870578744002378e-05, "loss": 0.1591, "step": 10905 }, { "epoch": 3.3341485784163862, "grad_norm": 0.8165041208267212, "learning_rate": 2.870536282960384e-05, "loss": 0.1878, "step": 10906 }, { "epoch": 3.3344542953225313, "grad_norm": 1.7563066482543945, "learning_rate": 2.87049382191839e-05, "loss": 0.1744, "step": 10907 }, { "epoch": 3.334760012228676, "grad_norm": 0.8639978170394897, "learning_rate": 2.870451360876396e-05, "loss": 0.191, "step": 10908 }, { "epoch": 3.335065729134821, "grad_norm": 0.9599329233169556, "learning_rate": 2.8704088998344022e-05, "loss": 0.203, "step": 10909 }, { "epoch": 3.3353714460409662, "grad_norm": 0.825532078742981, "learning_rate": 2.870366438792408e-05, "loss": 0.1834, "step": 10910 }, { "epoch": 3.335677162947111, "grad_norm": 0.8041403293609619, "learning_rate": 2.8703239777504143e-05, "loss": 0.2048, "step": 10911 }, { "epoch": 3.335982879853256, "grad_norm": 1.1289196014404297, "learning_rate": 2.87028151670842e-05, "loss": 0.2394, "step": 10912 }, { "epoch": 3.3362885967594007, "grad_norm": 1.0648143291473389, "learning_rate": 2.870239055666426e-05, "loss": 0.2203, "step": 10913 }, { "epoch": 3.336594313665546, "grad_norm": 0.39454203844070435, "learning_rate": 2.8701965946244322e-05, "loss": 0.1704, "step": 10914 }, { "epoch": 3.3369000305716905, "grad_norm": 0.5799201726913452, "learning_rate": 2.870154133582438e-05, "loss": 0.093, "step": 10915 }, { "epoch": 3.3372057474778356, "grad_norm": 0.24499940872192383, "learning_rate": 2.8701116725404443e-05, "loss": 0.0763, "step": 10916 }, { "epoch": 3.3375114643839803, "grad_norm": 0.25673723220825195, "learning_rate": 2.87006921149845e-05, "loss": 0.0508, "step": 10917 }, { "epoch": 3.3378171812901254, "grad_norm": 0.22389745712280273, "learning_rate": 2.8700267504564564e-05, "loss": 0.066, "step": 10918 }, { "epoch": 3.33812289819627, "grad_norm": 0.20704272389411926, "learning_rate": 2.8699842894144622e-05, "loss": 0.0501, "step": 10919 }, { "epoch": 3.338428615102415, "grad_norm": 0.5257928967475891, "learning_rate": 2.8699418283724685e-05, "loss": 0.0523, "step": 10920 }, { "epoch": 3.33873433200856, "grad_norm": 0.2660631239414215, "learning_rate": 2.8698993673304743e-05, "loss": 0.086, "step": 10921 }, { "epoch": 3.339040048914705, "grad_norm": 0.35499846935272217, "learning_rate": 2.8698569062884805e-05, "loss": 0.0706, "step": 10922 }, { "epoch": 3.33934576582085, "grad_norm": 0.322788268327713, "learning_rate": 2.8698144452464864e-05, "loss": 0.0765, "step": 10923 }, { "epoch": 3.3396514827269947, "grad_norm": 0.3422314524650574, "learning_rate": 2.8697719842044926e-05, "loss": 0.1145, "step": 10924 }, { "epoch": 3.33995719963314, "grad_norm": 0.4071977138519287, "learning_rate": 2.8697295231624985e-05, "loss": 0.074, "step": 10925 }, { "epoch": 3.3402629165392845, "grad_norm": 0.3223609924316406, "learning_rate": 2.8696870621205043e-05, "loss": 0.1185, "step": 10926 }, { "epoch": 3.3405686334454296, "grad_norm": 0.6447814702987671, "learning_rate": 2.8696446010785106e-05, "loss": 0.1079, "step": 10927 }, { "epoch": 3.3408743503515743, "grad_norm": 1.1135859489440918, "learning_rate": 2.8696021400365164e-05, "loss": 0.1187, "step": 10928 }, { "epoch": 3.3411800672577194, "grad_norm": 0.6486056447029114, "learning_rate": 2.8695596789945226e-05, "loss": 0.1589, "step": 10929 }, { "epoch": 3.341485784163864, "grad_norm": 0.46527186036109924, "learning_rate": 2.8695172179525285e-05, "loss": 0.1431, "step": 10930 }, { "epoch": 3.341791501070009, "grad_norm": 0.6787195205688477, "learning_rate": 2.8694747569105347e-05, "loss": 0.2007, "step": 10931 }, { "epoch": 3.342097217976154, "grad_norm": 0.9213535189628601, "learning_rate": 2.8694322958685406e-05, "loss": 0.2163, "step": 10932 }, { "epoch": 3.342402934882299, "grad_norm": 1.0492933988571167, "learning_rate": 2.8693898348265468e-05, "loss": 0.1902, "step": 10933 }, { "epoch": 3.3427086517884437, "grad_norm": 0.9340243935585022, "learning_rate": 2.8693473737845527e-05, "loss": 0.1925, "step": 10934 }, { "epoch": 3.343014368694589, "grad_norm": 0.9391844868659973, "learning_rate": 2.869304912742559e-05, "loss": 0.1956, "step": 10935 }, { "epoch": 3.343320085600734, "grad_norm": 0.7215834259986877, "learning_rate": 2.8692624517005647e-05, "loss": 0.2059, "step": 10936 }, { "epoch": 3.3436258025068786, "grad_norm": 0.8093318939208984, "learning_rate": 2.869219990658571e-05, "loss": 0.2042, "step": 10937 }, { "epoch": 3.3439315194130237, "grad_norm": 1.2211278676986694, "learning_rate": 2.8691775296165768e-05, "loss": 0.23, "step": 10938 }, { "epoch": 3.3442372363191684, "grad_norm": 0.6094898581504822, "learning_rate": 2.8691350685745827e-05, "loss": 0.17, "step": 10939 }, { "epoch": 3.3445429532253135, "grad_norm": 0.35562458634376526, "learning_rate": 2.869092607532589e-05, "loss": 0.0978, "step": 10940 }, { "epoch": 3.344848670131458, "grad_norm": 0.3956395387649536, "learning_rate": 2.8690501464905948e-05, "loss": 0.0857, "step": 10941 }, { "epoch": 3.3451543870376033, "grad_norm": 0.2733890414237976, "learning_rate": 2.869007685448601e-05, "loss": 0.0786, "step": 10942 }, { "epoch": 3.345460103943748, "grad_norm": 0.24697129428386688, "learning_rate": 2.868965224406607e-05, "loss": 0.0853, "step": 10943 }, { "epoch": 3.345765820849893, "grad_norm": 0.27841705083847046, "learning_rate": 2.868922763364613e-05, "loss": 0.0477, "step": 10944 }, { "epoch": 3.3460715377560377, "grad_norm": 0.42057427763938904, "learning_rate": 2.868880302322619e-05, "loss": 0.0705, "step": 10945 }, { "epoch": 3.346377254662183, "grad_norm": 0.222965806722641, "learning_rate": 2.868837841280625e-05, "loss": 0.0642, "step": 10946 }, { "epoch": 3.3466829715683275, "grad_norm": 0.4708074927330017, "learning_rate": 2.868795380238631e-05, "loss": 0.0706, "step": 10947 }, { "epoch": 3.3469886884744726, "grad_norm": 1.1123851537704468, "learning_rate": 2.8687529191966372e-05, "loss": 0.0904, "step": 10948 }, { "epoch": 3.3472944053806177, "grad_norm": 0.2999650835990906, "learning_rate": 2.868710458154643e-05, "loss": 0.0972, "step": 10949 }, { "epoch": 3.3476001222867624, "grad_norm": 0.4670831561088562, "learning_rate": 2.8686679971126493e-05, "loss": 0.0875, "step": 10950 }, { "epoch": 3.3479058391929075, "grad_norm": 0.4502103626728058, "learning_rate": 2.868625536070655e-05, "loss": 0.0975, "step": 10951 }, { "epoch": 3.348211556099052, "grad_norm": 0.4629995822906494, "learning_rate": 2.868583075028661e-05, "loss": 0.1469, "step": 10952 }, { "epoch": 3.3485172730051973, "grad_norm": 0.4278769791126251, "learning_rate": 2.8685406139866672e-05, "loss": 0.1347, "step": 10953 }, { "epoch": 3.348822989911342, "grad_norm": 0.6294943690299988, "learning_rate": 2.868498152944673e-05, "loss": 0.1612, "step": 10954 }, { "epoch": 3.349128706817487, "grad_norm": 0.7668391466140747, "learning_rate": 2.8684556919026793e-05, "loss": 0.2228, "step": 10955 }, { "epoch": 3.3494344237236318, "grad_norm": 0.44285404682159424, "learning_rate": 2.8684132308606852e-05, "loss": 0.1731, "step": 10956 }, { "epoch": 3.349740140629777, "grad_norm": 0.5275416374206543, "learning_rate": 2.8683707698186914e-05, "loss": 0.1846, "step": 10957 }, { "epoch": 3.3500458575359215, "grad_norm": 0.6214660406112671, "learning_rate": 2.8683283087766973e-05, "loss": 0.2434, "step": 10958 }, { "epoch": 3.3503515744420667, "grad_norm": 0.5198252201080322, "learning_rate": 2.8682858477347035e-05, "loss": 0.1744, "step": 10959 }, { "epoch": 3.3506572913482113, "grad_norm": 1.065997838973999, "learning_rate": 2.8682433866927093e-05, "loss": 0.2113, "step": 10960 }, { "epoch": 3.3509630082543564, "grad_norm": 1.063598394393921, "learning_rate": 2.8682009256507156e-05, "loss": 0.2262, "step": 10961 }, { "epoch": 3.3512687251605016, "grad_norm": 0.9519305229187012, "learning_rate": 2.8681584646087214e-05, "loss": 0.1782, "step": 10962 }, { "epoch": 3.3515744420666462, "grad_norm": 2.2721149921417236, "learning_rate": 2.8681160035667276e-05, "loss": 0.241, "step": 10963 }, { "epoch": 3.3518801589727913, "grad_norm": 0.38734158873558044, "learning_rate": 2.8680735425247335e-05, "loss": 0.1807, "step": 10964 }, { "epoch": 3.352185875878936, "grad_norm": 0.353187620639801, "learning_rate": 2.8680310814827394e-05, "loss": 0.0981, "step": 10965 }, { "epoch": 3.352491592785081, "grad_norm": 0.25627610087394714, "learning_rate": 2.8679886204407456e-05, "loss": 0.0926, "step": 10966 }, { "epoch": 3.352797309691226, "grad_norm": 0.29596826434135437, "learning_rate": 2.8679461593987515e-05, "loss": 0.0585, "step": 10967 }, { "epoch": 3.353103026597371, "grad_norm": 0.2503947913646698, "learning_rate": 2.8679036983567577e-05, "loss": 0.0697, "step": 10968 }, { "epoch": 3.3534087435035156, "grad_norm": 0.2105199247598648, "learning_rate": 2.8678612373147635e-05, "loss": 0.0478, "step": 10969 }, { "epoch": 3.3537144604096607, "grad_norm": 0.3283345699310303, "learning_rate": 2.8678187762727697e-05, "loss": 0.0704, "step": 10970 }, { "epoch": 3.3540201773158054, "grad_norm": 0.23772189021110535, "learning_rate": 2.8677763152307756e-05, "loss": 0.0701, "step": 10971 }, { "epoch": 3.3543258942219505, "grad_norm": 0.20785237848758698, "learning_rate": 2.8677338541887818e-05, "loss": 0.0566, "step": 10972 }, { "epoch": 3.354631611128095, "grad_norm": 0.23355494439601898, "learning_rate": 2.8676913931467877e-05, "loss": 0.0729, "step": 10973 }, { "epoch": 3.3549373280342403, "grad_norm": 0.3440993130207062, "learning_rate": 2.867648932104794e-05, "loss": 0.0815, "step": 10974 }, { "epoch": 3.3552430449403854, "grad_norm": 0.45965784788131714, "learning_rate": 2.8676064710627998e-05, "loss": 0.0897, "step": 10975 }, { "epoch": 3.35554876184653, "grad_norm": 0.33877554535865784, "learning_rate": 2.867564010020806e-05, "loss": 0.0875, "step": 10976 }, { "epoch": 3.355854478752675, "grad_norm": 0.5566742420196533, "learning_rate": 2.8675215489788122e-05, "loss": 0.1212, "step": 10977 }, { "epoch": 3.35616019565882, "grad_norm": 0.4582141637802124, "learning_rate": 2.867479087936818e-05, "loss": 0.133, "step": 10978 }, { "epoch": 3.356465912564965, "grad_norm": 0.48002511262893677, "learning_rate": 2.8674366268948243e-05, "loss": 0.1636, "step": 10979 }, { "epoch": 3.3567716294711096, "grad_norm": 0.6613157987594604, "learning_rate": 2.86739416585283e-05, "loss": 0.1631, "step": 10980 }, { "epoch": 3.3570773463772547, "grad_norm": 0.4610394835472107, "learning_rate": 2.8673517048108363e-05, "loss": 0.1387, "step": 10981 }, { "epoch": 3.3573830632833994, "grad_norm": 0.8438497185707092, "learning_rate": 2.8673092437688422e-05, "loss": 0.2126, "step": 10982 }, { "epoch": 3.3576887801895445, "grad_norm": 0.5415909886360168, "learning_rate": 2.8672667827268484e-05, "loss": 0.1781, "step": 10983 }, { "epoch": 3.357994497095689, "grad_norm": 0.6382995247840881, "learning_rate": 2.8672243216848543e-05, "loss": 0.1699, "step": 10984 }, { "epoch": 3.3583002140018343, "grad_norm": 0.7932626605033875, "learning_rate": 2.8671818606428605e-05, "loss": 0.1867, "step": 10985 }, { "epoch": 3.358605930907979, "grad_norm": 0.9882655143737793, "learning_rate": 2.8671393996008664e-05, "loss": 0.1835, "step": 10986 }, { "epoch": 3.358911647814124, "grad_norm": 1.3365654945373535, "learning_rate": 2.8670969385588726e-05, "loss": 0.194, "step": 10987 }, { "epoch": 3.359217364720269, "grad_norm": 1.6371628046035767, "learning_rate": 2.8670544775168785e-05, "loss": 0.2397, "step": 10988 }, { "epoch": 3.359523081626414, "grad_norm": 0.3053208291530609, "learning_rate": 2.8670120164748843e-05, "loss": 0.1801, "step": 10989 }, { "epoch": 3.359828798532559, "grad_norm": 0.27863234281539917, "learning_rate": 2.8669695554328905e-05, "loss": 0.0652, "step": 10990 }, { "epoch": 3.3601345154387037, "grad_norm": 0.27088072896003723, "learning_rate": 2.8669270943908964e-05, "loss": 0.0667, "step": 10991 }, { "epoch": 3.360440232344849, "grad_norm": 0.2601951062679291, "learning_rate": 2.8668846333489026e-05, "loss": 0.0657, "step": 10992 }, { "epoch": 3.3607459492509935, "grad_norm": 0.23795567452907562, "learning_rate": 2.8668421723069085e-05, "loss": 0.0505, "step": 10993 }, { "epoch": 3.3610516661571386, "grad_norm": 0.3159230053424835, "learning_rate": 2.8667997112649147e-05, "loss": 0.0617, "step": 10994 }, { "epoch": 3.3613573830632832, "grad_norm": 0.24406912922859192, "learning_rate": 2.8667572502229206e-05, "loss": 0.0593, "step": 10995 }, { "epoch": 3.3616630999694284, "grad_norm": 0.8313982486724854, "learning_rate": 2.8667147891809268e-05, "loss": 0.0642, "step": 10996 }, { "epoch": 3.361968816875573, "grad_norm": 0.45065468549728394, "learning_rate": 2.8666723281389326e-05, "loss": 0.0887, "step": 10997 }, { "epoch": 3.362274533781718, "grad_norm": 0.39944037795066833, "learning_rate": 2.866629867096939e-05, "loss": 0.0736, "step": 10998 }, { "epoch": 3.362580250687863, "grad_norm": 0.5261447429656982, "learning_rate": 2.8665874060549447e-05, "loss": 0.1095, "step": 10999 }, { "epoch": 3.362885967594008, "grad_norm": 0.3151904046535492, "learning_rate": 2.866544945012951e-05, "loss": 0.0698, "step": 11000 }, { "epoch": 3.362885967594008, "eval_cer": 0.1902883702661124, "eval_loss": 0.24274443089962006, "eval_runtime": 19.0896, "eval_samples_per_second": 237.721, "eval_steps_per_second": 0.786, "eval_wer": 0.334655456551493, "step": 11000 }, { "epoch": 3.363191684500153, "grad_norm": 0.40215015411376953, "learning_rate": 2.8665024839709568e-05, "loss": 0.088, "step": 11001 }, { "epoch": 3.3634974014062977, "grad_norm": 0.30165529251098633, "learning_rate": 2.8664600229289627e-05, "loss": 0.1085, "step": 11002 }, { "epoch": 3.363803118312443, "grad_norm": 0.6867185235023499, "learning_rate": 2.866417561886969e-05, "loss": 0.1553, "step": 11003 }, { "epoch": 3.3641088352185875, "grad_norm": 0.7095072865486145, "learning_rate": 2.8663751008449747e-05, "loss": 0.1669, "step": 11004 }, { "epoch": 3.3644145521247326, "grad_norm": 0.7562287449836731, "learning_rate": 2.866332639802981e-05, "loss": 0.142, "step": 11005 }, { "epoch": 3.3647202690308773, "grad_norm": 0.693246603012085, "learning_rate": 2.8662901787609868e-05, "loss": 0.1565, "step": 11006 }, { "epoch": 3.3650259859370224, "grad_norm": 0.6014534831047058, "learning_rate": 2.866247717718993e-05, "loss": 0.1992, "step": 11007 }, { "epoch": 3.365331702843167, "grad_norm": 0.6453886032104492, "learning_rate": 2.866205256676999e-05, "loss": 0.146, "step": 11008 }, { "epoch": 3.365637419749312, "grad_norm": 0.755530059337616, "learning_rate": 2.866162795635005e-05, "loss": 0.1536, "step": 11009 }, { "epoch": 3.365943136655457, "grad_norm": 0.9337927103042603, "learning_rate": 2.866120334593011e-05, "loss": 0.1667, "step": 11010 }, { "epoch": 3.366248853561602, "grad_norm": 1.51158607006073, "learning_rate": 2.8660778735510172e-05, "loss": 0.2007, "step": 11011 }, { "epoch": 3.3665545704677466, "grad_norm": 2.4377951622009277, "learning_rate": 2.866035412509023e-05, "loss": 0.189, "step": 11012 }, { "epoch": 3.3668602873738918, "grad_norm": 2.2643561363220215, "learning_rate": 2.8659929514670293e-05, "loss": 0.2072, "step": 11013 }, { "epoch": 3.367166004280037, "grad_norm": 0.372999906539917, "learning_rate": 2.865950490425035e-05, "loss": 0.1638, "step": 11014 }, { "epoch": 3.3674717211861815, "grad_norm": 0.6373817920684814, "learning_rate": 2.865908029383041e-05, "loss": 0.0759, "step": 11015 }, { "epoch": 3.3677774380923267, "grad_norm": 0.24770371615886688, "learning_rate": 2.8658655683410472e-05, "loss": 0.0619, "step": 11016 }, { "epoch": 3.3680831549984713, "grad_norm": 0.41431474685668945, "learning_rate": 2.865823107299053e-05, "loss": 0.0796, "step": 11017 }, { "epoch": 3.3683888719046164, "grad_norm": 0.6104310154914856, "learning_rate": 2.8657806462570593e-05, "loss": 0.078, "step": 11018 }, { "epoch": 3.368694588810761, "grad_norm": 0.887670636177063, "learning_rate": 2.865738185215065e-05, "loss": 0.0925, "step": 11019 }, { "epoch": 3.3690003057169062, "grad_norm": 0.331271767616272, "learning_rate": 2.8656957241730714e-05, "loss": 0.0619, "step": 11020 }, { "epoch": 3.369306022623051, "grad_norm": 0.23046717047691345, "learning_rate": 2.8656532631310772e-05, "loss": 0.0489, "step": 11021 }, { "epoch": 3.369611739529196, "grad_norm": 0.7297018766403198, "learning_rate": 2.8656108020890835e-05, "loss": 0.0843, "step": 11022 }, { "epoch": 3.3699174564353407, "grad_norm": 0.8349404335021973, "learning_rate": 2.8655683410470893e-05, "loss": 0.0646, "step": 11023 }, { "epoch": 3.370223173341486, "grad_norm": 0.2331169843673706, "learning_rate": 2.8655258800050955e-05, "loss": 0.0793, "step": 11024 }, { "epoch": 3.3705288902476305, "grad_norm": 0.573587954044342, "learning_rate": 2.8654834189631014e-05, "loss": 0.0839, "step": 11025 }, { "epoch": 3.3708346071537756, "grad_norm": 3.483689069747925, "learning_rate": 2.8654409579211076e-05, "loss": 0.082, "step": 11026 }, { "epoch": 3.3711403240599207, "grad_norm": 0.35768070816993713, "learning_rate": 2.8653984968791135e-05, "loss": 0.1253, "step": 11027 }, { "epoch": 3.3714460409660654, "grad_norm": 0.41181617975234985, "learning_rate": 2.8653560358371194e-05, "loss": 0.1032, "step": 11028 }, { "epoch": 3.3717517578722105, "grad_norm": 0.5587811470031738, "learning_rate": 2.8653135747951256e-05, "loss": 0.1466, "step": 11029 }, { "epoch": 3.372057474778355, "grad_norm": 0.4432947337627411, "learning_rate": 2.8652711137531314e-05, "loss": 0.107, "step": 11030 }, { "epoch": 3.3723631916845003, "grad_norm": 0.4617578387260437, "learning_rate": 2.8652286527111376e-05, "loss": 0.152, "step": 11031 }, { "epoch": 3.372668908590645, "grad_norm": 0.5859404802322388, "learning_rate": 2.8651861916691435e-05, "loss": 0.2014, "step": 11032 }, { "epoch": 3.37297462549679, "grad_norm": 0.8241380453109741, "learning_rate": 2.8651437306271497e-05, "loss": 0.1935, "step": 11033 }, { "epoch": 3.3732803424029347, "grad_norm": 0.5669066309928894, "learning_rate": 2.8651012695851556e-05, "loss": 0.2178, "step": 11034 }, { "epoch": 3.37358605930908, "grad_norm": 0.8821679949760437, "learning_rate": 2.8650588085431618e-05, "loss": 0.1787, "step": 11035 }, { "epoch": 3.3738917762152245, "grad_norm": 0.855734646320343, "learning_rate": 2.8650163475011677e-05, "loss": 0.1803, "step": 11036 }, { "epoch": 3.3741974931213696, "grad_norm": 1.6160091161727905, "learning_rate": 2.864973886459174e-05, "loss": 0.192, "step": 11037 }, { "epoch": 3.3745032100275143, "grad_norm": 1.5723912715911865, "learning_rate": 2.8649314254171797e-05, "loss": 0.2759, "step": 11038 }, { "epoch": 3.3748089269336594, "grad_norm": 0.3113328218460083, "learning_rate": 2.864888964375186e-05, "loss": 0.1326, "step": 11039 }, { "epoch": 3.3751146438398045, "grad_norm": 0.39645764231681824, "learning_rate": 2.8648465033331918e-05, "loss": 0.0934, "step": 11040 }, { "epoch": 3.375420360745949, "grad_norm": 0.2672535479068756, "learning_rate": 2.8648040422911977e-05, "loss": 0.0855, "step": 11041 }, { "epoch": 3.3757260776520943, "grad_norm": 0.2784367501735687, "learning_rate": 2.864761581249204e-05, "loss": 0.0715, "step": 11042 }, { "epoch": 3.376031794558239, "grad_norm": 0.4318382441997528, "learning_rate": 2.8647191202072098e-05, "loss": 0.0553, "step": 11043 }, { "epoch": 3.376337511464384, "grad_norm": 0.15545929968357086, "learning_rate": 2.864676659165216e-05, "loss": 0.0414, "step": 11044 }, { "epoch": 3.3766432283705288, "grad_norm": 0.34027111530303955, "learning_rate": 2.864634198123222e-05, "loss": 0.0524, "step": 11045 }, { "epoch": 3.376948945276674, "grad_norm": 0.36154812574386597, "learning_rate": 2.864591737081228e-05, "loss": 0.0715, "step": 11046 }, { "epoch": 3.3772546621828186, "grad_norm": 0.2341439425945282, "learning_rate": 2.864549276039234e-05, "loss": 0.0768, "step": 11047 }, { "epoch": 3.3775603790889637, "grad_norm": 0.1983947604894638, "learning_rate": 2.86450681499724e-05, "loss": 0.047, "step": 11048 }, { "epoch": 3.3778660959951083, "grad_norm": 0.41315576434135437, "learning_rate": 2.864464353955246e-05, "loss": 0.1205, "step": 11049 }, { "epoch": 3.3781718129012535, "grad_norm": 0.6048937439918518, "learning_rate": 2.8644218929132522e-05, "loss": 0.073, "step": 11050 }, { "epoch": 3.378477529807398, "grad_norm": 0.25351670384407043, "learning_rate": 2.864379431871258e-05, "loss": 0.1076, "step": 11051 }, { "epoch": 3.3787832467135432, "grad_norm": 0.39158695936203003, "learning_rate": 2.8643369708292643e-05, "loss": 0.0906, "step": 11052 }, { "epoch": 3.3790889636196884, "grad_norm": 0.6705509424209595, "learning_rate": 2.86429450978727e-05, "loss": 0.1323, "step": 11053 }, { "epoch": 3.379394680525833, "grad_norm": 0.4507009983062744, "learning_rate": 2.864252048745276e-05, "loss": 0.1306, "step": 11054 }, { "epoch": 3.379700397431978, "grad_norm": 0.9302592873573303, "learning_rate": 2.8642095877032822e-05, "loss": 0.162, "step": 11055 }, { "epoch": 3.380006114338123, "grad_norm": 1.054091453552246, "learning_rate": 2.864167126661288e-05, "loss": 0.1696, "step": 11056 }, { "epoch": 3.380311831244268, "grad_norm": 0.5973097085952759, "learning_rate": 2.8641246656192943e-05, "loss": 0.1648, "step": 11057 }, { "epoch": 3.3806175481504126, "grad_norm": 0.6645049452781677, "learning_rate": 2.8640822045773002e-05, "loss": 0.164, "step": 11058 }, { "epoch": 3.3809232650565577, "grad_norm": 0.9313128590583801, "learning_rate": 2.8640397435353064e-05, "loss": 0.176, "step": 11059 }, { "epoch": 3.3812289819627024, "grad_norm": 1.921385407447815, "learning_rate": 2.8639972824933123e-05, "loss": 0.2027, "step": 11060 }, { "epoch": 3.3815346988688475, "grad_norm": 2.224595785140991, "learning_rate": 2.8639548214513185e-05, "loss": 0.2362, "step": 11061 }, { "epoch": 3.381840415774992, "grad_norm": 1.0008517503738403, "learning_rate": 2.8639123604093244e-05, "loss": 0.2209, "step": 11062 }, { "epoch": 3.3821461326811373, "grad_norm": 1.840315341949463, "learning_rate": 2.8638698993673306e-05, "loss": 0.2313, "step": 11063 }, { "epoch": 3.382451849587282, "grad_norm": 0.3402838408946991, "learning_rate": 2.8638274383253364e-05, "loss": 0.1449, "step": 11064 }, { "epoch": 3.382757566493427, "grad_norm": 0.4807402789592743, "learning_rate": 2.8637849772833426e-05, "loss": 0.1001, "step": 11065 }, { "epoch": 3.383063283399572, "grad_norm": 0.30192625522613525, "learning_rate": 2.8637425162413485e-05, "loss": 0.0946, "step": 11066 }, { "epoch": 3.383369000305717, "grad_norm": 1.0920705795288086, "learning_rate": 2.8637000551993544e-05, "loss": 0.0607, "step": 11067 }, { "epoch": 3.383674717211862, "grad_norm": 0.35483071208000183, "learning_rate": 2.8636575941573606e-05, "loss": 0.0673, "step": 11068 }, { "epoch": 3.3839804341180066, "grad_norm": 0.40638643503189087, "learning_rate": 2.8636151331153665e-05, "loss": 0.0851, "step": 11069 }, { "epoch": 3.3842861510241518, "grad_norm": 0.16627974808216095, "learning_rate": 2.8635726720733727e-05, "loss": 0.0545, "step": 11070 }, { "epoch": 3.3845918679302964, "grad_norm": 0.7012414336204529, "learning_rate": 2.8635302110313785e-05, "loss": 0.1117, "step": 11071 }, { "epoch": 3.3848975848364415, "grad_norm": 1.6224665641784668, "learning_rate": 2.8634877499893847e-05, "loss": 0.0745, "step": 11072 }, { "epoch": 3.385203301742586, "grad_norm": 0.2740088403224945, "learning_rate": 2.8634452889473906e-05, "loss": 0.071, "step": 11073 }, { "epoch": 3.3855090186487313, "grad_norm": 0.22916315495967865, "learning_rate": 2.8634028279053968e-05, "loss": 0.0669, "step": 11074 }, { "epoch": 3.385814735554876, "grad_norm": 0.6614203453063965, "learning_rate": 2.8633603668634027e-05, "loss": 0.0937, "step": 11075 }, { "epoch": 3.386120452461021, "grad_norm": 1.0529242753982544, "learning_rate": 2.863317905821409e-05, "loss": 0.1468, "step": 11076 }, { "epoch": 3.386426169367166, "grad_norm": 0.48655271530151367, "learning_rate": 2.8632754447794148e-05, "loss": 0.1338, "step": 11077 }, { "epoch": 3.386731886273311, "grad_norm": 0.5450708866119385, "learning_rate": 2.863232983737421e-05, "loss": 0.1253, "step": 11078 }, { "epoch": 3.387037603179456, "grad_norm": 0.6146869659423828, "learning_rate": 2.8631905226954272e-05, "loss": 0.1543, "step": 11079 }, { "epoch": 3.3873433200856007, "grad_norm": 2.2542314529418945, "learning_rate": 2.863148061653433e-05, "loss": 0.1626, "step": 11080 }, { "epoch": 3.387649036991746, "grad_norm": 1.0433634519577026, "learning_rate": 2.8631056006114393e-05, "loss": 0.1868, "step": 11081 }, { "epoch": 3.3879547538978905, "grad_norm": 0.6640366911888123, "learning_rate": 2.863063139569445e-05, "loss": 0.1683, "step": 11082 }, { "epoch": 3.3882604708040356, "grad_norm": 1.3204659223556519, "learning_rate": 2.8630206785274513e-05, "loss": 0.2142, "step": 11083 }, { "epoch": 3.3885661877101803, "grad_norm": 0.6997808814048767, "learning_rate": 2.8629782174854572e-05, "loss": 0.2118, "step": 11084 }, { "epoch": 3.3888719046163254, "grad_norm": 2.6383376121520996, "learning_rate": 2.8629357564434634e-05, "loss": 0.212, "step": 11085 }, { "epoch": 3.38917762152247, "grad_norm": 3.2764742374420166, "learning_rate": 2.8628932954014693e-05, "loss": 0.2072, "step": 11086 }, { "epoch": 3.389483338428615, "grad_norm": 1.4518520832061768, "learning_rate": 2.8628508343594755e-05, "loss": 0.2518, "step": 11087 }, { "epoch": 3.38978905533476, "grad_norm": 1.7917150259017944, "learning_rate": 2.8628083733174814e-05, "loss": 0.3037, "step": 11088 }, { "epoch": 3.390094772240905, "grad_norm": 0.5822088718414307, "learning_rate": 2.8627659122754876e-05, "loss": 0.1508, "step": 11089 }, { "epoch": 3.3904004891470496, "grad_norm": 0.7069395780563354, "learning_rate": 2.8627234512334935e-05, "loss": 0.0761, "step": 11090 }, { "epoch": 3.3907062060531947, "grad_norm": 0.44877105951309204, "learning_rate": 2.8626809901914997e-05, "loss": 0.0651, "step": 11091 }, { "epoch": 3.39101192295934, "grad_norm": 0.3514990508556366, "learning_rate": 2.8626385291495055e-05, "loss": 0.0717, "step": 11092 }, { "epoch": 3.3913176398654845, "grad_norm": 0.5522249341011047, "learning_rate": 2.8625960681075114e-05, "loss": 0.0589, "step": 11093 }, { "epoch": 3.3916233567716296, "grad_norm": 0.4326910078525543, "learning_rate": 2.8625536070655176e-05, "loss": 0.05, "step": 11094 }, { "epoch": 3.3919290736777743, "grad_norm": 0.3183179497718811, "learning_rate": 2.8625111460235235e-05, "loss": 0.074, "step": 11095 }, { "epoch": 3.3922347905839194, "grad_norm": 0.36221566796302795, "learning_rate": 2.8624686849815297e-05, "loss": 0.07, "step": 11096 }, { "epoch": 3.392540507490064, "grad_norm": 0.5829149484634399, "learning_rate": 2.8624262239395356e-05, "loss": 0.0853, "step": 11097 }, { "epoch": 3.392846224396209, "grad_norm": 0.1641683578491211, "learning_rate": 2.8623837628975418e-05, "loss": 0.0411, "step": 11098 }, { "epoch": 3.393151941302354, "grad_norm": 0.45554542541503906, "learning_rate": 2.8623413018555476e-05, "loss": 0.1054, "step": 11099 }, { "epoch": 3.393457658208499, "grad_norm": 0.24489067494869232, "learning_rate": 2.862298840813554e-05, "loss": 0.0827, "step": 11100 }, { "epoch": 3.3937633751146437, "grad_norm": 0.5242631435394287, "learning_rate": 2.8622563797715597e-05, "loss": 0.1372, "step": 11101 }, { "epoch": 3.3940690920207888, "grad_norm": 1.810405969619751, "learning_rate": 2.862213918729566e-05, "loss": 0.1228, "step": 11102 }, { "epoch": 3.3943748089269334, "grad_norm": 1.388286828994751, "learning_rate": 2.8621714576875718e-05, "loss": 0.124, "step": 11103 }, { "epoch": 3.3946805258330786, "grad_norm": 1.122571349143982, "learning_rate": 2.8621289966455777e-05, "loss": 0.1442, "step": 11104 }, { "epoch": 3.3949862427392237, "grad_norm": 0.7035204768180847, "learning_rate": 2.862086535603584e-05, "loss": 0.1817, "step": 11105 }, { "epoch": 3.3952919596453683, "grad_norm": 2.2535481452941895, "learning_rate": 2.8620440745615897e-05, "loss": 0.1574, "step": 11106 }, { "epoch": 3.3955976765515135, "grad_norm": 4.784183502197266, "learning_rate": 2.862001613519596e-05, "loss": 0.1901, "step": 11107 }, { "epoch": 3.395903393457658, "grad_norm": 0.7781633138656616, "learning_rate": 2.8619591524776018e-05, "loss": 0.235, "step": 11108 }, { "epoch": 3.3962091103638032, "grad_norm": 1.3449037075042725, "learning_rate": 2.861916691435608e-05, "loss": 0.1965, "step": 11109 }, { "epoch": 3.396514827269948, "grad_norm": 1.2555925846099854, "learning_rate": 2.861874230393614e-05, "loss": 0.1741, "step": 11110 }, { "epoch": 3.396820544176093, "grad_norm": 0.9551922678947449, "learning_rate": 2.86183176935162e-05, "loss": 0.2295, "step": 11111 }, { "epoch": 3.3971262610822377, "grad_norm": 3.2238211631774902, "learning_rate": 2.861789308309626e-05, "loss": 0.2249, "step": 11112 }, { "epoch": 3.397431977988383, "grad_norm": 4.050937175750732, "learning_rate": 2.8617468472676322e-05, "loss": 0.2686, "step": 11113 }, { "epoch": 3.3977376948945275, "grad_norm": 0.386522114276886, "learning_rate": 2.861704386225638e-05, "loss": 0.1748, "step": 11114 }, { "epoch": 3.3980434118006726, "grad_norm": 0.3709333837032318, "learning_rate": 2.8616619251836443e-05, "loss": 0.0676, "step": 11115 }, { "epoch": 3.3983491287068173, "grad_norm": 0.6221264600753784, "learning_rate": 2.86161946414165e-05, "loss": 0.0669, "step": 11116 }, { "epoch": 3.3986548456129624, "grad_norm": 0.24095310270786285, "learning_rate": 2.861577003099656e-05, "loss": 0.0555, "step": 11117 }, { "epoch": 3.3989605625191075, "grad_norm": 0.38488227128982544, "learning_rate": 2.8615345420576622e-05, "loss": 0.0672, "step": 11118 }, { "epoch": 3.399266279425252, "grad_norm": 0.3087623417377472, "learning_rate": 2.861492081015668e-05, "loss": 0.0616, "step": 11119 }, { "epoch": 3.3995719963313973, "grad_norm": 0.43310612440109253, "learning_rate": 2.8614496199736743e-05, "loss": 0.0478, "step": 11120 }, { "epoch": 3.399877713237542, "grad_norm": 0.2166462242603302, "learning_rate": 2.86140715893168e-05, "loss": 0.0654, "step": 11121 }, { "epoch": 3.400183430143687, "grad_norm": 0.5031372308731079, "learning_rate": 2.8613646978896864e-05, "loss": 0.0698, "step": 11122 }, { "epoch": 3.4004891470498317, "grad_norm": 0.49083027243614197, "learning_rate": 2.8613222368476922e-05, "loss": 0.0509, "step": 11123 }, { "epoch": 3.400794863955977, "grad_norm": 0.45799243450164795, "learning_rate": 2.8612797758056985e-05, "loss": 0.0848, "step": 11124 }, { "epoch": 3.4011005808621215, "grad_norm": 1.273734450340271, "learning_rate": 2.8612373147637043e-05, "loss": 0.0867, "step": 11125 }, { "epoch": 3.4014062977682666, "grad_norm": 0.7074138522148132, "learning_rate": 2.8611948537217105e-05, "loss": 0.0965, "step": 11126 }, { "epoch": 3.4017120146744113, "grad_norm": 0.5903763771057129, "learning_rate": 2.8611523926797164e-05, "loss": 0.1105, "step": 11127 }, { "epoch": 3.4020177315805564, "grad_norm": 0.6371526122093201, "learning_rate": 2.8611099316377226e-05, "loss": 0.1035, "step": 11128 }, { "epoch": 3.402323448486701, "grad_norm": 0.7191242575645447, "learning_rate": 2.8610674705957285e-05, "loss": 0.18, "step": 11129 }, { "epoch": 3.402629165392846, "grad_norm": 0.8074448108673096, "learning_rate": 2.8610250095537344e-05, "loss": 0.1802, "step": 11130 }, { "epoch": 3.4029348822989913, "grad_norm": 1.0130277872085571, "learning_rate": 2.8609825485117406e-05, "loss": 0.167, "step": 11131 }, { "epoch": 3.403240599205136, "grad_norm": 0.9214343428611755, "learning_rate": 2.8609400874697464e-05, "loss": 0.1708, "step": 11132 }, { "epoch": 3.403546316111281, "grad_norm": 0.8381365537643433, "learning_rate": 2.8608976264277526e-05, "loss": 0.1836, "step": 11133 }, { "epoch": 3.403852033017426, "grad_norm": 2.686288356781006, "learning_rate": 2.8608551653857585e-05, "loss": 0.1822, "step": 11134 }, { "epoch": 3.404157749923571, "grad_norm": 1.0897866487503052, "learning_rate": 2.8608127043437647e-05, "loss": 0.1905, "step": 11135 }, { "epoch": 3.4044634668297156, "grad_norm": 1.2061734199523926, "learning_rate": 2.8607702433017706e-05, "loss": 0.2136, "step": 11136 }, { "epoch": 3.4047691837358607, "grad_norm": 2.327174425125122, "learning_rate": 2.8607277822597768e-05, "loss": 0.2289, "step": 11137 }, { "epoch": 3.4050749006420054, "grad_norm": 6.682974338531494, "learning_rate": 2.8606853212177827e-05, "loss": 0.285, "step": 11138 }, { "epoch": 3.4053806175481505, "grad_norm": 0.7041379809379578, "learning_rate": 2.860642860175789e-05, "loss": 0.1733, "step": 11139 }, { "epoch": 3.405686334454295, "grad_norm": 0.2937558591365814, "learning_rate": 2.8606003991337947e-05, "loss": 0.1056, "step": 11140 }, { "epoch": 3.4059920513604403, "grad_norm": 0.2297447770833969, "learning_rate": 2.860557938091801e-05, "loss": 0.0571, "step": 11141 }, { "epoch": 3.406297768266585, "grad_norm": 0.3587099313735962, "learning_rate": 2.8605154770498068e-05, "loss": 0.0536, "step": 11142 }, { "epoch": 3.40660348517273, "grad_norm": 0.37594541907310486, "learning_rate": 2.8604730160078127e-05, "loss": 0.0676, "step": 11143 }, { "epoch": 3.406909202078875, "grad_norm": 0.7717876434326172, "learning_rate": 2.860430554965819e-05, "loss": 0.0638, "step": 11144 }, { "epoch": 3.40721491898502, "grad_norm": 0.4143736660480499, "learning_rate": 2.8603880939238248e-05, "loss": 0.0803, "step": 11145 }, { "epoch": 3.407520635891165, "grad_norm": 0.3413622975349426, "learning_rate": 2.860345632881831e-05, "loss": 0.0697, "step": 11146 }, { "epoch": 3.4078263527973096, "grad_norm": 0.4460757076740265, "learning_rate": 2.860303171839837e-05, "loss": 0.075, "step": 11147 }, { "epoch": 3.4081320697034547, "grad_norm": 0.42595136165618896, "learning_rate": 2.860260710797843e-05, "loss": 0.0802, "step": 11148 }, { "epoch": 3.4084377866095994, "grad_norm": 0.3969530761241913, "learning_rate": 2.860218249755849e-05, "loss": 0.0823, "step": 11149 }, { "epoch": 3.4087435035157445, "grad_norm": 4.497835159301758, "learning_rate": 2.860175788713855e-05, "loss": 0.0791, "step": 11150 }, { "epoch": 3.409049220421889, "grad_norm": 0.36849135160446167, "learning_rate": 2.860133327671861e-05, "loss": 0.1057, "step": 11151 }, { "epoch": 3.4093549373280343, "grad_norm": 0.6750729084014893, "learning_rate": 2.8600908666298672e-05, "loss": 0.1194, "step": 11152 }, { "epoch": 3.409660654234179, "grad_norm": 0.5330031514167786, "learning_rate": 2.860048405587873e-05, "loss": 0.1021, "step": 11153 }, { "epoch": 3.409966371140324, "grad_norm": 0.9343262910842896, "learning_rate": 2.8600059445458793e-05, "loss": 0.1649, "step": 11154 }, { "epoch": 3.4102720880464688, "grad_norm": 0.7922360897064209, "learning_rate": 2.8599634835038852e-05, "loss": 0.1422, "step": 11155 }, { "epoch": 3.410577804952614, "grad_norm": 1.2635822296142578, "learning_rate": 2.859921022461891e-05, "loss": 0.2127, "step": 11156 }, { "epoch": 3.410883521858759, "grad_norm": 0.9164182543754578, "learning_rate": 2.8598785614198972e-05, "loss": 0.1776, "step": 11157 }, { "epoch": 3.4111892387649037, "grad_norm": 1.1359444856643677, "learning_rate": 2.859836100377903e-05, "loss": 0.1887, "step": 11158 }, { "epoch": 3.4114949556710488, "grad_norm": 0.9731099009513855, "learning_rate": 2.8597936393359093e-05, "loss": 0.2041, "step": 11159 }, { "epoch": 3.4118006725771934, "grad_norm": 1.0292668342590332, "learning_rate": 2.8597511782939152e-05, "loss": 0.2144, "step": 11160 }, { "epoch": 3.4121063894833386, "grad_norm": 0.9869859218597412, "learning_rate": 2.8597087172519214e-05, "loss": 0.201, "step": 11161 }, { "epoch": 3.4124121063894832, "grad_norm": 1.2989439964294434, "learning_rate": 2.8596662562099273e-05, "loss": 0.1945, "step": 11162 }, { "epoch": 3.4127178232956283, "grad_norm": 0.9073262214660645, "learning_rate": 2.8596237951679335e-05, "loss": 0.2377, "step": 11163 }, { "epoch": 3.413023540201773, "grad_norm": 0.4860975444316864, "learning_rate": 2.8595813341259394e-05, "loss": 0.1482, "step": 11164 }, { "epoch": 3.413329257107918, "grad_norm": 0.31408655643463135, "learning_rate": 2.8595388730839456e-05, "loss": 0.1009, "step": 11165 }, { "epoch": 3.413634974014063, "grad_norm": 0.47527575492858887, "learning_rate": 2.8594964120419514e-05, "loss": 0.0861, "step": 11166 }, { "epoch": 3.413940690920208, "grad_norm": 0.29472240805625916, "learning_rate": 2.8594539509999576e-05, "loss": 0.0748, "step": 11167 }, { "epoch": 3.4142464078263526, "grad_norm": 0.2872104346752167, "learning_rate": 2.8594114899579635e-05, "loss": 0.0825, "step": 11168 }, { "epoch": 3.4145521247324977, "grad_norm": 0.3630920946598053, "learning_rate": 2.8593690289159694e-05, "loss": 0.0759, "step": 11169 }, { "epoch": 3.414857841638643, "grad_norm": 0.29541411995887756, "learning_rate": 2.8593265678739756e-05, "loss": 0.077, "step": 11170 }, { "epoch": 3.4151635585447875, "grad_norm": 0.23007957637310028, "learning_rate": 2.8592841068319815e-05, "loss": 0.0563, "step": 11171 }, { "epoch": 3.4154692754509326, "grad_norm": 0.2559294104576111, "learning_rate": 2.8592416457899877e-05, "loss": 0.0587, "step": 11172 }, { "epoch": 3.4157749923570773, "grad_norm": 0.5289781093597412, "learning_rate": 2.8591991847479935e-05, "loss": 0.0972, "step": 11173 }, { "epoch": 3.4160807092632224, "grad_norm": 0.5510900616645813, "learning_rate": 2.8591567237059997e-05, "loss": 0.0927, "step": 11174 }, { "epoch": 3.416386426169367, "grad_norm": 0.521876335144043, "learning_rate": 2.8591142626640056e-05, "loss": 0.1149, "step": 11175 }, { "epoch": 3.416692143075512, "grad_norm": 0.7328980565071106, "learning_rate": 2.8590718016220118e-05, "loss": 0.1395, "step": 11176 }, { "epoch": 3.416997859981657, "grad_norm": 0.48018497228622437, "learning_rate": 2.8590293405800177e-05, "loss": 0.1095, "step": 11177 }, { "epoch": 3.417303576887802, "grad_norm": 0.35432639718055725, "learning_rate": 2.858986879538024e-05, "loss": 0.1225, "step": 11178 }, { "epoch": 3.4176092937939466, "grad_norm": 0.7398841381072998, "learning_rate": 2.8589444184960298e-05, "loss": 0.1713, "step": 11179 }, { "epoch": 3.4179150107000917, "grad_norm": 3.287006139755249, "learning_rate": 2.858901957454036e-05, "loss": 0.1645, "step": 11180 }, { "epoch": 3.4182207276062364, "grad_norm": 0.3344568908214569, "learning_rate": 2.8588594964120422e-05, "loss": 0.1449, "step": 11181 }, { "epoch": 3.4185264445123815, "grad_norm": 0.9108774662017822, "learning_rate": 2.858817035370048e-05, "loss": 0.226, "step": 11182 }, { "epoch": 3.4188321614185266, "grad_norm": 1.4169617891311646, "learning_rate": 2.8587745743280543e-05, "loss": 0.215, "step": 11183 }, { "epoch": 3.4191378783246713, "grad_norm": 0.8283886313438416, "learning_rate": 2.85873211328606e-05, "loss": 0.2102, "step": 11184 }, { "epoch": 3.4194435952308164, "grad_norm": 1.9084337949752808, "learning_rate": 2.8586896522440664e-05, "loss": 0.2009, "step": 11185 }, { "epoch": 3.419749312136961, "grad_norm": 1.08633553981781, "learning_rate": 2.8586471912020722e-05, "loss": 0.2174, "step": 11186 }, { "epoch": 3.420055029043106, "grad_norm": 0.8826468586921692, "learning_rate": 2.8586047301600784e-05, "loss": 0.1917, "step": 11187 }, { "epoch": 3.420360745949251, "grad_norm": 0.9814870953559875, "learning_rate": 2.8585622691180843e-05, "loss": 0.2464, "step": 11188 }, { "epoch": 3.420666462855396, "grad_norm": 0.4930233061313629, "learning_rate": 2.8585198080760905e-05, "loss": 0.1495, "step": 11189 }, { "epoch": 3.4209721797615407, "grad_norm": 0.47526657581329346, "learning_rate": 2.8584773470340964e-05, "loss": 0.1079, "step": 11190 }, { "epoch": 3.421277896667686, "grad_norm": 0.1978556364774704, "learning_rate": 2.8584348859921026e-05, "loss": 0.0711, "step": 11191 }, { "epoch": 3.4215836135738305, "grad_norm": 0.22381241619586945, "learning_rate": 2.8583924249501085e-05, "loss": 0.0528, "step": 11192 }, { "epoch": 3.4218893304799756, "grad_norm": 1.0993322134017944, "learning_rate": 2.8583499639081147e-05, "loss": 0.0668, "step": 11193 }, { "epoch": 3.4221950473861202, "grad_norm": 0.41804319620132446, "learning_rate": 2.8583075028661205e-05, "loss": 0.0732, "step": 11194 }, { "epoch": 3.4225007642922654, "grad_norm": 0.23038138449192047, "learning_rate": 2.8582650418241264e-05, "loss": 0.0648, "step": 11195 }, { "epoch": 3.4228064811984105, "grad_norm": 0.4321121573448181, "learning_rate": 2.8582225807821326e-05, "loss": 0.0732, "step": 11196 }, { "epoch": 3.423112198104555, "grad_norm": 0.18544644117355347, "learning_rate": 2.8581801197401385e-05, "loss": 0.0587, "step": 11197 }, { "epoch": 3.4234179150107003, "grad_norm": 0.36278408765792847, "learning_rate": 2.8581376586981447e-05, "loss": 0.0859, "step": 11198 }, { "epoch": 3.423723631916845, "grad_norm": 0.35932570695877075, "learning_rate": 2.8580951976561506e-05, "loss": 0.0799, "step": 11199 }, { "epoch": 3.42402934882299, "grad_norm": 0.2522822618484497, "learning_rate": 2.8580527366141568e-05, "loss": 0.0768, "step": 11200 }, { "epoch": 3.4243350657291347, "grad_norm": 0.43644312024116516, "learning_rate": 2.8580102755721626e-05, "loss": 0.0939, "step": 11201 }, { "epoch": 3.42464078263528, "grad_norm": 0.34795263409614563, "learning_rate": 2.857967814530169e-05, "loss": 0.0944, "step": 11202 }, { "epoch": 3.4249464995414245, "grad_norm": 0.8085882663726807, "learning_rate": 2.8579253534881747e-05, "loss": 0.1738, "step": 11203 }, { "epoch": 3.4252522164475696, "grad_norm": 1.2353655099868774, "learning_rate": 2.857882892446181e-05, "loss": 0.1589, "step": 11204 }, { "epoch": 3.4255579333537143, "grad_norm": 0.7843267917633057, "learning_rate": 2.8578404314041868e-05, "loss": 0.1604, "step": 11205 }, { "epoch": 3.4258636502598594, "grad_norm": 1.0672463178634644, "learning_rate": 2.857797970362193e-05, "loss": 0.1719, "step": 11206 }, { "epoch": 3.426169367166004, "grad_norm": 0.7055094838142395, "learning_rate": 2.857755509320199e-05, "loss": 0.1702, "step": 11207 }, { "epoch": 3.426475084072149, "grad_norm": 0.49495846033096313, "learning_rate": 2.8577130482782048e-05, "loss": 0.1484, "step": 11208 }, { "epoch": 3.4267808009782943, "grad_norm": 1.1161152124404907, "learning_rate": 2.857670587236211e-05, "loss": 0.185, "step": 11209 }, { "epoch": 3.427086517884439, "grad_norm": 2.031763792037964, "learning_rate": 2.8576281261942168e-05, "loss": 0.1818, "step": 11210 }, { "epoch": 3.427392234790584, "grad_norm": 0.57451993227005, "learning_rate": 2.857585665152223e-05, "loss": 0.1998, "step": 11211 }, { "epoch": 3.4276979516967288, "grad_norm": 3.0181312561035156, "learning_rate": 2.857543204110229e-05, "loss": 0.1926, "step": 11212 }, { "epoch": 3.428003668602874, "grad_norm": 2.4608497619628906, "learning_rate": 2.857500743068235e-05, "loss": 0.2407, "step": 11213 }, { "epoch": 3.4283093855090185, "grad_norm": 0.33264708518981934, "learning_rate": 2.857458282026241e-05, "loss": 0.1504, "step": 11214 }, { "epoch": 3.4286151024151637, "grad_norm": 0.3837297856807709, "learning_rate": 2.8574158209842472e-05, "loss": 0.0687, "step": 11215 }, { "epoch": 3.4289208193213083, "grad_norm": 0.4726308584213257, "learning_rate": 2.857373359942253e-05, "loss": 0.109, "step": 11216 }, { "epoch": 3.4292265362274534, "grad_norm": 0.2989474833011627, "learning_rate": 2.8573308989002593e-05, "loss": 0.0581, "step": 11217 }, { "epoch": 3.429532253133598, "grad_norm": 0.4340091943740845, "learning_rate": 2.857288437858265e-05, "loss": 0.0714, "step": 11218 }, { "epoch": 3.4298379700397432, "grad_norm": 1.098655343055725, "learning_rate": 2.857245976816271e-05, "loss": 0.0509, "step": 11219 }, { "epoch": 3.430143686945888, "grad_norm": 0.3345983326435089, "learning_rate": 2.8572035157742772e-05, "loss": 0.0605, "step": 11220 }, { "epoch": 3.430449403852033, "grad_norm": 0.357097327709198, "learning_rate": 2.857161054732283e-05, "loss": 0.074, "step": 11221 }, { "epoch": 3.430755120758178, "grad_norm": 0.20017167925834656, "learning_rate": 2.8571185936902893e-05, "loss": 0.06, "step": 11222 }, { "epoch": 3.431060837664323, "grad_norm": 0.4276970326900482, "learning_rate": 2.8570761326482952e-05, "loss": 0.0608, "step": 11223 }, { "epoch": 3.431366554570468, "grad_norm": 1.2892823219299316, "learning_rate": 2.8570336716063014e-05, "loss": 0.1039, "step": 11224 }, { "epoch": 3.4316722714766126, "grad_norm": 0.29828134179115295, "learning_rate": 2.8569912105643073e-05, "loss": 0.0863, "step": 11225 }, { "epoch": 3.4319779883827577, "grad_norm": 0.42545345425605774, "learning_rate": 2.8569487495223135e-05, "loss": 0.1376, "step": 11226 }, { "epoch": 3.4322837052889024, "grad_norm": 0.5995786190032959, "learning_rate": 2.8569062884803193e-05, "loss": 0.1147, "step": 11227 }, { "epoch": 3.4325894221950475, "grad_norm": 0.5276011228561401, "learning_rate": 2.8568638274383255e-05, "loss": 0.1414, "step": 11228 }, { "epoch": 3.432895139101192, "grad_norm": 0.4868644177913666, "learning_rate": 2.8568213663963314e-05, "loss": 0.196, "step": 11229 }, { "epoch": 3.4332008560073373, "grad_norm": 0.5763110518455505, "learning_rate": 2.8567789053543376e-05, "loss": 0.1709, "step": 11230 }, { "epoch": 3.433506572913482, "grad_norm": 2.0266921520233154, "learning_rate": 2.8567364443123435e-05, "loss": 0.1769, "step": 11231 }, { "epoch": 3.433812289819627, "grad_norm": 0.7400306463241577, "learning_rate": 2.8566939832703494e-05, "loss": 0.1842, "step": 11232 }, { "epoch": 3.4341180067257717, "grad_norm": 0.5746641755104065, "learning_rate": 2.8566515222283556e-05, "loss": 0.2036, "step": 11233 }, { "epoch": 3.434423723631917, "grad_norm": 0.8421096205711365, "learning_rate": 2.8566090611863614e-05, "loss": 0.2867, "step": 11234 }, { "epoch": 3.434729440538062, "grad_norm": 0.7130597829818726, "learning_rate": 2.8565666001443676e-05, "loss": 0.2161, "step": 11235 }, { "epoch": 3.4350351574442066, "grad_norm": 1.2640281915664673, "learning_rate": 2.8565241391023735e-05, "loss": 0.2122, "step": 11236 }, { "epoch": 3.4353408743503517, "grad_norm": 0.7829792499542236, "learning_rate": 2.8564816780603797e-05, "loss": 0.2117, "step": 11237 }, { "epoch": 3.4356465912564964, "grad_norm": 1.7060266733169556, "learning_rate": 2.8564392170183856e-05, "loss": 0.245, "step": 11238 }, { "epoch": 3.4359523081626415, "grad_norm": 1.5544251203536987, "learning_rate": 2.8563967559763918e-05, "loss": 0.1545, "step": 11239 }, { "epoch": 3.436258025068786, "grad_norm": 0.3039417266845703, "learning_rate": 2.8563542949343977e-05, "loss": 0.0771, "step": 11240 }, { "epoch": 3.4365637419749313, "grad_norm": 0.3587471842765808, "learning_rate": 2.856311833892404e-05, "loss": 0.0854, "step": 11241 }, { "epoch": 3.436869458881076, "grad_norm": 0.3013661503791809, "learning_rate": 2.8562693728504098e-05, "loss": 0.0874, "step": 11242 }, { "epoch": 3.437175175787221, "grad_norm": 0.4879480302333832, "learning_rate": 2.856226911808416e-05, "loss": 0.0635, "step": 11243 }, { "epoch": 3.4374808926933658, "grad_norm": 0.2938022017478943, "learning_rate": 2.8561844507664218e-05, "loss": 0.0882, "step": 11244 }, { "epoch": 3.437786609599511, "grad_norm": 0.6890708208084106, "learning_rate": 2.8561419897244277e-05, "loss": 0.0745, "step": 11245 }, { "epoch": 3.4380923265056555, "grad_norm": 0.36310383677482605, "learning_rate": 2.856099528682434e-05, "loss": 0.0685, "step": 11246 }, { "epoch": 3.4383980434118007, "grad_norm": 0.7234895825386047, "learning_rate": 2.8560570676404398e-05, "loss": 0.096, "step": 11247 }, { "epoch": 3.438703760317946, "grad_norm": 0.35860487818717957, "learning_rate": 2.856014606598446e-05, "loss": 0.0693, "step": 11248 }, { "epoch": 3.4390094772240904, "grad_norm": 0.4805558919906616, "learning_rate": 2.855972145556452e-05, "loss": 0.1167, "step": 11249 }, { "epoch": 3.4393151941302356, "grad_norm": 0.33242037892341614, "learning_rate": 2.855929684514458e-05, "loss": 0.07, "step": 11250 }, { "epoch": 3.4396209110363802, "grad_norm": 0.5215994119644165, "learning_rate": 2.855887223472464e-05, "loss": 0.133, "step": 11251 }, { "epoch": 3.4399266279425254, "grad_norm": 0.5715591311454773, "learning_rate": 2.85584476243047e-05, "loss": 0.1207, "step": 11252 }, { "epoch": 3.44023234484867, "grad_norm": 0.48361289501190186, "learning_rate": 2.855802301388476e-05, "loss": 0.1524, "step": 11253 }, { "epoch": 3.440538061754815, "grad_norm": 0.7075240612030029, "learning_rate": 2.8557598403464822e-05, "loss": 0.1464, "step": 11254 }, { "epoch": 3.44084377866096, "grad_norm": 2.535809278488159, "learning_rate": 2.855717379304488e-05, "loss": 0.166, "step": 11255 }, { "epoch": 3.441149495567105, "grad_norm": 0.5684098601341248, "learning_rate": 2.8556749182624943e-05, "loss": 0.146, "step": 11256 }, { "epoch": 3.4414552124732496, "grad_norm": 0.6821333765983582, "learning_rate": 2.8556324572205002e-05, "loss": 0.1521, "step": 11257 }, { "epoch": 3.4417609293793947, "grad_norm": 1.4391567707061768, "learning_rate": 2.855589996178506e-05, "loss": 0.2309, "step": 11258 }, { "epoch": 3.4420666462855394, "grad_norm": 0.7324361801147461, "learning_rate": 2.8555475351365123e-05, "loss": 0.2007, "step": 11259 }, { "epoch": 3.4423723631916845, "grad_norm": 0.8995396494865417, "learning_rate": 2.855505074094518e-05, "loss": 0.1802, "step": 11260 }, { "epoch": 3.4426780800978296, "grad_norm": 0.7046141624450684, "learning_rate": 2.8554626130525243e-05, "loss": 0.171, "step": 11261 }, { "epoch": 3.4429837970039743, "grad_norm": 2.1006903648376465, "learning_rate": 2.8554201520105302e-05, "loss": 0.267, "step": 11262 }, { "epoch": 3.4432895139101194, "grad_norm": 1.4310050010681152, "learning_rate": 2.8553776909685364e-05, "loss": 0.2522, "step": 11263 }, { "epoch": 3.443595230816264, "grad_norm": 0.5349249243736267, "learning_rate": 2.8553352299265423e-05, "loss": 0.1425, "step": 11264 }, { "epoch": 3.443900947722409, "grad_norm": 0.2685914635658264, "learning_rate": 2.8552927688845485e-05, "loss": 0.0866, "step": 11265 }, { "epoch": 3.444206664628554, "grad_norm": 0.24599753320217133, "learning_rate": 2.8552503078425544e-05, "loss": 0.0846, "step": 11266 }, { "epoch": 3.444512381534699, "grad_norm": 0.26190057396888733, "learning_rate": 2.8552078468005606e-05, "loss": 0.0524, "step": 11267 }, { "epoch": 3.4448180984408436, "grad_norm": 0.23265700042247772, "learning_rate": 2.8551653857585664e-05, "loss": 0.0701, "step": 11268 }, { "epoch": 3.4451238153469887, "grad_norm": 0.5232430696487427, "learning_rate": 2.8551229247165726e-05, "loss": 0.0802, "step": 11269 }, { "epoch": 3.4454295322531334, "grad_norm": 0.3303511142730713, "learning_rate": 2.8550804636745785e-05, "loss": 0.0597, "step": 11270 }, { "epoch": 3.4457352491592785, "grad_norm": 0.5395406484603882, "learning_rate": 2.8550380026325844e-05, "loss": 0.0832, "step": 11271 }, { "epoch": 3.446040966065423, "grad_norm": 0.3607071042060852, "learning_rate": 2.8549955415905906e-05, "loss": 0.0721, "step": 11272 }, { "epoch": 3.4463466829715683, "grad_norm": 0.2960799038410187, "learning_rate": 2.8549530805485965e-05, "loss": 0.0609, "step": 11273 }, { "epoch": 3.4466523998777134, "grad_norm": 2.580223798751831, "learning_rate": 2.8549106195066027e-05, "loss": 0.0997, "step": 11274 }, { "epoch": 3.446958116783858, "grad_norm": 0.35427916049957275, "learning_rate": 2.8548681584646085e-05, "loss": 0.0819, "step": 11275 }, { "epoch": 3.447263833690003, "grad_norm": 0.6593188047409058, "learning_rate": 2.8548256974226148e-05, "loss": 0.1418, "step": 11276 }, { "epoch": 3.447569550596148, "grad_norm": 0.4378340244293213, "learning_rate": 2.8547832363806206e-05, "loss": 0.1042, "step": 11277 }, { "epoch": 3.447875267502293, "grad_norm": 0.7397594451904297, "learning_rate": 2.854740775338627e-05, "loss": 0.1702, "step": 11278 }, { "epoch": 3.4481809844084377, "grad_norm": 0.5291993618011475, "learning_rate": 2.8546983142966327e-05, "loss": 0.1135, "step": 11279 }, { "epoch": 3.448486701314583, "grad_norm": 0.9286319017410278, "learning_rate": 2.854655853254639e-05, "loss": 0.1759, "step": 11280 }, { "epoch": 3.4487924182207275, "grad_norm": 0.48664259910583496, "learning_rate": 2.8546133922126448e-05, "loss": 0.157, "step": 11281 }, { "epoch": 3.4490981351268726, "grad_norm": 0.6817853450775146, "learning_rate": 2.854570931170651e-05, "loss": 0.2009, "step": 11282 }, { "epoch": 3.4494038520330172, "grad_norm": 0.5224918723106384, "learning_rate": 2.8545284701286572e-05, "loss": 0.1781, "step": 11283 }, { "epoch": 3.4497095689391624, "grad_norm": 1.0849593877792358, "learning_rate": 2.854486009086663e-05, "loss": 0.2102, "step": 11284 }, { "epoch": 3.450015285845307, "grad_norm": 0.9128221273422241, "learning_rate": 2.8544435480446693e-05, "loss": 0.2104, "step": 11285 }, { "epoch": 3.450321002751452, "grad_norm": 0.7959086298942566, "learning_rate": 2.854401087002675e-05, "loss": 0.2615, "step": 11286 }, { "epoch": 3.4506267196575973, "grad_norm": 1.0221836566925049, "learning_rate": 2.8543586259606814e-05, "loss": 0.1992, "step": 11287 }, { "epoch": 3.450932436563742, "grad_norm": 3.9314517974853516, "learning_rate": 2.8543161649186872e-05, "loss": 0.2745, "step": 11288 }, { "epoch": 3.451238153469887, "grad_norm": 1.0184390544891357, "learning_rate": 2.8542737038766934e-05, "loss": 0.163, "step": 11289 }, { "epoch": 3.4515438703760317, "grad_norm": 0.2449783831834793, "learning_rate": 2.8542312428346993e-05, "loss": 0.0984, "step": 11290 }, { "epoch": 3.451849587282177, "grad_norm": 0.6575819849967957, "learning_rate": 2.8541887817927055e-05, "loss": 0.068, "step": 11291 }, { "epoch": 3.4521553041883215, "grad_norm": 0.2224055975675583, "learning_rate": 2.8541463207507114e-05, "loss": 0.0574, "step": 11292 }, { "epoch": 3.4524610210944666, "grad_norm": 0.2879335582256317, "learning_rate": 2.8541038597087176e-05, "loss": 0.0607, "step": 11293 }, { "epoch": 3.4527667380006113, "grad_norm": 0.18428242206573486, "learning_rate": 2.8540613986667235e-05, "loss": 0.0618, "step": 11294 }, { "epoch": 3.4530724549067564, "grad_norm": 0.30078479647636414, "learning_rate": 2.8540189376247297e-05, "loss": 0.0618, "step": 11295 }, { "epoch": 3.453378171812901, "grad_norm": 0.3593904674053192, "learning_rate": 2.8539764765827355e-05, "loss": 0.0736, "step": 11296 }, { "epoch": 3.453683888719046, "grad_norm": 1.1135681867599487, "learning_rate": 2.8539340155407414e-05, "loss": 0.0602, "step": 11297 }, { "epoch": 3.453989605625191, "grad_norm": 0.5326744318008423, "learning_rate": 2.8538915544987476e-05, "loss": 0.0734, "step": 11298 }, { "epoch": 3.454295322531336, "grad_norm": 0.46083229780197144, "learning_rate": 2.8538490934567535e-05, "loss": 0.108, "step": 11299 }, { "epoch": 3.454601039437481, "grad_norm": 0.2609047293663025, "learning_rate": 2.8538066324147597e-05, "loss": 0.0931, "step": 11300 }, { "epoch": 3.4549067563436258, "grad_norm": 0.34027621150016785, "learning_rate": 2.8537641713727656e-05, "loss": 0.1112, "step": 11301 }, { "epoch": 3.455212473249771, "grad_norm": 0.5679340362548828, "learning_rate": 2.8537217103307718e-05, "loss": 0.1145, "step": 11302 }, { "epoch": 3.4555181901559155, "grad_norm": 0.6877142190933228, "learning_rate": 2.8536792492887776e-05, "loss": 0.1468, "step": 11303 }, { "epoch": 3.4558239070620607, "grad_norm": 0.9959607124328613, "learning_rate": 2.853636788246784e-05, "loss": 0.1555, "step": 11304 }, { "epoch": 3.4561296239682053, "grad_norm": 0.647853672504425, "learning_rate": 2.8535943272047897e-05, "loss": 0.131, "step": 11305 }, { "epoch": 3.4564353408743504, "grad_norm": 0.4793001115322113, "learning_rate": 2.853551866162796e-05, "loss": 0.1599, "step": 11306 }, { "epoch": 3.456741057780495, "grad_norm": 0.7990772724151611, "learning_rate": 2.8535094051208018e-05, "loss": 0.1966, "step": 11307 }, { "epoch": 3.4570467746866402, "grad_norm": 0.870517373085022, "learning_rate": 2.853466944078808e-05, "loss": 0.1743, "step": 11308 }, { "epoch": 3.457352491592785, "grad_norm": 0.9628502130508423, "learning_rate": 2.853424483036814e-05, "loss": 0.17, "step": 11309 }, { "epoch": 3.45765820849893, "grad_norm": 0.7041844725608826, "learning_rate": 2.8533820219948198e-05, "loss": 0.1977, "step": 11310 }, { "epoch": 3.4579639254050747, "grad_norm": 1.8725132942199707, "learning_rate": 2.853339560952826e-05, "loss": 0.2444, "step": 11311 }, { "epoch": 3.45826964231122, "grad_norm": 0.755177915096283, "learning_rate": 2.853297099910832e-05, "loss": 0.1728, "step": 11312 }, { "epoch": 3.458575359217365, "grad_norm": 1.8864519596099854, "learning_rate": 2.853254638868838e-05, "loss": 0.2813, "step": 11313 }, { "epoch": 3.4588810761235096, "grad_norm": 0.39647915959358215, "learning_rate": 2.853212177826844e-05, "loss": 0.1455, "step": 11314 }, { "epoch": 3.4591867930296547, "grad_norm": 0.5808576941490173, "learning_rate": 2.85316971678485e-05, "loss": 0.0975, "step": 11315 }, { "epoch": 3.4594925099357994, "grad_norm": 0.37843021750450134, "learning_rate": 2.853127255742856e-05, "loss": 0.0679, "step": 11316 }, { "epoch": 3.4597982268419445, "grad_norm": 0.3310873508453369, "learning_rate": 2.8530847947008622e-05, "loss": 0.0454, "step": 11317 }, { "epoch": 3.460103943748089, "grad_norm": 0.22694648802280426, "learning_rate": 2.853042333658868e-05, "loss": 0.0529, "step": 11318 }, { "epoch": 3.4604096606542343, "grad_norm": 0.2615300714969635, "learning_rate": 2.8529998726168743e-05, "loss": 0.0492, "step": 11319 }, { "epoch": 3.460715377560379, "grad_norm": 0.33539271354675293, "learning_rate": 2.85295741157488e-05, "loss": 0.1052, "step": 11320 }, { "epoch": 3.461021094466524, "grad_norm": 0.6923887729644775, "learning_rate": 2.8529149505328864e-05, "loss": 0.0859, "step": 11321 }, { "epoch": 3.4613268113726687, "grad_norm": 0.27677854895591736, "learning_rate": 2.8528724894908922e-05, "loss": 0.0829, "step": 11322 }, { "epoch": 3.461632528278814, "grad_norm": 0.3146294355392456, "learning_rate": 2.852830028448898e-05, "loss": 0.0816, "step": 11323 }, { "epoch": 3.4619382451849585, "grad_norm": 0.25200799107551575, "learning_rate": 2.8527875674069043e-05, "loss": 0.0814, "step": 11324 }, { "epoch": 3.4622439620911036, "grad_norm": 0.22961124777793884, "learning_rate": 2.8527451063649102e-05, "loss": 0.0906, "step": 11325 }, { "epoch": 3.4625496789972487, "grad_norm": 0.32707926630973816, "learning_rate": 2.8527026453229164e-05, "loss": 0.0789, "step": 11326 }, { "epoch": 3.4628553959033934, "grad_norm": 0.8964866399765015, "learning_rate": 2.8526601842809223e-05, "loss": 0.1498, "step": 11327 }, { "epoch": 3.4631611128095385, "grad_norm": 0.5053001046180725, "learning_rate": 2.8526177232389285e-05, "loss": 0.1205, "step": 11328 }, { "epoch": 3.463466829715683, "grad_norm": 0.44359588623046875, "learning_rate": 2.8525752621969343e-05, "loss": 0.1586, "step": 11329 }, { "epoch": 3.4637725466218283, "grad_norm": 0.5006656050682068, "learning_rate": 2.8525328011549405e-05, "loss": 0.1662, "step": 11330 }, { "epoch": 3.464078263527973, "grad_norm": 0.46795547008514404, "learning_rate": 2.8524903401129464e-05, "loss": 0.1584, "step": 11331 }, { "epoch": 3.464383980434118, "grad_norm": 0.4886009991168976, "learning_rate": 2.8524478790709526e-05, "loss": 0.1798, "step": 11332 }, { "epoch": 3.4646896973402628, "grad_norm": 1.4820148944854736, "learning_rate": 2.8524054180289585e-05, "loss": 0.1769, "step": 11333 }, { "epoch": 3.464995414246408, "grad_norm": 0.776319146156311, "learning_rate": 2.8523629569869647e-05, "loss": 0.2182, "step": 11334 }, { "epoch": 3.4653011311525526, "grad_norm": 0.9147388935089111, "learning_rate": 2.8523204959449706e-05, "loss": 0.2069, "step": 11335 }, { "epoch": 3.4656068480586977, "grad_norm": 1.1325361728668213, "learning_rate": 2.8522780349029764e-05, "loss": 0.2008, "step": 11336 }, { "epoch": 3.4659125649648423, "grad_norm": 1.6940199136734009, "learning_rate": 2.8522355738609826e-05, "loss": 0.2546, "step": 11337 }, { "epoch": 3.4662182818709875, "grad_norm": 1.9625250101089478, "learning_rate": 2.8521931128189885e-05, "loss": 0.2265, "step": 11338 }, { "epoch": 3.4665239987771326, "grad_norm": 0.3467671871185303, "learning_rate": 2.8521506517769947e-05, "loss": 0.1554, "step": 11339 }, { "epoch": 3.4668297156832772, "grad_norm": 0.4298381507396698, "learning_rate": 2.8521081907350006e-05, "loss": 0.0932, "step": 11340 }, { "epoch": 3.4671354325894224, "grad_norm": 0.438882440328598, "learning_rate": 2.8520657296930068e-05, "loss": 0.1074, "step": 11341 }, { "epoch": 3.467441149495567, "grad_norm": 0.22480154037475586, "learning_rate": 2.8520232686510127e-05, "loss": 0.0775, "step": 11342 }, { "epoch": 3.467746866401712, "grad_norm": 0.3221060037612915, "learning_rate": 2.851980807609019e-05, "loss": 0.0594, "step": 11343 }, { "epoch": 3.468052583307857, "grad_norm": 0.16212166845798492, "learning_rate": 2.8519383465670248e-05, "loss": 0.0374, "step": 11344 }, { "epoch": 3.468358300214002, "grad_norm": 0.22855263948440552, "learning_rate": 2.851895885525031e-05, "loss": 0.0546, "step": 11345 }, { "epoch": 3.4686640171201466, "grad_norm": 0.45545798540115356, "learning_rate": 2.851853424483037e-05, "loss": 0.0938, "step": 11346 }, { "epoch": 3.4689697340262917, "grad_norm": 0.2434130758047104, "learning_rate": 2.8518109634410427e-05, "loss": 0.083, "step": 11347 }, { "epoch": 3.4692754509324364, "grad_norm": 0.5001268982887268, "learning_rate": 2.851768502399049e-05, "loss": 0.062, "step": 11348 }, { "epoch": 3.4695811678385815, "grad_norm": 0.230449840426445, "learning_rate": 2.8517260413570548e-05, "loss": 0.0949, "step": 11349 }, { "epoch": 3.469886884744726, "grad_norm": 0.22864101827144623, "learning_rate": 2.851683580315061e-05, "loss": 0.0805, "step": 11350 }, { "epoch": 3.4701926016508713, "grad_norm": 0.4223034381866455, "learning_rate": 2.851641119273067e-05, "loss": 0.0846, "step": 11351 }, { "epoch": 3.4704983185570164, "grad_norm": 1.5232298374176025, "learning_rate": 2.851598658231073e-05, "loss": 0.1278, "step": 11352 }, { "epoch": 3.470804035463161, "grad_norm": 0.5497438311576843, "learning_rate": 2.851556197189079e-05, "loss": 0.131, "step": 11353 }, { "epoch": 3.471109752369306, "grad_norm": 0.5384012460708618, "learning_rate": 2.851513736147085e-05, "loss": 0.1453, "step": 11354 }, { "epoch": 3.471415469275451, "grad_norm": 0.7203657627105713, "learning_rate": 2.851471275105091e-05, "loss": 0.1918, "step": 11355 }, { "epoch": 3.471721186181596, "grad_norm": 0.9012494683265686, "learning_rate": 2.8514288140630972e-05, "loss": 0.172, "step": 11356 }, { "epoch": 3.4720269030877406, "grad_norm": 0.9350446462631226, "learning_rate": 2.851386353021103e-05, "loss": 0.1775, "step": 11357 }, { "epoch": 3.4723326199938858, "grad_norm": 1.311978816986084, "learning_rate": 2.8513438919791093e-05, "loss": 0.1518, "step": 11358 }, { "epoch": 3.4726383369000304, "grad_norm": 0.6597777605056763, "learning_rate": 2.8513014309371152e-05, "loss": 0.1536, "step": 11359 }, { "epoch": 3.4729440538061755, "grad_norm": 1.2731540203094482, "learning_rate": 2.851258969895121e-05, "loss": 0.2142, "step": 11360 }, { "epoch": 3.47324977071232, "grad_norm": 1.3904006481170654, "learning_rate": 2.8512165088531273e-05, "loss": 0.1906, "step": 11361 }, { "epoch": 3.4735554876184653, "grad_norm": 0.9410531520843506, "learning_rate": 2.851174047811133e-05, "loss": 0.2257, "step": 11362 }, { "epoch": 3.47386120452461, "grad_norm": 1.1796958446502686, "learning_rate": 2.8511315867691393e-05, "loss": 0.2648, "step": 11363 }, { "epoch": 3.474166921430755, "grad_norm": 0.36743414402008057, "learning_rate": 2.8510891257271452e-05, "loss": 0.1415, "step": 11364 }, { "epoch": 3.4744726383369002, "grad_norm": 0.6382285356521606, "learning_rate": 2.8510466646851514e-05, "loss": 0.104, "step": 11365 }, { "epoch": 3.474778355243045, "grad_norm": 0.3812567889690399, "learning_rate": 2.8510042036431573e-05, "loss": 0.0653, "step": 11366 }, { "epoch": 3.47508407214919, "grad_norm": 0.24122491478919983, "learning_rate": 2.8509617426011635e-05, "loss": 0.0796, "step": 11367 }, { "epoch": 3.4753897890553347, "grad_norm": 0.15925545990467072, "learning_rate": 2.8509192815591694e-05, "loss": 0.0478, "step": 11368 }, { "epoch": 3.47569550596148, "grad_norm": 0.15356849133968353, "learning_rate": 2.8508768205171756e-05, "loss": 0.063, "step": 11369 }, { "epoch": 3.4760012228676245, "grad_norm": 0.2225913107395172, "learning_rate": 2.8508343594751814e-05, "loss": 0.0647, "step": 11370 }, { "epoch": 3.4763069397737696, "grad_norm": 0.2522819936275482, "learning_rate": 2.8507918984331876e-05, "loss": 0.0922, "step": 11371 }, { "epoch": 3.4766126566799143, "grad_norm": 0.8713470101356506, "learning_rate": 2.8507494373911935e-05, "loss": 0.0736, "step": 11372 }, { "epoch": 3.4769183735860594, "grad_norm": 0.19900065660476685, "learning_rate": 2.8507069763491994e-05, "loss": 0.0621, "step": 11373 }, { "epoch": 3.477224090492204, "grad_norm": 0.31019866466522217, "learning_rate": 2.8506645153072056e-05, "loss": 0.1036, "step": 11374 }, { "epoch": 3.477529807398349, "grad_norm": 0.4666471481323242, "learning_rate": 2.8506220542652115e-05, "loss": 0.1006, "step": 11375 }, { "epoch": 3.477835524304494, "grad_norm": 0.3982682526111603, "learning_rate": 2.8505795932232177e-05, "loss": 0.1194, "step": 11376 }, { "epoch": 3.478141241210639, "grad_norm": 0.6024423241615295, "learning_rate": 2.8505371321812235e-05, "loss": 0.1021, "step": 11377 }, { "epoch": 3.478446958116784, "grad_norm": 0.5077764391899109, "learning_rate": 2.8504946711392298e-05, "loss": 0.1172, "step": 11378 }, { "epoch": 3.4787526750229287, "grad_norm": 0.44966500997543335, "learning_rate": 2.8504522100972356e-05, "loss": 0.1292, "step": 11379 }, { "epoch": 3.479058391929074, "grad_norm": 0.9931373596191406, "learning_rate": 2.850409749055242e-05, "loss": 0.146, "step": 11380 }, { "epoch": 3.4793641088352185, "grad_norm": 0.5340157747268677, "learning_rate": 2.8503672880132477e-05, "loss": 0.2046, "step": 11381 }, { "epoch": 3.4796698257413636, "grad_norm": 0.6756018400192261, "learning_rate": 2.850324826971254e-05, "loss": 0.1812, "step": 11382 }, { "epoch": 3.4799755426475083, "grad_norm": 0.9688618779182434, "learning_rate": 2.8502823659292598e-05, "loss": 0.2145, "step": 11383 }, { "epoch": 3.4802812595536534, "grad_norm": 0.832491397857666, "learning_rate": 2.850239904887266e-05, "loss": 0.229, "step": 11384 }, { "epoch": 3.480586976459798, "grad_norm": 0.6163973808288574, "learning_rate": 2.8501974438452722e-05, "loss": 0.1627, "step": 11385 }, { "epoch": 3.480892693365943, "grad_norm": 0.7795174717903137, "learning_rate": 2.850154982803278e-05, "loss": 0.1592, "step": 11386 }, { "epoch": 3.481198410272088, "grad_norm": 0.665359377861023, "learning_rate": 2.8501125217612843e-05, "loss": 0.2091, "step": 11387 }, { "epoch": 3.481504127178233, "grad_norm": 1.4279351234436035, "learning_rate": 2.85007006071929e-05, "loss": 0.2429, "step": 11388 }, { "epoch": 3.4818098440843777, "grad_norm": 0.35106366872787476, "learning_rate": 2.8500275996772964e-05, "loss": 0.2002, "step": 11389 }, { "epoch": 3.4821155609905228, "grad_norm": 0.5045731663703918, "learning_rate": 2.8499851386353022e-05, "loss": 0.0829, "step": 11390 }, { "epoch": 3.482421277896668, "grad_norm": 0.23386335372924805, "learning_rate": 2.8499426775933084e-05, "loss": 0.0921, "step": 11391 }, { "epoch": 3.4827269948028126, "grad_norm": 0.8404673933982849, "learning_rate": 2.8499002165513143e-05, "loss": 0.0814, "step": 11392 }, { "epoch": 3.4830327117089577, "grad_norm": 0.1676418036222458, "learning_rate": 2.8498577555093205e-05, "loss": 0.0553, "step": 11393 }, { "epoch": 3.4833384286151023, "grad_norm": 0.22680307924747467, "learning_rate": 2.8498152944673264e-05, "loss": 0.0328, "step": 11394 }, { "epoch": 3.4836441455212475, "grad_norm": 0.324039489030838, "learning_rate": 2.8497728334253326e-05, "loss": 0.0809, "step": 11395 }, { "epoch": 3.483949862427392, "grad_norm": 0.2235116958618164, "learning_rate": 2.8497303723833385e-05, "loss": 0.062, "step": 11396 }, { "epoch": 3.4842555793335372, "grad_norm": 0.5210524201393127, "learning_rate": 2.8496879113413447e-05, "loss": 0.078, "step": 11397 }, { "epoch": 3.484561296239682, "grad_norm": 0.4013563096523285, "learning_rate": 2.8496454502993505e-05, "loss": 0.0627, "step": 11398 }, { "epoch": 3.484867013145827, "grad_norm": 0.2912479639053345, "learning_rate": 2.8496029892573564e-05, "loss": 0.1093, "step": 11399 }, { "epoch": 3.4851727300519717, "grad_norm": 0.7112475633621216, "learning_rate": 2.8495605282153626e-05, "loss": 0.0804, "step": 11400 }, { "epoch": 3.485478446958117, "grad_norm": 0.38377124071121216, "learning_rate": 2.8495180671733685e-05, "loss": 0.0943, "step": 11401 }, { "epoch": 3.4857841638642615, "grad_norm": 0.40957850217819214, "learning_rate": 2.8494756061313747e-05, "loss": 0.136, "step": 11402 }, { "epoch": 3.4860898807704066, "grad_norm": 0.5405424237251282, "learning_rate": 2.8494331450893806e-05, "loss": 0.119, "step": 11403 }, { "epoch": 3.4863955976765517, "grad_norm": 1.1068066358566284, "learning_rate": 2.8493906840473868e-05, "loss": 0.1969, "step": 11404 }, { "epoch": 3.4867013145826964, "grad_norm": 0.7953932881355286, "learning_rate": 2.8493482230053927e-05, "loss": 0.1794, "step": 11405 }, { "epoch": 3.4870070314888415, "grad_norm": 0.578853189945221, "learning_rate": 2.849305761963399e-05, "loss": 0.1845, "step": 11406 }, { "epoch": 3.487312748394986, "grad_norm": 1.1918203830718994, "learning_rate": 2.8492633009214047e-05, "loss": 0.1686, "step": 11407 }, { "epoch": 3.4876184653011313, "grad_norm": 0.8109108209609985, "learning_rate": 2.849220839879411e-05, "loss": 0.2157, "step": 11408 }, { "epoch": 3.487924182207276, "grad_norm": 1.6038247346878052, "learning_rate": 2.8491783788374168e-05, "loss": 0.1656, "step": 11409 }, { "epoch": 3.488229899113421, "grad_norm": 5.733242034912109, "learning_rate": 2.849135917795423e-05, "loss": 0.2307, "step": 11410 }, { "epoch": 3.4885356160195657, "grad_norm": 1.5683422088623047, "learning_rate": 2.849093456753429e-05, "loss": 0.1972, "step": 11411 }, { "epoch": 3.488841332925711, "grad_norm": 0.995733916759491, "learning_rate": 2.8490509957114348e-05, "loss": 0.2022, "step": 11412 }, { "epoch": 3.4891470498318555, "grad_norm": 1.551689863204956, "learning_rate": 2.849008534669441e-05, "loss": 0.2289, "step": 11413 }, { "epoch": 3.4894527667380006, "grad_norm": 1.3901945352554321, "learning_rate": 2.848966073627447e-05, "loss": 0.1975, "step": 11414 }, { "epoch": 3.4897584836441453, "grad_norm": 0.2698350250720978, "learning_rate": 2.848923612585453e-05, "loss": 0.0724, "step": 11415 }, { "epoch": 3.4900642005502904, "grad_norm": 0.21980762481689453, "learning_rate": 2.848881151543459e-05, "loss": 0.0603, "step": 11416 }, { "epoch": 3.4903699174564355, "grad_norm": 0.3083381652832031, "learning_rate": 2.848838690501465e-05, "loss": 0.0888, "step": 11417 }, { "epoch": 3.49067563436258, "grad_norm": 0.22719649970531464, "learning_rate": 2.848796229459471e-05, "loss": 0.0609, "step": 11418 }, { "epoch": 3.4909813512687253, "grad_norm": 0.19541360437870026, "learning_rate": 2.8487537684174772e-05, "loss": 0.0528, "step": 11419 }, { "epoch": 3.49128706817487, "grad_norm": 0.34072354435920715, "learning_rate": 2.848711307375483e-05, "loss": 0.084, "step": 11420 }, { "epoch": 3.491592785081015, "grad_norm": 0.34657979011535645, "learning_rate": 2.8486688463334893e-05, "loss": 0.0741, "step": 11421 }, { "epoch": 3.49189850198716, "grad_norm": 0.38249915838241577, "learning_rate": 2.848626385291495e-05, "loss": 0.0771, "step": 11422 }, { "epoch": 3.492204218893305, "grad_norm": 0.40435683727264404, "learning_rate": 2.8485839242495014e-05, "loss": 0.0788, "step": 11423 }, { "epoch": 3.4925099357994496, "grad_norm": 0.2765819728374481, "learning_rate": 2.8485414632075072e-05, "loss": 0.082, "step": 11424 }, { "epoch": 3.4928156527055947, "grad_norm": 0.5292683243751526, "learning_rate": 2.848499002165513e-05, "loss": 0.0755, "step": 11425 }, { "epoch": 3.4931213696117394, "grad_norm": 0.3266717791557312, "learning_rate": 2.8484565411235193e-05, "loss": 0.1012, "step": 11426 }, { "epoch": 3.4934270865178845, "grad_norm": 0.2891128957271576, "learning_rate": 2.8484140800815252e-05, "loss": 0.102, "step": 11427 }, { "epoch": 3.493732803424029, "grad_norm": 0.4062090516090393, "learning_rate": 2.8483716190395314e-05, "loss": 0.1328, "step": 11428 }, { "epoch": 3.4940385203301743, "grad_norm": 0.9411286115646362, "learning_rate": 2.8483291579975373e-05, "loss": 0.1358, "step": 11429 }, { "epoch": 3.4943442372363194, "grad_norm": 0.7439152598381042, "learning_rate": 2.8482866969555435e-05, "loss": 0.161, "step": 11430 }, { "epoch": 3.494649954142464, "grad_norm": 1.036895990371704, "learning_rate": 2.8482442359135493e-05, "loss": 0.1552, "step": 11431 }, { "epoch": 3.494955671048609, "grad_norm": 0.6867927312850952, "learning_rate": 2.8482017748715555e-05, "loss": 0.2111, "step": 11432 }, { "epoch": 3.495261387954754, "grad_norm": 0.6055060625076294, "learning_rate": 2.8481593138295614e-05, "loss": 0.1625, "step": 11433 }, { "epoch": 3.495567104860899, "grad_norm": 0.7361211180686951, "learning_rate": 2.8481168527875676e-05, "loss": 0.1627, "step": 11434 }, { "epoch": 3.4958728217670436, "grad_norm": 0.9004834890365601, "learning_rate": 2.8480743917455735e-05, "loss": 0.1995, "step": 11435 }, { "epoch": 3.4961785386731887, "grad_norm": 0.970980703830719, "learning_rate": 2.8480319307035797e-05, "loss": 0.2172, "step": 11436 }, { "epoch": 3.4964842555793334, "grad_norm": 0.654110848903656, "learning_rate": 2.8479894696615856e-05, "loss": 0.1695, "step": 11437 }, { "epoch": 3.4967899724854785, "grad_norm": 0.9024820327758789, "learning_rate": 2.8479470086195914e-05, "loss": 0.2248, "step": 11438 }, { "epoch": 3.497095689391623, "grad_norm": 0.6135653257369995, "learning_rate": 2.8479045475775977e-05, "loss": 0.1519, "step": 11439 }, { "epoch": 3.4974014062977683, "grad_norm": 0.26746511459350586, "learning_rate": 2.8478620865356035e-05, "loss": 0.0882, "step": 11440 }, { "epoch": 3.497707123203913, "grad_norm": 0.3774074912071228, "learning_rate": 2.8478196254936097e-05, "loss": 0.1094, "step": 11441 }, { "epoch": 3.498012840110058, "grad_norm": 0.2779954671859741, "learning_rate": 2.8477771644516156e-05, "loss": 0.0651, "step": 11442 }, { "epoch": 3.498318557016203, "grad_norm": 0.3161855638027191, "learning_rate": 2.8477347034096218e-05, "loss": 0.0953, "step": 11443 }, { "epoch": 3.498624273922348, "grad_norm": 0.6093910336494446, "learning_rate": 2.8476922423676277e-05, "loss": 0.0489, "step": 11444 }, { "epoch": 3.498929990828493, "grad_norm": 0.2108757644891739, "learning_rate": 2.847649781325634e-05, "loss": 0.07, "step": 11445 }, { "epoch": 3.4992357077346377, "grad_norm": 0.5073111057281494, "learning_rate": 2.8476073202836398e-05, "loss": 0.0473, "step": 11446 }, { "epoch": 3.4995414246407828, "grad_norm": 0.39042380452156067, "learning_rate": 2.847564859241646e-05, "loss": 0.0818, "step": 11447 }, { "epoch": 3.4998471415469274, "grad_norm": 0.5247715711593628, "learning_rate": 2.847522398199652e-05, "loss": 0.094, "step": 11448 }, { "epoch": 3.5001528584530726, "grad_norm": 0.4532693028450012, "learning_rate": 2.847479937157658e-05, "loss": 0.0884, "step": 11449 }, { "epoch": 3.5004585753592172, "grad_norm": 0.5646331310272217, "learning_rate": 2.847437476115664e-05, "loss": 0.0973, "step": 11450 }, { "epoch": 3.5007642922653623, "grad_norm": 0.7378602623939514, "learning_rate": 2.8473950150736698e-05, "loss": 0.0938, "step": 11451 }, { "epoch": 3.5010700091715075, "grad_norm": 0.6067817211151123, "learning_rate": 2.847352554031676e-05, "loss": 0.1398, "step": 11452 }, { "epoch": 3.501375726077652, "grad_norm": 0.7781251072883606, "learning_rate": 2.847310092989682e-05, "loss": 0.1383, "step": 11453 }, { "epoch": 3.501681442983797, "grad_norm": 0.9678758382797241, "learning_rate": 2.847267631947688e-05, "loss": 0.1579, "step": 11454 }, { "epoch": 3.501987159889942, "grad_norm": 0.49234700202941895, "learning_rate": 2.847225170905694e-05, "loss": 0.159, "step": 11455 }, { "epoch": 3.502292876796087, "grad_norm": 0.5465881824493408, "learning_rate": 2.8471827098637e-05, "loss": 0.1633, "step": 11456 }, { "epoch": 3.5025985937022317, "grad_norm": 0.7654823064804077, "learning_rate": 2.847140248821706e-05, "loss": 0.1991, "step": 11457 }, { "epoch": 3.5029043106083764, "grad_norm": 0.5092651844024658, "learning_rate": 2.8470977877797122e-05, "loss": 0.2011, "step": 11458 }, { "epoch": 3.5032100275145215, "grad_norm": 0.9590121507644653, "learning_rate": 2.847055326737718e-05, "loss": 0.2444, "step": 11459 }, { "epoch": 3.5035157444206666, "grad_norm": 0.9401979446411133, "learning_rate": 2.8470128656957243e-05, "loss": 0.2006, "step": 11460 }, { "epoch": 3.5038214613268113, "grad_norm": 0.7927969098091125, "learning_rate": 2.8469704046537302e-05, "loss": 0.1876, "step": 11461 }, { "epoch": 3.5041271782329564, "grad_norm": 0.9923880100250244, "learning_rate": 2.846927943611736e-05, "loss": 0.2595, "step": 11462 }, { "epoch": 3.504432895139101, "grad_norm": 2.0358598232269287, "learning_rate": 2.8468854825697423e-05, "loss": 0.2977, "step": 11463 }, { "epoch": 3.504738612045246, "grad_norm": 0.5781574845314026, "learning_rate": 2.846843021527748e-05, "loss": 0.1588, "step": 11464 }, { "epoch": 3.5050443289513913, "grad_norm": 0.32750701904296875, "learning_rate": 2.8468005604857543e-05, "loss": 0.0885, "step": 11465 }, { "epoch": 3.505350045857536, "grad_norm": 0.3895922303199768, "learning_rate": 2.8467580994437602e-05, "loss": 0.0836, "step": 11466 }, { "epoch": 3.5056557627636806, "grad_norm": 0.3422072231769562, "learning_rate": 2.8467156384017664e-05, "loss": 0.0703, "step": 11467 }, { "epoch": 3.5059614796698257, "grad_norm": 0.30199843645095825, "learning_rate": 2.8466731773597723e-05, "loss": 0.0651, "step": 11468 }, { "epoch": 3.506267196575971, "grad_norm": 0.3430488705635071, "learning_rate": 2.8466307163177785e-05, "loss": 0.0598, "step": 11469 }, { "epoch": 3.5065729134821155, "grad_norm": 0.4778077304363251, "learning_rate": 2.8465882552757844e-05, "loss": 0.0613, "step": 11470 }, { "epoch": 3.50687863038826, "grad_norm": 0.24532267451286316, "learning_rate": 2.8465457942337906e-05, "loss": 0.0644, "step": 11471 }, { "epoch": 3.5071843472944053, "grad_norm": 0.4214217960834503, "learning_rate": 2.8465033331917964e-05, "loss": 0.1036, "step": 11472 }, { "epoch": 3.5074900642005504, "grad_norm": 0.34652963280677795, "learning_rate": 2.8464608721498027e-05, "loss": 0.0809, "step": 11473 }, { "epoch": 3.507795781106695, "grad_norm": 0.3354351222515106, "learning_rate": 2.8464184111078085e-05, "loss": 0.0958, "step": 11474 }, { "epoch": 3.50810149801284, "grad_norm": 0.6133198142051697, "learning_rate": 2.8463759500658144e-05, "loss": 0.1048, "step": 11475 }, { "epoch": 3.508407214918985, "grad_norm": 0.2999522387981415, "learning_rate": 2.8463334890238206e-05, "loss": 0.1005, "step": 11476 }, { "epoch": 3.50871293182513, "grad_norm": 0.5687159895896912, "learning_rate": 2.8462910279818265e-05, "loss": 0.1234, "step": 11477 }, { "epoch": 3.509018648731275, "grad_norm": 0.29086631536483765, "learning_rate": 2.8462485669398327e-05, "loss": 0.0976, "step": 11478 }, { "epoch": 3.50932436563742, "grad_norm": 0.34547746181488037, "learning_rate": 2.8462061058978385e-05, "loss": 0.1282, "step": 11479 }, { "epoch": 3.5096300825435645, "grad_norm": 0.5128869414329529, "learning_rate": 2.8461636448558448e-05, "loss": 0.1769, "step": 11480 }, { "epoch": 3.5099357994497096, "grad_norm": 1.495835304260254, "learning_rate": 2.8461211838138506e-05, "loss": 0.1636, "step": 11481 }, { "epoch": 3.5102415163558547, "grad_norm": 0.7294812798500061, "learning_rate": 2.846078722771857e-05, "loss": 0.2106, "step": 11482 }, { "epoch": 3.5105472332619994, "grad_norm": 0.6004093885421753, "learning_rate": 2.8460362617298627e-05, "loss": 0.1858, "step": 11483 }, { "epoch": 3.510852950168144, "grad_norm": 0.5743131041526794, "learning_rate": 2.845993800687869e-05, "loss": 0.1817, "step": 11484 }, { "epoch": 3.511158667074289, "grad_norm": 0.571833610534668, "learning_rate": 2.8459513396458748e-05, "loss": 0.1843, "step": 11485 }, { "epoch": 3.5114643839804343, "grad_norm": 2.9073407649993896, "learning_rate": 2.845908878603881e-05, "loss": 0.1993, "step": 11486 }, { "epoch": 3.511770100886579, "grad_norm": 1.7250263690948486, "learning_rate": 2.8458664175618872e-05, "loss": 0.2037, "step": 11487 }, { "epoch": 3.512075817792724, "grad_norm": 1.559813380241394, "learning_rate": 2.845823956519893e-05, "loss": 0.2848, "step": 11488 }, { "epoch": 3.5123815346988687, "grad_norm": 0.6587136387825012, "learning_rate": 2.8457814954778993e-05, "loss": 0.1413, "step": 11489 }, { "epoch": 3.512687251605014, "grad_norm": 0.5518613457679749, "learning_rate": 2.845739034435905e-05, "loss": 0.0729, "step": 11490 }, { "epoch": 3.512992968511159, "grad_norm": 0.3848462402820587, "learning_rate": 2.8456965733939114e-05, "loss": 0.0906, "step": 11491 }, { "epoch": 3.5132986854173036, "grad_norm": 0.27591460943222046, "learning_rate": 2.8456541123519172e-05, "loss": 0.054, "step": 11492 }, { "epoch": 3.5136044023234483, "grad_norm": 0.22865070402622223, "learning_rate": 2.8456116513099234e-05, "loss": 0.0675, "step": 11493 }, { "epoch": 3.5139101192295934, "grad_norm": 0.3022361695766449, "learning_rate": 2.8455691902679293e-05, "loss": 0.0549, "step": 11494 }, { "epoch": 3.5142158361357385, "grad_norm": 0.24340510368347168, "learning_rate": 2.8455267292259355e-05, "loss": 0.051, "step": 11495 }, { "epoch": 3.514521553041883, "grad_norm": 0.2980298101902008, "learning_rate": 2.8454842681839414e-05, "loss": 0.0616, "step": 11496 }, { "epoch": 3.514827269948028, "grad_norm": 0.29164668917655945, "learning_rate": 2.8454418071419476e-05, "loss": 0.0868, "step": 11497 }, { "epoch": 3.515132986854173, "grad_norm": 0.3395784795284271, "learning_rate": 2.8453993460999535e-05, "loss": 0.0647, "step": 11498 }, { "epoch": 3.515438703760318, "grad_norm": 0.47711285948753357, "learning_rate": 2.8453568850579597e-05, "loss": 0.0984, "step": 11499 }, { "epoch": 3.5157444206664628, "grad_norm": 0.2944624423980713, "learning_rate": 2.8453144240159655e-05, "loss": 0.0799, "step": 11500 }, { "epoch": 3.516050137572608, "grad_norm": 0.4849626421928406, "learning_rate": 2.8452719629739714e-05, "loss": 0.1013, "step": 11501 }, { "epoch": 3.5163558544787525, "grad_norm": 0.5351730585098267, "learning_rate": 2.8452295019319776e-05, "loss": 0.1634, "step": 11502 }, { "epoch": 3.5166615713848977, "grad_norm": 0.5417189598083496, "learning_rate": 2.8451870408899835e-05, "loss": 0.1367, "step": 11503 }, { "epoch": 3.5169672882910428, "grad_norm": 0.5286691188812256, "learning_rate": 2.8451445798479897e-05, "loss": 0.131, "step": 11504 }, { "epoch": 3.5172730051971874, "grad_norm": 0.6983892917633057, "learning_rate": 2.8451021188059956e-05, "loss": 0.2037, "step": 11505 }, { "epoch": 3.517578722103332, "grad_norm": 0.8080576658248901, "learning_rate": 2.8450596577640018e-05, "loss": 0.1634, "step": 11506 }, { "epoch": 3.5178844390094772, "grad_norm": 0.4139341711997986, "learning_rate": 2.8450171967220077e-05, "loss": 0.1923, "step": 11507 }, { "epoch": 3.5181901559156223, "grad_norm": 0.7558646202087402, "learning_rate": 2.844974735680014e-05, "loss": 0.1598, "step": 11508 }, { "epoch": 3.518495872821767, "grad_norm": 0.9610670804977417, "learning_rate": 2.8449322746380197e-05, "loss": 0.2225, "step": 11509 }, { "epoch": 3.5188015897279117, "grad_norm": 0.9078764319419861, "learning_rate": 2.844889813596026e-05, "loss": 0.174, "step": 11510 }, { "epoch": 3.519107306634057, "grad_norm": 1.1578933000564575, "learning_rate": 2.8448473525540318e-05, "loss": 0.1966, "step": 11511 }, { "epoch": 3.519413023540202, "grad_norm": 1.5946648120880127, "learning_rate": 2.844804891512038e-05, "loss": 0.2247, "step": 11512 }, { "epoch": 3.5197187404463466, "grad_norm": 1.275651216506958, "learning_rate": 2.844762430470044e-05, "loss": 0.2121, "step": 11513 }, { "epoch": 3.5200244573524917, "grad_norm": 0.32618406414985657, "learning_rate": 2.8447199694280498e-05, "loss": 0.1375, "step": 11514 }, { "epoch": 3.5203301742586364, "grad_norm": 0.25047048926353455, "learning_rate": 2.844677508386056e-05, "loss": 0.0785, "step": 11515 }, { "epoch": 3.5206358911647815, "grad_norm": 0.229536235332489, "learning_rate": 2.844635047344062e-05, "loss": 0.0668, "step": 11516 }, { "epoch": 3.5209416080709266, "grad_norm": 0.4554395079612732, "learning_rate": 2.844592586302068e-05, "loss": 0.065, "step": 11517 }, { "epoch": 3.5212473249770713, "grad_norm": 0.21899832785129547, "learning_rate": 2.844550125260074e-05, "loss": 0.0633, "step": 11518 }, { "epoch": 3.521553041883216, "grad_norm": 0.24704717099666595, "learning_rate": 2.84450766421808e-05, "loss": 0.0494, "step": 11519 }, { "epoch": 3.521858758789361, "grad_norm": 0.23019635677337646, "learning_rate": 2.844465203176086e-05, "loss": 0.0754, "step": 11520 }, { "epoch": 3.522164475695506, "grad_norm": 0.37012794613838196, "learning_rate": 2.8444227421340922e-05, "loss": 0.0572, "step": 11521 }, { "epoch": 3.522470192601651, "grad_norm": 0.3373015522956848, "learning_rate": 2.844380281092098e-05, "loss": 0.089, "step": 11522 }, { "epoch": 3.5227759095077955, "grad_norm": 0.39675235748291016, "learning_rate": 2.8443378200501043e-05, "loss": 0.0723, "step": 11523 }, { "epoch": 3.5230816264139406, "grad_norm": 0.4130776524543762, "learning_rate": 2.84429535900811e-05, "loss": 0.0701, "step": 11524 }, { "epoch": 3.5233873433200857, "grad_norm": 0.3030373156070709, "learning_rate": 2.8442528979661164e-05, "loss": 0.0853, "step": 11525 }, { "epoch": 3.5236930602262304, "grad_norm": 0.48010289669036865, "learning_rate": 2.8442104369241222e-05, "loss": 0.1482, "step": 11526 }, { "epoch": 3.5239987771323755, "grad_norm": 0.40027084946632385, "learning_rate": 2.844167975882128e-05, "loss": 0.1158, "step": 11527 }, { "epoch": 3.52430449403852, "grad_norm": 0.43213796615600586, "learning_rate": 2.8441255148401343e-05, "loss": 0.1328, "step": 11528 }, { "epoch": 3.5246102109446653, "grad_norm": 0.6389030814170837, "learning_rate": 2.8440830537981402e-05, "loss": 0.1189, "step": 11529 }, { "epoch": 3.5249159278508104, "grad_norm": 0.7710555195808411, "learning_rate": 2.8440405927561464e-05, "loss": 0.1665, "step": 11530 }, { "epoch": 3.525221644756955, "grad_norm": 0.640352725982666, "learning_rate": 2.8439981317141523e-05, "loss": 0.163, "step": 11531 }, { "epoch": 3.5255273616630998, "grad_norm": 1.0999804735183716, "learning_rate": 2.8439556706721585e-05, "loss": 0.1864, "step": 11532 }, { "epoch": 3.525833078569245, "grad_norm": 1.8030943870544434, "learning_rate": 2.8439132096301643e-05, "loss": 0.1916, "step": 11533 }, { "epoch": 3.52613879547539, "grad_norm": 0.7956291437149048, "learning_rate": 2.8438707485881705e-05, "loss": 0.2005, "step": 11534 }, { "epoch": 3.5264445123815347, "grad_norm": 1.059709906578064, "learning_rate": 2.8438282875461764e-05, "loss": 0.1624, "step": 11535 }, { "epoch": 3.5267502292876793, "grad_norm": 1.0205696821212769, "learning_rate": 2.8437858265041826e-05, "loss": 0.2071, "step": 11536 }, { "epoch": 3.5270559461938245, "grad_norm": 0.6790058016777039, "learning_rate": 2.8437433654621885e-05, "loss": 0.2174, "step": 11537 }, { "epoch": 3.5273616630999696, "grad_norm": 1.7033050060272217, "learning_rate": 2.8437009044201947e-05, "loss": 0.2793, "step": 11538 }, { "epoch": 3.5276673800061142, "grad_norm": 0.4140586853027344, "learning_rate": 2.8436584433782006e-05, "loss": 0.152, "step": 11539 }, { "epoch": 3.5279730969122594, "grad_norm": 0.3229818642139435, "learning_rate": 2.8436159823362064e-05, "loss": 0.071, "step": 11540 }, { "epoch": 3.528278813818404, "grad_norm": 0.4780507981777191, "learning_rate": 2.8435735212942127e-05, "loss": 0.0868, "step": 11541 }, { "epoch": 3.528584530724549, "grad_norm": 0.6339280605316162, "learning_rate": 2.8435310602522185e-05, "loss": 0.0589, "step": 11542 }, { "epoch": 3.5288902476306943, "grad_norm": 0.2628802955150604, "learning_rate": 2.8434885992102247e-05, "loss": 0.0657, "step": 11543 }, { "epoch": 3.529195964536839, "grad_norm": 0.32245561480522156, "learning_rate": 2.8434461381682306e-05, "loss": 0.0581, "step": 11544 }, { "epoch": 3.5295016814429836, "grad_norm": 0.3815990090370178, "learning_rate": 2.8434036771262368e-05, "loss": 0.0756, "step": 11545 }, { "epoch": 3.5298073983491287, "grad_norm": 0.8275772929191589, "learning_rate": 2.8433612160842427e-05, "loss": 0.0583, "step": 11546 }, { "epoch": 3.530113115255274, "grad_norm": 0.26662030816078186, "learning_rate": 2.843318755042249e-05, "loss": 0.0857, "step": 11547 }, { "epoch": 3.5304188321614185, "grad_norm": 0.28198570013046265, "learning_rate": 2.8432762940002548e-05, "loss": 0.0488, "step": 11548 }, { "epoch": 3.530724549067563, "grad_norm": 0.5089575052261353, "learning_rate": 2.843233832958261e-05, "loss": 0.1081, "step": 11549 }, { "epoch": 3.5310302659737083, "grad_norm": 0.2844114601612091, "learning_rate": 2.843191371916267e-05, "loss": 0.0866, "step": 11550 }, { "epoch": 3.5313359828798534, "grad_norm": 0.5490170121192932, "learning_rate": 2.843148910874273e-05, "loss": 0.1202, "step": 11551 }, { "epoch": 3.531641699785998, "grad_norm": 0.5534006953239441, "learning_rate": 2.843106449832279e-05, "loss": 0.1272, "step": 11552 }, { "epoch": 3.531947416692143, "grad_norm": 0.5542840957641602, "learning_rate": 2.8430639887902848e-05, "loss": 0.1201, "step": 11553 }, { "epoch": 3.532253133598288, "grad_norm": 0.9074380993843079, "learning_rate": 2.843021527748291e-05, "loss": 0.1526, "step": 11554 }, { "epoch": 3.532558850504433, "grad_norm": 0.5023823976516724, "learning_rate": 2.842979066706297e-05, "loss": 0.1383, "step": 11555 }, { "epoch": 3.5328645674105776, "grad_norm": 1.7264856100082397, "learning_rate": 2.842936605664303e-05, "loss": 0.1588, "step": 11556 }, { "epoch": 3.5331702843167228, "grad_norm": 0.7313159704208374, "learning_rate": 2.842894144622309e-05, "loss": 0.1763, "step": 11557 }, { "epoch": 3.5334760012228674, "grad_norm": 0.4500439465045929, "learning_rate": 2.842851683580315e-05, "loss": 0.1855, "step": 11558 }, { "epoch": 3.5337817181290125, "grad_norm": 0.9771196842193604, "learning_rate": 2.842809222538321e-05, "loss": 0.2095, "step": 11559 }, { "epoch": 3.5340874350351577, "grad_norm": 1.1302992105484009, "learning_rate": 2.8427667614963272e-05, "loss": 0.1924, "step": 11560 }, { "epoch": 3.5343931519413023, "grad_norm": 1.0376131534576416, "learning_rate": 2.842724300454333e-05, "loss": 0.1563, "step": 11561 }, { "epoch": 3.534698868847447, "grad_norm": 3.332500696182251, "learning_rate": 2.8426818394123393e-05, "loss": 0.216, "step": 11562 }, { "epoch": 3.535004585753592, "grad_norm": 1.3901996612548828, "learning_rate": 2.8426393783703452e-05, "loss": 0.2341, "step": 11563 }, { "epoch": 3.5353103026597372, "grad_norm": 0.6331530809402466, "learning_rate": 2.8425969173283514e-05, "loss": 0.1484, "step": 11564 }, { "epoch": 3.535616019565882, "grad_norm": 0.36887872219085693, "learning_rate": 2.8425544562863573e-05, "loss": 0.088, "step": 11565 }, { "epoch": 3.535921736472027, "grad_norm": 0.47864270210266113, "learning_rate": 2.842511995244363e-05, "loss": 0.0742, "step": 11566 }, { "epoch": 3.5362274533781717, "grad_norm": 0.19780413806438446, "learning_rate": 2.8424695342023693e-05, "loss": 0.0501, "step": 11567 }, { "epoch": 3.536533170284317, "grad_norm": 0.5565905570983887, "learning_rate": 2.8424270731603752e-05, "loss": 0.0613, "step": 11568 }, { "epoch": 3.5368388871904615, "grad_norm": 0.327531099319458, "learning_rate": 2.8423846121183814e-05, "loss": 0.0872, "step": 11569 }, { "epoch": 3.5371446040966066, "grad_norm": 0.21474432945251465, "learning_rate": 2.8423421510763873e-05, "loss": 0.0428, "step": 11570 }, { "epoch": 3.5374503210027513, "grad_norm": 0.25374218821525574, "learning_rate": 2.8422996900343935e-05, "loss": 0.0476, "step": 11571 }, { "epoch": 3.5377560379088964, "grad_norm": 0.2844635546207428, "learning_rate": 2.8422572289923994e-05, "loss": 0.0836, "step": 11572 }, { "epoch": 3.5380617548150415, "grad_norm": 0.5565991997718811, "learning_rate": 2.8422147679504056e-05, "loss": 0.072, "step": 11573 }, { "epoch": 3.538367471721186, "grad_norm": 0.3755410313606262, "learning_rate": 2.8421723069084114e-05, "loss": 0.0952, "step": 11574 }, { "epoch": 3.538673188627331, "grad_norm": 0.37353792786598206, "learning_rate": 2.8421298458664177e-05, "loss": 0.1149, "step": 11575 }, { "epoch": 3.538978905533476, "grad_norm": 0.3638252019882202, "learning_rate": 2.8420873848244235e-05, "loss": 0.097, "step": 11576 }, { "epoch": 3.539284622439621, "grad_norm": 0.7192012667655945, "learning_rate": 2.8420449237824294e-05, "loss": 0.1314, "step": 11577 }, { "epoch": 3.5395903393457657, "grad_norm": 0.399249404668808, "learning_rate": 2.8420024627404356e-05, "loss": 0.1264, "step": 11578 }, { "epoch": 3.539896056251911, "grad_norm": 0.5784982442855835, "learning_rate": 2.8419600016984415e-05, "loss": 0.1383, "step": 11579 }, { "epoch": 3.5402017731580555, "grad_norm": 0.4093351662158966, "learning_rate": 2.8419175406564477e-05, "loss": 0.147, "step": 11580 }, { "epoch": 3.5405074900642006, "grad_norm": 1.2135051488876343, "learning_rate": 2.8418750796144536e-05, "loss": 0.158, "step": 11581 }, { "epoch": 3.5408132069703453, "grad_norm": 0.568932056427002, "learning_rate": 2.8418326185724598e-05, "loss": 0.1796, "step": 11582 }, { "epoch": 3.5411189238764904, "grad_norm": 0.7059070467948914, "learning_rate": 2.8417901575304656e-05, "loss": 0.1878, "step": 11583 }, { "epoch": 3.541424640782635, "grad_norm": 0.7659462094306946, "learning_rate": 2.841747696488472e-05, "loss": 0.1993, "step": 11584 }, { "epoch": 3.54173035768878, "grad_norm": 1.0457464456558228, "learning_rate": 2.8417052354464777e-05, "loss": 0.1893, "step": 11585 }, { "epoch": 3.5420360745949253, "grad_norm": 0.8749816417694092, "learning_rate": 2.841662774404484e-05, "loss": 0.1702, "step": 11586 }, { "epoch": 3.54234179150107, "grad_norm": 1.15398371219635, "learning_rate": 2.8416203133624898e-05, "loss": 0.2242, "step": 11587 }, { "epoch": 3.5426475084072147, "grad_norm": 2.236189603805542, "learning_rate": 2.841577852320496e-05, "loss": 0.2837, "step": 11588 }, { "epoch": 3.5429532253133598, "grad_norm": 0.3813992738723755, "learning_rate": 2.8415353912785022e-05, "loss": 0.1458, "step": 11589 }, { "epoch": 3.543258942219505, "grad_norm": 0.46310845017433167, "learning_rate": 2.841492930236508e-05, "loss": 0.0783, "step": 11590 }, { "epoch": 3.5435646591256496, "grad_norm": 1.0048072338104248, "learning_rate": 2.8414504691945143e-05, "loss": 0.1109, "step": 11591 }, { "epoch": 3.5438703760317947, "grad_norm": 0.2660067677497864, "learning_rate": 2.84140800815252e-05, "loss": 0.0456, "step": 11592 }, { "epoch": 3.5441760929379393, "grad_norm": 0.324781209230423, "learning_rate": 2.8413655471105264e-05, "loss": 0.0846, "step": 11593 }, { "epoch": 3.5444818098440845, "grad_norm": 0.2947094440460205, "learning_rate": 2.8413230860685322e-05, "loss": 0.0568, "step": 11594 }, { "epoch": 3.544787526750229, "grad_norm": 0.23147529363632202, "learning_rate": 2.8412806250265384e-05, "loss": 0.0378, "step": 11595 }, { "epoch": 3.5450932436563742, "grad_norm": 0.41921573877334595, "learning_rate": 2.8412381639845443e-05, "loss": 0.0586, "step": 11596 }, { "epoch": 3.545398960562519, "grad_norm": 0.3211387097835541, "learning_rate": 2.8411957029425505e-05, "loss": 0.0601, "step": 11597 }, { "epoch": 3.545704677468664, "grad_norm": 0.678422212600708, "learning_rate": 2.8411532419005564e-05, "loss": 0.072, "step": 11598 }, { "epoch": 3.546010394374809, "grad_norm": 0.46211937069892883, "learning_rate": 2.8411107808585626e-05, "loss": 0.1249, "step": 11599 }, { "epoch": 3.546316111280954, "grad_norm": 0.640656054019928, "learning_rate": 2.8410683198165685e-05, "loss": 0.0909, "step": 11600 }, { "epoch": 3.5466218281870985, "grad_norm": 0.4672847390174866, "learning_rate": 2.8410258587745747e-05, "loss": 0.1205, "step": 11601 }, { "epoch": 3.5469275450932436, "grad_norm": 2.6840438842773438, "learning_rate": 2.8409833977325805e-05, "loss": 0.128, "step": 11602 }, { "epoch": 3.5472332619993887, "grad_norm": 0.6033338904380798, "learning_rate": 2.8409409366905864e-05, "loss": 0.1219, "step": 11603 }, { "epoch": 3.5475389789055334, "grad_norm": 0.4765825569629669, "learning_rate": 2.8408984756485926e-05, "loss": 0.1446, "step": 11604 }, { "epoch": 3.5478446958116785, "grad_norm": 0.47841647267341614, "learning_rate": 2.8408560146065985e-05, "loss": 0.1375, "step": 11605 }, { "epoch": 3.548150412717823, "grad_norm": 0.6063072681427002, "learning_rate": 2.8408135535646047e-05, "loss": 0.154, "step": 11606 }, { "epoch": 3.5484561296239683, "grad_norm": 0.9406918883323669, "learning_rate": 2.8407710925226106e-05, "loss": 0.1873, "step": 11607 }, { "epoch": 3.548761846530113, "grad_norm": 0.6821566224098206, "learning_rate": 2.8407286314806168e-05, "loss": 0.1993, "step": 11608 }, { "epoch": 3.549067563436258, "grad_norm": 0.5806258916854858, "learning_rate": 2.8406861704386227e-05, "loss": 0.1528, "step": 11609 }, { "epoch": 3.5493732803424027, "grad_norm": 0.8073420524597168, "learning_rate": 2.840643709396629e-05, "loss": 0.1692, "step": 11610 }, { "epoch": 3.549678997248548, "grad_norm": 1.0767936706542969, "learning_rate": 2.8406012483546347e-05, "loss": 0.215, "step": 11611 }, { "epoch": 3.549984714154693, "grad_norm": 1.4746923446655273, "learning_rate": 2.840558787312641e-05, "loss": 0.2152, "step": 11612 }, { "epoch": 3.5502904310608376, "grad_norm": 2.01702880859375, "learning_rate": 2.8405163262706468e-05, "loss": 0.2823, "step": 11613 }, { "epoch": 3.5505961479669823, "grad_norm": 0.25539761781692505, "learning_rate": 2.840473865228653e-05, "loss": 0.1673, "step": 11614 }, { "epoch": 3.5509018648731274, "grad_norm": 0.515259325504303, "learning_rate": 2.840431404186659e-05, "loss": 0.1083, "step": 11615 }, { "epoch": 3.5512075817792725, "grad_norm": 0.28102707862854004, "learning_rate": 2.8403889431446648e-05, "loss": 0.0729, "step": 11616 }, { "epoch": 3.551513298685417, "grad_norm": 0.4108816981315613, "learning_rate": 2.840346482102671e-05, "loss": 0.0513, "step": 11617 }, { "epoch": 3.5518190155915623, "grad_norm": 0.2238505482673645, "learning_rate": 2.840304021060677e-05, "loss": 0.0483, "step": 11618 }, { "epoch": 3.552124732497707, "grad_norm": 0.298963725566864, "learning_rate": 2.840261560018683e-05, "loss": 0.0462, "step": 11619 }, { "epoch": 3.552430449403852, "grad_norm": 0.682094395160675, "learning_rate": 2.840219098976689e-05, "loss": 0.0651, "step": 11620 }, { "epoch": 3.552736166309997, "grad_norm": 1.181045651435852, "learning_rate": 2.840176637934695e-05, "loss": 0.0533, "step": 11621 }, { "epoch": 3.553041883216142, "grad_norm": 0.27104729413986206, "learning_rate": 2.840134176892701e-05, "loss": 0.0657, "step": 11622 }, { "epoch": 3.5533476001222866, "grad_norm": 1.5319441556930542, "learning_rate": 2.8400917158507072e-05, "loss": 0.0761, "step": 11623 }, { "epoch": 3.5536533170284317, "grad_norm": 0.3496967554092407, "learning_rate": 2.840049254808713e-05, "loss": 0.1199, "step": 11624 }, { "epoch": 3.553959033934577, "grad_norm": 0.5452714562416077, "learning_rate": 2.8400067937667193e-05, "loss": 0.0834, "step": 11625 }, { "epoch": 3.5542647508407215, "grad_norm": 0.270303338766098, "learning_rate": 2.839964332724725e-05, "loss": 0.0987, "step": 11626 }, { "epoch": 3.554570467746866, "grad_norm": 0.4135800302028656, "learning_rate": 2.8399218716827314e-05, "loss": 0.108, "step": 11627 }, { "epoch": 3.5548761846530113, "grad_norm": 0.5310267210006714, "learning_rate": 2.8398794106407372e-05, "loss": 0.1407, "step": 11628 }, { "epoch": 3.5551819015591564, "grad_norm": 0.4938777685165405, "learning_rate": 2.839836949598743e-05, "loss": 0.1596, "step": 11629 }, { "epoch": 3.555487618465301, "grad_norm": 1.8012338876724243, "learning_rate": 2.8397944885567493e-05, "loss": 0.145, "step": 11630 }, { "epoch": 3.555793335371446, "grad_norm": 0.6352095007896423, "learning_rate": 2.8397520275147552e-05, "loss": 0.1657, "step": 11631 }, { "epoch": 3.556099052277591, "grad_norm": 0.9520487785339355, "learning_rate": 2.8397095664727614e-05, "loss": 0.197, "step": 11632 }, { "epoch": 3.556404769183736, "grad_norm": 5.520075798034668, "learning_rate": 2.8396671054307673e-05, "loss": 0.1763, "step": 11633 }, { "epoch": 3.5567104860898806, "grad_norm": 1.286476492881775, "learning_rate": 2.8396246443887735e-05, "loss": 0.2117, "step": 11634 }, { "epoch": 3.5570162029960257, "grad_norm": 0.6994674801826477, "learning_rate": 2.8395821833467793e-05, "loss": 0.2244, "step": 11635 }, { "epoch": 3.5573219199021704, "grad_norm": 1.090482473373413, "learning_rate": 2.8395397223047856e-05, "loss": 0.1863, "step": 11636 }, { "epoch": 3.5576276368083155, "grad_norm": 1.2555532455444336, "learning_rate": 2.8394972612627914e-05, "loss": 0.2121, "step": 11637 }, { "epoch": 3.5579333537144606, "grad_norm": 1.9046069383621216, "learning_rate": 2.8394548002207976e-05, "loss": 0.2519, "step": 11638 }, { "epoch": 3.5582390706206053, "grad_norm": 0.5318306684494019, "learning_rate": 2.8394123391788035e-05, "loss": 0.15, "step": 11639 }, { "epoch": 3.55854478752675, "grad_norm": 0.3486272990703583, "learning_rate": 2.8393698781368097e-05, "loss": 0.0928, "step": 11640 }, { "epoch": 3.558850504432895, "grad_norm": 0.2701771557331085, "learning_rate": 2.8393274170948156e-05, "loss": 0.0749, "step": 11641 }, { "epoch": 3.55915622133904, "grad_norm": 0.3228515386581421, "learning_rate": 2.8392849560528214e-05, "loss": 0.0774, "step": 11642 }, { "epoch": 3.559461938245185, "grad_norm": 0.8724523186683655, "learning_rate": 2.8392424950108277e-05, "loss": 0.0787, "step": 11643 }, { "epoch": 3.55976765515133, "grad_norm": 0.9084330201148987, "learning_rate": 2.8392000339688335e-05, "loss": 0.0626, "step": 11644 }, { "epoch": 3.5600733720574746, "grad_norm": 0.22866421937942505, "learning_rate": 2.8391575729268397e-05, "loss": 0.0391, "step": 11645 }, { "epoch": 3.5603790889636198, "grad_norm": 0.28577765822410583, "learning_rate": 2.8391151118848456e-05, "loss": 0.0516, "step": 11646 }, { "epoch": 3.5606848058697644, "grad_norm": 0.5192993879318237, "learning_rate": 2.8390726508428518e-05, "loss": 0.1275, "step": 11647 }, { "epoch": 3.5609905227759096, "grad_norm": 0.33329087495803833, "learning_rate": 2.8390301898008577e-05, "loss": 0.0634, "step": 11648 }, { "epoch": 3.561296239682054, "grad_norm": 0.6497235298156738, "learning_rate": 2.838987728758864e-05, "loss": 0.0614, "step": 11649 }, { "epoch": 3.5616019565881993, "grad_norm": 0.41588231921195984, "learning_rate": 2.8389452677168698e-05, "loss": 0.0903, "step": 11650 }, { "epoch": 3.5619076734943445, "grad_norm": 0.276182621717453, "learning_rate": 2.838902806674876e-05, "loss": 0.0838, "step": 11651 }, { "epoch": 3.562213390400489, "grad_norm": 0.6442866921424866, "learning_rate": 2.838860345632882e-05, "loss": 0.1187, "step": 11652 }, { "epoch": 3.562519107306634, "grad_norm": 0.7814022302627563, "learning_rate": 2.838817884590888e-05, "loss": 0.1208, "step": 11653 }, { "epoch": 3.562824824212779, "grad_norm": 0.5732600688934326, "learning_rate": 2.838775423548894e-05, "loss": 0.1648, "step": 11654 }, { "epoch": 3.563130541118924, "grad_norm": 0.5714125037193298, "learning_rate": 2.8387329625068998e-05, "loss": 0.1636, "step": 11655 }, { "epoch": 3.5634362580250687, "grad_norm": 2.167201519012451, "learning_rate": 2.838690501464906e-05, "loss": 0.1632, "step": 11656 }, { "epoch": 3.563741974931214, "grad_norm": 0.8507255911827087, "learning_rate": 2.838648040422912e-05, "loss": 0.1823, "step": 11657 }, { "epoch": 3.5640476918373585, "grad_norm": 0.8246555924415588, "learning_rate": 2.838605579380918e-05, "loss": 0.1744, "step": 11658 }, { "epoch": 3.5643534087435036, "grad_norm": 0.9439881443977356, "learning_rate": 2.838563118338924e-05, "loss": 0.159, "step": 11659 }, { "epoch": 3.5646591256496483, "grad_norm": 1.2449336051940918, "learning_rate": 2.83852065729693e-05, "loss": 0.2678, "step": 11660 }, { "epoch": 3.5649648425557934, "grad_norm": 0.9609862565994263, "learning_rate": 2.838478196254936e-05, "loss": 0.1779, "step": 11661 }, { "epoch": 3.565270559461938, "grad_norm": 1.342763900756836, "learning_rate": 2.8384357352129422e-05, "loss": 0.2387, "step": 11662 }, { "epoch": 3.565576276368083, "grad_norm": 1.5780553817749023, "learning_rate": 2.838393274170948e-05, "loss": 0.2857, "step": 11663 }, { "epoch": 3.5658819932742283, "grad_norm": 0.606106698513031, "learning_rate": 2.8383508131289543e-05, "loss": 0.1765, "step": 11664 }, { "epoch": 3.566187710180373, "grad_norm": 0.20238758623600006, "learning_rate": 2.8383083520869602e-05, "loss": 0.0856, "step": 11665 }, { "epoch": 3.5664934270865176, "grad_norm": 0.1805974245071411, "learning_rate": 2.8382658910449664e-05, "loss": 0.0701, "step": 11666 }, { "epoch": 3.5667991439926627, "grad_norm": 0.4178202748298645, "learning_rate": 2.8382234300029723e-05, "loss": 0.0552, "step": 11667 }, { "epoch": 3.567104860898808, "grad_norm": 0.3794001340866089, "learning_rate": 2.838180968960978e-05, "loss": 0.0555, "step": 11668 }, { "epoch": 3.5674105778049525, "grad_norm": 0.6603590250015259, "learning_rate": 2.8381385079189843e-05, "loss": 0.0712, "step": 11669 }, { "epoch": 3.5677162947110976, "grad_norm": 0.23926708102226257, "learning_rate": 2.8380960468769902e-05, "loss": 0.0497, "step": 11670 }, { "epoch": 3.5680220116172423, "grad_norm": 0.48903489112854004, "learning_rate": 2.8380535858349964e-05, "loss": 0.0547, "step": 11671 }, { "epoch": 3.5683277285233874, "grad_norm": 0.3253152668476105, "learning_rate": 2.8380111247930023e-05, "loss": 0.0809, "step": 11672 }, { "epoch": 3.568633445429532, "grad_norm": 0.6168380379676819, "learning_rate": 2.8379686637510085e-05, "loss": 0.0561, "step": 11673 }, { "epoch": 3.568939162335677, "grad_norm": 0.49521321058273315, "learning_rate": 2.8379262027090144e-05, "loss": 0.1029, "step": 11674 }, { "epoch": 3.569244879241822, "grad_norm": 0.835713267326355, "learning_rate": 2.8378837416670206e-05, "loss": 0.1091, "step": 11675 }, { "epoch": 3.569550596147967, "grad_norm": 0.33132320642471313, "learning_rate": 2.8378412806250264e-05, "loss": 0.0957, "step": 11676 }, { "epoch": 3.569856313054112, "grad_norm": 0.6139249205589294, "learning_rate": 2.8377988195830327e-05, "loss": 0.1354, "step": 11677 }, { "epoch": 3.5701620299602568, "grad_norm": 0.6477430462837219, "learning_rate": 2.8377563585410385e-05, "loss": 0.1296, "step": 11678 }, { "epoch": 3.5704677468664014, "grad_norm": 0.5189515352249146, "learning_rate": 2.8377138974990447e-05, "loss": 0.1244, "step": 11679 }, { "epoch": 3.5707734637725466, "grad_norm": 0.626003623008728, "learning_rate": 2.8376714364570506e-05, "loss": 0.1489, "step": 11680 }, { "epoch": 3.5710791806786917, "grad_norm": 0.9480247497558594, "learning_rate": 2.8376289754150565e-05, "loss": 0.1949, "step": 11681 }, { "epoch": 3.5713848975848363, "grad_norm": 0.5300143361091614, "learning_rate": 2.8375865143730627e-05, "loss": 0.1912, "step": 11682 }, { "epoch": 3.5716906144909815, "grad_norm": 1.6775410175323486, "learning_rate": 2.8375440533310686e-05, "loss": 0.1785, "step": 11683 }, { "epoch": 3.571996331397126, "grad_norm": 0.6460542678833008, "learning_rate": 2.8375015922890748e-05, "loss": 0.2032, "step": 11684 }, { "epoch": 3.5723020483032712, "grad_norm": 0.8204830288887024, "learning_rate": 2.8374591312470806e-05, "loss": 0.2006, "step": 11685 }, { "epoch": 3.572607765209416, "grad_norm": 0.5102282762527466, "learning_rate": 2.837416670205087e-05, "loss": 0.2087, "step": 11686 }, { "epoch": 3.572913482115561, "grad_norm": 0.7959402799606323, "learning_rate": 2.8373742091630927e-05, "loss": 0.2282, "step": 11687 }, { "epoch": 3.5732191990217057, "grad_norm": 1.096182942390442, "learning_rate": 2.837331748121099e-05, "loss": 0.2668, "step": 11688 }, { "epoch": 3.573524915927851, "grad_norm": 0.6326525211334229, "learning_rate": 2.8372892870791048e-05, "loss": 0.1607, "step": 11689 }, { "epoch": 3.573830632833996, "grad_norm": 0.2521153688430786, "learning_rate": 2.837246826037111e-05, "loss": 0.0983, "step": 11690 }, { "epoch": 3.5741363497401406, "grad_norm": 0.21493850648403168, "learning_rate": 2.837204364995117e-05, "loss": 0.0847, "step": 11691 }, { "epoch": 3.5744420666462853, "grad_norm": 0.3454105854034424, "learning_rate": 2.837161903953123e-05, "loss": 0.0723, "step": 11692 }, { "epoch": 3.5747477835524304, "grad_norm": 0.2186533808708191, "learning_rate": 2.8371194429111293e-05, "loss": 0.0581, "step": 11693 }, { "epoch": 3.5750535004585755, "grad_norm": 0.2498197853565216, "learning_rate": 2.837076981869135e-05, "loss": 0.0502, "step": 11694 }, { "epoch": 3.57535921736472, "grad_norm": 0.35339871048927307, "learning_rate": 2.8370345208271414e-05, "loss": 0.0599, "step": 11695 }, { "epoch": 3.5756649342708653, "grad_norm": 0.267624169588089, "learning_rate": 2.8369920597851472e-05, "loss": 0.0711, "step": 11696 }, { "epoch": 3.57597065117701, "grad_norm": 0.3471984267234802, "learning_rate": 2.8369495987431534e-05, "loss": 0.0893, "step": 11697 }, { "epoch": 3.576276368083155, "grad_norm": 0.35348477959632874, "learning_rate": 2.8369071377011593e-05, "loss": 0.0708, "step": 11698 }, { "epoch": 3.5765820849892997, "grad_norm": 1.1440945863723755, "learning_rate": 2.8368646766591655e-05, "loss": 0.1062, "step": 11699 }, { "epoch": 3.576887801895445, "grad_norm": 0.2518911063671112, "learning_rate": 2.8368222156171714e-05, "loss": 0.0918, "step": 11700 }, { "epoch": 3.5771935188015895, "grad_norm": 0.38847821950912476, "learning_rate": 2.8367797545751776e-05, "loss": 0.1084, "step": 11701 }, { "epoch": 3.5774992357077346, "grad_norm": 0.2563537061214447, "learning_rate": 2.8367372935331835e-05, "loss": 0.1277, "step": 11702 }, { "epoch": 3.5778049526138798, "grad_norm": 0.4627496898174286, "learning_rate": 2.8366948324911897e-05, "loss": 0.1378, "step": 11703 }, { "epoch": 3.5781106695200244, "grad_norm": 0.4263947904109955, "learning_rate": 2.8366523714491956e-05, "loss": 0.1477, "step": 11704 }, { "epoch": 3.578416386426169, "grad_norm": 0.5060939788818359, "learning_rate": 2.8366099104072014e-05, "loss": 0.1812, "step": 11705 }, { "epoch": 3.578722103332314, "grad_norm": 0.9655070304870605, "learning_rate": 2.8365674493652076e-05, "loss": 0.1876, "step": 11706 }, { "epoch": 3.5790278202384593, "grad_norm": 0.7038581371307373, "learning_rate": 2.8365249883232135e-05, "loss": 0.1973, "step": 11707 }, { "epoch": 3.579333537144604, "grad_norm": 0.8331713080406189, "learning_rate": 2.8364825272812197e-05, "loss": 0.2121, "step": 11708 }, { "epoch": 3.579639254050749, "grad_norm": 0.8518937230110168, "learning_rate": 2.8364400662392256e-05, "loss": 0.2219, "step": 11709 }, { "epoch": 3.579944970956894, "grad_norm": 0.645575225353241, "learning_rate": 2.8363976051972318e-05, "loss": 0.2097, "step": 11710 }, { "epoch": 3.580250687863039, "grad_norm": 0.8484359383583069, "learning_rate": 2.8363551441552377e-05, "loss": 0.1941, "step": 11711 }, { "epoch": 3.5805564047691836, "grad_norm": 0.7489464282989502, "learning_rate": 2.836312683113244e-05, "loss": 0.2106, "step": 11712 }, { "epoch": 3.5808621216753287, "grad_norm": 1.037770390510559, "learning_rate": 2.8362702220712497e-05, "loss": 0.2788, "step": 11713 }, { "epoch": 3.5811678385814734, "grad_norm": 0.25767821073532104, "learning_rate": 2.836227761029256e-05, "loss": 0.1498, "step": 11714 }, { "epoch": 3.5814735554876185, "grad_norm": 1.5222755670547485, "learning_rate": 2.8361852999872618e-05, "loss": 0.0761, "step": 11715 }, { "epoch": 3.5817792723937636, "grad_norm": 0.4082672894001007, "learning_rate": 2.836142838945268e-05, "loss": 0.0839, "step": 11716 }, { "epoch": 3.5820849892999083, "grad_norm": 0.37242844700813293, "learning_rate": 2.836100377903274e-05, "loss": 0.0821, "step": 11717 }, { "epoch": 3.582390706206053, "grad_norm": 0.2506983280181885, "learning_rate": 2.8360579168612798e-05, "loss": 0.07, "step": 11718 }, { "epoch": 3.582696423112198, "grad_norm": 0.29395100474357605, "learning_rate": 2.836015455819286e-05, "loss": 0.0838, "step": 11719 }, { "epoch": 3.583002140018343, "grad_norm": 0.24214395880699158, "learning_rate": 2.835972994777292e-05, "loss": 0.0725, "step": 11720 }, { "epoch": 3.583307856924488, "grad_norm": 0.2703808844089508, "learning_rate": 2.835930533735298e-05, "loss": 0.0653, "step": 11721 }, { "epoch": 3.583613573830633, "grad_norm": 0.2234671413898468, "learning_rate": 2.835888072693304e-05, "loss": 0.0681, "step": 11722 }, { "epoch": 3.5839192907367776, "grad_norm": 0.24812379479408264, "learning_rate": 2.83584561165131e-05, "loss": 0.0665, "step": 11723 }, { "epoch": 3.5842250076429227, "grad_norm": 0.43398743867874146, "learning_rate": 2.835803150609316e-05, "loss": 0.1288, "step": 11724 }, { "epoch": 3.5845307245490674, "grad_norm": 0.32900938391685486, "learning_rate": 2.8357606895673222e-05, "loss": 0.0699, "step": 11725 }, { "epoch": 3.5848364414552125, "grad_norm": 0.22131651639938354, "learning_rate": 2.835718228525328e-05, "loss": 0.0937, "step": 11726 }, { "epoch": 3.585142158361357, "grad_norm": 0.47590190172195435, "learning_rate": 2.8356757674833343e-05, "loss": 0.1201, "step": 11727 }, { "epoch": 3.5854478752675023, "grad_norm": 0.404462993144989, "learning_rate": 2.83563330644134e-05, "loss": 0.1022, "step": 11728 }, { "epoch": 3.5857535921736474, "grad_norm": 0.6286687850952148, "learning_rate": 2.8355908453993464e-05, "loss": 0.136, "step": 11729 }, { "epoch": 3.586059309079792, "grad_norm": 0.8662146925926208, "learning_rate": 2.8355483843573522e-05, "loss": 0.1483, "step": 11730 }, { "epoch": 3.5863650259859368, "grad_norm": 0.8975753784179688, "learning_rate": 2.835505923315358e-05, "loss": 0.1966, "step": 11731 }, { "epoch": 3.586670742892082, "grad_norm": 0.7335655689239502, "learning_rate": 2.8354634622733643e-05, "loss": 0.1793, "step": 11732 }, { "epoch": 3.586976459798227, "grad_norm": 0.6103354692459106, "learning_rate": 2.8354210012313702e-05, "loss": 0.1901, "step": 11733 }, { "epoch": 3.5872821767043717, "grad_norm": 0.6258367300033569, "learning_rate": 2.8353785401893764e-05, "loss": 0.1875, "step": 11734 }, { "epoch": 3.5875878936105168, "grad_norm": 1.8256609439849854, "learning_rate": 2.8353360791473823e-05, "loss": 0.2678, "step": 11735 }, { "epoch": 3.5878936105166614, "grad_norm": 0.7031739354133606, "learning_rate": 2.8352936181053885e-05, "loss": 0.2269, "step": 11736 }, { "epoch": 3.5881993274228066, "grad_norm": 0.9871903657913208, "learning_rate": 2.8352511570633943e-05, "loss": 0.1792, "step": 11737 }, { "epoch": 3.5885050443289512, "grad_norm": 1.003570795059204, "learning_rate": 2.8352086960214006e-05, "loss": 0.264, "step": 11738 }, { "epoch": 3.5888107612350963, "grad_norm": 1.2770717144012451, "learning_rate": 2.8351662349794064e-05, "loss": 0.1569, "step": 11739 }, { "epoch": 3.589116478141241, "grad_norm": 0.4013620913028717, "learning_rate": 2.8351237739374126e-05, "loss": 0.0876, "step": 11740 }, { "epoch": 3.589422195047386, "grad_norm": 0.28413209319114685, "learning_rate": 2.8350813128954185e-05, "loss": 0.0769, "step": 11741 }, { "epoch": 3.5897279119535312, "grad_norm": 0.2972313463687897, "learning_rate": 2.8350388518534247e-05, "loss": 0.0683, "step": 11742 }, { "epoch": 3.590033628859676, "grad_norm": 0.2651325464248657, "learning_rate": 2.8349963908114306e-05, "loss": 0.0716, "step": 11743 }, { "epoch": 3.5903393457658206, "grad_norm": 0.19471749663352966, "learning_rate": 2.8349539297694365e-05, "loss": 0.0543, "step": 11744 }, { "epoch": 3.5906450626719657, "grad_norm": 0.22795620560646057, "learning_rate": 2.8349114687274427e-05, "loss": 0.0697, "step": 11745 }, { "epoch": 3.590950779578111, "grad_norm": 0.240395650267601, "learning_rate": 2.8348690076854485e-05, "loss": 0.0572, "step": 11746 }, { "epoch": 3.5912564964842555, "grad_norm": 0.3435830771923065, "learning_rate": 2.8348265466434547e-05, "loss": 0.0712, "step": 11747 }, { "epoch": 3.5915622133904006, "grad_norm": 0.2818073630332947, "learning_rate": 2.8347840856014606e-05, "loss": 0.0817, "step": 11748 }, { "epoch": 3.5918679302965453, "grad_norm": 0.45901039242744446, "learning_rate": 2.8347416245594668e-05, "loss": 0.0683, "step": 11749 }, { "epoch": 3.5921736472026904, "grad_norm": 0.37557363510131836, "learning_rate": 2.8346991635174727e-05, "loss": 0.094, "step": 11750 }, { "epoch": 3.592479364108835, "grad_norm": 0.47588273882865906, "learning_rate": 2.834656702475479e-05, "loss": 0.1245, "step": 11751 }, { "epoch": 3.59278508101498, "grad_norm": 0.39239197969436646, "learning_rate": 2.8346142414334848e-05, "loss": 0.0886, "step": 11752 }, { "epoch": 3.593090797921125, "grad_norm": 0.6141964793205261, "learning_rate": 2.834571780391491e-05, "loss": 0.142, "step": 11753 }, { "epoch": 3.59339651482727, "grad_norm": 0.6001500487327576, "learning_rate": 2.834529319349497e-05, "loss": 0.1898, "step": 11754 }, { "epoch": 3.593702231733415, "grad_norm": 0.6181190609931946, "learning_rate": 2.834486858307503e-05, "loss": 0.1786, "step": 11755 }, { "epoch": 3.5940079486395597, "grad_norm": 1.5686442852020264, "learning_rate": 2.834444397265509e-05, "loss": 0.1578, "step": 11756 }, { "epoch": 3.5943136655457044, "grad_norm": 0.7555628418922424, "learning_rate": 2.8344019362235148e-05, "loss": 0.19, "step": 11757 }, { "epoch": 3.5946193824518495, "grad_norm": 0.704319953918457, "learning_rate": 2.834359475181521e-05, "loss": 0.2106, "step": 11758 }, { "epoch": 3.5949250993579946, "grad_norm": 0.8975332975387573, "learning_rate": 2.834317014139527e-05, "loss": 0.2251, "step": 11759 }, { "epoch": 3.5952308162641393, "grad_norm": 1.1731657981872559, "learning_rate": 2.834274553097533e-05, "loss": 0.1937, "step": 11760 }, { "epoch": 3.5955365331702844, "grad_norm": 0.9381893873214722, "learning_rate": 2.834232092055539e-05, "loss": 0.2019, "step": 11761 }, { "epoch": 3.595842250076429, "grad_norm": 4.522054195404053, "learning_rate": 2.834189631013545e-05, "loss": 0.2276, "step": 11762 }, { "epoch": 3.596147966982574, "grad_norm": 1.1806584596633911, "learning_rate": 2.834147169971551e-05, "loss": 0.2442, "step": 11763 }, { "epoch": 3.596453683888719, "grad_norm": 0.4213605523109436, "learning_rate": 2.8341047089295572e-05, "loss": 0.1436, "step": 11764 }, { "epoch": 3.596759400794864, "grad_norm": 0.8128756880760193, "learning_rate": 2.834062247887563e-05, "loss": 0.0909, "step": 11765 }, { "epoch": 3.5970651177010087, "grad_norm": 0.33513790369033813, "learning_rate": 2.8340197868455693e-05, "loss": 0.0941, "step": 11766 }, { "epoch": 3.597370834607154, "grad_norm": 0.3210385739803314, "learning_rate": 2.8339773258035752e-05, "loss": 0.0897, "step": 11767 }, { "epoch": 3.597676551513299, "grad_norm": 0.32529187202453613, "learning_rate": 2.8339348647615814e-05, "loss": 0.0536, "step": 11768 }, { "epoch": 3.5979822684194436, "grad_norm": 0.44311121106147766, "learning_rate": 2.8338924037195873e-05, "loss": 0.0579, "step": 11769 }, { "epoch": 3.5982879853255882, "grad_norm": 0.4259544909000397, "learning_rate": 2.833849942677593e-05, "loss": 0.067, "step": 11770 }, { "epoch": 3.5985937022317334, "grad_norm": 0.48747342824935913, "learning_rate": 2.8338074816355993e-05, "loss": 0.0983, "step": 11771 }, { "epoch": 3.5988994191378785, "grad_norm": 0.42608121037483215, "learning_rate": 2.8337650205936052e-05, "loss": 0.0833, "step": 11772 }, { "epoch": 3.599205136044023, "grad_norm": 0.360583633184433, "learning_rate": 2.8337225595516114e-05, "loss": 0.0862, "step": 11773 }, { "epoch": 3.5995108529501683, "grad_norm": 0.4813787639141083, "learning_rate": 2.8336800985096173e-05, "loss": 0.0975, "step": 11774 }, { "epoch": 3.599816569856313, "grad_norm": 0.3514440357685089, "learning_rate": 2.8336376374676235e-05, "loss": 0.0806, "step": 11775 }, { "epoch": 3.600122286762458, "grad_norm": 0.3553240895271301, "learning_rate": 2.8335951764256294e-05, "loss": 0.1156, "step": 11776 }, { "epoch": 3.6004280036686027, "grad_norm": 0.37313753366470337, "learning_rate": 2.8335527153836356e-05, "loss": 0.1447, "step": 11777 }, { "epoch": 3.600733720574748, "grad_norm": 0.5706823468208313, "learning_rate": 2.8335102543416415e-05, "loss": 0.1196, "step": 11778 }, { "epoch": 3.6010394374808925, "grad_norm": 0.47223323583602905, "learning_rate": 2.8334677932996477e-05, "loss": 0.14, "step": 11779 }, { "epoch": 3.6013451543870376, "grad_norm": 0.6221165657043457, "learning_rate": 2.8334253322576535e-05, "loss": 0.1898, "step": 11780 }, { "epoch": 3.6016508712931827, "grad_norm": 0.6243786811828613, "learning_rate": 2.8333828712156597e-05, "loss": 0.1704, "step": 11781 }, { "epoch": 3.6019565881993274, "grad_norm": 3.352388858795166, "learning_rate": 2.8333404101736656e-05, "loss": 0.1627, "step": 11782 }, { "epoch": 3.602262305105472, "grad_norm": 0.6272567510604858, "learning_rate": 2.8332979491316715e-05, "loss": 0.1707, "step": 11783 }, { "epoch": 3.602568022011617, "grad_norm": 0.614588737487793, "learning_rate": 2.8332554880896777e-05, "loss": 0.1674, "step": 11784 }, { "epoch": 3.6028737389177623, "grad_norm": 2.817972183227539, "learning_rate": 2.8332130270476836e-05, "loss": 0.1867, "step": 11785 }, { "epoch": 3.603179455823907, "grad_norm": 1.475066065788269, "learning_rate": 2.8331705660056898e-05, "loss": 0.2242, "step": 11786 }, { "epoch": 3.603485172730052, "grad_norm": 3.2115471363067627, "learning_rate": 2.8331281049636956e-05, "loss": 0.2568, "step": 11787 }, { "epoch": 3.6037908896361968, "grad_norm": 1.5505328178405762, "learning_rate": 2.833085643921702e-05, "loss": 0.3607, "step": 11788 }, { "epoch": 3.604096606542342, "grad_norm": 0.2845970392227173, "learning_rate": 2.8330431828797077e-05, "loss": 0.1422, "step": 11789 }, { "epoch": 3.6044023234484865, "grad_norm": 0.5124988555908203, "learning_rate": 2.833000721837714e-05, "loss": 0.0917, "step": 11790 }, { "epoch": 3.6047080403546317, "grad_norm": 0.3382508456707001, "learning_rate": 2.8329582607957198e-05, "loss": 0.0806, "step": 11791 }, { "epoch": 3.6050137572607763, "grad_norm": 0.2346203774213791, "learning_rate": 2.832915799753726e-05, "loss": 0.0627, "step": 11792 }, { "epoch": 3.6053194741669214, "grad_norm": 0.2435457557439804, "learning_rate": 2.832873338711732e-05, "loss": 0.0546, "step": 11793 }, { "epoch": 3.6056251910730666, "grad_norm": 0.23458489775657654, "learning_rate": 2.832830877669738e-05, "loss": 0.0599, "step": 11794 }, { "epoch": 3.6059309079792112, "grad_norm": 0.1946381777524948, "learning_rate": 2.8327884166277443e-05, "loss": 0.0647, "step": 11795 }, { "epoch": 3.606236624885356, "grad_norm": 0.29896920919418335, "learning_rate": 2.83274595558575e-05, "loss": 0.0526, "step": 11796 }, { "epoch": 3.606542341791501, "grad_norm": 0.48729196190834045, "learning_rate": 2.8327034945437564e-05, "loss": 0.0902, "step": 11797 }, { "epoch": 3.606848058697646, "grad_norm": 0.597442626953125, "learning_rate": 2.8326610335017622e-05, "loss": 0.0902, "step": 11798 }, { "epoch": 3.607153775603791, "grad_norm": 0.4484374523162842, "learning_rate": 2.8326185724597684e-05, "loss": 0.0944, "step": 11799 }, { "epoch": 3.607459492509936, "grad_norm": 0.5087063312530518, "learning_rate": 2.8325761114177743e-05, "loss": 0.0642, "step": 11800 }, { "epoch": 3.6077652094160806, "grad_norm": 0.2482924461364746, "learning_rate": 2.8325336503757805e-05, "loss": 0.1025, "step": 11801 }, { "epoch": 3.6080709263222257, "grad_norm": 0.40980562567710876, "learning_rate": 2.8324911893337864e-05, "loss": 0.1158, "step": 11802 }, { "epoch": 3.6083766432283704, "grad_norm": 0.35672327876091003, "learning_rate": 2.8324487282917926e-05, "loss": 0.1098, "step": 11803 }, { "epoch": 3.6086823601345155, "grad_norm": 0.44848132133483887, "learning_rate": 2.8324062672497985e-05, "loss": 0.112, "step": 11804 }, { "epoch": 3.60898807704066, "grad_norm": 0.7811030745506287, "learning_rate": 2.8323638062078047e-05, "loss": 0.1856, "step": 11805 }, { "epoch": 3.6092937939468053, "grad_norm": 0.5563563108444214, "learning_rate": 2.8323213451658106e-05, "loss": 0.1649, "step": 11806 }, { "epoch": 3.6095995108529504, "grad_norm": 0.5300674438476562, "learning_rate": 2.8322788841238168e-05, "loss": 0.1814, "step": 11807 }, { "epoch": 3.609905227759095, "grad_norm": 0.793218195438385, "learning_rate": 2.8322364230818226e-05, "loss": 0.1724, "step": 11808 }, { "epoch": 3.6102109446652397, "grad_norm": 0.5493916273117065, "learning_rate": 2.8321939620398285e-05, "loss": 0.1821, "step": 11809 }, { "epoch": 3.610516661571385, "grad_norm": 1.1759570837020874, "learning_rate": 2.8321515009978347e-05, "loss": 0.2253, "step": 11810 }, { "epoch": 3.61082237847753, "grad_norm": 0.8095409274101257, "learning_rate": 2.8321090399558406e-05, "loss": 0.2097, "step": 11811 }, { "epoch": 3.6111280953836746, "grad_norm": 0.603877604007721, "learning_rate": 2.8320665789138468e-05, "loss": 0.1956, "step": 11812 }, { "epoch": 3.6114338122898197, "grad_norm": 1.1549371480941772, "learning_rate": 2.8320241178718527e-05, "loss": 0.2963, "step": 11813 }, { "epoch": 3.6117395291959644, "grad_norm": 0.2915317714214325, "learning_rate": 2.831981656829859e-05, "loss": 0.1685, "step": 11814 }, { "epoch": 3.6120452461021095, "grad_norm": 0.18560990691184998, "learning_rate": 2.8319391957878647e-05, "loss": 0.0656, "step": 11815 }, { "epoch": 3.612350963008254, "grad_norm": 0.23090894520282745, "learning_rate": 2.831896734745871e-05, "loss": 0.0794, "step": 11816 }, { "epoch": 3.6126566799143993, "grad_norm": 0.2937065362930298, "learning_rate": 2.8318542737038768e-05, "loss": 0.0689, "step": 11817 }, { "epoch": 3.612962396820544, "grad_norm": 0.24488480389118195, "learning_rate": 2.831811812661883e-05, "loss": 0.0579, "step": 11818 }, { "epoch": 3.613268113726689, "grad_norm": 0.15870939195156097, "learning_rate": 2.831769351619889e-05, "loss": 0.0581, "step": 11819 }, { "epoch": 3.613573830632834, "grad_norm": 0.30639928579330444, "learning_rate": 2.8317268905778948e-05, "loss": 0.0521, "step": 11820 }, { "epoch": 3.613879547538979, "grad_norm": 0.3682774007320404, "learning_rate": 2.831684429535901e-05, "loss": 0.0678, "step": 11821 }, { "epoch": 3.6141852644451236, "grad_norm": 0.1878669112920761, "learning_rate": 2.831641968493907e-05, "loss": 0.0642, "step": 11822 }, { "epoch": 3.6144909813512687, "grad_norm": 0.3402140140533447, "learning_rate": 2.831599507451913e-05, "loss": 0.0587, "step": 11823 }, { "epoch": 3.614796698257414, "grad_norm": 0.3549704849720001, "learning_rate": 2.831557046409919e-05, "loss": 0.0952, "step": 11824 }, { "epoch": 3.6151024151635585, "grad_norm": 0.3391115665435791, "learning_rate": 2.831514585367925e-05, "loss": 0.0878, "step": 11825 }, { "epoch": 3.6154081320697036, "grad_norm": 0.2593439817428589, "learning_rate": 2.831472124325931e-05, "loss": 0.079, "step": 11826 }, { "epoch": 3.6157138489758482, "grad_norm": 0.460860937833786, "learning_rate": 2.8314296632839372e-05, "loss": 0.1324, "step": 11827 }, { "epoch": 3.6160195658819934, "grad_norm": 0.3076055645942688, "learning_rate": 2.831387202241943e-05, "loss": 0.115, "step": 11828 }, { "epoch": 3.616325282788138, "grad_norm": 0.34216636419296265, "learning_rate": 2.8313447411999493e-05, "loss": 0.1484, "step": 11829 }, { "epoch": 3.616630999694283, "grad_norm": 0.5176770091056824, "learning_rate": 2.831302280157955e-05, "loss": 0.1447, "step": 11830 }, { "epoch": 3.616936716600428, "grad_norm": 0.8467203974723816, "learning_rate": 2.8312598191159614e-05, "loss": 0.1845, "step": 11831 }, { "epoch": 3.617242433506573, "grad_norm": 0.6569538712501526, "learning_rate": 2.8312173580739672e-05, "loss": 0.1836, "step": 11832 }, { "epoch": 3.617548150412718, "grad_norm": 0.7074881792068481, "learning_rate": 2.831174897031973e-05, "loss": 0.1982, "step": 11833 }, { "epoch": 3.6178538673188627, "grad_norm": 0.9201061725616455, "learning_rate": 2.8311324359899793e-05, "loss": 0.165, "step": 11834 }, { "epoch": 3.6181595842250074, "grad_norm": 0.9862549901008606, "learning_rate": 2.8310899749479852e-05, "loss": 0.178, "step": 11835 }, { "epoch": 3.6184653011311525, "grad_norm": 1.086486577987671, "learning_rate": 2.8310475139059914e-05, "loss": 0.2063, "step": 11836 }, { "epoch": 3.6187710180372976, "grad_norm": 0.8128204345703125, "learning_rate": 2.8310050528639973e-05, "loss": 0.1866, "step": 11837 }, { "epoch": 3.6190767349434423, "grad_norm": 1.418892741203308, "learning_rate": 2.8309625918220035e-05, "loss": 0.2788, "step": 11838 }, { "epoch": 3.6193824518495874, "grad_norm": 0.40972211956977844, "learning_rate": 2.8309201307800093e-05, "loss": 0.1303, "step": 11839 }, { "epoch": 3.619688168755732, "grad_norm": 1.2899149656295776, "learning_rate": 2.8308776697380156e-05, "loss": 0.0678, "step": 11840 }, { "epoch": 3.619993885661877, "grad_norm": 1.1042253971099854, "learning_rate": 2.8308352086960214e-05, "loss": 0.0791, "step": 11841 }, { "epoch": 3.620299602568022, "grad_norm": 0.2848855257034302, "learning_rate": 2.8307927476540276e-05, "loss": 0.0627, "step": 11842 }, { "epoch": 3.620605319474167, "grad_norm": 0.840053379535675, "learning_rate": 2.8307502866120335e-05, "loss": 0.0517, "step": 11843 }, { "epoch": 3.6209110363803116, "grad_norm": 0.2192358821630478, "learning_rate": 2.8307078255700397e-05, "loss": 0.0441, "step": 11844 }, { "epoch": 3.6212167532864568, "grad_norm": 0.356291264295578, "learning_rate": 2.8306653645280456e-05, "loss": 0.0784, "step": 11845 }, { "epoch": 3.621522470192602, "grad_norm": 0.4403637647628784, "learning_rate": 2.8306229034860515e-05, "loss": 0.071, "step": 11846 }, { "epoch": 3.6218281870987465, "grad_norm": 0.3787027895450592, "learning_rate": 2.8305804424440577e-05, "loss": 0.0519, "step": 11847 }, { "epoch": 3.622133904004891, "grad_norm": 0.3120979070663452, "learning_rate": 2.8305379814020635e-05, "loss": 0.0631, "step": 11848 }, { "epoch": 3.6224396209110363, "grad_norm": 0.35446107387542725, "learning_rate": 2.8304955203600697e-05, "loss": 0.1353, "step": 11849 }, { "epoch": 3.6227453378171814, "grad_norm": 0.3796779215335846, "learning_rate": 2.8304530593180756e-05, "loss": 0.0856, "step": 11850 }, { "epoch": 3.623051054723326, "grad_norm": 0.2249758392572403, "learning_rate": 2.8304105982760818e-05, "loss": 0.0941, "step": 11851 }, { "epoch": 3.6233567716294712, "grad_norm": 0.28351303935050964, "learning_rate": 2.8303681372340877e-05, "loss": 0.1379, "step": 11852 }, { "epoch": 3.623662488535616, "grad_norm": 0.5886611342430115, "learning_rate": 2.830325676192094e-05, "loss": 0.1356, "step": 11853 }, { "epoch": 3.623968205441761, "grad_norm": 0.7195833325386047, "learning_rate": 2.8302832151500998e-05, "loss": 0.1764, "step": 11854 }, { "epoch": 3.6242739223479057, "grad_norm": 0.564130425453186, "learning_rate": 2.830240754108106e-05, "loss": 0.1588, "step": 11855 }, { "epoch": 3.624579639254051, "grad_norm": 0.5940136313438416, "learning_rate": 2.830198293066112e-05, "loss": 0.1549, "step": 11856 }, { "epoch": 3.6248853561601955, "grad_norm": 0.39409518241882324, "learning_rate": 2.830155832024118e-05, "loss": 0.1474, "step": 11857 }, { "epoch": 3.6251910730663406, "grad_norm": 0.6379450559616089, "learning_rate": 2.830113370982124e-05, "loss": 0.1878, "step": 11858 }, { "epoch": 3.6254967899724857, "grad_norm": 0.6521804332733154, "learning_rate": 2.8300709099401298e-05, "loss": 0.2046, "step": 11859 }, { "epoch": 3.6258025068786304, "grad_norm": 0.5331445336341858, "learning_rate": 2.830028448898136e-05, "loss": 0.1812, "step": 11860 }, { "epoch": 3.626108223784775, "grad_norm": 1.2426602840423584, "learning_rate": 2.829985987856142e-05, "loss": 0.2612, "step": 11861 }, { "epoch": 3.62641394069092, "grad_norm": 0.8613545298576355, "learning_rate": 2.829943526814148e-05, "loss": 0.2464, "step": 11862 }, { "epoch": 3.6267196575970653, "grad_norm": 1.9958122968673706, "learning_rate": 2.829901065772154e-05, "loss": 0.2913, "step": 11863 }, { "epoch": 3.62702537450321, "grad_norm": 0.3943147659301758, "learning_rate": 2.82985860473016e-05, "loss": 0.1297, "step": 11864 }, { "epoch": 3.627331091409355, "grad_norm": 0.20072075724601746, "learning_rate": 2.829816143688166e-05, "loss": 0.0993, "step": 11865 }, { "epoch": 3.6276368083154997, "grad_norm": 0.31539222598075867, "learning_rate": 2.8297736826461722e-05, "loss": 0.1012, "step": 11866 }, { "epoch": 3.627942525221645, "grad_norm": 0.25249236822128296, "learning_rate": 2.829731221604178e-05, "loss": 0.0556, "step": 11867 }, { "epoch": 3.6282482421277895, "grad_norm": 0.1778499335050583, "learning_rate": 2.8296887605621843e-05, "loss": 0.0527, "step": 11868 }, { "epoch": 3.6285539590339346, "grad_norm": 0.19797830283641815, "learning_rate": 2.8296462995201902e-05, "loss": 0.0582, "step": 11869 }, { "epoch": 3.6288596759400793, "grad_norm": 0.9754933714866638, "learning_rate": 2.8296038384781964e-05, "loss": 0.0567, "step": 11870 }, { "epoch": 3.6291653928462244, "grad_norm": 0.2639389634132385, "learning_rate": 2.8295613774362023e-05, "loss": 0.0604, "step": 11871 }, { "epoch": 3.6294711097523695, "grad_norm": 0.2764546871185303, "learning_rate": 2.829518916394208e-05, "loss": 0.0872, "step": 11872 }, { "epoch": 3.629776826658514, "grad_norm": 0.4844982922077179, "learning_rate": 2.8294764553522143e-05, "loss": 0.0656, "step": 11873 }, { "epoch": 3.630082543564659, "grad_norm": 0.41692227125167847, "learning_rate": 2.8294339943102202e-05, "loss": 0.0972, "step": 11874 }, { "epoch": 3.630388260470804, "grad_norm": 0.3153003454208374, "learning_rate": 2.8293915332682264e-05, "loss": 0.097, "step": 11875 }, { "epoch": 3.630693977376949, "grad_norm": 0.31045061349868774, "learning_rate": 2.8293490722262323e-05, "loss": 0.1172, "step": 11876 }, { "epoch": 3.6309996942830938, "grad_norm": 0.4595772922039032, "learning_rate": 2.8293066111842385e-05, "loss": 0.1016, "step": 11877 }, { "epoch": 3.631305411189239, "grad_norm": 0.36308732628822327, "learning_rate": 2.8292641501422444e-05, "loss": 0.1364, "step": 11878 }, { "epoch": 3.6316111280953836, "grad_norm": 0.7252886891365051, "learning_rate": 2.8292216891002506e-05, "loss": 0.1482, "step": 11879 }, { "epoch": 3.6319168450015287, "grad_norm": 0.49741190671920776, "learning_rate": 2.8291792280582565e-05, "loss": 0.1615, "step": 11880 }, { "epoch": 3.6322225619076733, "grad_norm": 0.4849351942539215, "learning_rate": 2.8291367670162627e-05, "loss": 0.1489, "step": 11881 }, { "epoch": 3.6325282788138185, "grad_norm": 0.5441967248916626, "learning_rate": 2.8290943059742685e-05, "loss": 0.1738, "step": 11882 }, { "epoch": 3.632833995719963, "grad_norm": 0.8647991418838501, "learning_rate": 2.8290518449322747e-05, "loss": 0.1579, "step": 11883 }, { "epoch": 3.6331397126261082, "grad_norm": 0.5764318108558655, "learning_rate": 2.8290093838902806e-05, "loss": 0.1925, "step": 11884 }, { "epoch": 3.6334454295322534, "grad_norm": 0.6718711853027344, "learning_rate": 2.8289669228482865e-05, "loss": 0.1858, "step": 11885 }, { "epoch": 3.633751146438398, "grad_norm": 0.7766542434692383, "learning_rate": 2.8289244618062927e-05, "loss": 0.2024, "step": 11886 }, { "epoch": 3.6340568633445427, "grad_norm": 0.7827467322349548, "learning_rate": 2.8288820007642986e-05, "loss": 0.1783, "step": 11887 }, { "epoch": 3.634362580250688, "grad_norm": 1.0433706045150757, "learning_rate": 2.8288395397223048e-05, "loss": 0.2725, "step": 11888 }, { "epoch": 3.634668297156833, "grad_norm": 0.5693661570549011, "learning_rate": 2.8287970786803106e-05, "loss": 0.1593, "step": 11889 }, { "epoch": 3.6349740140629776, "grad_norm": 0.3470436632633209, "learning_rate": 2.828754617638317e-05, "loss": 0.1113, "step": 11890 }, { "epoch": 3.6352797309691227, "grad_norm": 0.3419566750526428, "learning_rate": 2.8287121565963227e-05, "loss": 0.0679, "step": 11891 }, { "epoch": 3.6355854478752674, "grad_norm": 0.4354007840156555, "learning_rate": 2.828669695554329e-05, "loss": 0.0494, "step": 11892 }, { "epoch": 3.6358911647814125, "grad_norm": 0.2459566444158554, "learning_rate": 2.8286272345123348e-05, "loss": 0.0549, "step": 11893 }, { "epoch": 3.636196881687557, "grad_norm": 0.2599288523197174, "learning_rate": 2.828584773470341e-05, "loss": 0.0483, "step": 11894 }, { "epoch": 3.6365025985937023, "grad_norm": 0.2675650417804718, "learning_rate": 2.828542312428347e-05, "loss": 0.0515, "step": 11895 }, { "epoch": 3.636808315499847, "grad_norm": 0.8202938437461853, "learning_rate": 2.828499851386353e-05, "loss": 0.0959, "step": 11896 }, { "epoch": 3.637114032405992, "grad_norm": 0.3784049451351166, "learning_rate": 2.8284573903443593e-05, "loss": 0.0616, "step": 11897 }, { "epoch": 3.637419749312137, "grad_norm": 0.295743465423584, "learning_rate": 2.828414929302365e-05, "loss": 0.0665, "step": 11898 }, { "epoch": 3.637725466218282, "grad_norm": 0.27304309606552124, "learning_rate": 2.8283724682603714e-05, "loss": 0.0714, "step": 11899 }, { "epoch": 3.6380311831244265, "grad_norm": 0.523847222328186, "learning_rate": 2.8283300072183772e-05, "loss": 0.1073, "step": 11900 }, { "epoch": 3.6383369000305716, "grad_norm": 0.26696890592575073, "learning_rate": 2.8282875461763835e-05, "loss": 0.1112, "step": 11901 }, { "epoch": 3.6386426169367168, "grad_norm": 0.26565709710121155, "learning_rate": 2.8282450851343893e-05, "loss": 0.0933, "step": 11902 }, { "epoch": 3.6389483338428614, "grad_norm": 0.40301448106765747, "learning_rate": 2.8282026240923955e-05, "loss": 0.1212, "step": 11903 }, { "epoch": 3.6392540507490065, "grad_norm": 0.5433918237686157, "learning_rate": 2.8281601630504014e-05, "loss": 0.1524, "step": 11904 }, { "epoch": 3.639559767655151, "grad_norm": 0.7293044924736023, "learning_rate": 2.8281177020084076e-05, "loss": 0.1444, "step": 11905 }, { "epoch": 3.6398654845612963, "grad_norm": 0.6483283042907715, "learning_rate": 2.8280752409664135e-05, "loss": 0.1744, "step": 11906 }, { "epoch": 3.640171201467441, "grad_norm": 0.7163029909133911, "learning_rate": 2.8280327799244197e-05, "loss": 0.1846, "step": 11907 }, { "epoch": 3.640476918373586, "grad_norm": 1.1382486820220947, "learning_rate": 2.8279903188824256e-05, "loss": 0.1716, "step": 11908 }, { "epoch": 3.640782635279731, "grad_norm": 1.0966808795928955, "learning_rate": 2.8279478578404318e-05, "loss": 0.1574, "step": 11909 }, { "epoch": 3.641088352185876, "grad_norm": 0.555380642414093, "learning_rate": 2.8279053967984376e-05, "loss": 0.1959, "step": 11910 }, { "epoch": 3.641394069092021, "grad_norm": 0.7282954454421997, "learning_rate": 2.8278629357564435e-05, "loss": 0.2063, "step": 11911 }, { "epoch": 3.6416997859981657, "grad_norm": 0.8728958368301392, "learning_rate": 2.8278204747144497e-05, "loss": 0.1896, "step": 11912 }, { "epoch": 3.6420055029043104, "grad_norm": 0.7409018874168396, "learning_rate": 2.8277780136724556e-05, "loss": 0.2187, "step": 11913 }, { "epoch": 3.6423112198104555, "grad_norm": 0.42394164204597473, "learning_rate": 2.8277355526304618e-05, "loss": 0.1368, "step": 11914 }, { "epoch": 3.6426169367166006, "grad_norm": 0.3460516929626465, "learning_rate": 2.8276930915884677e-05, "loss": 0.1232, "step": 11915 }, { "epoch": 3.6429226536227453, "grad_norm": 0.3719013035297394, "learning_rate": 2.827650630546474e-05, "loss": 0.0801, "step": 11916 }, { "epoch": 3.6432283705288904, "grad_norm": 0.27545908093452454, "learning_rate": 2.8276081695044797e-05, "loss": 0.0624, "step": 11917 }, { "epoch": 3.643534087435035, "grad_norm": 0.19549356400966644, "learning_rate": 2.827565708462486e-05, "loss": 0.0494, "step": 11918 }, { "epoch": 3.64383980434118, "grad_norm": 0.2865433990955353, "learning_rate": 2.8275232474204918e-05, "loss": 0.0957, "step": 11919 }, { "epoch": 3.644145521247325, "grad_norm": 0.9130196571350098, "learning_rate": 2.827480786378498e-05, "loss": 0.08, "step": 11920 }, { "epoch": 3.64445123815347, "grad_norm": 0.21596695482730865, "learning_rate": 2.827438325336504e-05, "loss": 0.041, "step": 11921 }, { "epoch": 3.6447569550596146, "grad_norm": 0.28996333479881287, "learning_rate": 2.82739586429451e-05, "loss": 0.0811, "step": 11922 }, { "epoch": 3.6450626719657597, "grad_norm": 0.3829897940158844, "learning_rate": 2.827353403252516e-05, "loss": 0.0625, "step": 11923 }, { "epoch": 3.645368388871905, "grad_norm": 0.291230171918869, "learning_rate": 2.827310942210522e-05, "loss": 0.1117, "step": 11924 }, { "epoch": 3.6456741057780495, "grad_norm": 0.4493353068828583, "learning_rate": 2.827268481168528e-05, "loss": 0.0946, "step": 11925 }, { "epoch": 3.645979822684194, "grad_norm": 0.4543725550174713, "learning_rate": 2.827226020126534e-05, "loss": 0.1039, "step": 11926 }, { "epoch": 3.6462855395903393, "grad_norm": 0.40504026412963867, "learning_rate": 2.82718355908454e-05, "loss": 0.0897, "step": 11927 }, { "epoch": 3.6465912564964844, "grad_norm": 1.3106497526168823, "learning_rate": 2.827141098042546e-05, "loss": 0.1528, "step": 11928 }, { "epoch": 3.646896973402629, "grad_norm": 0.8654216527938843, "learning_rate": 2.8270986370005522e-05, "loss": 0.1321, "step": 11929 }, { "epoch": 3.647202690308774, "grad_norm": 0.6887930035591125, "learning_rate": 2.827056175958558e-05, "loss": 0.1463, "step": 11930 }, { "epoch": 3.647508407214919, "grad_norm": 0.7851628065109253, "learning_rate": 2.8270137149165643e-05, "loss": 0.1795, "step": 11931 }, { "epoch": 3.647814124121064, "grad_norm": 1.4753180742263794, "learning_rate": 2.82697125387457e-05, "loss": 0.1736, "step": 11932 }, { "epoch": 3.6481198410272087, "grad_norm": 1.0199941396713257, "learning_rate": 2.8269287928325764e-05, "loss": 0.1685, "step": 11933 }, { "epoch": 3.6484255579333538, "grad_norm": 0.9823446273803711, "learning_rate": 2.8268863317905822e-05, "loss": 0.1939, "step": 11934 }, { "epoch": 3.6487312748394984, "grad_norm": 0.8424067497253418, "learning_rate": 2.826843870748588e-05, "loss": 0.1871, "step": 11935 }, { "epoch": 3.6490369917456436, "grad_norm": 3.5172178745269775, "learning_rate": 2.8268014097065943e-05, "loss": 0.1821, "step": 11936 }, { "epoch": 3.6493427086517887, "grad_norm": 1.0149189233779907, "learning_rate": 2.8267589486646002e-05, "loss": 0.2247, "step": 11937 }, { "epoch": 3.6496484255579333, "grad_norm": 1.1313700675964355, "learning_rate": 2.8267164876226064e-05, "loss": 0.266, "step": 11938 }, { "epoch": 3.649954142464078, "grad_norm": 0.8226147890090942, "learning_rate": 2.8266740265806123e-05, "loss": 0.1492, "step": 11939 }, { "epoch": 3.650259859370223, "grad_norm": 0.2629293203353882, "learning_rate": 2.8266315655386185e-05, "loss": 0.0979, "step": 11940 }, { "epoch": 3.6505655762763682, "grad_norm": 0.29179292917251587, "learning_rate": 2.8265891044966244e-05, "loss": 0.0655, "step": 11941 }, { "epoch": 3.650871293182513, "grad_norm": 0.546625018119812, "learning_rate": 2.8265466434546306e-05, "loss": 0.066, "step": 11942 }, { "epoch": 3.651177010088658, "grad_norm": 0.30035799741744995, "learning_rate": 2.8265041824126364e-05, "loss": 0.053, "step": 11943 }, { "epoch": 3.6514827269948027, "grad_norm": 0.21405397355556488, "learning_rate": 2.8264617213706426e-05, "loss": 0.0607, "step": 11944 }, { "epoch": 3.651788443900948, "grad_norm": 0.32638850808143616, "learning_rate": 2.8264192603286485e-05, "loss": 0.0661, "step": 11945 }, { "epoch": 3.6520941608070925, "grad_norm": 0.33142006397247314, "learning_rate": 2.8263767992866547e-05, "loss": 0.0611, "step": 11946 }, { "epoch": 3.6523998777132376, "grad_norm": 0.35431045293807983, "learning_rate": 2.8263343382446606e-05, "loss": 0.0879, "step": 11947 }, { "epoch": 3.6527055946193823, "grad_norm": 0.3443698287010193, "learning_rate": 2.8262918772026665e-05, "loss": 0.0726, "step": 11948 }, { "epoch": 3.6530113115255274, "grad_norm": 0.32301855087280273, "learning_rate": 2.8262494161606727e-05, "loss": 0.1003, "step": 11949 }, { "epoch": 3.6533170284316725, "grad_norm": 0.2930426597595215, "learning_rate": 2.8262069551186785e-05, "loss": 0.0991, "step": 11950 }, { "epoch": 3.653622745337817, "grad_norm": 0.3499718904495239, "learning_rate": 2.8261644940766847e-05, "loss": 0.0854, "step": 11951 }, { "epoch": 3.653928462243962, "grad_norm": 0.5135340094566345, "learning_rate": 2.8261220330346906e-05, "loss": 0.1471, "step": 11952 }, { "epoch": 3.654234179150107, "grad_norm": 0.5223749279975891, "learning_rate": 2.8260795719926968e-05, "loss": 0.1242, "step": 11953 }, { "epoch": 3.654539896056252, "grad_norm": 0.5707592368125916, "learning_rate": 2.8260371109507027e-05, "loss": 0.1881, "step": 11954 }, { "epoch": 3.6548456129623967, "grad_norm": 0.45949888229370117, "learning_rate": 2.825994649908709e-05, "loss": 0.1706, "step": 11955 }, { "epoch": 3.655151329868542, "grad_norm": 0.5113751292228699, "learning_rate": 2.8259521888667148e-05, "loss": 0.1678, "step": 11956 }, { "epoch": 3.6554570467746865, "grad_norm": 0.5510910749435425, "learning_rate": 2.825909727824721e-05, "loss": 0.1765, "step": 11957 }, { "epoch": 3.6557627636808316, "grad_norm": 0.5246974229812622, "learning_rate": 2.825867266782727e-05, "loss": 0.2124, "step": 11958 }, { "epoch": 3.6560684805869763, "grad_norm": 1.031203269958496, "learning_rate": 2.825824805740733e-05, "loss": 0.2123, "step": 11959 }, { "epoch": 3.6563741974931214, "grad_norm": 0.58543860912323, "learning_rate": 2.825782344698739e-05, "loss": 0.2031, "step": 11960 }, { "epoch": 3.656679914399266, "grad_norm": 2.0182247161865234, "learning_rate": 2.8257398836567448e-05, "loss": 0.2088, "step": 11961 }, { "epoch": 3.656985631305411, "grad_norm": 1.3058240413665771, "learning_rate": 2.825697422614751e-05, "loss": 0.2252, "step": 11962 }, { "epoch": 3.6572913482115563, "grad_norm": 1.1558605432510376, "learning_rate": 2.825654961572757e-05, "loss": 0.2807, "step": 11963 }, { "epoch": 3.657597065117701, "grad_norm": 0.40271344780921936, "learning_rate": 2.825612500530763e-05, "loss": 0.1339, "step": 11964 }, { "epoch": 3.6579027820238457, "grad_norm": 0.4470112919807434, "learning_rate": 2.825570039488769e-05, "loss": 0.0889, "step": 11965 }, { "epoch": 3.658208498929991, "grad_norm": 0.353147029876709, "learning_rate": 2.825527578446775e-05, "loss": 0.0732, "step": 11966 }, { "epoch": 3.658514215836136, "grad_norm": 0.27411630749702454, "learning_rate": 2.825485117404781e-05, "loss": 0.0556, "step": 11967 }, { "epoch": 3.6588199327422806, "grad_norm": 0.2623741328716278, "learning_rate": 2.8254426563627872e-05, "loss": 0.0895, "step": 11968 }, { "epoch": 3.6591256496484257, "grad_norm": 0.3266567885875702, "learning_rate": 2.825400195320793e-05, "loss": 0.0762, "step": 11969 }, { "epoch": 3.6594313665545704, "grad_norm": 0.34803199768066406, "learning_rate": 2.8253577342787993e-05, "loss": 0.0499, "step": 11970 }, { "epoch": 3.6597370834607155, "grad_norm": 0.7812053561210632, "learning_rate": 2.8253152732368052e-05, "loss": 0.0752, "step": 11971 }, { "epoch": 3.66004280036686, "grad_norm": 0.2711164355278015, "learning_rate": 2.8252728121948114e-05, "loss": 0.0689, "step": 11972 }, { "epoch": 3.6603485172730053, "grad_norm": 0.7918285727500916, "learning_rate": 2.8252303511528173e-05, "loss": 0.0533, "step": 11973 }, { "epoch": 3.66065423417915, "grad_norm": 0.47233960032463074, "learning_rate": 2.825187890110823e-05, "loss": 0.0875, "step": 11974 }, { "epoch": 3.660959951085295, "grad_norm": 0.2258080393075943, "learning_rate": 2.8251454290688294e-05, "loss": 0.0839, "step": 11975 }, { "epoch": 3.66126566799144, "grad_norm": 0.3886774182319641, "learning_rate": 2.8251029680268352e-05, "loss": 0.087, "step": 11976 }, { "epoch": 3.661571384897585, "grad_norm": 0.5262261033058167, "learning_rate": 2.8250605069848414e-05, "loss": 0.1214, "step": 11977 }, { "epoch": 3.6618771018037295, "grad_norm": 0.5174117088317871, "learning_rate": 2.8250180459428473e-05, "loss": 0.1175, "step": 11978 }, { "epoch": 3.6621828187098746, "grad_norm": 0.40949687361717224, "learning_rate": 2.8249755849008535e-05, "loss": 0.1641, "step": 11979 }, { "epoch": 3.6624885356160197, "grad_norm": 0.5645827054977417, "learning_rate": 2.8249331238588594e-05, "loss": 0.1501, "step": 11980 }, { "epoch": 3.6627942525221644, "grad_norm": 0.676817774772644, "learning_rate": 2.8248906628168656e-05, "loss": 0.166, "step": 11981 }, { "epoch": 3.6630999694283095, "grad_norm": 0.5432229042053223, "learning_rate": 2.8248482017748715e-05, "loss": 0.1674, "step": 11982 }, { "epoch": 3.663405686334454, "grad_norm": 0.5186040997505188, "learning_rate": 2.8248057407328777e-05, "loss": 0.1685, "step": 11983 }, { "epoch": 3.6637114032405993, "grad_norm": 0.6970278024673462, "learning_rate": 2.8247632796908835e-05, "loss": 0.2008, "step": 11984 }, { "epoch": 3.664017120146744, "grad_norm": 6.348059177398682, "learning_rate": 2.8247208186488897e-05, "loss": 0.2131, "step": 11985 }, { "epoch": 3.664322837052889, "grad_norm": 0.8249282836914062, "learning_rate": 2.8246783576068956e-05, "loss": 0.193, "step": 11986 }, { "epoch": 3.6646285539590338, "grad_norm": 1.3885554075241089, "learning_rate": 2.8246358965649015e-05, "loss": 0.172, "step": 11987 }, { "epoch": 3.664934270865179, "grad_norm": 1.1010934114456177, "learning_rate": 2.8245934355229077e-05, "loss": 0.2549, "step": 11988 }, { "epoch": 3.665239987771324, "grad_norm": 0.36208802461624146, "learning_rate": 2.8245509744809136e-05, "loss": 0.1437, "step": 11989 }, { "epoch": 3.6655457046774687, "grad_norm": 0.35959386825561523, "learning_rate": 2.8245085134389198e-05, "loss": 0.1134, "step": 11990 }, { "epoch": 3.6658514215836133, "grad_norm": 0.3354943096637726, "learning_rate": 2.8244660523969256e-05, "loss": 0.0716, "step": 11991 }, { "epoch": 3.6661571384897584, "grad_norm": 0.40667563676834106, "learning_rate": 2.824423591354932e-05, "loss": 0.0661, "step": 11992 }, { "epoch": 3.6664628553959036, "grad_norm": 0.23361313343048096, "learning_rate": 2.8243811303129377e-05, "loss": 0.0543, "step": 11993 }, { "epoch": 3.6667685723020482, "grad_norm": 0.13685326278209686, "learning_rate": 2.824338669270944e-05, "loss": 0.054, "step": 11994 }, { "epoch": 3.6670742892081933, "grad_norm": 0.4498595893383026, "learning_rate": 2.8242962082289498e-05, "loss": 0.0788, "step": 11995 }, { "epoch": 3.667380006114338, "grad_norm": 0.37972524762153625, "learning_rate": 2.824253747186956e-05, "loss": 0.0816, "step": 11996 }, { "epoch": 3.667685723020483, "grad_norm": 0.39554455876350403, "learning_rate": 2.824211286144962e-05, "loss": 0.08, "step": 11997 }, { "epoch": 3.667991439926628, "grad_norm": 0.27266043424606323, "learning_rate": 2.824168825102968e-05, "loss": 0.0749, "step": 11998 }, { "epoch": 3.668297156832773, "grad_norm": 0.3923933506011963, "learning_rate": 2.8241263640609743e-05, "loss": 0.1064, "step": 11999 }, { "epoch": 3.6686028737389176, "grad_norm": 1.4726812839508057, "learning_rate": 2.82408390301898e-05, "loss": 0.0859, "step": 12000 }, { "epoch": 3.6686028737389176, "eval_cer": 0.1892354860179154, "eval_loss": 0.23735006153583527, "eval_runtime": 18.9875, "eval_samples_per_second": 238.999, "eval_steps_per_second": 0.79, "eval_wer": 0.33256054300159216, "step": 12000 }, { "epoch": 3.6689085906450627, "grad_norm": 0.30162861943244934, "learning_rate": 2.8240414419769864e-05, "loss": 0.1003, "step": 12001 }, { "epoch": 3.669214307551208, "grad_norm": 0.33009031414985657, "learning_rate": 2.8239989809349922e-05, "loss": 0.1059, "step": 12002 }, { "epoch": 3.6695200244573525, "grad_norm": 0.6069833040237427, "learning_rate": 2.8239565198929985e-05, "loss": 0.1483, "step": 12003 }, { "epoch": 3.669825741363497, "grad_norm": 0.38457947969436646, "learning_rate": 2.8239140588510043e-05, "loss": 0.1538, "step": 12004 }, { "epoch": 3.6701314582696423, "grad_norm": 0.6652607321739197, "learning_rate": 2.8238715978090105e-05, "loss": 0.152, "step": 12005 }, { "epoch": 3.6704371751757874, "grad_norm": 0.8202066421508789, "learning_rate": 2.8238291367670164e-05, "loss": 0.2029, "step": 12006 }, { "epoch": 3.670742892081932, "grad_norm": 0.6052011847496033, "learning_rate": 2.8237866757250226e-05, "loss": 0.1641, "step": 12007 }, { "epoch": 3.671048608988077, "grad_norm": 0.46525266766548157, "learning_rate": 2.8237442146830285e-05, "loss": 0.2032, "step": 12008 }, { "epoch": 3.671354325894222, "grad_norm": 0.7314935326576233, "learning_rate": 2.8237017536410347e-05, "loss": 0.2526, "step": 12009 }, { "epoch": 3.671660042800367, "grad_norm": 0.7591322660446167, "learning_rate": 2.8236592925990406e-05, "loss": 0.1936, "step": 12010 }, { "epoch": 3.6719657597065116, "grad_norm": 2.0682785511016846, "learning_rate": 2.8236168315570468e-05, "loss": 0.2174, "step": 12011 }, { "epoch": 3.6722714766126567, "grad_norm": 1.585244059562683, "learning_rate": 2.8235743705150526e-05, "loss": 0.2102, "step": 12012 }, { "epoch": 3.6725771935188014, "grad_norm": 1.6153252124786377, "learning_rate": 2.8235319094730585e-05, "loss": 0.267, "step": 12013 }, { "epoch": 3.6728829104249465, "grad_norm": 0.4876508116722107, "learning_rate": 2.8234894484310647e-05, "loss": 0.1484, "step": 12014 }, { "epoch": 3.6731886273310916, "grad_norm": 0.26835665106773376, "learning_rate": 2.8234469873890706e-05, "loss": 0.1235, "step": 12015 }, { "epoch": 3.6734943442372363, "grad_norm": 0.3133852481842041, "learning_rate": 2.8234045263470768e-05, "loss": 0.0696, "step": 12016 }, { "epoch": 3.673800061143381, "grad_norm": 0.1616891324520111, "learning_rate": 2.8233620653050827e-05, "loss": 0.0468, "step": 12017 }, { "epoch": 3.674105778049526, "grad_norm": 0.757885754108429, "learning_rate": 2.823319604263089e-05, "loss": 0.0607, "step": 12018 }, { "epoch": 3.674411494955671, "grad_norm": 0.36823827028274536, "learning_rate": 2.8232771432210947e-05, "loss": 0.1034, "step": 12019 }, { "epoch": 3.674717211861816, "grad_norm": 0.21234993636608124, "learning_rate": 2.823234682179101e-05, "loss": 0.0509, "step": 12020 }, { "epoch": 3.675022928767961, "grad_norm": 0.3347912132740021, "learning_rate": 2.8231922211371068e-05, "loss": 0.0459, "step": 12021 }, { "epoch": 3.6753286456741057, "grad_norm": 4.138100624084473, "learning_rate": 2.823149760095113e-05, "loss": 0.0811, "step": 12022 }, { "epoch": 3.675634362580251, "grad_norm": 0.5946982502937317, "learning_rate": 2.823107299053119e-05, "loss": 0.0542, "step": 12023 }, { "epoch": 3.6759400794863955, "grad_norm": 0.5198044180870056, "learning_rate": 2.823064838011125e-05, "loss": 0.1126, "step": 12024 }, { "epoch": 3.6762457963925406, "grad_norm": 0.2680531442165375, "learning_rate": 2.823022376969131e-05, "loss": 0.0813, "step": 12025 }, { "epoch": 3.6765515132986852, "grad_norm": 0.6768645644187927, "learning_rate": 2.822979915927137e-05, "loss": 0.1377, "step": 12026 }, { "epoch": 3.6768572302048304, "grad_norm": 0.32915419340133667, "learning_rate": 2.822937454885143e-05, "loss": 0.1278, "step": 12027 }, { "epoch": 3.6771629471109755, "grad_norm": 0.3419412076473236, "learning_rate": 2.822894993843149e-05, "loss": 0.1149, "step": 12028 }, { "epoch": 3.67746866401712, "grad_norm": 0.44523292779922485, "learning_rate": 2.822852532801155e-05, "loss": 0.1284, "step": 12029 }, { "epoch": 3.677774380923265, "grad_norm": 0.8937473297119141, "learning_rate": 2.822810071759161e-05, "loss": 0.1939, "step": 12030 }, { "epoch": 3.67808009782941, "grad_norm": 0.6277521848678589, "learning_rate": 2.8227676107171672e-05, "loss": 0.1938, "step": 12031 }, { "epoch": 3.678385814735555, "grad_norm": 0.9641230702400208, "learning_rate": 2.822725149675173e-05, "loss": 0.1881, "step": 12032 }, { "epoch": 3.6786915316416997, "grad_norm": 1.0644123554229736, "learning_rate": 2.8226826886331793e-05, "loss": 0.1957, "step": 12033 }, { "epoch": 3.678997248547845, "grad_norm": 1.6015177965164185, "learning_rate": 2.822640227591185e-05, "loss": 0.3038, "step": 12034 }, { "epoch": 3.6793029654539895, "grad_norm": 0.783247172832489, "learning_rate": 2.8225977665491914e-05, "loss": 0.2005, "step": 12035 }, { "epoch": 3.6796086823601346, "grad_norm": 0.8882373571395874, "learning_rate": 2.8225553055071972e-05, "loss": 0.2524, "step": 12036 }, { "epoch": 3.6799143992662793, "grad_norm": 0.9530139565467834, "learning_rate": 2.8225128444652035e-05, "loss": 0.1843, "step": 12037 }, { "epoch": 3.6802201161724244, "grad_norm": 2.159282922744751, "learning_rate": 2.8224703834232093e-05, "loss": 0.2946, "step": 12038 }, { "epoch": 3.680525833078569, "grad_norm": 0.7108483910560608, "learning_rate": 2.8224279223812152e-05, "loss": 0.1715, "step": 12039 }, { "epoch": 3.680831549984714, "grad_norm": 0.6257935762405396, "learning_rate": 2.8223854613392214e-05, "loss": 0.0776, "step": 12040 }, { "epoch": 3.6811372668908593, "grad_norm": 0.3885909616947174, "learning_rate": 2.8223430002972273e-05, "loss": 0.0999, "step": 12041 }, { "epoch": 3.681442983797004, "grad_norm": 0.2675653398036957, "learning_rate": 2.8223005392552335e-05, "loss": 0.0588, "step": 12042 }, { "epoch": 3.6817487007031486, "grad_norm": 0.24460680782794952, "learning_rate": 2.8222580782132394e-05, "loss": 0.0642, "step": 12043 }, { "epoch": 3.6820544176092938, "grad_norm": 0.1569153219461441, "learning_rate": 2.8222156171712456e-05, "loss": 0.0445, "step": 12044 }, { "epoch": 3.682360134515439, "grad_norm": 0.285241037607193, "learning_rate": 2.8221731561292514e-05, "loss": 0.071, "step": 12045 }, { "epoch": 3.6826658514215835, "grad_norm": 0.2760595977306366, "learning_rate": 2.8221306950872576e-05, "loss": 0.0591, "step": 12046 }, { "epoch": 3.6829715683277287, "grad_norm": 0.3598121106624603, "learning_rate": 2.8220882340452635e-05, "loss": 0.0828, "step": 12047 }, { "epoch": 3.6832772852338733, "grad_norm": 0.36339646577835083, "learning_rate": 2.8220457730032697e-05, "loss": 0.0722, "step": 12048 }, { "epoch": 3.6835830021400184, "grad_norm": 0.3260767459869385, "learning_rate": 2.8220033119612756e-05, "loss": 0.1186, "step": 12049 }, { "epoch": 3.683888719046163, "grad_norm": 0.2732526659965515, "learning_rate": 2.8219608509192815e-05, "loss": 0.0751, "step": 12050 }, { "epoch": 3.684194435952308, "grad_norm": 0.429093599319458, "learning_rate": 2.8219183898772877e-05, "loss": 0.1034, "step": 12051 }, { "epoch": 3.684500152858453, "grad_norm": 0.32360491156578064, "learning_rate": 2.8218759288352935e-05, "loss": 0.1138, "step": 12052 }, { "epoch": 3.684805869764598, "grad_norm": 1.077656626701355, "learning_rate": 2.8218334677932997e-05, "loss": 0.1274, "step": 12053 }, { "epoch": 3.685111586670743, "grad_norm": 0.7752101421356201, "learning_rate": 2.8217910067513056e-05, "loss": 0.1849, "step": 12054 }, { "epoch": 3.685417303576888, "grad_norm": 0.42795491218566895, "learning_rate": 2.8217485457093118e-05, "loss": 0.1597, "step": 12055 }, { "epoch": 3.6857230204830325, "grad_norm": 0.6543185114860535, "learning_rate": 2.8217060846673177e-05, "loss": 0.1692, "step": 12056 }, { "epoch": 3.6860287373891776, "grad_norm": 0.5095911026000977, "learning_rate": 2.821663623625324e-05, "loss": 0.1776, "step": 12057 }, { "epoch": 3.6863344542953227, "grad_norm": 1.2739832401275635, "learning_rate": 2.8216211625833298e-05, "loss": 0.1678, "step": 12058 }, { "epoch": 3.6866401712014674, "grad_norm": 0.8544392585754395, "learning_rate": 2.821578701541336e-05, "loss": 0.1788, "step": 12059 }, { "epoch": 3.6869458881076125, "grad_norm": 2.643709897994995, "learning_rate": 2.821536240499342e-05, "loss": 0.2062, "step": 12060 }, { "epoch": 3.687251605013757, "grad_norm": 1.1053943634033203, "learning_rate": 2.821493779457348e-05, "loss": 0.1752, "step": 12061 }, { "epoch": 3.6875573219199023, "grad_norm": 0.9081541895866394, "learning_rate": 2.821451318415354e-05, "loss": 0.2314, "step": 12062 }, { "epoch": 3.687863038826047, "grad_norm": 1.838461995124817, "learning_rate": 2.8214088573733598e-05, "loss": 0.2741, "step": 12063 }, { "epoch": 3.688168755732192, "grad_norm": 0.5558493733406067, "learning_rate": 2.821366396331366e-05, "loss": 0.1397, "step": 12064 }, { "epoch": 3.6884744726383367, "grad_norm": 0.30870750546455383, "learning_rate": 2.821323935289372e-05, "loss": 0.0778, "step": 12065 }, { "epoch": 3.688780189544482, "grad_norm": 0.384689599275589, "learning_rate": 2.821281474247378e-05, "loss": 0.0566, "step": 12066 }, { "epoch": 3.689085906450627, "grad_norm": 0.3132034242153168, "learning_rate": 2.821239013205384e-05, "loss": 0.0773, "step": 12067 }, { "epoch": 3.6893916233567716, "grad_norm": 0.215736523270607, "learning_rate": 2.82119655216339e-05, "loss": 0.0562, "step": 12068 }, { "epoch": 3.6896973402629163, "grad_norm": 0.5045027732849121, "learning_rate": 2.821154091121396e-05, "loss": 0.0577, "step": 12069 }, { "epoch": 3.6900030571690614, "grad_norm": 0.31986773014068604, "learning_rate": 2.8211116300794022e-05, "loss": 0.0548, "step": 12070 }, { "epoch": 3.6903087740752065, "grad_norm": 0.2540302276611328, "learning_rate": 2.821069169037408e-05, "loss": 0.0581, "step": 12071 }, { "epoch": 3.690614490981351, "grad_norm": 0.37901216745376587, "learning_rate": 2.8210267079954143e-05, "loss": 0.0746, "step": 12072 }, { "epoch": 3.6909202078874963, "grad_norm": 0.7212011814117432, "learning_rate": 2.8209842469534202e-05, "loss": 0.0688, "step": 12073 }, { "epoch": 3.691225924793641, "grad_norm": 0.7191643714904785, "learning_rate": 2.8209417859114264e-05, "loss": 0.0808, "step": 12074 }, { "epoch": 3.691531641699786, "grad_norm": 0.586270272731781, "learning_rate": 2.8208993248694323e-05, "loss": 0.1115, "step": 12075 }, { "epoch": 3.6918373586059308, "grad_norm": 0.3587043285369873, "learning_rate": 2.820856863827438e-05, "loss": 0.1314, "step": 12076 }, { "epoch": 3.692143075512076, "grad_norm": 0.4079412817955017, "learning_rate": 2.8208144027854444e-05, "loss": 0.1129, "step": 12077 }, { "epoch": 3.6924487924182205, "grad_norm": 1.5374773740768433, "learning_rate": 2.8207719417434502e-05, "loss": 0.1427, "step": 12078 }, { "epoch": 3.6927545093243657, "grad_norm": 0.7580461502075195, "learning_rate": 2.8207294807014564e-05, "loss": 0.1475, "step": 12079 }, { "epoch": 3.693060226230511, "grad_norm": 0.5761195421218872, "learning_rate": 2.8206870196594623e-05, "loss": 0.1652, "step": 12080 }, { "epoch": 3.6933659431366554, "grad_norm": 0.6495999097824097, "learning_rate": 2.8206445586174685e-05, "loss": 0.2103, "step": 12081 }, { "epoch": 3.6936716600428, "grad_norm": 0.7713256478309631, "learning_rate": 2.8206020975754744e-05, "loss": 0.1826, "step": 12082 }, { "epoch": 3.6939773769489452, "grad_norm": 0.9481729865074158, "learning_rate": 2.8205596365334806e-05, "loss": 0.2008, "step": 12083 }, { "epoch": 3.6942830938550903, "grad_norm": 1.0538285970687866, "learning_rate": 2.8205171754914865e-05, "loss": 0.1933, "step": 12084 }, { "epoch": 3.694588810761235, "grad_norm": 0.7417480945587158, "learning_rate": 2.8204747144494927e-05, "loss": 0.1833, "step": 12085 }, { "epoch": 3.69489452766738, "grad_norm": 0.7142953276634216, "learning_rate": 2.8204322534074985e-05, "loss": 0.1807, "step": 12086 }, { "epoch": 3.695200244573525, "grad_norm": 3.701155662536621, "learning_rate": 2.8203897923655047e-05, "loss": 0.2072, "step": 12087 }, { "epoch": 3.69550596147967, "grad_norm": 1.0026227235794067, "learning_rate": 2.8203473313235106e-05, "loss": 0.2581, "step": 12088 }, { "epoch": 3.6958116783858146, "grad_norm": 0.4979563355445862, "learning_rate": 2.8203048702815165e-05, "loss": 0.1338, "step": 12089 }, { "epoch": 3.6961173952919597, "grad_norm": 0.43651899695396423, "learning_rate": 2.8202624092395227e-05, "loss": 0.0799, "step": 12090 }, { "epoch": 3.6964231121981044, "grad_norm": 0.428185373544693, "learning_rate": 2.8202199481975286e-05, "loss": 0.0823, "step": 12091 }, { "epoch": 3.6967288291042495, "grad_norm": 0.3377753496170044, "learning_rate": 2.8201774871555348e-05, "loss": 0.066, "step": 12092 }, { "epoch": 3.6970345460103946, "grad_norm": 0.26806655526161194, "learning_rate": 2.8201350261135406e-05, "loss": 0.0618, "step": 12093 }, { "epoch": 3.6973402629165393, "grad_norm": 0.2806757688522339, "learning_rate": 2.820092565071547e-05, "loss": 0.0565, "step": 12094 }, { "epoch": 3.697645979822684, "grad_norm": 0.2086852788925171, "learning_rate": 2.8200501040295527e-05, "loss": 0.051, "step": 12095 }, { "epoch": 3.697951696728829, "grad_norm": 0.8166077733039856, "learning_rate": 2.820007642987559e-05, "loss": 0.0606, "step": 12096 }, { "epoch": 3.698257413634974, "grad_norm": 0.28645652532577515, "learning_rate": 2.8199651819455648e-05, "loss": 0.0928, "step": 12097 }, { "epoch": 3.698563130541119, "grad_norm": 0.7048086524009705, "learning_rate": 2.819922720903571e-05, "loss": 0.0725, "step": 12098 }, { "epoch": 3.698868847447264, "grad_norm": 0.9460377097129822, "learning_rate": 2.819880259861577e-05, "loss": 0.0645, "step": 12099 }, { "epoch": 3.6991745643534086, "grad_norm": 0.34024712443351746, "learning_rate": 2.819837798819583e-05, "loss": 0.0954, "step": 12100 }, { "epoch": 3.6994802812595537, "grad_norm": 0.3352932035923004, "learning_rate": 2.8197953377775893e-05, "loss": 0.0987, "step": 12101 }, { "epoch": 3.6997859981656984, "grad_norm": 0.27072983980178833, "learning_rate": 2.8197528767355952e-05, "loss": 0.1, "step": 12102 }, { "epoch": 3.7000917150718435, "grad_norm": 0.7786687016487122, "learning_rate": 2.8197104156936014e-05, "loss": 0.161, "step": 12103 }, { "epoch": 3.700397431977988, "grad_norm": 0.6363269686698914, "learning_rate": 2.8196679546516072e-05, "loss": 0.15, "step": 12104 }, { "epoch": 3.7007031488841333, "grad_norm": 0.8042523860931396, "learning_rate": 2.8196254936096135e-05, "loss": 0.1885, "step": 12105 }, { "epoch": 3.7010088657902784, "grad_norm": 0.5880594849586487, "learning_rate": 2.8195830325676193e-05, "loss": 0.1635, "step": 12106 }, { "epoch": 3.701314582696423, "grad_norm": 0.8195476531982422, "learning_rate": 2.8195405715256255e-05, "loss": 0.219, "step": 12107 }, { "epoch": 3.7016202996025678, "grad_norm": 0.6792598366737366, "learning_rate": 2.8194981104836314e-05, "loss": 0.1779, "step": 12108 }, { "epoch": 3.701926016508713, "grad_norm": 1.2919092178344727, "learning_rate": 2.8194556494416376e-05, "loss": 0.1826, "step": 12109 }, { "epoch": 3.702231733414858, "grad_norm": 1.4620620012283325, "learning_rate": 2.8194131883996435e-05, "loss": 0.2004, "step": 12110 }, { "epoch": 3.7025374503210027, "grad_norm": 0.7329964637756348, "learning_rate": 2.8193707273576497e-05, "loss": 0.2379, "step": 12111 }, { "epoch": 3.702843167227148, "grad_norm": 0.8668164610862732, "learning_rate": 2.8193282663156556e-05, "loss": 0.1852, "step": 12112 }, { "epoch": 3.7031488841332925, "grad_norm": 3.8335678577423096, "learning_rate": 2.8192858052736618e-05, "loss": 0.2588, "step": 12113 }, { "epoch": 3.7034546010394376, "grad_norm": 0.3751384913921356, "learning_rate": 2.8192433442316676e-05, "loss": 0.1408, "step": 12114 }, { "epoch": 3.7037603179455822, "grad_norm": 0.18363697826862335, "learning_rate": 2.8192008831896735e-05, "loss": 0.0701, "step": 12115 }, { "epoch": 3.7040660348517274, "grad_norm": 0.30244120955467224, "learning_rate": 2.8191584221476797e-05, "loss": 0.1157, "step": 12116 }, { "epoch": 3.704371751757872, "grad_norm": 0.3364492952823639, "learning_rate": 2.8191159611056856e-05, "loss": 0.0648, "step": 12117 }, { "epoch": 3.704677468664017, "grad_norm": 0.3851816952228546, "learning_rate": 2.8190735000636918e-05, "loss": 0.0617, "step": 12118 }, { "epoch": 3.7049831855701623, "grad_norm": 0.24666696786880493, "learning_rate": 2.8190310390216977e-05, "loss": 0.0542, "step": 12119 }, { "epoch": 3.705288902476307, "grad_norm": 0.19368377327919006, "learning_rate": 2.818988577979704e-05, "loss": 0.0556, "step": 12120 }, { "epoch": 3.7055946193824516, "grad_norm": 0.9330170154571533, "learning_rate": 2.8189461169377097e-05, "loss": 0.0593, "step": 12121 }, { "epoch": 3.7059003362885967, "grad_norm": 0.572128176689148, "learning_rate": 2.818903655895716e-05, "loss": 0.1026, "step": 12122 }, { "epoch": 3.706206053194742, "grad_norm": 0.3542979955673218, "learning_rate": 2.8188611948537218e-05, "loss": 0.0968, "step": 12123 }, { "epoch": 3.7065117701008865, "grad_norm": 0.6591923832893372, "learning_rate": 2.818818733811728e-05, "loss": 0.0853, "step": 12124 }, { "epoch": 3.7068174870070316, "grad_norm": 0.48752546310424805, "learning_rate": 2.818776272769734e-05, "loss": 0.0858, "step": 12125 }, { "epoch": 3.7071232039131763, "grad_norm": 0.4888482689857483, "learning_rate": 2.81873381172774e-05, "loss": 0.0923, "step": 12126 }, { "epoch": 3.7074289208193214, "grad_norm": 0.5643006563186646, "learning_rate": 2.818691350685746e-05, "loss": 0.1265, "step": 12127 }, { "epoch": 3.707734637725466, "grad_norm": 4.395726203918457, "learning_rate": 2.818648889643752e-05, "loss": 0.1434, "step": 12128 }, { "epoch": 3.708040354631611, "grad_norm": 0.6526485681533813, "learning_rate": 2.818606428601758e-05, "loss": 0.1366, "step": 12129 }, { "epoch": 3.708346071537756, "grad_norm": 0.8227155804634094, "learning_rate": 2.818563967559764e-05, "loss": 0.1624, "step": 12130 }, { "epoch": 3.708651788443901, "grad_norm": 0.40990501642227173, "learning_rate": 2.81852150651777e-05, "loss": 0.1597, "step": 12131 }, { "epoch": 3.708957505350046, "grad_norm": 0.469353586435318, "learning_rate": 2.818479045475776e-05, "loss": 0.1482, "step": 12132 }, { "epoch": 3.7092632222561908, "grad_norm": 0.789216935634613, "learning_rate": 2.8184365844337822e-05, "loss": 0.1828, "step": 12133 }, { "epoch": 3.7095689391623354, "grad_norm": 0.6251068711280823, "learning_rate": 2.818394123391788e-05, "loss": 0.1692, "step": 12134 }, { "epoch": 3.7098746560684805, "grad_norm": 0.6499804854393005, "learning_rate": 2.8183516623497943e-05, "loss": 0.1751, "step": 12135 }, { "epoch": 3.7101803729746257, "grad_norm": 1.0827223062515259, "learning_rate": 2.8183092013078002e-05, "loss": 0.2679, "step": 12136 }, { "epoch": 3.7104860898807703, "grad_norm": 2.0309183597564697, "learning_rate": 2.8182667402658064e-05, "loss": 0.2197, "step": 12137 }, { "epoch": 3.7107918067869154, "grad_norm": 1.6232401132583618, "learning_rate": 2.8182242792238122e-05, "loss": 0.2815, "step": 12138 }, { "epoch": 3.71109752369306, "grad_norm": 0.6464628577232361, "learning_rate": 2.8181818181818185e-05, "loss": 0.1459, "step": 12139 }, { "epoch": 3.7114032405992052, "grad_norm": 0.33790621161460876, "learning_rate": 2.8181393571398243e-05, "loss": 0.0783, "step": 12140 }, { "epoch": 3.71170895750535, "grad_norm": 0.2499135583639145, "learning_rate": 2.8180968960978302e-05, "loss": 0.0896, "step": 12141 }, { "epoch": 3.712014674411495, "grad_norm": 0.31241723895072937, "learning_rate": 2.8180544350558364e-05, "loss": 0.0728, "step": 12142 }, { "epoch": 3.7123203913176397, "grad_norm": 0.4997214078903198, "learning_rate": 2.8180119740138423e-05, "loss": 0.0754, "step": 12143 }, { "epoch": 3.712626108223785, "grad_norm": 0.2369890809059143, "learning_rate": 2.8179695129718485e-05, "loss": 0.0707, "step": 12144 }, { "epoch": 3.71293182512993, "grad_norm": 0.727214515209198, "learning_rate": 2.8179270519298544e-05, "loss": 0.0673, "step": 12145 }, { "epoch": 3.7132375420360746, "grad_norm": 0.7773690819740295, "learning_rate": 2.8178845908878606e-05, "loss": 0.0736, "step": 12146 }, { "epoch": 3.7135432589422193, "grad_norm": 0.45519429445266724, "learning_rate": 2.8178421298458664e-05, "loss": 0.1197, "step": 12147 }, { "epoch": 3.7138489758483644, "grad_norm": 0.724873423576355, "learning_rate": 2.8177996688038726e-05, "loss": 0.0674, "step": 12148 }, { "epoch": 3.7141546927545095, "grad_norm": 0.45441997051239014, "learning_rate": 2.8177572077618785e-05, "loss": 0.1292, "step": 12149 }, { "epoch": 3.714460409660654, "grad_norm": 0.3731149435043335, "learning_rate": 2.8177147467198847e-05, "loss": 0.0772, "step": 12150 }, { "epoch": 3.7147661265667993, "grad_norm": 0.4723815619945526, "learning_rate": 2.8176722856778906e-05, "loss": 0.0996, "step": 12151 }, { "epoch": 3.715071843472944, "grad_norm": 0.6526547074317932, "learning_rate": 2.8176298246358968e-05, "loss": 0.1105, "step": 12152 }, { "epoch": 3.715377560379089, "grad_norm": 0.5269560813903809, "learning_rate": 2.8175873635939027e-05, "loss": 0.1251, "step": 12153 }, { "epoch": 3.7156832772852337, "grad_norm": 1.3701939582824707, "learning_rate": 2.8175449025519085e-05, "loss": 0.13, "step": 12154 }, { "epoch": 3.715988994191379, "grad_norm": 0.9154121279716492, "learning_rate": 2.8175024415099148e-05, "loss": 0.1915, "step": 12155 }, { "epoch": 3.7162947110975235, "grad_norm": 0.9507400393486023, "learning_rate": 2.8174599804679206e-05, "loss": 0.178, "step": 12156 }, { "epoch": 3.7166004280036686, "grad_norm": 1.4272516965866089, "learning_rate": 2.8174175194259268e-05, "loss": 0.1872, "step": 12157 }, { "epoch": 3.7169061449098137, "grad_norm": 0.8014790415763855, "learning_rate": 2.8173750583839327e-05, "loss": 0.1422, "step": 12158 }, { "epoch": 3.7172118618159584, "grad_norm": 1.9106662273406982, "learning_rate": 2.817332597341939e-05, "loss": 0.1959, "step": 12159 }, { "epoch": 3.717517578722103, "grad_norm": 4.368098735809326, "learning_rate": 2.8172901362999448e-05, "loss": 0.2071, "step": 12160 }, { "epoch": 3.717823295628248, "grad_norm": 1.1558758020401, "learning_rate": 2.817247675257951e-05, "loss": 0.2283, "step": 12161 }, { "epoch": 3.7181290125343933, "grad_norm": 4.152878761291504, "learning_rate": 2.817205214215957e-05, "loss": 0.1964, "step": 12162 }, { "epoch": 3.718434729440538, "grad_norm": 1.214240312576294, "learning_rate": 2.817162753173963e-05, "loss": 0.2455, "step": 12163 }, { "epoch": 3.718740446346683, "grad_norm": 0.5591237545013428, "learning_rate": 2.817120292131969e-05, "loss": 0.1319, "step": 12164 }, { "epoch": 3.7190461632528278, "grad_norm": 0.6175518035888672, "learning_rate": 2.8170778310899748e-05, "loss": 0.0724, "step": 12165 }, { "epoch": 3.719351880158973, "grad_norm": 0.3873523473739624, "learning_rate": 2.817035370047981e-05, "loss": 0.077, "step": 12166 }, { "epoch": 3.7196575970651176, "grad_norm": 0.6352532505989075, "learning_rate": 2.816992909005987e-05, "loss": 0.0919, "step": 12167 }, { "epoch": 3.7199633139712627, "grad_norm": 0.4659424424171448, "learning_rate": 2.816950447963993e-05, "loss": 0.0579, "step": 12168 }, { "epoch": 3.7202690308774073, "grad_norm": 0.21096496284008026, "learning_rate": 2.816907986921999e-05, "loss": 0.0358, "step": 12169 }, { "epoch": 3.7205747477835525, "grad_norm": 0.572263240814209, "learning_rate": 2.8168655258800052e-05, "loss": 0.062, "step": 12170 }, { "epoch": 3.7208804646896976, "grad_norm": 0.309483140707016, "learning_rate": 2.816823064838011e-05, "loss": 0.0728, "step": 12171 }, { "epoch": 3.7211861815958422, "grad_norm": 0.5543188452720642, "learning_rate": 2.8167806037960173e-05, "loss": 0.0703, "step": 12172 }, { "epoch": 3.721491898501987, "grad_norm": 0.469788521528244, "learning_rate": 2.816738142754023e-05, "loss": 0.074, "step": 12173 }, { "epoch": 3.721797615408132, "grad_norm": 1.0449360609054565, "learning_rate": 2.8166956817120293e-05, "loss": 0.0962, "step": 12174 }, { "epoch": 3.722103332314277, "grad_norm": 0.8292784094810486, "learning_rate": 2.8166532206700352e-05, "loss": 0.0771, "step": 12175 }, { "epoch": 3.722409049220422, "grad_norm": 0.5902529358863831, "learning_rate": 2.8166107596280414e-05, "loss": 0.0895, "step": 12176 }, { "epoch": 3.722714766126567, "grad_norm": 0.7138229608535767, "learning_rate": 2.8165682985860473e-05, "loss": 0.1501, "step": 12177 }, { "epoch": 3.7230204830327116, "grad_norm": 1.0475444793701172, "learning_rate": 2.816525837544053e-05, "loss": 0.1312, "step": 12178 }, { "epoch": 3.7233261999388567, "grad_norm": 0.6669849157333374, "learning_rate": 2.8164833765020594e-05, "loss": 0.1461, "step": 12179 }, { "epoch": 3.7236319168450014, "grad_norm": 1.412272572517395, "learning_rate": 2.8164409154600652e-05, "loss": 0.1428, "step": 12180 }, { "epoch": 3.7239376337511465, "grad_norm": 1.3975147008895874, "learning_rate": 2.8163984544180714e-05, "loss": 0.1685, "step": 12181 }, { "epoch": 3.724243350657291, "grad_norm": 0.6232331991195679, "learning_rate": 2.8163559933760773e-05, "loss": 0.1739, "step": 12182 }, { "epoch": 3.7245490675634363, "grad_norm": 0.6990103125572205, "learning_rate": 2.8163135323340835e-05, "loss": 0.2094, "step": 12183 }, { "epoch": 3.7248547844695814, "grad_norm": 1.1468578577041626, "learning_rate": 2.8162710712920894e-05, "loss": 0.1865, "step": 12184 }, { "epoch": 3.725160501375726, "grad_norm": 1.1241129636764526, "learning_rate": 2.8162286102500956e-05, "loss": 0.1645, "step": 12185 }, { "epoch": 3.7254662182818707, "grad_norm": 0.8806542158126831, "learning_rate": 2.8161861492081015e-05, "loss": 0.1749, "step": 12186 }, { "epoch": 3.725771935188016, "grad_norm": 2.654649496078491, "learning_rate": 2.8161436881661077e-05, "loss": 0.2404, "step": 12187 }, { "epoch": 3.726077652094161, "grad_norm": 1.181640386581421, "learning_rate": 2.8161012271241135e-05, "loss": 0.1933, "step": 12188 }, { "epoch": 3.7263833690003056, "grad_norm": 0.6183341145515442, "learning_rate": 2.8160587660821198e-05, "loss": 0.1348, "step": 12189 }, { "epoch": 3.7266890859064508, "grad_norm": 0.4136729836463928, "learning_rate": 2.8160163050401256e-05, "loss": 0.0825, "step": 12190 }, { "epoch": 3.7269948028125954, "grad_norm": 0.3077714145183563, "learning_rate": 2.8159738439981315e-05, "loss": 0.06, "step": 12191 }, { "epoch": 3.7273005197187405, "grad_norm": 0.29856252670288086, "learning_rate": 2.8159313829561377e-05, "loss": 0.07, "step": 12192 }, { "epoch": 3.727606236624885, "grad_norm": 0.2239868938922882, "learning_rate": 2.8158889219141436e-05, "loss": 0.062, "step": 12193 }, { "epoch": 3.7279119535310303, "grad_norm": 0.30136963725090027, "learning_rate": 2.8158464608721498e-05, "loss": 0.0612, "step": 12194 }, { "epoch": 3.728217670437175, "grad_norm": 0.330839067697525, "learning_rate": 2.8158039998301556e-05, "loss": 0.0865, "step": 12195 }, { "epoch": 3.72852338734332, "grad_norm": 0.4506281614303589, "learning_rate": 2.815761538788162e-05, "loss": 0.0663, "step": 12196 }, { "epoch": 3.7288291042494652, "grad_norm": 0.5013958811759949, "learning_rate": 2.8157190777461677e-05, "loss": 0.096, "step": 12197 }, { "epoch": 3.72913482115561, "grad_norm": 0.28636181354522705, "learning_rate": 2.815676616704174e-05, "loss": 0.0862, "step": 12198 }, { "epoch": 3.7294405380617546, "grad_norm": 0.401771605014801, "learning_rate": 2.8156341556621798e-05, "loss": 0.0904, "step": 12199 }, { "epoch": 3.7297462549678997, "grad_norm": 0.6400455832481384, "learning_rate": 2.815591694620186e-05, "loss": 0.0996, "step": 12200 }, { "epoch": 3.730051971874045, "grad_norm": 0.8213857412338257, "learning_rate": 2.815549233578192e-05, "loss": 0.0925, "step": 12201 }, { "epoch": 3.7303576887801895, "grad_norm": 0.7134050726890564, "learning_rate": 2.815506772536198e-05, "loss": 0.1218, "step": 12202 }, { "epoch": 3.7306634056863346, "grad_norm": 1.5304718017578125, "learning_rate": 2.8154643114942043e-05, "loss": 0.1286, "step": 12203 }, { "epoch": 3.7309691225924793, "grad_norm": 0.49882763624191284, "learning_rate": 2.8154218504522102e-05, "loss": 0.1359, "step": 12204 }, { "epoch": 3.7312748394986244, "grad_norm": 0.7837218046188354, "learning_rate": 2.8153793894102164e-05, "loss": 0.1672, "step": 12205 }, { "epoch": 3.731580556404769, "grad_norm": 0.5777390003204346, "learning_rate": 2.8153369283682223e-05, "loss": 0.1546, "step": 12206 }, { "epoch": 3.731886273310914, "grad_norm": 1.0442451238632202, "learning_rate": 2.8152944673262285e-05, "loss": 0.2118, "step": 12207 }, { "epoch": 3.732191990217059, "grad_norm": 19.48548126220703, "learning_rate": 2.8152520062842343e-05, "loss": 0.2012, "step": 12208 }, { "epoch": 3.732497707123204, "grad_norm": 1.4159963130950928, "learning_rate": 2.8152095452422405e-05, "loss": 0.208, "step": 12209 }, { "epoch": 3.732803424029349, "grad_norm": 1.3350037336349487, "learning_rate": 2.8151670842002464e-05, "loss": 0.1858, "step": 12210 }, { "epoch": 3.7331091409354937, "grad_norm": 1.1488580703735352, "learning_rate": 2.8151246231582526e-05, "loss": 0.229, "step": 12211 }, { "epoch": 3.7334148578416384, "grad_norm": 1.9699434041976929, "learning_rate": 2.8150821621162585e-05, "loss": 0.2035, "step": 12212 }, { "epoch": 3.7337205747477835, "grad_norm": 2.5671133995056152, "learning_rate": 2.8150397010742647e-05, "loss": 0.3497, "step": 12213 }, { "epoch": 3.7340262916539286, "grad_norm": 0.5518456697463989, "learning_rate": 2.8149972400322706e-05, "loss": 0.1617, "step": 12214 }, { "epoch": 3.7343320085600733, "grad_norm": 0.28887030482292175, "learning_rate": 2.8149547789902768e-05, "loss": 0.0933, "step": 12215 }, { "epoch": 3.7346377254662184, "grad_norm": 0.37304237484931946, "learning_rate": 2.8149123179482826e-05, "loss": 0.0687, "step": 12216 }, { "epoch": 3.734943442372363, "grad_norm": 0.15185242891311646, "learning_rate": 2.8148698569062885e-05, "loss": 0.0439, "step": 12217 }, { "epoch": 3.735249159278508, "grad_norm": 0.2086067795753479, "learning_rate": 2.8148273958642947e-05, "loss": 0.0529, "step": 12218 }, { "epoch": 3.735554876184653, "grad_norm": 0.30341219902038574, "learning_rate": 2.8147849348223006e-05, "loss": 0.0504, "step": 12219 }, { "epoch": 3.735860593090798, "grad_norm": 0.28236639499664307, "learning_rate": 2.8147424737803068e-05, "loss": 0.0458, "step": 12220 }, { "epoch": 3.7361663099969427, "grad_norm": 0.4908217191696167, "learning_rate": 2.8147000127383127e-05, "loss": 0.0559, "step": 12221 }, { "epoch": 3.7364720269030878, "grad_norm": 0.57806795835495, "learning_rate": 2.814657551696319e-05, "loss": 0.1194, "step": 12222 }, { "epoch": 3.736777743809233, "grad_norm": 0.22829490900039673, "learning_rate": 2.8146150906543248e-05, "loss": 0.0696, "step": 12223 }, { "epoch": 3.7370834607153776, "grad_norm": 1.2656680345535278, "learning_rate": 2.814572629612331e-05, "loss": 0.1052, "step": 12224 }, { "epoch": 3.7373891776215222, "grad_norm": 0.35924702882766724, "learning_rate": 2.814530168570337e-05, "loss": 0.0765, "step": 12225 }, { "epoch": 3.7376948945276673, "grad_norm": 0.37659087777137756, "learning_rate": 2.814487707528343e-05, "loss": 0.0889, "step": 12226 }, { "epoch": 3.7380006114338125, "grad_norm": 0.4647407829761505, "learning_rate": 2.814445246486349e-05, "loss": 0.1137, "step": 12227 }, { "epoch": 3.738306328339957, "grad_norm": 0.5568381547927856, "learning_rate": 2.814402785444355e-05, "loss": 0.1476, "step": 12228 }, { "epoch": 3.7386120452461022, "grad_norm": 0.5223169922828674, "learning_rate": 2.814360324402361e-05, "loss": 0.1326, "step": 12229 }, { "epoch": 3.738917762152247, "grad_norm": 0.6964367032051086, "learning_rate": 2.814317863360367e-05, "loss": 0.158, "step": 12230 }, { "epoch": 3.739223479058392, "grad_norm": 0.7660449147224426, "learning_rate": 2.814275402318373e-05, "loss": 0.1442, "step": 12231 }, { "epoch": 3.7395291959645367, "grad_norm": 1.186630129814148, "learning_rate": 2.814232941276379e-05, "loss": 0.1764, "step": 12232 }, { "epoch": 3.739834912870682, "grad_norm": 2.713330030441284, "learning_rate": 2.814190480234385e-05, "loss": 0.1691, "step": 12233 }, { "epoch": 3.7401406297768265, "grad_norm": 4.667356491088867, "learning_rate": 2.814148019192391e-05, "loss": 0.2276, "step": 12234 }, { "epoch": 3.7404463466829716, "grad_norm": 1.3183948993682861, "learning_rate": 2.8141055581503972e-05, "loss": 0.182, "step": 12235 }, { "epoch": 3.7407520635891167, "grad_norm": 1.2263801097869873, "learning_rate": 2.814063097108403e-05, "loss": 0.2683, "step": 12236 }, { "epoch": 3.7410577804952614, "grad_norm": 1.3862416744232178, "learning_rate": 2.8140206360664093e-05, "loss": 0.2056, "step": 12237 }, { "epoch": 3.741363497401406, "grad_norm": 1.4808170795440674, "learning_rate": 2.8139781750244152e-05, "loss": 0.2462, "step": 12238 }, { "epoch": 3.741669214307551, "grad_norm": 0.4257934093475342, "learning_rate": 2.8139357139824214e-05, "loss": 0.1478, "step": 12239 }, { "epoch": 3.7419749312136963, "grad_norm": 0.4928136467933655, "learning_rate": 2.8138932529404273e-05, "loss": 0.0722, "step": 12240 }, { "epoch": 3.742280648119841, "grad_norm": 0.2858369052410126, "learning_rate": 2.8138507918984335e-05, "loss": 0.074, "step": 12241 }, { "epoch": 3.742586365025986, "grad_norm": 0.6616882681846619, "learning_rate": 2.8138083308564393e-05, "loss": 0.0688, "step": 12242 }, { "epoch": 3.7428920819321307, "grad_norm": 0.3016863763332367, "learning_rate": 2.8137658698144452e-05, "loss": 0.0607, "step": 12243 }, { "epoch": 3.743197798838276, "grad_norm": 0.3037016987800598, "learning_rate": 2.8137234087724514e-05, "loss": 0.0586, "step": 12244 }, { "epoch": 3.7435035157444205, "grad_norm": 0.5066967010498047, "learning_rate": 2.8136809477304573e-05, "loss": 0.0594, "step": 12245 }, { "epoch": 3.7438092326505656, "grad_norm": 0.4575718343257904, "learning_rate": 2.8136384866884635e-05, "loss": 0.0841, "step": 12246 }, { "epoch": 3.7441149495567103, "grad_norm": 0.28400105237960815, "learning_rate": 2.8135960256464694e-05, "loss": 0.0662, "step": 12247 }, { "epoch": 3.7444206664628554, "grad_norm": 0.23416449129581451, "learning_rate": 2.8135535646044756e-05, "loss": 0.0609, "step": 12248 }, { "epoch": 3.7447263833690005, "grad_norm": 0.7189582586288452, "learning_rate": 2.8135111035624814e-05, "loss": 0.1328, "step": 12249 }, { "epoch": 3.745032100275145, "grad_norm": 1.010986566543579, "learning_rate": 2.8134686425204876e-05, "loss": 0.0729, "step": 12250 }, { "epoch": 3.74533781718129, "grad_norm": 0.5499368906021118, "learning_rate": 2.8134261814784935e-05, "loss": 0.0761, "step": 12251 }, { "epoch": 3.745643534087435, "grad_norm": 0.5039981007575989, "learning_rate": 2.8133837204364997e-05, "loss": 0.1815, "step": 12252 }, { "epoch": 3.74594925099358, "grad_norm": 0.8035646677017212, "learning_rate": 2.8133412593945056e-05, "loss": 0.1193, "step": 12253 }, { "epoch": 3.746254967899725, "grad_norm": 0.799053966999054, "learning_rate": 2.8132987983525118e-05, "loss": 0.168, "step": 12254 }, { "epoch": 3.74656068480587, "grad_norm": 0.7801622152328491, "learning_rate": 2.8132563373105177e-05, "loss": 0.1685, "step": 12255 }, { "epoch": 3.7468664017120146, "grad_norm": 0.44199544191360474, "learning_rate": 2.8132138762685235e-05, "loss": 0.2033, "step": 12256 }, { "epoch": 3.7471721186181597, "grad_norm": 7.585861682891846, "learning_rate": 2.8131714152265298e-05, "loss": 0.2082, "step": 12257 }, { "epoch": 3.7474778355243044, "grad_norm": 1.317372441291809, "learning_rate": 2.8131289541845356e-05, "loss": 0.2022, "step": 12258 }, { "epoch": 3.7477835524304495, "grad_norm": 1.2263109683990479, "learning_rate": 2.813086493142542e-05, "loss": 0.1766, "step": 12259 }, { "epoch": 3.748089269336594, "grad_norm": 0.9392316937446594, "learning_rate": 2.8130440321005477e-05, "loss": 0.2036, "step": 12260 }, { "epoch": 3.7483949862427393, "grad_norm": 1.2456028461456299, "learning_rate": 2.813001571058554e-05, "loss": 0.1673, "step": 12261 }, { "epoch": 3.7487007031488844, "grad_norm": 1.3599578142166138, "learning_rate": 2.8129591100165598e-05, "loss": 0.2169, "step": 12262 }, { "epoch": 3.749006420055029, "grad_norm": 1.4408283233642578, "learning_rate": 2.812916648974566e-05, "loss": 0.2372, "step": 12263 }, { "epoch": 3.7493121369611737, "grad_norm": 0.8896666765213013, "learning_rate": 2.812874187932572e-05, "loss": 0.1327, "step": 12264 }, { "epoch": 3.749617853867319, "grad_norm": 0.39975547790527344, "learning_rate": 2.812831726890578e-05, "loss": 0.0747, "step": 12265 }, { "epoch": 3.749923570773464, "grad_norm": 0.32913240790367126, "learning_rate": 2.812789265848584e-05, "loss": 0.1088, "step": 12266 }, { "epoch": 3.7502292876796086, "grad_norm": 0.4067794978618622, "learning_rate": 2.81274680480659e-05, "loss": 0.0539, "step": 12267 }, { "epoch": 3.7505350045857537, "grad_norm": 0.490201860666275, "learning_rate": 2.812704343764596e-05, "loss": 0.0858, "step": 12268 }, { "epoch": 3.7508407214918984, "grad_norm": 0.26230037212371826, "learning_rate": 2.812661882722602e-05, "loss": 0.0542, "step": 12269 }, { "epoch": 3.7511464383980435, "grad_norm": 0.5442776679992676, "learning_rate": 2.812619421680608e-05, "loss": 0.055, "step": 12270 }, { "epoch": 3.751452155304188, "grad_norm": 0.5065364837646484, "learning_rate": 2.812576960638614e-05, "loss": 0.0623, "step": 12271 }, { "epoch": 3.7517578722103333, "grad_norm": 0.44638004899024963, "learning_rate": 2.8125344995966202e-05, "loss": 0.0832, "step": 12272 }, { "epoch": 3.752063589116478, "grad_norm": 0.385575532913208, "learning_rate": 2.812492038554626e-05, "loss": 0.0872, "step": 12273 }, { "epoch": 3.752369306022623, "grad_norm": 0.773652195930481, "learning_rate": 2.8124495775126323e-05, "loss": 0.078, "step": 12274 }, { "epoch": 3.752675022928768, "grad_norm": 0.5178918838500977, "learning_rate": 2.812407116470638e-05, "loss": 0.0805, "step": 12275 }, { "epoch": 3.752980739834913, "grad_norm": 0.3317956030368805, "learning_rate": 2.8123646554286443e-05, "loss": 0.0997, "step": 12276 }, { "epoch": 3.7532864567410575, "grad_norm": 0.9412323832511902, "learning_rate": 2.8123221943866502e-05, "loss": 0.138, "step": 12277 }, { "epoch": 3.7535921736472027, "grad_norm": 0.9097295999526978, "learning_rate": 2.8122797333446564e-05, "loss": 0.1414, "step": 12278 }, { "epoch": 3.7538978905533478, "grad_norm": 0.46672654151916504, "learning_rate": 2.8122372723026623e-05, "loss": 0.1648, "step": 12279 }, { "epoch": 3.7542036074594924, "grad_norm": 1.0524120330810547, "learning_rate": 2.8121948112606685e-05, "loss": 0.2031, "step": 12280 }, { "epoch": 3.7545093243656376, "grad_norm": 2.0074315071105957, "learning_rate": 2.8121523502186744e-05, "loss": 0.1502, "step": 12281 }, { "epoch": 3.7548150412717822, "grad_norm": 0.7241616249084473, "learning_rate": 2.8121098891766802e-05, "loss": 0.1797, "step": 12282 }, { "epoch": 3.7551207581779273, "grad_norm": 1.008439064025879, "learning_rate": 2.8120674281346864e-05, "loss": 0.1806, "step": 12283 }, { "epoch": 3.755426475084072, "grad_norm": 0.6657471656799316, "learning_rate": 2.8120249670926923e-05, "loss": 0.1709, "step": 12284 }, { "epoch": 3.755732191990217, "grad_norm": 1.6943691968917847, "learning_rate": 2.8119825060506985e-05, "loss": 0.2055, "step": 12285 }, { "epoch": 3.756037908896362, "grad_norm": 1.392191767692566, "learning_rate": 2.8119400450087044e-05, "loss": 0.2129, "step": 12286 }, { "epoch": 3.756343625802507, "grad_norm": 2.8516340255737305, "learning_rate": 2.8118975839667106e-05, "loss": 0.2326, "step": 12287 }, { "epoch": 3.756649342708652, "grad_norm": 2.023428201675415, "learning_rate": 2.8118551229247165e-05, "loss": 0.2578, "step": 12288 }, { "epoch": 3.7569550596147967, "grad_norm": 2.0605077743530273, "learning_rate": 2.8118126618827227e-05, "loss": 0.1777, "step": 12289 }, { "epoch": 3.7572607765209414, "grad_norm": 0.2228725403547287, "learning_rate": 2.8117702008407285e-05, "loss": 0.0745, "step": 12290 }, { "epoch": 3.7575664934270865, "grad_norm": 0.2228710651397705, "learning_rate": 2.8117277397987348e-05, "loss": 0.0591, "step": 12291 }, { "epoch": 3.7578722103332316, "grad_norm": 0.3065859377384186, "learning_rate": 2.8116852787567406e-05, "loss": 0.0776, "step": 12292 }, { "epoch": 3.7581779272393763, "grad_norm": 0.6911678910255432, "learning_rate": 2.8116428177147465e-05, "loss": 0.0526, "step": 12293 }, { "epoch": 3.7584836441455214, "grad_norm": 0.48828667402267456, "learning_rate": 2.8116003566727527e-05, "loss": 0.0503, "step": 12294 }, { "epoch": 3.758789361051666, "grad_norm": 0.3852267265319824, "learning_rate": 2.8115578956307586e-05, "loss": 0.067, "step": 12295 }, { "epoch": 3.759095077957811, "grad_norm": 0.32718828320503235, "learning_rate": 2.8115154345887648e-05, "loss": 0.0917, "step": 12296 }, { "epoch": 3.759400794863956, "grad_norm": 0.2173108011484146, "learning_rate": 2.8114729735467707e-05, "loss": 0.1062, "step": 12297 }, { "epoch": 3.759706511770101, "grad_norm": 0.25570404529571533, "learning_rate": 2.811430512504777e-05, "loss": 0.0777, "step": 12298 }, { "epoch": 3.7600122286762456, "grad_norm": 0.24534524977207184, "learning_rate": 2.8113880514627827e-05, "loss": 0.0771, "step": 12299 }, { "epoch": 3.7603179455823907, "grad_norm": 0.2651313841342926, "learning_rate": 2.811345590420789e-05, "loss": 0.0532, "step": 12300 }, { "epoch": 3.760623662488536, "grad_norm": 0.34832167625427246, "learning_rate": 2.8113031293787948e-05, "loss": 0.0877, "step": 12301 }, { "epoch": 3.7609293793946805, "grad_norm": 0.6746938228607178, "learning_rate": 2.811260668336801e-05, "loss": 0.1261, "step": 12302 }, { "epoch": 3.761235096300825, "grad_norm": 0.974989116191864, "learning_rate": 2.811218207294807e-05, "loss": 0.1676, "step": 12303 }, { "epoch": 3.7615408132069703, "grad_norm": 0.46498894691467285, "learning_rate": 2.811175746252813e-05, "loss": 0.1521, "step": 12304 }, { "epoch": 3.7618465301131154, "grad_norm": 0.9093959331512451, "learning_rate": 2.8111332852108193e-05, "loss": 0.1885, "step": 12305 }, { "epoch": 3.76215224701926, "grad_norm": 0.4090564250946045, "learning_rate": 2.8110908241688252e-05, "loss": 0.1848, "step": 12306 }, { "epoch": 3.762457963925405, "grad_norm": 0.40995246171951294, "learning_rate": 2.8110483631268314e-05, "loss": 0.1442, "step": 12307 }, { "epoch": 3.76276368083155, "grad_norm": 0.8992806077003479, "learning_rate": 2.8110059020848373e-05, "loss": 0.1736, "step": 12308 }, { "epoch": 3.763069397737695, "grad_norm": 0.8713218569755554, "learning_rate": 2.8109634410428435e-05, "loss": 0.2071, "step": 12309 }, { "epoch": 3.7633751146438397, "grad_norm": 0.7891702055931091, "learning_rate": 2.8109209800008493e-05, "loss": 0.1784, "step": 12310 }, { "epoch": 3.763680831549985, "grad_norm": 0.8857466578483582, "learning_rate": 2.8108785189588555e-05, "loss": 0.2267, "step": 12311 }, { "epoch": 3.7639865484561295, "grad_norm": 1.3312770128250122, "learning_rate": 2.8108360579168614e-05, "loss": 0.2486, "step": 12312 }, { "epoch": 3.7642922653622746, "grad_norm": 1.0752320289611816, "learning_rate": 2.8107935968748676e-05, "loss": 0.2499, "step": 12313 }, { "epoch": 3.7645979822684197, "grad_norm": 0.5050508379936218, "learning_rate": 2.8107511358328735e-05, "loss": 0.1608, "step": 12314 }, { "epoch": 3.7649036991745644, "grad_norm": 0.2645193934440613, "learning_rate": 2.8107086747908797e-05, "loss": 0.0802, "step": 12315 }, { "epoch": 3.765209416080709, "grad_norm": 0.8045337200164795, "learning_rate": 2.8106662137488856e-05, "loss": 0.1006, "step": 12316 }, { "epoch": 3.765515132986854, "grad_norm": 0.3031395673751831, "learning_rate": 2.8106237527068918e-05, "loss": 0.0892, "step": 12317 }, { "epoch": 3.7658208498929993, "grad_norm": 0.30917659401893616, "learning_rate": 2.8105812916648976e-05, "loss": 0.0738, "step": 12318 }, { "epoch": 3.766126566799144, "grad_norm": 0.7233683466911316, "learning_rate": 2.8105388306229035e-05, "loss": 0.0588, "step": 12319 }, { "epoch": 3.766432283705289, "grad_norm": 0.39801478385925293, "learning_rate": 2.8104963695809097e-05, "loss": 0.0614, "step": 12320 }, { "epoch": 3.7667380006114337, "grad_norm": 0.4693896770477295, "learning_rate": 2.8104539085389156e-05, "loss": 0.0727, "step": 12321 }, { "epoch": 3.767043717517579, "grad_norm": 0.44981345534324646, "learning_rate": 2.8104114474969218e-05, "loss": 0.0826, "step": 12322 }, { "epoch": 3.7673494344237235, "grad_norm": 0.3182886838912964, "learning_rate": 2.8103689864549277e-05, "loss": 0.0674, "step": 12323 }, { "epoch": 3.7676551513298686, "grad_norm": 0.4671022891998291, "learning_rate": 2.810326525412934e-05, "loss": 0.0862, "step": 12324 }, { "epoch": 3.7679608682360133, "grad_norm": 0.41310185194015503, "learning_rate": 2.8102840643709398e-05, "loss": 0.0944, "step": 12325 }, { "epoch": 3.7682665851421584, "grad_norm": 0.25075677037239075, "learning_rate": 2.810241603328946e-05, "loss": 0.0831, "step": 12326 }, { "epoch": 3.7685723020483035, "grad_norm": 0.48422330617904663, "learning_rate": 2.810199142286952e-05, "loss": 0.1285, "step": 12327 }, { "epoch": 3.768878018954448, "grad_norm": 0.64329594373703, "learning_rate": 2.810156681244958e-05, "loss": 0.1299, "step": 12328 }, { "epoch": 3.769183735860593, "grad_norm": 0.6714340448379517, "learning_rate": 2.810114220202964e-05, "loss": 0.2126, "step": 12329 }, { "epoch": 3.769489452766738, "grad_norm": 0.5756922960281372, "learning_rate": 2.81007175916097e-05, "loss": 0.1665, "step": 12330 }, { "epoch": 3.769795169672883, "grad_norm": 0.7431420683860779, "learning_rate": 2.810029298118976e-05, "loss": 0.188, "step": 12331 }, { "epoch": 3.7701008865790278, "grad_norm": 0.5142768025398254, "learning_rate": 2.809986837076982e-05, "loss": 0.1637, "step": 12332 }, { "epoch": 3.770406603485173, "grad_norm": 1.0556930303573608, "learning_rate": 2.809944376034988e-05, "loss": 0.1894, "step": 12333 }, { "epoch": 3.7707123203913175, "grad_norm": 0.650965690612793, "learning_rate": 2.809901914992994e-05, "loss": 0.1791, "step": 12334 }, { "epoch": 3.7710180372974627, "grad_norm": 0.9178462624549866, "learning_rate": 2.809859453951e-05, "loss": 0.1924, "step": 12335 }, { "epoch": 3.7713237542036073, "grad_norm": 1.732070803642273, "learning_rate": 2.809816992909006e-05, "loss": 0.2291, "step": 12336 }, { "epoch": 3.7716294711097524, "grad_norm": 1.092232584953308, "learning_rate": 2.8097745318670122e-05, "loss": 0.2183, "step": 12337 }, { "epoch": 3.771935188015897, "grad_norm": 1.7080106735229492, "learning_rate": 2.809732070825018e-05, "loss": 0.2788, "step": 12338 }, { "epoch": 3.7722409049220422, "grad_norm": 0.8863224983215332, "learning_rate": 2.8096896097830243e-05, "loss": 0.167, "step": 12339 }, { "epoch": 3.7725466218281873, "grad_norm": 0.3173794150352478, "learning_rate": 2.8096471487410302e-05, "loss": 0.0877, "step": 12340 }, { "epoch": 3.772852338734332, "grad_norm": 0.24164195358753204, "learning_rate": 2.8096046876990364e-05, "loss": 0.0668, "step": 12341 }, { "epoch": 3.7731580556404767, "grad_norm": 0.3994818925857544, "learning_rate": 2.8095622266570423e-05, "loss": 0.0571, "step": 12342 }, { "epoch": 3.773463772546622, "grad_norm": 0.35951852798461914, "learning_rate": 2.8095197656150485e-05, "loss": 0.0698, "step": 12343 }, { "epoch": 3.773769489452767, "grad_norm": 0.24742157757282257, "learning_rate": 2.8094773045730543e-05, "loss": 0.0735, "step": 12344 }, { "epoch": 3.7740752063589116, "grad_norm": 0.2524430453777313, "learning_rate": 2.8094348435310602e-05, "loss": 0.0554, "step": 12345 }, { "epoch": 3.7743809232650567, "grad_norm": 0.26777851581573486, "learning_rate": 2.8093923824890664e-05, "loss": 0.054, "step": 12346 }, { "epoch": 3.7746866401712014, "grad_norm": 0.2628082036972046, "learning_rate": 2.8093499214470723e-05, "loss": 0.0673, "step": 12347 }, { "epoch": 3.7749923570773465, "grad_norm": 0.3572167754173279, "learning_rate": 2.8093074604050785e-05, "loss": 0.0713, "step": 12348 }, { "epoch": 3.775298073983491, "grad_norm": 0.25641393661499023, "learning_rate": 2.8092649993630844e-05, "loss": 0.0609, "step": 12349 }, { "epoch": 3.7756037908896363, "grad_norm": 0.4714646339416504, "learning_rate": 2.8092225383210906e-05, "loss": 0.1031, "step": 12350 }, { "epoch": 3.775909507795781, "grad_norm": 0.3458315432071686, "learning_rate": 2.8091800772790964e-05, "loss": 0.1036, "step": 12351 }, { "epoch": 3.776215224701926, "grad_norm": 0.4322526752948761, "learning_rate": 2.8091376162371026e-05, "loss": 0.1264, "step": 12352 }, { "epoch": 3.776520941608071, "grad_norm": 0.5219970941543579, "learning_rate": 2.8090951551951085e-05, "loss": 0.1299, "step": 12353 }, { "epoch": 3.776826658514216, "grad_norm": 0.6853410601615906, "learning_rate": 2.8090526941531147e-05, "loss": 0.1558, "step": 12354 }, { "epoch": 3.7771323754203605, "grad_norm": 0.6415146589279175, "learning_rate": 2.8090102331111206e-05, "loss": 0.1831, "step": 12355 }, { "epoch": 3.7774380923265056, "grad_norm": 0.6792811155319214, "learning_rate": 2.8089677720691268e-05, "loss": 0.1682, "step": 12356 }, { "epoch": 3.7777438092326507, "grad_norm": 0.7751348614692688, "learning_rate": 2.8089253110271327e-05, "loss": 0.1985, "step": 12357 }, { "epoch": 3.7780495261387954, "grad_norm": 0.674325704574585, "learning_rate": 2.8088828499851385e-05, "loss": 0.2003, "step": 12358 }, { "epoch": 3.7783552430449405, "grad_norm": 0.527586817741394, "learning_rate": 2.8088403889431448e-05, "loss": 0.1453, "step": 12359 }, { "epoch": 3.778660959951085, "grad_norm": 0.9603501558303833, "learning_rate": 2.8087979279011506e-05, "loss": 0.1673, "step": 12360 }, { "epoch": 3.7789666768572303, "grad_norm": 2.3503658771514893, "learning_rate": 2.808755466859157e-05, "loss": 0.2034, "step": 12361 }, { "epoch": 3.779272393763375, "grad_norm": 0.8514179587364197, "learning_rate": 2.8087130058171627e-05, "loss": 0.2835, "step": 12362 }, { "epoch": 3.77957811066952, "grad_norm": 1.3266047239303589, "learning_rate": 2.808670544775169e-05, "loss": 0.2733, "step": 12363 }, { "epoch": 3.7798838275756648, "grad_norm": 0.5678476095199585, "learning_rate": 2.8086280837331748e-05, "loss": 0.1562, "step": 12364 }, { "epoch": 3.78018954448181, "grad_norm": 0.25830110907554626, "learning_rate": 2.808585622691181e-05, "loss": 0.0737, "step": 12365 }, { "epoch": 3.780495261387955, "grad_norm": 0.2312336564064026, "learning_rate": 2.808543161649187e-05, "loss": 0.0818, "step": 12366 }, { "epoch": 3.7808009782940997, "grad_norm": 0.4119085669517517, "learning_rate": 2.808500700607193e-05, "loss": 0.0733, "step": 12367 }, { "epoch": 3.7811066952002443, "grad_norm": 0.37759414315223694, "learning_rate": 2.808458239565199e-05, "loss": 0.0723, "step": 12368 }, { "epoch": 3.7814124121063895, "grad_norm": 0.18851913511753082, "learning_rate": 2.808415778523205e-05, "loss": 0.0554, "step": 12369 }, { "epoch": 3.7817181290125346, "grad_norm": 0.2378050535917282, "learning_rate": 2.808373317481211e-05, "loss": 0.0902, "step": 12370 }, { "epoch": 3.7820238459186792, "grad_norm": 0.3793354332447052, "learning_rate": 2.808330856439217e-05, "loss": 0.0587, "step": 12371 }, { "epoch": 3.7823295628248244, "grad_norm": 0.6015732288360596, "learning_rate": 2.808288395397223e-05, "loss": 0.0699, "step": 12372 }, { "epoch": 3.782635279730969, "grad_norm": 0.24390143156051636, "learning_rate": 2.808245934355229e-05, "loss": 0.0662, "step": 12373 }, { "epoch": 3.782940996637114, "grad_norm": 0.2633736729621887, "learning_rate": 2.8082034733132352e-05, "loss": 0.0966, "step": 12374 }, { "epoch": 3.783246713543259, "grad_norm": 0.33753344416618347, "learning_rate": 2.808161012271241e-05, "loss": 0.1247, "step": 12375 }, { "epoch": 3.783552430449404, "grad_norm": 0.27813705801963806, "learning_rate": 2.8081185512292473e-05, "loss": 0.0664, "step": 12376 }, { "epoch": 3.7838581473555486, "grad_norm": 0.6431933641433716, "learning_rate": 2.808076090187253e-05, "loss": 0.1151, "step": 12377 }, { "epoch": 3.7841638642616937, "grad_norm": 0.44995835423469543, "learning_rate": 2.8080336291452593e-05, "loss": 0.1431, "step": 12378 }, { "epoch": 3.784469581167839, "grad_norm": 0.638580322265625, "learning_rate": 2.8079911681032652e-05, "loss": 0.1624, "step": 12379 }, { "epoch": 3.7847752980739835, "grad_norm": 0.8659681677818298, "learning_rate": 2.8079487070612714e-05, "loss": 0.1592, "step": 12380 }, { "epoch": 3.785081014980128, "grad_norm": 0.6461348533630371, "learning_rate": 2.8079062460192773e-05, "loss": 0.166, "step": 12381 }, { "epoch": 3.7853867318862733, "grad_norm": 1.2064989805221558, "learning_rate": 2.8078637849772835e-05, "loss": 0.1836, "step": 12382 }, { "epoch": 3.7856924487924184, "grad_norm": 0.9994871616363525, "learning_rate": 2.8078213239352894e-05, "loss": 0.1697, "step": 12383 }, { "epoch": 3.785998165698563, "grad_norm": 0.8210234642028809, "learning_rate": 2.8077788628932952e-05, "loss": 0.2017, "step": 12384 }, { "epoch": 3.786303882604708, "grad_norm": 1.034287691116333, "learning_rate": 2.8077364018513014e-05, "loss": 0.2046, "step": 12385 }, { "epoch": 3.786609599510853, "grad_norm": 1.4748069047927856, "learning_rate": 2.8076939408093073e-05, "loss": 0.182, "step": 12386 }, { "epoch": 3.786915316416998, "grad_norm": 1.9795206785202026, "learning_rate": 2.8076514797673135e-05, "loss": 0.171, "step": 12387 }, { "epoch": 3.7872210333231426, "grad_norm": 2.434643268585205, "learning_rate": 2.8076090187253194e-05, "loss": 0.2916, "step": 12388 }, { "epoch": 3.7875267502292878, "grad_norm": 0.7861005663871765, "learning_rate": 2.8075665576833256e-05, "loss": 0.1488, "step": 12389 }, { "epoch": 3.7878324671354324, "grad_norm": 0.8329139351844788, "learning_rate": 2.8075240966413315e-05, "loss": 0.0816, "step": 12390 }, { "epoch": 3.7881381840415775, "grad_norm": 1.0474094152450562, "learning_rate": 2.8074816355993377e-05, "loss": 0.0599, "step": 12391 }, { "epoch": 3.7884439009477227, "grad_norm": 0.28338590264320374, "learning_rate": 2.8074391745573435e-05, "loss": 0.1038, "step": 12392 }, { "epoch": 3.7887496178538673, "grad_norm": 0.8563012480735779, "learning_rate": 2.8073967135153498e-05, "loss": 0.0593, "step": 12393 }, { "epoch": 3.789055334760012, "grad_norm": 0.27740272879600525, "learning_rate": 2.8073542524733556e-05, "loss": 0.069, "step": 12394 }, { "epoch": 3.789361051666157, "grad_norm": 0.3924362361431122, "learning_rate": 2.807311791431362e-05, "loss": 0.0746, "step": 12395 }, { "epoch": 3.7896667685723022, "grad_norm": 0.40116220712661743, "learning_rate": 2.8072693303893677e-05, "loss": 0.0975, "step": 12396 }, { "epoch": 3.789972485478447, "grad_norm": 0.36154210567474365, "learning_rate": 2.8072268693473736e-05, "loss": 0.069, "step": 12397 }, { "epoch": 3.790278202384592, "grad_norm": 0.24797381460666656, "learning_rate": 2.8071844083053798e-05, "loss": 0.051, "step": 12398 }, { "epoch": 3.7905839192907367, "grad_norm": 0.3712245225906372, "learning_rate": 2.8071419472633857e-05, "loss": 0.0961, "step": 12399 }, { "epoch": 3.790889636196882, "grad_norm": 0.6671234369277954, "learning_rate": 2.807099486221392e-05, "loss": 0.0916, "step": 12400 }, { "epoch": 3.7911953531030265, "grad_norm": 0.37003034353256226, "learning_rate": 2.8070570251793977e-05, "loss": 0.1227, "step": 12401 }, { "epoch": 3.7915010700091716, "grad_norm": 0.49513179063796997, "learning_rate": 2.807014564137404e-05, "loss": 0.1072, "step": 12402 }, { "epoch": 3.7918067869153163, "grad_norm": 0.5591790676116943, "learning_rate": 2.8069721030954098e-05, "loss": 0.1289, "step": 12403 }, { "epoch": 3.7921125038214614, "grad_norm": 0.3859288692474365, "learning_rate": 2.806929642053416e-05, "loss": 0.1126, "step": 12404 }, { "epoch": 3.7924182207276065, "grad_norm": 0.7995820045471191, "learning_rate": 2.806887181011422e-05, "loss": 0.1442, "step": 12405 }, { "epoch": 3.792723937633751, "grad_norm": 0.9248815178871155, "learning_rate": 2.806844719969428e-05, "loss": 0.2026, "step": 12406 }, { "epoch": 3.793029654539896, "grad_norm": 0.5661753416061401, "learning_rate": 2.8068022589274343e-05, "loss": 0.1642, "step": 12407 }, { "epoch": 3.793335371446041, "grad_norm": 1.5883691310882568, "learning_rate": 2.8067597978854402e-05, "loss": 0.1784, "step": 12408 }, { "epoch": 3.793641088352186, "grad_norm": 0.6325412392616272, "learning_rate": 2.8067173368434464e-05, "loss": 0.189, "step": 12409 }, { "epoch": 3.7939468052583307, "grad_norm": 5.93735408782959, "learning_rate": 2.8066748758014523e-05, "loss": 0.2195, "step": 12410 }, { "epoch": 3.794252522164476, "grad_norm": 1.204443097114563, "learning_rate": 2.8066324147594585e-05, "loss": 0.2731, "step": 12411 }, { "epoch": 3.7945582390706205, "grad_norm": 1.1439346075057983, "learning_rate": 2.8065899537174643e-05, "loss": 0.3064, "step": 12412 }, { "epoch": 3.7948639559767656, "grad_norm": 1.522173523902893, "learning_rate": 2.8065474926754705e-05, "loss": 0.2966, "step": 12413 }, { "epoch": 3.7951696728829103, "grad_norm": 0.4014332890510559, "learning_rate": 2.8065050316334764e-05, "loss": 0.1498, "step": 12414 }, { "epoch": 3.7954753897890554, "grad_norm": 0.27997058629989624, "learning_rate": 2.8064625705914826e-05, "loss": 0.0868, "step": 12415 }, { "epoch": 3.7957811066952, "grad_norm": 0.36205944418907166, "learning_rate": 2.8064201095494885e-05, "loss": 0.0793, "step": 12416 }, { "epoch": 3.796086823601345, "grad_norm": 0.24166327714920044, "learning_rate": 2.8063776485074947e-05, "loss": 0.0586, "step": 12417 }, { "epoch": 3.7963925405074903, "grad_norm": 0.18260420858860016, "learning_rate": 2.8063351874655006e-05, "loss": 0.0634, "step": 12418 }, { "epoch": 3.796698257413635, "grad_norm": 0.2275693267583847, "learning_rate": 2.8062927264235068e-05, "loss": 0.0434, "step": 12419 }, { "epoch": 3.7970039743197797, "grad_norm": 0.2723166346549988, "learning_rate": 2.8062502653815127e-05, "loss": 0.0602, "step": 12420 }, { "epoch": 3.7973096912259248, "grad_norm": 0.22989904880523682, "learning_rate": 2.8062078043395185e-05, "loss": 0.0887, "step": 12421 }, { "epoch": 3.79761540813207, "grad_norm": 0.48757249116897583, "learning_rate": 2.8061653432975247e-05, "loss": 0.1027, "step": 12422 }, { "epoch": 3.7979211250382146, "grad_norm": 0.3211931586265564, "learning_rate": 2.8061228822555306e-05, "loss": 0.0863, "step": 12423 }, { "epoch": 3.7982268419443597, "grad_norm": 0.5690388083457947, "learning_rate": 2.8060804212135368e-05, "loss": 0.0992, "step": 12424 }, { "epoch": 3.7985325588505043, "grad_norm": 0.25824373960494995, "learning_rate": 2.8060379601715427e-05, "loss": 0.0765, "step": 12425 }, { "epoch": 3.7988382757566495, "grad_norm": 0.36754631996154785, "learning_rate": 2.805995499129549e-05, "loss": 0.1083, "step": 12426 }, { "epoch": 3.799143992662794, "grad_norm": 0.7297321557998657, "learning_rate": 2.8059530380875548e-05, "loss": 0.173, "step": 12427 }, { "epoch": 3.7994497095689392, "grad_norm": 0.35494324564933777, "learning_rate": 2.805910577045561e-05, "loss": 0.1236, "step": 12428 }, { "epoch": 3.799755426475084, "grad_norm": 0.41168907284736633, "learning_rate": 2.805868116003567e-05, "loss": 0.1357, "step": 12429 }, { "epoch": 3.800061143381229, "grad_norm": 0.5613402128219604, "learning_rate": 2.805825654961573e-05, "loss": 0.1532, "step": 12430 }, { "epoch": 3.800366860287374, "grad_norm": 0.5448596477508545, "learning_rate": 2.805783193919579e-05, "loss": 0.1795, "step": 12431 }, { "epoch": 3.800672577193519, "grad_norm": 0.647794783115387, "learning_rate": 2.805740732877585e-05, "loss": 0.1764, "step": 12432 }, { "epoch": 3.8009782940996635, "grad_norm": 0.5295434594154358, "learning_rate": 2.805698271835591e-05, "loss": 0.1528, "step": 12433 }, { "epoch": 3.8012840110058086, "grad_norm": 0.6888216137886047, "learning_rate": 2.805655810793597e-05, "loss": 0.1919, "step": 12434 }, { "epoch": 3.8015897279119537, "grad_norm": 0.7166035771369934, "learning_rate": 2.805613349751603e-05, "loss": 0.2195, "step": 12435 }, { "epoch": 3.8018954448180984, "grad_norm": 2.694045066833496, "learning_rate": 2.805570888709609e-05, "loss": 0.199, "step": 12436 }, { "epoch": 3.8022011617242435, "grad_norm": 1.24227774143219, "learning_rate": 2.805528427667615e-05, "loss": 0.1764, "step": 12437 }, { "epoch": 3.802506878630388, "grad_norm": 1.7213869094848633, "learning_rate": 2.805485966625621e-05, "loss": 0.284, "step": 12438 }, { "epoch": 3.8028125955365333, "grad_norm": 0.5194447040557861, "learning_rate": 2.8054435055836272e-05, "loss": 0.1641, "step": 12439 }, { "epoch": 3.803118312442678, "grad_norm": 0.2529175579547882, "learning_rate": 2.805401044541633e-05, "loss": 0.1012, "step": 12440 }, { "epoch": 3.803424029348823, "grad_norm": 0.577483594417572, "learning_rate": 2.8053585834996393e-05, "loss": 0.073, "step": 12441 }, { "epoch": 3.8037297462549677, "grad_norm": 0.3008013069629669, "learning_rate": 2.8053161224576452e-05, "loss": 0.0513, "step": 12442 }, { "epoch": 3.804035463161113, "grad_norm": 0.1449783593416214, "learning_rate": 2.8052736614156514e-05, "loss": 0.0636, "step": 12443 }, { "epoch": 3.804341180067258, "grad_norm": 0.20821447670459747, "learning_rate": 2.8052312003736573e-05, "loss": 0.0608, "step": 12444 }, { "epoch": 3.8046468969734026, "grad_norm": 0.35097306966781616, "learning_rate": 2.8051887393316635e-05, "loss": 0.0633, "step": 12445 }, { "epoch": 3.8049526138795473, "grad_norm": 0.31117650866508484, "learning_rate": 2.8051462782896693e-05, "loss": 0.0713, "step": 12446 }, { "epoch": 3.8052583307856924, "grad_norm": 0.7909262776374817, "learning_rate": 2.8051038172476752e-05, "loss": 0.0622, "step": 12447 }, { "epoch": 3.8055640476918375, "grad_norm": 0.2996687889099121, "learning_rate": 2.8050613562056814e-05, "loss": 0.1049, "step": 12448 }, { "epoch": 3.805869764597982, "grad_norm": 0.28973740339279175, "learning_rate": 2.8050188951636873e-05, "loss": 0.0923, "step": 12449 }, { "epoch": 3.8061754815041273, "grad_norm": 0.6349725723266602, "learning_rate": 2.8049764341216935e-05, "loss": 0.0679, "step": 12450 }, { "epoch": 3.806481198410272, "grad_norm": 0.2754019796848297, "learning_rate": 2.8049339730796994e-05, "loss": 0.0993, "step": 12451 }, { "epoch": 3.806786915316417, "grad_norm": 0.44330376386642456, "learning_rate": 2.8048915120377056e-05, "loss": 0.137, "step": 12452 }, { "epoch": 3.807092632222562, "grad_norm": 0.5015212297439575, "learning_rate": 2.8048490509957114e-05, "loss": 0.1579, "step": 12453 }, { "epoch": 3.807398349128707, "grad_norm": 0.4138302505016327, "learning_rate": 2.8048065899537177e-05, "loss": 0.1323, "step": 12454 }, { "epoch": 3.8077040660348516, "grad_norm": 0.8856955766677856, "learning_rate": 2.8047641289117235e-05, "loss": 0.1762, "step": 12455 }, { "epoch": 3.8080097829409967, "grad_norm": 1.1740435361862183, "learning_rate": 2.8047216678697297e-05, "loss": 0.1924, "step": 12456 }, { "epoch": 3.808315499847142, "grad_norm": 0.6348282694816589, "learning_rate": 2.8046792068277356e-05, "loss": 0.1908, "step": 12457 }, { "epoch": 3.8086212167532865, "grad_norm": 0.5664768815040588, "learning_rate": 2.8046367457857418e-05, "loss": 0.1856, "step": 12458 }, { "epoch": 3.808926933659431, "grad_norm": 0.7392383813858032, "learning_rate": 2.8045942847437477e-05, "loss": 0.206, "step": 12459 }, { "epoch": 3.8092326505655763, "grad_norm": 0.5977333784103394, "learning_rate": 2.8045518237017536e-05, "loss": 0.1897, "step": 12460 }, { "epoch": 3.8095383674717214, "grad_norm": 0.8911076784133911, "learning_rate": 2.8045093626597598e-05, "loss": 0.2306, "step": 12461 }, { "epoch": 3.809844084377866, "grad_norm": 1.0742074251174927, "learning_rate": 2.8044669016177656e-05, "loss": 0.2024, "step": 12462 }, { "epoch": 3.810149801284011, "grad_norm": 1.0750017166137695, "learning_rate": 2.804424440575772e-05, "loss": 0.2367, "step": 12463 }, { "epoch": 3.810455518190156, "grad_norm": 0.4453577399253845, "learning_rate": 2.8043819795337777e-05, "loss": 0.159, "step": 12464 }, { "epoch": 3.810761235096301, "grad_norm": 0.29093530774116516, "learning_rate": 2.804339518491784e-05, "loss": 0.0826, "step": 12465 }, { "epoch": 3.8110669520024456, "grad_norm": 0.41168367862701416, "learning_rate": 2.8042970574497898e-05, "loss": 0.0959, "step": 12466 }, { "epoch": 3.8113726689085907, "grad_norm": 0.20536379516124725, "learning_rate": 2.804254596407796e-05, "loss": 0.0621, "step": 12467 }, { "epoch": 3.8116783858147354, "grad_norm": 0.34002557396888733, "learning_rate": 2.804212135365802e-05, "loss": 0.067, "step": 12468 }, { "epoch": 3.8119841027208805, "grad_norm": 0.2336040586233139, "learning_rate": 2.804169674323808e-05, "loss": 0.0589, "step": 12469 }, { "epoch": 3.8122898196270256, "grad_norm": 0.23405753076076508, "learning_rate": 2.804127213281814e-05, "loss": 0.0861, "step": 12470 }, { "epoch": 3.8125955365331703, "grad_norm": 0.2919888496398926, "learning_rate": 2.80408475223982e-05, "loss": 0.0569, "step": 12471 }, { "epoch": 3.812901253439315, "grad_norm": 0.2967952787876129, "learning_rate": 2.804042291197826e-05, "loss": 0.0802, "step": 12472 }, { "epoch": 3.81320697034546, "grad_norm": 0.3244171142578125, "learning_rate": 2.803999830155832e-05, "loss": 0.0657, "step": 12473 }, { "epoch": 3.813512687251605, "grad_norm": 0.452791690826416, "learning_rate": 2.803957369113838e-05, "loss": 0.0892, "step": 12474 }, { "epoch": 3.81381840415775, "grad_norm": 0.2347192019224167, "learning_rate": 2.803914908071844e-05, "loss": 0.0775, "step": 12475 }, { "epoch": 3.814124121063895, "grad_norm": 0.4014931321144104, "learning_rate": 2.8038724470298502e-05, "loss": 0.1082, "step": 12476 }, { "epoch": 3.8144298379700396, "grad_norm": 0.5877811908721924, "learning_rate": 2.803829985987856e-05, "loss": 0.1558, "step": 12477 }, { "epoch": 3.8147355548761848, "grad_norm": 0.3306513726711273, "learning_rate": 2.8037875249458623e-05, "loss": 0.1048, "step": 12478 }, { "epoch": 3.8150412717823294, "grad_norm": 0.37583211064338684, "learning_rate": 2.803745063903868e-05, "loss": 0.131, "step": 12479 }, { "epoch": 3.8153469886884745, "grad_norm": 2.0054688453674316, "learning_rate": 2.8037026028618743e-05, "loss": 0.1676, "step": 12480 }, { "epoch": 3.815652705594619, "grad_norm": 0.6776883006095886, "learning_rate": 2.8036601418198802e-05, "loss": 0.1487, "step": 12481 }, { "epoch": 3.8159584225007643, "grad_norm": 0.6409081220626831, "learning_rate": 2.8036176807778864e-05, "loss": 0.2034, "step": 12482 }, { "epoch": 3.8162641394069095, "grad_norm": 0.6417822241783142, "learning_rate": 2.8035752197358923e-05, "loss": 0.2023, "step": 12483 }, { "epoch": 3.816569856313054, "grad_norm": 0.8259570598602295, "learning_rate": 2.8035327586938985e-05, "loss": 0.171, "step": 12484 }, { "epoch": 3.816875573219199, "grad_norm": 1.801918625831604, "learning_rate": 2.8034902976519044e-05, "loss": 0.2064, "step": 12485 }, { "epoch": 3.817181290125344, "grad_norm": 0.7945690751075745, "learning_rate": 2.8034478366099102e-05, "loss": 0.2115, "step": 12486 }, { "epoch": 3.817487007031489, "grad_norm": 0.6287004947662354, "learning_rate": 2.8034053755679164e-05, "loss": 0.2034, "step": 12487 }, { "epoch": 3.8177927239376337, "grad_norm": 1.5596656799316406, "learning_rate": 2.8033629145259223e-05, "loss": 0.2542, "step": 12488 }, { "epoch": 3.818098440843779, "grad_norm": 0.3610144853591919, "learning_rate": 2.8033204534839285e-05, "loss": 0.1421, "step": 12489 }, { "epoch": 3.8184041577499235, "grad_norm": 0.6256545186042786, "learning_rate": 2.8032779924419344e-05, "loss": 0.0987, "step": 12490 }, { "epoch": 3.8187098746560686, "grad_norm": 0.2708989977836609, "learning_rate": 2.8032355313999406e-05, "loss": 0.0723, "step": 12491 }, { "epoch": 3.8190155915622133, "grad_norm": 0.21033187210559845, "learning_rate": 2.8031930703579465e-05, "loss": 0.0534, "step": 12492 }, { "epoch": 3.8193213084683584, "grad_norm": 1.305126428604126, "learning_rate": 2.8031506093159527e-05, "loss": 0.0655, "step": 12493 }, { "epoch": 3.819627025374503, "grad_norm": 0.160169780254364, "learning_rate": 2.8031081482739586e-05, "loss": 0.0396, "step": 12494 }, { "epoch": 3.819932742280648, "grad_norm": 0.4137905538082123, "learning_rate": 2.8030656872319648e-05, "loss": 0.0528, "step": 12495 }, { "epoch": 3.8202384591867933, "grad_norm": 0.5144973993301392, "learning_rate": 2.8030232261899706e-05, "loss": 0.0727, "step": 12496 }, { "epoch": 3.820544176092938, "grad_norm": 0.4618794023990631, "learning_rate": 2.802980765147977e-05, "loss": 0.0807, "step": 12497 }, { "epoch": 3.8208498929990826, "grad_norm": 0.22604918479919434, "learning_rate": 2.8029383041059827e-05, "loss": 0.0603, "step": 12498 }, { "epoch": 3.8211556099052277, "grad_norm": 0.3566006124019623, "learning_rate": 2.8028958430639886e-05, "loss": 0.0934, "step": 12499 }, { "epoch": 3.821461326811373, "grad_norm": 0.26871711015701294, "learning_rate": 2.8028533820219948e-05, "loss": 0.0684, "step": 12500 }, { "epoch": 3.8217670437175175, "grad_norm": 0.4687936007976532, "learning_rate": 2.8028109209800007e-05, "loss": 0.1015, "step": 12501 }, { "epoch": 3.8220727606236626, "grad_norm": 0.4281185269355774, "learning_rate": 2.802768459938007e-05, "loss": 0.1153, "step": 12502 }, { "epoch": 3.8223784775298073, "grad_norm": 0.5110422968864441, "learning_rate": 2.8027259988960127e-05, "loss": 0.1065, "step": 12503 }, { "epoch": 3.8226841944359524, "grad_norm": 0.9024673700332642, "learning_rate": 2.802683537854019e-05, "loss": 0.1363, "step": 12504 }, { "epoch": 3.822989911342097, "grad_norm": 0.44869598746299744, "learning_rate": 2.8026410768120248e-05, "loss": 0.1739, "step": 12505 }, { "epoch": 3.823295628248242, "grad_norm": 0.6270476579666138, "learning_rate": 2.802598615770031e-05, "loss": 0.1547, "step": 12506 }, { "epoch": 3.823601345154387, "grad_norm": 0.5017854571342468, "learning_rate": 2.802556154728037e-05, "loss": 0.1736, "step": 12507 }, { "epoch": 3.823907062060532, "grad_norm": 0.5863416790962219, "learning_rate": 2.802513693686043e-05, "loss": 0.1761, "step": 12508 }, { "epoch": 3.824212778966677, "grad_norm": 1.6251177787780762, "learning_rate": 2.802471232644049e-05, "loss": 0.1782, "step": 12509 }, { "epoch": 3.8245184958728218, "grad_norm": 2.333381414413452, "learning_rate": 2.8024287716020552e-05, "loss": 0.1845, "step": 12510 }, { "epoch": 3.8248242127789664, "grad_norm": 1.0324411392211914, "learning_rate": 2.8023863105600614e-05, "loss": 0.1805, "step": 12511 }, { "epoch": 3.8251299296851116, "grad_norm": 1.2396160364151, "learning_rate": 2.8023438495180673e-05, "loss": 0.2176, "step": 12512 }, { "epoch": 3.8254356465912567, "grad_norm": 0.9181387424468994, "learning_rate": 2.8023013884760735e-05, "loss": 0.2263, "step": 12513 }, { "epoch": 3.8257413634974013, "grad_norm": 1.300673007965088, "learning_rate": 2.8022589274340793e-05, "loss": 0.129, "step": 12514 }, { "epoch": 3.8260470804035465, "grad_norm": 0.4328397214412689, "learning_rate": 2.8022164663920855e-05, "loss": 0.0959, "step": 12515 }, { "epoch": 3.826352797309691, "grad_norm": 0.23776859045028687, "learning_rate": 2.8021740053500914e-05, "loss": 0.0813, "step": 12516 }, { "epoch": 3.8266585142158362, "grad_norm": 0.5769002437591553, "learning_rate": 2.8021315443080976e-05, "loss": 0.0676, "step": 12517 }, { "epoch": 3.826964231121981, "grad_norm": 0.2694486379623413, "learning_rate": 2.8020890832661035e-05, "loss": 0.0597, "step": 12518 }, { "epoch": 3.827269948028126, "grad_norm": 0.23653270304203033, "learning_rate": 2.8020466222241097e-05, "loss": 0.0483, "step": 12519 }, { "epoch": 3.8275756649342707, "grad_norm": 0.31736814975738525, "learning_rate": 2.8020041611821156e-05, "loss": 0.0751, "step": 12520 }, { "epoch": 3.827881381840416, "grad_norm": 0.2450147420167923, "learning_rate": 2.8019617001401218e-05, "loss": 0.0597, "step": 12521 }, { "epoch": 3.828187098746561, "grad_norm": 0.35669171810150146, "learning_rate": 2.8019192390981277e-05, "loss": 0.0883, "step": 12522 }, { "epoch": 3.8284928156527056, "grad_norm": 0.2692742943763733, "learning_rate": 2.8018767780561335e-05, "loss": 0.0609, "step": 12523 }, { "epoch": 3.8287985325588503, "grad_norm": 0.7210924625396729, "learning_rate": 2.8018343170141397e-05, "loss": 0.0812, "step": 12524 }, { "epoch": 3.8291042494649954, "grad_norm": 0.26828286051750183, "learning_rate": 2.8017918559721456e-05, "loss": 0.0558, "step": 12525 }, { "epoch": 3.8294099663711405, "grad_norm": 0.4483063220977783, "learning_rate": 2.8017493949301518e-05, "loss": 0.1074, "step": 12526 }, { "epoch": 3.829715683277285, "grad_norm": 0.26308587193489075, "learning_rate": 2.8017069338881577e-05, "loss": 0.1, "step": 12527 }, { "epoch": 3.8300214001834303, "grad_norm": 0.5697556734085083, "learning_rate": 2.801664472846164e-05, "loss": 0.1352, "step": 12528 }, { "epoch": 3.830327117089575, "grad_norm": 0.4635988175868988, "learning_rate": 2.8016220118041698e-05, "loss": 0.1274, "step": 12529 }, { "epoch": 3.83063283399572, "grad_norm": 0.4893168807029724, "learning_rate": 2.801579550762176e-05, "loss": 0.1753, "step": 12530 }, { "epoch": 3.8309385509018647, "grad_norm": 0.48305267095565796, "learning_rate": 2.801537089720182e-05, "loss": 0.1439, "step": 12531 }, { "epoch": 3.83124426780801, "grad_norm": 0.6316514611244202, "learning_rate": 2.801494628678188e-05, "loss": 0.2114, "step": 12532 }, { "epoch": 3.8315499847141545, "grad_norm": 1.4565715789794922, "learning_rate": 2.801452167636194e-05, "loss": 0.1659, "step": 12533 }, { "epoch": 3.8318557016202996, "grad_norm": 1.0363441705703735, "learning_rate": 2.8014097065942e-05, "loss": 0.1726, "step": 12534 }, { "epoch": 3.8321614185264448, "grad_norm": 1.0533645153045654, "learning_rate": 2.801367245552206e-05, "loss": 0.1549, "step": 12535 }, { "epoch": 3.8324671354325894, "grad_norm": 1.7414759397506714, "learning_rate": 2.801324784510212e-05, "loss": 0.1964, "step": 12536 }, { "epoch": 3.832772852338734, "grad_norm": 1.8061569929122925, "learning_rate": 2.801282323468218e-05, "loss": 0.2153, "step": 12537 }, { "epoch": 3.833078569244879, "grad_norm": 1.8790183067321777, "learning_rate": 2.801239862426224e-05, "loss": 0.2554, "step": 12538 }, { "epoch": 3.8333842861510243, "grad_norm": 0.4284478425979614, "learning_rate": 2.80119740138423e-05, "loss": 0.1394, "step": 12539 }, { "epoch": 3.833690003057169, "grad_norm": 0.29985204339027405, "learning_rate": 2.801154940342236e-05, "loss": 0.0935, "step": 12540 }, { "epoch": 3.833995719963314, "grad_norm": 0.30905768275260925, "learning_rate": 2.8011124793002422e-05, "loss": 0.0635, "step": 12541 }, { "epoch": 3.834301436869459, "grad_norm": 0.2637423574924469, "learning_rate": 2.801070018258248e-05, "loss": 0.0574, "step": 12542 }, { "epoch": 3.834607153775604, "grad_norm": 0.326328843832016, "learning_rate": 2.8010275572162543e-05, "loss": 0.0588, "step": 12543 }, { "epoch": 3.8349128706817486, "grad_norm": 0.19063879549503326, "learning_rate": 2.8009850961742602e-05, "loss": 0.0405, "step": 12544 }, { "epoch": 3.8352185875878937, "grad_norm": 0.2287713587284088, "learning_rate": 2.8009426351322664e-05, "loss": 0.0744, "step": 12545 }, { "epoch": 3.8355243044940384, "grad_norm": 0.452837198972702, "learning_rate": 2.8009001740902723e-05, "loss": 0.096, "step": 12546 }, { "epoch": 3.8358300214001835, "grad_norm": 0.2401670664548874, "learning_rate": 2.8008577130482785e-05, "loss": 0.0727, "step": 12547 }, { "epoch": 3.8361357383063286, "grad_norm": 0.3243342936038971, "learning_rate": 2.8008152520062843e-05, "loss": 0.055, "step": 12548 }, { "epoch": 3.8364414552124733, "grad_norm": 0.7515826225280762, "learning_rate": 2.8007727909642902e-05, "loss": 0.1005, "step": 12549 }, { "epoch": 3.836747172118618, "grad_norm": 0.36215806007385254, "learning_rate": 2.8007303299222964e-05, "loss": 0.0722, "step": 12550 }, { "epoch": 3.837052889024763, "grad_norm": 0.8725994229316711, "learning_rate": 2.8006878688803023e-05, "loss": 0.0983, "step": 12551 }, { "epoch": 3.837358605930908, "grad_norm": 0.5560895800590515, "learning_rate": 2.8006454078383085e-05, "loss": 0.114, "step": 12552 }, { "epoch": 3.837664322837053, "grad_norm": 0.7655998468399048, "learning_rate": 2.8006029467963144e-05, "loss": 0.1194, "step": 12553 }, { "epoch": 3.837970039743198, "grad_norm": 1.0297931432724, "learning_rate": 2.8005604857543206e-05, "loss": 0.1449, "step": 12554 }, { "epoch": 3.8382757566493426, "grad_norm": 0.7554863691329956, "learning_rate": 2.8005180247123264e-05, "loss": 0.1435, "step": 12555 }, { "epoch": 3.8385814735554877, "grad_norm": 0.515485942363739, "learning_rate": 2.8004755636703327e-05, "loss": 0.1929, "step": 12556 }, { "epoch": 3.8388871904616324, "grad_norm": 1.1851871013641357, "learning_rate": 2.8004331026283385e-05, "loss": 0.1654, "step": 12557 }, { "epoch": 3.8391929073677775, "grad_norm": 0.63059401512146, "learning_rate": 2.8003906415863447e-05, "loss": 0.1571, "step": 12558 }, { "epoch": 3.839498624273922, "grad_norm": 1.18667733669281, "learning_rate": 2.8003481805443506e-05, "loss": 0.1941, "step": 12559 }, { "epoch": 3.8398043411800673, "grad_norm": 0.7809451818466187, "learning_rate": 2.8003057195023568e-05, "loss": 0.1935, "step": 12560 }, { "epoch": 3.8401100580862124, "grad_norm": 0.8438601493835449, "learning_rate": 2.8002632584603627e-05, "loss": 0.2072, "step": 12561 }, { "epoch": 3.840415774992357, "grad_norm": 2.006354808807373, "learning_rate": 2.8002207974183686e-05, "loss": 0.2336, "step": 12562 }, { "epoch": 3.8407214918985018, "grad_norm": 3.0057387351989746, "learning_rate": 2.8001783363763748e-05, "loss": 0.1999, "step": 12563 }, { "epoch": 3.841027208804647, "grad_norm": 0.9902729988098145, "learning_rate": 2.8001358753343806e-05, "loss": 0.1183, "step": 12564 }, { "epoch": 3.841332925710792, "grad_norm": 0.32469627261161804, "learning_rate": 2.800093414292387e-05, "loss": 0.0827, "step": 12565 }, { "epoch": 3.8416386426169367, "grad_norm": 0.2715981900691986, "learning_rate": 2.8000509532503927e-05, "loss": 0.0908, "step": 12566 }, { "epoch": 3.8419443595230818, "grad_norm": 0.6270651817321777, "learning_rate": 2.800008492208399e-05, "loss": 0.0851, "step": 12567 }, { "epoch": 3.8422500764292264, "grad_norm": 0.3507998585700989, "learning_rate": 2.7999660311664048e-05, "loss": 0.0537, "step": 12568 }, { "epoch": 3.8425557933353716, "grad_norm": 0.574167788028717, "learning_rate": 2.799923570124411e-05, "loss": 0.0517, "step": 12569 }, { "epoch": 3.8428615102415162, "grad_norm": 0.2529152035713196, "learning_rate": 2.799881109082417e-05, "loss": 0.0498, "step": 12570 }, { "epoch": 3.8431672271476613, "grad_norm": 0.422122061252594, "learning_rate": 2.799838648040423e-05, "loss": 0.0678, "step": 12571 }, { "epoch": 3.843472944053806, "grad_norm": 0.32984450459480286, "learning_rate": 2.799796186998429e-05, "loss": 0.0823, "step": 12572 }, { "epoch": 3.843778660959951, "grad_norm": 0.36713382601737976, "learning_rate": 2.799753725956435e-05, "loss": 0.1141, "step": 12573 }, { "epoch": 3.8440843778660962, "grad_norm": 0.31342530250549316, "learning_rate": 2.799711264914441e-05, "loss": 0.1025, "step": 12574 }, { "epoch": 3.844390094772241, "grad_norm": 0.31111860275268555, "learning_rate": 2.799668803872447e-05, "loss": 0.1069, "step": 12575 }, { "epoch": 3.8446958116783856, "grad_norm": 0.5451996326446533, "learning_rate": 2.799626342830453e-05, "loss": 0.1069, "step": 12576 }, { "epoch": 3.8450015285845307, "grad_norm": 0.4682769179344177, "learning_rate": 2.799583881788459e-05, "loss": 0.1148, "step": 12577 }, { "epoch": 3.845307245490676, "grad_norm": 0.5644345879554749, "learning_rate": 2.7995414207464652e-05, "loss": 0.1655, "step": 12578 }, { "epoch": 3.8456129623968205, "grad_norm": 0.493988037109375, "learning_rate": 2.799498959704471e-05, "loss": 0.1556, "step": 12579 }, { "epoch": 3.8459186793029656, "grad_norm": 1.160312533378601, "learning_rate": 2.7994564986624773e-05, "loss": 0.1499, "step": 12580 }, { "epoch": 3.8462243962091103, "grad_norm": 0.6152557134628296, "learning_rate": 2.799414037620483e-05, "loss": 0.1706, "step": 12581 }, { "epoch": 3.8465301131152554, "grad_norm": 0.9207544922828674, "learning_rate": 2.7993715765784893e-05, "loss": 0.165, "step": 12582 }, { "epoch": 3.8468358300214, "grad_norm": 0.6837823390960693, "learning_rate": 2.7993291155364952e-05, "loss": 0.1756, "step": 12583 }, { "epoch": 3.847141546927545, "grad_norm": 1.2024978399276733, "learning_rate": 2.7992866544945014e-05, "loss": 0.1899, "step": 12584 }, { "epoch": 3.84744726383369, "grad_norm": 1.5102390050888062, "learning_rate": 2.7992441934525073e-05, "loss": 0.2255, "step": 12585 }, { "epoch": 3.847752980739835, "grad_norm": 1.9135948419570923, "learning_rate": 2.7992017324105135e-05, "loss": 0.1948, "step": 12586 }, { "epoch": 3.84805869764598, "grad_norm": 1.4196580648422241, "learning_rate": 2.7991592713685194e-05, "loss": 0.2104, "step": 12587 }, { "epoch": 3.8483644145521247, "grad_norm": 2.9157817363739014, "learning_rate": 2.7991168103265252e-05, "loss": 0.2784, "step": 12588 }, { "epoch": 3.8486701314582694, "grad_norm": 0.36820700764656067, "learning_rate": 2.7990743492845314e-05, "loss": 0.1589, "step": 12589 }, { "epoch": 3.8489758483644145, "grad_norm": 0.41592395305633545, "learning_rate": 2.7990318882425373e-05, "loss": 0.0765, "step": 12590 }, { "epoch": 3.8492815652705596, "grad_norm": 0.2291109561920166, "learning_rate": 2.7989894272005435e-05, "loss": 0.0757, "step": 12591 }, { "epoch": 3.8495872821767043, "grad_norm": 0.3152850866317749, "learning_rate": 2.7989469661585494e-05, "loss": 0.0654, "step": 12592 }, { "epoch": 3.8498929990828494, "grad_norm": 0.2040436714887619, "learning_rate": 2.7989045051165556e-05, "loss": 0.0549, "step": 12593 }, { "epoch": 3.850198715988994, "grad_norm": 0.29800984263420105, "learning_rate": 2.7988620440745615e-05, "loss": 0.0687, "step": 12594 }, { "epoch": 3.850504432895139, "grad_norm": 0.4038722515106201, "learning_rate": 2.7988195830325677e-05, "loss": 0.0572, "step": 12595 }, { "epoch": 3.850810149801284, "grad_norm": 0.31257036328315735, "learning_rate": 2.7987771219905736e-05, "loss": 0.0793, "step": 12596 }, { "epoch": 3.851115866707429, "grad_norm": 0.27754515409469604, "learning_rate": 2.7987346609485798e-05, "loss": 0.0495, "step": 12597 }, { "epoch": 3.8514215836135737, "grad_norm": 0.3895576298236847, "learning_rate": 2.7986921999065856e-05, "loss": 0.0984, "step": 12598 }, { "epoch": 3.851727300519719, "grad_norm": 0.2481229305267334, "learning_rate": 2.798649738864592e-05, "loss": 0.0668, "step": 12599 }, { "epoch": 3.852033017425864, "grad_norm": 0.30456846952438354, "learning_rate": 2.7986072778225977e-05, "loss": 0.0772, "step": 12600 }, { "epoch": 3.8523387343320086, "grad_norm": 0.31840184330940247, "learning_rate": 2.7985648167806036e-05, "loss": 0.1176, "step": 12601 }, { "epoch": 3.8526444512381532, "grad_norm": 0.4634588956832886, "learning_rate": 2.7985223557386098e-05, "loss": 0.0955, "step": 12602 }, { "epoch": 3.8529501681442984, "grad_norm": 0.5182746052742004, "learning_rate": 2.7984798946966157e-05, "loss": 0.1318, "step": 12603 }, { "epoch": 3.8532558850504435, "grad_norm": 0.5427331328392029, "learning_rate": 2.798437433654622e-05, "loss": 0.1287, "step": 12604 }, { "epoch": 3.853561601956588, "grad_norm": 1.8184666633605957, "learning_rate": 2.7983949726126277e-05, "loss": 0.1676, "step": 12605 }, { "epoch": 3.8538673188627333, "grad_norm": 0.5097066164016724, "learning_rate": 2.798352511570634e-05, "loss": 0.1504, "step": 12606 }, { "epoch": 3.854173035768878, "grad_norm": 0.7392906546592712, "learning_rate": 2.7983100505286398e-05, "loss": 0.2091, "step": 12607 }, { "epoch": 3.854478752675023, "grad_norm": 1.1194647550582886, "learning_rate": 2.798267589486646e-05, "loss": 0.1812, "step": 12608 }, { "epoch": 3.8547844695811677, "grad_norm": 0.6930502653121948, "learning_rate": 2.798225128444652e-05, "loss": 0.1696, "step": 12609 }, { "epoch": 3.855090186487313, "grad_norm": 0.6558812856674194, "learning_rate": 2.798182667402658e-05, "loss": 0.1638, "step": 12610 }, { "epoch": 3.8553959033934575, "grad_norm": 0.641684889793396, "learning_rate": 2.798140206360664e-05, "loss": 0.1961, "step": 12611 }, { "epoch": 3.8557016202996026, "grad_norm": 0.7383394241333008, "learning_rate": 2.7980977453186702e-05, "loss": 0.2064, "step": 12612 }, { "epoch": 3.8560073372057477, "grad_norm": 1.2290722131729126, "learning_rate": 2.7980552842766764e-05, "loss": 0.2561, "step": 12613 }, { "epoch": 3.8563130541118924, "grad_norm": 0.3651379942893982, "learning_rate": 2.7980128232346823e-05, "loss": 0.1289, "step": 12614 }, { "epoch": 3.856618771018037, "grad_norm": 0.5255253314971924, "learning_rate": 2.7979703621926885e-05, "loss": 0.1152, "step": 12615 }, { "epoch": 3.856924487924182, "grad_norm": 0.3543580174446106, "learning_rate": 2.7979279011506943e-05, "loss": 0.0849, "step": 12616 }, { "epoch": 3.8572302048303273, "grad_norm": 0.26084062457084656, "learning_rate": 2.7978854401087006e-05, "loss": 0.078, "step": 12617 }, { "epoch": 3.857535921736472, "grad_norm": 0.2819722592830658, "learning_rate": 2.7978429790667064e-05, "loss": 0.045, "step": 12618 }, { "epoch": 3.857841638642617, "grad_norm": 0.5434988737106323, "learning_rate": 2.7978005180247126e-05, "loss": 0.0635, "step": 12619 }, { "epoch": 3.8581473555487618, "grad_norm": 0.35328054428100586, "learning_rate": 2.7977580569827185e-05, "loss": 0.0659, "step": 12620 }, { "epoch": 3.858453072454907, "grad_norm": 0.38560211658477783, "learning_rate": 2.7977155959407247e-05, "loss": 0.071, "step": 12621 }, { "epoch": 3.8587587893610515, "grad_norm": 0.3137490451335907, "learning_rate": 2.7976731348987306e-05, "loss": 0.0596, "step": 12622 }, { "epoch": 3.8590645062671967, "grad_norm": 0.2651955187320709, "learning_rate": 2.7976306738567368e-05, "loss": 0.047, "step": 12623 }, { "epoch": 3.8593702231733413, "grad_norm": 0.40356528759002686, "learning_rate": 2.7975882128147427e-05, "loss": 0.0988, "step": 12624 }, { "epoch": 3.8596759400794864, "grad_norm": 0.30936551094055176, "learning_rate": 2.797545751772749e-05, "loss": 0.0822, "step": 12625 }, { "epoch": 3.8599816569856316, "grad_norm": 0.6283523440361023, "learning_rate": 2.7975032907307547e-05, "loss": 0.0737, "step": 12626 }, { "epoch": 3.8602873738917762, "grad_norm": 0.4646620750427246, "learning_rate": 2.7974608296887606e-05, "loss": 0.1166, "step": 12627 }, { "epoch": 3.860593090797921, "grad_norm": 0.41724637150764465, "learning_rate": 2.7974183686467668e-05, "loss": 0.1221, "step": 12628 }, { "epoch": 3.860898807704066, "grad_norm": 1.8181302547454834, "learning_rate": 2.7973759076047727e-05, "loss": 0.1287, "step": 12629 }, { "epoch": 3.861204524610211, "grad_norm": 0.49251216650009155, "learning_rate": 2.797333446562779e-05, "loss": 0.1679, "step": 12630 }, { "epoch": 3.861510241516356, "grad_norm": 0.5658133029937744, "learning_rate": 2.7972909855207848e-05, "loss": 0.1862, "step": 12631 }, { "epoch": 3.861815958422501, "grad_norm": 1.3713531494140625, "learning_rate": 2.797248524478791e-05, "loss": 0.19, "step": 12632 }, { "epoch": 3.8621216753286456, "grad_norm": 1.6456631422042847, "learning_rate": 2.797206063436797e-05, "loss": 0.1727, "step": 12633 }, { "epoch": 3.8624273922347907, "grad_norm": 0.6230593919754028, "learning_rate": 2.797163602394803e-05, "loss": 0.1963, "step": 12634 }, { "epoch": 3.8627331091409354, "grad_norm": 1.5822346210479736, "learning_rate": 2.797121141352809e-05, "loss": 0.1714, "step": 12635 }, { "epoch": 3.8630388260470805, "grad_norm": 1.1170471906661987, "learning_rate": 2.797078680310815e-05, "loss": 0.2031, "step": 12636 }, { "epoch": 3.863344542953225, "grad_norm": 1.5468894243240356, "learning_rate": 2.797036219268821e-05, "loss": 0.2157, "step": 12637 }, { "epoch": 3.8636502598593703, "grad_norm": 1.6510419845581055, "learning_rate": 2.796993758226827e-05, "loss": 0.2426, "step": 12638 }, { "epoch": 3.8639559767655154, "grad_norm": 0.680927038192749, "learning_rate": 2.796951297184833e-05, "loss": 0.1791, "step": 12639 }, { "epoch": 3.86426169367166, "grad_norm": 0.8228338956832886, "learning_rate": 2.796908836142839e-05, "loss": 0.0891, "step": 12640 }, { "epoch": 3.8645674105778047, "grad_norm": 0.29219764471054077, "learning_rate": 2.796866375100845e-05, "loss": 0.0743, "step": 12641 }, { "epoch": 3.86487312748395, "grad_norm": 0.21323026716709137, "learning_rate": 2.796823914058851e-05, "loss": 0.0559, "step": 12642 }, { "epoch": 3.865178844390095, "grad_norm": 0.7985320091247559, "learning_rate": 2.7967814530168572e-05, "loss": 0.0467, "step": 12643 }, { "epoch": 3.8654845612962396, "grad_norm": 0.23243404924869537, "learning_rate": 2.796738991974863e-05, "loss": 0.0586, "step": 12644 }, { "epoch": 3.8657902782023847, "grad_norm": 0.42685237526893616, "learning_rate": 2.7966965309328693e-05, "loss": 0.0857, "step": 12645 }, { "epoch": 3.8660959951085294, "grad_norm": 0.340203195810318, "learning_rate": 2.7966540698908752e-05, "loss": 0.0692, "step": 12646 }, { "epoch": 3.8664017120146745, "grad_norm": 0.47396591305732727, "learning_rate": 2.7966116088488814e-05, "loss": 0.0632, "step": 12647 }, { "epoch": 3.866707428920819, "grad_norm": 0.49132105708122253, "learning_rate": 2.7965691478068873e-05, "loss": 0.0678, "step": 12648 }, { "epoch": 3.8670131458269643, "grad_norm": 0.3315005302429199, "learning_rate": 2.7965266867648935e-05, "loss": 0.0954, "step": 12649 }, { "epoch": 3.867318862733109, "grad_norm": 0.3473191559314728, "learning_rate": 2.7964842257228993e-05, "loss": 0.0844, "step": 12650 }, { "epoch": 3.867624579639254, "grad_norm": 0.5359255075454712, "learning_rate": 2.7964417646809052e-05, "loss": 0.1032, "step": 12651 }, { "epoch": 3.867930296545399, "grad_norm": 0.4376867711544037, "learning_rate": 2.7963993036389114e-05, "loss": 0.1276, "step": 12652 }, { "epoch": 3.868236013451544, "grad_norm": 1.5840872526168823, "learning_rate": 2.7963568425969173e-05, "loss": 0.1192, "step": 12653 }, { "epoch": 3.8685417303576886, "grad_norm": 0.5089139938354492, "learning_rate": 2.7963143815549235e-05, "loss": 0.121, "step": 12654 }, { "epoch": 3.8688474472638337, "grad_norm": 0.6942740082740784, "learning_rate": 2.7962719205129294e-05, "loss": 0.157, "step": 12655 }, { "epoch": 3.869153164169979, "grad_norm": 1.296273112297058, "learning_rate": 2.7962294594709356e-05, "loss": 0.2074, "step": 12656 }, { "epoch": 3.8694588810761235, "grad_norm": 0.9968715310096741, "learning_rate": 2.7961869984289414e-05, "loss": 0.16, "step": 12657 }, { "epoch": 3.8697645979822686, "grad_norm": 0.6678184866905212, "learning_rate": 2.7961445373869477e-05, "loss": 0.1823, "step": 12658 }, { "epoch": 3.8700703148884132, "grad_norm": 0.5490761995315552, "learning_rate": 2.7961020763449535e-05, "loss": 0.1778, "step": 12659 }, { "epoch": 3.8703760317945584, "grad_norm": 1.0841007232666016, "learning_rate": 2.7960596153029597e-05, "loss": 0.2242, "step": 12660 }, { "epoch": 3.870681748700703, "grad_norm": 0.8468596339225769, "learning_rate": 2.7960171542609656e-05, "loss": 0.1793, "step": 12661 }, { "epoch": 3.870987465606848, "grad_norm": 1.0797983407974243, "learning_rate": 2.7959746932189718e-05, "loss": 0.2341, "step": 12662 }, { "epoch": 3.871293182512993, "grad_norm": 1.8849048614501953, "learning_rate": 2.7959322321769777e-05, "loss": 0.25, "step": 12663 }, { "epoch": 3.871598899419138, "grad_norm": 0.4510466754436493, "learning_rate": 2.7958897711349836e-05, "loss": 0.1434, "step": 12664 }, { "epoch": 3.871904616325283, "grad_norm": 0.46719905734062195, "learning_rate": 2.7958473100929898e-05, "loss": 0.0792, "step": 12665 }, { "epoch": 3.8722103332314277, "grad_norm": 0.4482925534248352, "learning_rate": 2.7958048490509956e-05, "loss": 0.0901, "step": 12666 }, { "epoch": 3.8725160501375724, "grad_norm": 0.4874393045902252, "learning_rate": 2.795762388009002e-05, "loss": 0.0846, "step": 12667 }, { "epoch": 3.8728217670437175, "grad_norm": 0.19272422790527344, "learning_rate": 2.7957199269670077e-05, "loss": 0.05, "step": 12668 }, { "epoch": 3.8731274839498626, "grad_norm": 0.4689985513687134, "learning_rate": 2.795677465925014e-05, "loss": 0.0745, "step": 12669 }, { "epoch": 3.8734332008560073, "grad_norm": 0.5978392958641052, "learning_rate": 2.7956350048830198e-05, "loss": 0.1064, "step": 12670 }, { "epoch": 3.8737389177621524, "grad_norm": 0.7187896966934204, "learning_rate": 2.795592543841026e-05, "loss": 0.0632, "step": 12671 }, { "epoch": 3.874044634668297, "grad_norm": 0.6463556885719299, "learning_rate": 2.795550082799032e-05, "loss": 0.0974, "step": 12672 }, { "epoch": 3.874350351574442, "grad_norm": 0.362118661403656, "learning_rate": 2.795507621757038e-05, "loss": 0.0734, "step": 12673 }, { "epoch": 3.874656068480587, "grad_norm": 0.4804350435733795, "learning_rate": 2.795465160715044e-05, "loss": 0.0866, "step": 12674 }, { "epoch": 3.874961785386732, "grad_norm": 0.33483991026878357, "learning_rate": 2.79542269967305e-05, "loss": 0.1038, "step": 12675 }, { "epoch": 3.8752675022928766, "grad_norm": 0.4869299829006195, "learning_rate": 2.795380238631056e-05, "loss": 0.1078, "step": 12676 }, { "epoch": 3.8755732191990218, "grad_norm": 0.820496141910553, "learning_rate": 2.795337777589062e-05, "loss": 0.1435, "step": 12677 }, { "epoch": 3.875878936105167, "grad_norm": 0.5240147709846497, "learning_rate": 2.795295316547068e-05, "loss": 0.1648, "step": 12678 }, { "epoch": 3.8761846530113115, "grad_norm": 0.513158917427063, "learning_rate": 2.795252855505074e-05, "loss": 0.1296, "step": 12679 }, { "epoch": 3.876490369917456, "grad_norm": 0.4675637483596802, "learning_rate": 2.7952103944630802e-05, "loss": 0.1782, "step": 12680 }, { "epoch": 3.8767960868236013, "grad_norm": 0.5051162838935852, "learning_rate": 2.795167933421086e-05, "loss": 0.1584, "step": 12681 }, { "epoch": 3.8771018037297464, "grad_norm": 0.7653627395629883, "learning_rate": 2.7951254723790923e-05, "loss": 0.1839, "step": 12682 }, { "epoch": 3.877407520635891, "grad_norm": 1.355432391166687, "learning_rate": 2.795083011337098e-05, "loss": 0.1682, "step": 12683 }, { "epoch": 3.8777132375420362, "grad_norm": 0.6204313635826111, "learning_rate": 2.7950405502951043e-05, "loss": 0.1919, "step": 12684 }, { "epoch": 3.878018954448181, "grad_norm": 1.1736327409744263, "learning_rate": 2.7949980892531102e-05, "loss": 0.2111, "step": 12685 }, { "epoch": 3.878324671354326, "grad_norm": NaN, "learning_rate": 2.7949980892531102e-05, "loss": 0.1788, "step": 12686 }, { "epoch": 3.8786303882604707, "grad_norm": 3.1464972496032715, "learning_rate": 2.7949556282111164e-05, "loss": 0.2354, "step": 12687 }, { "epoch": 3.878936105166616, "grad_norm": 1.7629034519195557, "learning_rate": 2.7949131671691223e-05, "loss": 0.2322, "step": 12688 }, { "epoch": 3.8792418220727605, "grad_norm": 0.685493528842926, "learning_rate": 2.7948707061271285e-05, "loss": 0.1455, "step": 12689 }, { "epoch": 3.8795475389789056, "grad_norm": 0.44662249088287354, "learning_rate": 2.7948282450851344e-05, "loss": 0.1013, "step": 12690 }, { "epoch": 3.8798532558850507, "grad_norm": 0.2385488897562027, "learning_rate": 2.7947857840431402e-05, "loss": 0.06, "step": 12691 }, { "epoch": 3.8801589727911954, "grad_norm": 0.22552873194217682, "learning_rate": 2.7947433230011465e-05, "loss": 0.0666, "step": 12692 }, { "epoch": 3.88046468969734, "grad_norm": 0.7397958636283875, "learning_rate": 2.7947008619591523e-05, "loss": 0.0574, "step": 12693 }, { "epoch": 3.880770406603485, "grad_norm": 0.34910714626312256, "learning_rate": 2.7946584009171585e-05, "loss": 0.0968, "step": 12694 }, { "epoch": 3.8810761235096303, "grad_norm": 0.2212829887866974, "learning_rate": 2.7946159398751644e-05, "loss": 0.0497, "step": 12695 }, { "epoch": 3.881381840415775, "grad_norm": 0.4381672739982605, "learning_rate": 2.7945734788331706e-05, "loss": 0.0857, "step": 12696 }, { "epoch": 3.88168755732192, "grad_norm": 0.27672502398490906, "learning_rate": 2.7945310177911765e-05, "loss": 0.0676, "step": 12697 }, { "epoch": 3.8819932742280647, "grad_norm": 0.3445660173892975, "learning_rate": 2.7944885567491827e-05, "loss": 0.0852, "step": 12698 }, { "epoch": 3.88229899113421, "grad_norm": 0.5745885372161865, "learning_rate": 2.7944460957071886e-05, "loss": 0.1128, "step": 12699 }, { "epoch": 3.8826047080403545, "grad_norm": 0.5489261746406555, "learning_rate": 2.7944036346651948e-05, "loss": 0.1107, "step": 12700 }, { "epoch": 3.8829104249464996, "grad_norm": 0.26507091522216797, "learning_rate": 2.7943611736232006e-05, "loss": 0.1097, "step": 12701 }, { "epoch": 3.8832161418526443, "grad_norm": 0.35123124718666077, "learning_rate": 2.794318712581207e-05, "loss": 0.1124, "step": 12702 }, { "epoch": 3.8835218587587894, "grad_norm": 0.4571612775325775, "learning_rate": 2.7942762515392127e-05, "loss": 0.1558, "step": 12703 }, { "epoch": 3.8838275756649345, "grad_norm": 0.4518432021141052, "learning_rate": 2.7942337904972186e-05, "loss": 0.1188, "step": 12704 }, { "epoch": 3.884133292571079, "grad_norm": 0.5251731872558594, "learning_rate": 2.7941913294552248e-05, "loss": 0.1631, "step": 12705 }, { "epoch": 3.884439009477224, "grad_norm": 0.993657648563385, "learning_rate": 2.7941488684132307e-05, "loss": 0.1353, "step": 12706 }, { "epoch": 3.884744726383369, "grad_norm": 0.6964758038520813, "learning_rate": 2.794106407371237e-05, "loss": 0.1471, "step": 12707 }, { "epoch": 3.885050443289514, "grad_norm": 1.079908013343811, "learning_rate": 2.7940639463292427e-05, "loss": 0.1828, "step": 12708 }, { "epoch": 3.8853561601956588, "grad_norm": 1.3146628141403198, "learning_rate": 2.794021485287249e-05, "loss": 0.1643, "step": 12709 }, { "epoch": 3.885661877101804, "grad_norm": 0.7540606260299683, "learning_rate": 2.7939790242452548e-05, "loss": 0.2038, "step": 12710 }, { "epoch": 3.8859675940079486, "grad_norm": 0.8865712881088257, "learning_rate": 2.793936563203261e-05, "loss": 0.1938, "step": 12711 }, { "epoch": 3.8862733109140937, "grad_norm": 0.9490522146224976, "learning_rate": 2.793894102161267e-05, "loss": 0.2534, "step": 12712 }, { "epoch": 3.8865790278202383, "grad_norm": 1.8484176397323608, "learning_rate": 2.793851641119273e-05, "loss": 0.2327, "step": 12713 }, { "epoch": 3.8868847447263835, "grad_norm": 1.310009241104126, "learning_rate": 2.793809180077279e-05, "loss": 0.1201, "step": 12714 }, { "epoch": 3.887190461632528, "grad_norm": 0.43328914046287537, "learning_rate": 2.7937667190352852e-05, "loss": 0.0821, "step": 12715 }, { "epoch": 3.8874961785386732, "grad_norm": 0.17895495891571045, "learning_rate": 2.7937242579932914e-05, "loss": 0.0676, "step": 12716 }, { "epoch": 3.8878018954448184, "grad_norm": 0.2049020677804947, "learning_rate": 2.7936817969512973e-05, "loss": 0.0784, "step": 12717 }, { "epoch": 3.888107612350963, "grad_norm": 0.18927831947803497, "learning_rate": 2.7936393359093035e-05, "loss": 0.0587, "step": 12718 }, { "epoch": 3.8884133292571077, "grad_norm": 0.17274145781993866, "learning_rate": 2.7935968748673093e-05, "loss": 0.0478, "step": 12719 }, { "epoch": 3.888719046163253, "grad_norm": 0.2876453995704651, "learning_rate": 2.7935544138253156e-05, "loss": 0.0771, "step": 12720 }, { "epoch": 3.889024763069398, "grad_norm": 0.2086889147758484, "learning_rate": 2.7935119527833214e-05, "loss": 0.0569, "step": 12721 }, { "epoch": 3.8893304799755426, "grad_norm": 0.2826918363571167, "learning_rate": 2.7934694917413276e-05, "loss": 0.0715, "step": 12722 }, { "epoch": 3.8896361968816877, "grad_norm": 0.24911901354789734, "learning_rate": 2.7934270306993335e-05, "loss": 0.0801, "step": 12723 }, { "epoch": 3.8899419137878324, "grad_norm": 0.6786152124404907, "learning_rate": 2.7933845696573397e-05, "loss": 0.1418, "step": 12724 }, { "epoch": 3.8902476306939775, "grad_norm": 0.3590351939201355, "learning_rate": 2.7933421086153456e-05, "loss": 0.085, "step": 12725 }, { "epoch": 3.890553347600122, "grad_norm": 1.3081969022750854, "learning_rate": 2.7932996475733518e-05, "loss": 0.0882, "step": 12726 }, { "epoch": 3.8908590645062673, "grad_norm": 1.2886579036712646, "learning_rate": 2.7932571865313577e-05, "loss": 0.1684, "step": 12727 }, { "epoch": 3.891164781412412, "grad_norm": 0.3913992941379547, "learning_rate": 2.793214725489364e-05, "loss": 0.1416, "step": 12728 }, { "epoch": 3.891470498318557, "grad_norm": 0.858242392539978, "learning_rate": 2.7931722644473697e-05, "loss": 0.1333, "step": 12729 }, { "epoch": 3.891776215224702, "grad_norm": 0.4540872275829315, "learning_rate": 2.7931298034053756e-05, "loss": 0.1485, "step": 12730 }, { "epoch": 3.892081932130847, "grad_norm": 0.9360054731369019, "learning_rate": 2.7930873423633818e-05, "loss": 0.2027, "step": 12731 }, { "epoch": 3.8923876490369915, "grad_norm": 0.8387813568115234, "learning_rate": 2.7930448813213877e-05, "loss": 0.1892, "step": 12732 }, { "epoch": 3.8926933659431366, "grad_norm": 0.9267975091934204, "learning_rate": 2.793002420279394e-05, "loss": 0.2165, "step": 12733 }, { "epoch": 3.8929990828492818, "grad_norm": 0.6864867806434631, "learning_rate": 2.7929599592373998e-05, "loss": 0.1873, "step": 12734 }, { "epoch": 3.8933047997554264, "grad_norm": 0.7088063955307007, "learning_rate": 2.792917498195406e-05, "loss": 0.2129, "step": 12735 }, { "epoch": 3.8936105166615715, "grad_norm": 0.9340026378631592, "learning_rate": 2.792875037153412e-05, "loss": 0.2092, "step": 12736 }, { "epoch": 3.893916233567716, "grad_norm": 1.1072534322738647, "learning_rate": 2.792832576111418e-05, "loss": 0.273, "step": 12737 }, { "epoch": 3.8942219504738613, "grad_norm": 1.8074456453323364, "learning_rate": 2.792790115069424e-05, "loss": 0.2527, "step": 12738 }, { "epoch": 3.894527667380006, "grad_norm": 1.218425989151001, "learning_rate": 2.79274765402743e-05, "loss": 0.1557, "step": 12739 }, { "epoch": 3.894833384286151, "grad_norm": 0.29345351457595825, "learning_rate": 2.792705192985436e-05, "loss": 0.0838, "step": 12740 }, { "epoch": 3.895139101192296, "grad_norm": 0.26436176896095276, "learning_rate": 2.7926627319434422e-05, "loss": 0.0585, "step": 12741 }, { "epoch": 3.895444818098441, "grad_norm": 0.1694384515285492, "learning_rate": 2.792620270901448e-05, "loss": 0.0504, "step": 12742 }, { "epoch": 3.895750535004586, "grad_norm": 0.16130980849266052, "learning_rate": 2.792577809859454e-05, "loss": 0.0495, "step": 12743 }, { "epoch": 3.8960562519107307, "grad_norm": 0.22639405727386475, "learning_rate": 2.79253534881746e-05, "loss": 0.0827, "step": 12744 }, { "epoch": 3.8963619688168754, "grad_norm": 0.188930481672287, "learning_rate": 2.792492887775466e-05, "loss": 0.043, "step": 12745 }, { "epoch": 3.8966676857230205, "grad_norm": 0.6341919898986816, "learning_rate": 2.7924504267334722e-05, "loss": 0.0637, "step": 12746 }, { "epoch": 3.8969734026291656, "grad_norm": 0.38212040066719055, "learning_rate": 2.792407965691478e-05, "loss": 0.0697, "step": 12747 }, { "epoch": 3.8972791195353103, "grad_norm": 0.2371101826429367, "learning_rate": 2.7923655046494843e-05, "loss": 0.0665, "step": 12748 }, { "epoch": 3.8975848364414554, "grad_norm": 0.4689578413963318, "learning_rate": 2.7923230436074902e-05, "loss": 0.1083, "step": 12749 }, { "epoch": 3.8978905533476, "grad_norm": 0.38014277815818787, "learning_rate": 2.7922805825654964e-05, "loss": 0.1003, "step": 12750 }, { "epoch": 3.898196270253745, "grad_norm": 0.31048277020454407, "learning_rate": 2.7922381215235023e-05, "loss": 0.1082, "step": 12751 }, { "epoch": 3.89850198715989, "grad_norm": 0.365779310464859, "learning_rate": 2.7921956604815085e-05, "loss": 0.0924, "step": 12752 }, { "epoch": 3.898807704066035, "grad_norm": 0.5072965025901794, "learning_rate": 2.7921531994395143e-05, "loss": 0.1411, "step": 12753 }, { "epoch": 3.8991134209721796, "grad_norm": 0.42600077390670776, "learning_rate": 2.7921107383975202e-05, "loss": 0.1211, "step": 12754 }, { "epoch": 3.8994191378783247, "grad_norm": 1.0028384923934937, "learning_rate": 2.7920682773555264e-05, "loss": 0.2088, "step": 12755 }, { "epoch": 3.89972485478447, "grad_norm": 0.8246336579322815, "learning_rate": 2.7920258163135323e-05, "loss": 0.1536, "step": 12756 }, { "epoch": 3.9000305716906145, "grad_norm": 0.6138439774513245, "learning_rate": 2.7919833552715385e-05, "loss": 0.1769, "step": 12757 }, { "epoch": 3.900336288596759, "grad_norm": 0.4753778576850891, "learning_rate": 2.7919408942295444e-05, "loss": 0.1809, "step": 12758 }, { "epoch": 3.9006420055029043, "grad_norm": 1.102365493774414, "learning_rate": 2.7918984331875506e-05, "loss": 0.1779, "step": 12759 }, { "epoch": 3.9009477224090494, "grad_norm": 0.756068766117096, "learning_rate": 2.7918559721455565e-05, "loss": 0.169, "step": 12760 }, { "epoch": 3.901253439315194, "grad_norm": 2.273047924041748, "learning_rate": 2.7918135111035627e-05, "loss": 0.1939, "step": 12761 }, { "epoch": 3.901559156221339, "grad_norm": 0.8746192455291748, "learning_rate": 2.7917710500615685e-05, "loss": 0.2398, "step": 12762 }, { "epoch": 3.901864873127484, "grad_norm": 1.293111801147461, "learning_rate": 2.7917285890195747e-05, "loss": 0.2455, "step": 12763 }, { "epoch": 3.902170590033629, "grad_norm": 0.44301196932792664, "learning_rate": 2.7916861279775806e-05, "loss": 0.1236, "step": 12764 }, { "epoch": 3.9024763069397737, "grad_norm": 0.3210761249065399, "learning_rate": 2.7916436669355868e-05, "loss": 0.0693, "step": 12765 }, { "epoch": 3.9027820238459188, "grad_norm": 0.30250489711761475, "learning_rate": 2.7916012058935927e-05, "loss": 0.0994, "step": 12766 }, { "epoch": 3.9030877407520634, "grad_norm": 0.19345293939113617, "learning_rate": 2.7915587448515986e-05, "loss": 0.0539, "step": 12767 }, { "epoch": 3.9033934576582086, "grad_norm": 0.21516959369182587, "learning_rate": 2.7915162838096048e-05, "loss": 0.0442, "step": 12768 }, { "epoch": 3.9036991745643537, "grad_norm": 1.0949440002441406, "learning_rate": 2.7914738227676106e-05, "loss": 0.0794, "step": 12769 }, { "epoch": 3.9040048914704983, "grad_norm": 0.2122853398323059, "learning_rate": 2.791431361725617e-05, "loss": 0.0403, "step": 12770 }, { "epoch": 3.904310608376643, "grad_norm": 0.2571423053741455, "learning_rate": 2.7913889006836227e-05, "loss": 0.0697, "step": 12771 }, { "epoch": 3.904616325282788, "grad_norm": 0.4223651885986328, "learning_rate": 2.791346439641629e-05, "loss": 0.0744, "step": 12772 }, { "epoch": 3.9049220421889332, "grad_norm": 0.692964494228363, "learning_rate": 2.7913039785996348e-05, "loss": 0.0683, "step": 12773 }, { "epoch": 3.905227759095078, "grad_norm": 0.2764548361301422, "learning_rate": 2.791261517557641e-05, "loss": 0.0699, "step": 12774 }, { "epoch": 3.905533476001223, "grad_norm": 0.33426833152770996, "learning_rate": 2.791219056515647e-05, "loss": 0.0759, "step": 12775 }, { "epoch": 3.9058391929073677, "grad_norm": 0.4209083914756775, "learning_rate": 2.791176595473653e-05, "loss": 0.0794, "step": 12776 }, { "epoch": 3.906144909813513, "grad_norm": 0.7264217734336853, "learning_rate": 2.791134134431659e-05, "loss": 0.1072, "step": 12777 }, { "epoch": 3.9064506267196575, "grad_norm": 0.40508294105529785, "learning_rate": 2.791091673389665e-05, "loss": 0.1637, "step": 12778 }, { "epoch": 3.9067563436258026, "grad_norm": 0.5945075750350952, "learning_rate": 2.791049212347671e-05, "loss": 0.1531, "step": 12779 }, { "epoch": 3.9070620605319473, "grad_norm": 0.5529379844665527, "learning_rate": 2.791006751305677e-05, "loss": 0.1666, "step": 12780 }, { "epoch": 3.9073677774380924, "grad_norm": 1.0533090829849243, "learning_rate": 2.790964290263683e-05, "loss": 0.1931, "step": 12781 }, { "epoch": 3.9076734943442375, "grad_norm": 1.1274863481521606, "learning_rate": 2.790921829221689e-05, "loss": 0.154, "step": 12782 }, { "epoch": 3.907979211250382, "grad_norm": 0.8706808090209961, "learning_rate": 2.7908793681796952e-05, "loss": 0.1805, "step": 12783 }, { "epoch": 3.908284928156527, "grad_norm": 1.9931648969650269, "learning_rate": 2.790836907137701e-05, "loss": 0.2172, "step": 12784 }, { "epoch": 3.908590645062672, "grad_norm": 0.9340153932571411, "learning_rate": 2.7907944460957073e-05, "loss": 0.1868, "step": 12785 }, { "epoch": 3.908896361968817, "grad_norm": 1.0103460550308228, "learning_rate": 2.790751985053713e-05, "loss": 0.2128, "step": 12786 }, { "epoch": 3.9092020788749617, "grad_norm": 1.066827416419983, "learning_rate": 2.7907095240117193e-05, "loss": 0.2243, "step": 12787 }, { "epoch": 3.909507795781107, "grad_norm": 2.8389601707458496, "learning_rate": 2.7906670629697252e-05, "loss": 0.2901, "step": 12788 }, { "epoch": 3.9098135126872515, "grad_norm": 0.7854180932044983, "learning_rate": 2.7906246019277314e-05, "loss": 0.137, "step": 12789 }, { "epoch": 3.9101192295933966, "grad_norm": 0.4641217887401581, "learning_rate": 2.7905821408857373e-05, "loss": 0.077, "step": 12790 }, { "epoch": 3.9104249464995413, "grad_norm": 0.2512850761413574, "learning_rate": 2.7905396798437435e-05, "loss": 0.0744, "step": 12791 }, { "epoch": 3.9107306634056864, "grad_norm": 0.25641295313835144, "learning_rate": 2.7904972188017494e-05, "loss": 0.0633, "step": 12792 }, { "epoch": 3.911036380311831, "grad_norm": 0.2716315686702728, "learning_rate": 2.7904547577597552e-05, "loss": 0.0621, "step": 12793 }, { "epoch": 3.911342097217976, "grad_norm": 0.18093951046466827, "learning_rate": 2.7904122967177615e-05, "loss": 0.0657, "step": 12794 }, { "epoch": 3.9116478141241213, "grad_norm": 0.19502438604831696, "learning_rate": 2.7903698356757673e-05, "loss": 0.0651, "step": 12795 }, { "epoch": 3.911953531030266, "grad_norm": 0.2358929067850113, "learning_rate": 2.7903273746337735e-05, "loss": 0.0542, "step": 12796 }, { "epoch": 3.9122592479364107, "grad_norm": 0.3667823076248169, "learning_rate": 2.7902849135917794e-05, "loss": 0.0752, "step": 12797 }, { "epoch": 3.912564964842556, "grad_norm": 0.31471842527389526, "learning_rate": 2.7902424525497856e-05, "loss": 0.0558, "step": 12798 }, { "epoch": 3.912870681748701, "grad_norm": 0.23207080364227295, "learning_rate": 2.7901999915077915e-05, "loss": 0.1005, "step": 12799 }, { "epoch": 3.9131763986548456, "grad_norm": 0.23847700655460358, "learning_rate": 2.7901575304657977e-05, "loss": 0.1023, "step": 12800 }, { "epoch": 3.9134821155609907, "grad_norm": 0.396746963262558, "learning_rate": 2.7901150694238036e-05, "loss": 0.1002, "step": 12801 }, { "epoch": 3.9137878324671354, "grad_norm": 0.3721924424171448, "learning_rate": 2.7900726083818098e-05, "loss": 0.1296, "step": 12802 }, { "epoch": 3.9140935493732805, "grad_norm": 0.37481170892715454, "learning_rate": 2.7900301473398156e-05, "loss": 0.1064, "step": 12803 }, { "epoch": 3.914399266279425, "grad_norm": 0.6592788696289062, "learning_rate": 2.789987686297822e-05, "loss": 0.1485, "step": 12804 }, { "epoch": 3.9147049831855703, "grad_norm": 2.3731799125671387, "learning_rate": 2.7899452252558277e-05, "loss": 0.1393, "step": 12805 }, { "epoch": 3.915010700091715, "grad_norm": 0.8018143177032471, "learning_rate": 2.7899027642138336e-05, "loss": 0.1889, "step": 12806 }, { "epoch": 3.91531641699786, "grad_norm": 0.5641263127326965, "learning_rate": 2.7898603031718398e-05, "loss": 0.1905, "step": 12807 }, { "epoch": 3.915622133904005, "grad_norm": 0.6855986714363098, "learning_rate": 2.7898178421298457e-05, "loss": 0.2104, "step": 12808 }, { "epoch": 3.91592785081015, "grad_norm": 0.5901069641113281, "learning_rate": 2.789775381087852e-05, "loss": 0.1669, "step": 12809 }, { "epoch": 3.9162335677162945, "grad_norm": 0.7875247001647949, "learning_rate": 2.7897329200458577e-05, "loss": 0.1778, "step": 12810 }, { "epoch": 3.9165392846224396, "grad_norm": 1.207879662513733, "learning_rate": 2.789690459003864e-05, "loss": 0.1983, "step": 12811 }, { "epoch": 3.9168450015285847, "grad_norm": 1.0021964311599731, "learning_rate": 2.7896479979618698e-05, "loss": 0.2407, "step": 12812 }, { "epoch": 3.9171507184347294, "grad_norm": 1.9491335153579712, "learning_rate": 2.789605536919876e-05, "loss": 0.2575, "step": 12813 }, { "epoch": 3.9174564353408745, "grad_norm": 0.5022057294845581, "learning_rate": 2.789563075877882e-05, "loss": 0.148, "step": 12814 }, { "epoch": 3.917762152247019, "grad_norm": 0.2948642671108246, "learning_rate": 2.789520614835888e-05, "loss": 0.072, "step": 12815 }, { "epoch": 3.9180678691531643, "grad_norm": 0.4798768162727356, "learning_rate": 2.789478153793894e-05, "loss": 0.0786, "step": 12816 }, { "epoch": 3.918373586059309, "grad_norm": 0.49679526686668396, "learning_rate": 2.7894356927519002e-05, "loss": 0.0663, "step": 12817 }, { "epoch": 3.918679302965454, "grad_norm": 0.32980450987815857, "learning_rate": 2.7893932317099064e-05, "loss": 0.048, "step": 12818 }, { "epoch": 3.9189850198715988, "grad_norm": 0.747016429901123, "learning_rate": 2.7893507706679123e-05, "loss": 0.0506, "step": 12819 }, { "epoch": 3.919290736777744, "grad_norm": 0.2783246636390686, "learning_rate": 2.7893083096259185e-05, "loss": 0.0626, "step": 12820 }, { "epoch": 3.919596453683889, "grad_norm": 0.20648419857025146, "learning_rate": 2.7892658485839243e-05, "loss": 0.0611, "step": 12821 }, { "epoch": 3.9199021705900337, "grad_norm": 0.22106339037418365, "learning_rate": 2.7892233875419306e-05, "loss": 0.0716, "step": 12822 }, { "epoch": 3.9202078874961783, "grad_norm": 0.17246340215206146, "learning_rate": 2.7891809264999364e-05, "loss": 0.0614, "step": 12823 }, { "epoch": 3.9205136044023234, "grad_norm": 0.33810973167419434, "learning_rate": 2.7891384654579426e-05, "loss": 0.1067, "step": 12824 }, { "epoch": 3.9208193213084686, "grad_norm": 0.3592131733894348, "learning_rate": 2.7890960044159485e-05, "loss": 0.071, "step": 12825 }, { "epoch": 3.9211250382146132, "grad_norm": 0.3666788637638092, "learning_rate": 2.7890535433739547e-05, "loss": 0.0824, "step": 12826 }, { "epoch": 3.9214307551207583, "grad_norm": 0.9718689918518066, "learning_rate": 2.7890110823319606e-05, "loss": 0.1474, "step": 12827 }, { "epoch": 3.921736472026903, "grad_norm": 0.33510085940361023, "learning_rate": 2.7889686212899668e-05, "loss": 0.1073, "step": 12828 }, { "epoch": 3.922042188933048, "grad_norm": 0.44898927211761475, "learning_rate": 2.7889261602479727e-05, "loss": 0.1215, "step": 12829 }, { "epoch": 3.922347905839193, "grad_norm": 0.5295097827911377, "learning_rate": 2.788883699205979e-05, "loss": 0.1603, "step": 12830 }, { "epoch": 3.922653622745338, "grad_norm": 0.602199375629425, "learning_rate": 2.7888412381639847e-05, "loss": 0.1491, "step": 12831 }, { "epoch": 3.9229593396514826, "grad_norm": 1.0715205669403076, "learning_rate": 2.7887987771219906e-05, "loss": 0.1708, "step": 12832 }, { "epoch": 3.9232650565576277, "grad_norm": 0.4915609061717987, "learning_rate": 2.7887563160799968e-05, "loss": 0.2093, "step": 12833 }, { "epoch": 3.923570773463773, "grad_norm": 0.5506158471107483, "learning_rate": 2.7887138550380027e-05, "loss": 0.2174, "step": 12834 }, { "epoch": 3.9238764903699175, "grad_norm": 0.6764092445373535, "learning_rate": 2.788671393996009e-05, "loss": 0.2022, "step": 12835 }, { "epoch": 3.924182207276062, "grad_norm": 1.448798656463623, "learning_rate": 2.7886289329540148e-05, "loss": 0.2026, "step": 12836 }, { "epoch": 3.9244879241822073, "grad_norm": 0.6571189761161804, "learning_rate": 2.788586471912021e-05, "loss": 0.168, "step": 12837 }, { "epoch": 3.9247936410883524, "grad_norm": 1.3480943441390991, "learning_rate": 2.788544010870027e-05, "loss": 0.2683, "step": 12838 }, { "epoch": 3.925099357994497, "grad_norm": 0.4012591540813446, "learning_rate": 2.788501549828033e-05, "loss": 0.1184, "step": 12839 }, { "epoch": 3.925405074900642, "grad_norm": 0.41694751381874084, "learning_rate": 2.788459088786039e-05, "loss": 0.1022, "step": 12840 }, { "epoch": 3.925710791806787, "grad_norm": 0.3899916708469391, "learning_rate": 2.788416627744045e-05, "loss": 0.0758, "step": 12841 }, { "epoch": 3.926016508712932, "grad_norm": 0.23494024574756622, "learning_rate": 2.788374166702051e-05, "loss": 0.0724, "step": 12842 }, { "epoch": 3.9263222256190766, "grad_norm": 0.3014655113220215, "learning_rate": 2.7883317056600572e-05, "loss": 0.0618, "step": 12843 }, { "epoch": 3.9266279425252217, "grad_norm": 0.6003100872039795, "learning_rate": 2.788289244618063e-05, "loss": 0.0406, "step": 12844 }, { "epoch": 3.9269336594313664, "grad_norm": 0.682386040687561, "learning_rate": 2.788246783576069e-05, "loss": 0.0597, "step": 12845 }, { "epoch": 3.9272393763375115, "grad_norm": 0.5480945706367493, "learning_rate": 2.788204322534075e-05, "loss": 0.0752, "step": 12846 }, { "epoch": 3.9275450932436566, "grad_norm": 0.4700543284416199, "learning_rate": 2.788161861492081e-05, "loss": 0.0945, "step": 12847 }, { "epoch": 3.9278508101498013, "grad_norm": 0.4244331121444702, "learning_rate": 2.7881194004500872e-05, "loss": 0.0792, "step": 12848 }, { "epoch": 3.928156527055946, "grad_norm": 0.36176005005836487, "learning_rate": 2.788076939408093e-05, "loss": 0.0818, "step": 12849 }, { "epoch": 3.928462243962091, "grad_norm": 0.6713166236877441, "learning_rate": 2.7880344783660993e-05, "loss": 0.087, "step": 12850 }, { "epoch": 3.928767960868236, "grad_norm": 0.5819852948188782, "learning_rate": 2.7879920173241052e-05, "loss": 0.1013, "step": 12851 }, { "epoch": 3.929073677774381, "grad_norm": 2.3580875396728516, "learning_rate": 2.7879495562821114e-05, "loss": 0.1184, "step": 12852 }, { "epoch": 3.929379394680526, "grad_norm": 0.5240365266799927, "learning_rate": 2.7879070952401173e-05, "loss": 0.1268, "step": 12853 }, { "epoch": 3.9296851115866707, "grad_norm": 1.2682782411575317, "learning_rate": 2.7878646341981235e-05, "loss": 0.1484, "step": 12854 }, { "epoch": 3.929990828492816, "grad_norm": 0.553189754486084, "learning_rate": 2.7878221731561293e-05, "loss": 0.1683, "step": 12855 }, { "epoch": 3.9302965453989605, "grad_norm": 0.5474544763565063, "learning_rate": 2.7877797121141356e-05, "loss": 0.1784, "step": 12856 }, { "epoch": 3.9306022623051056, "grad_norm": 1.0723536014556885, "learning_rate": 2.7877372510721414e-05, "loss": 0.2039, "step": 12857 }, { "epoch": 3.9309079792112502, "grad_norm": 0.81844562292099, "learning_rate": 2.7876947900301473e-05, "loss": 0.1913, "step": 12858 }, { "epoch": 3.9312136961173954, "grad_norm": 0.921522319316864, "learning_rate": 2.7876523289881535e-05, "loss": 0.2128, "step": 12859 }, { "epoch": 3.9315194130235405, "grad_norm": 1.2985124588012695, "learning_rate": 2.7876098679461594e-05, "loss": 0.1929, "step": 12860 }, { "epoch": 3.931825129929685, "grad_norm": 1.349522590637207, "learning_rate": 2.7875674069041656e-05, "loss": 0.233, "step": 12861 }, { "epoch": 3.93213084683583, "grad_norm": 1.8991618156433105, "learning_rate": 2.7875249458621715e-05, "loss": 0.2685, "step": 12862 }, { "epoch": 3.932436563741975, "grad_norm": 1.435217022895813, "learning_rate": 2.7874824848201777e-05, "loss": 0.2727, "step": 12863 }, { "epoch": 3.93274228064812, "grad_norm": 0.6399997472763062, "learning_rate": 2.7874400237781835e-05, "loss": 0.1569, "step": 12864 }, { "epoch": 3.9330479975542647, "grad_norm": 0.6699171662330627, "learning_rate": 2.7873975627361897e-05, "loss": 0.1069, "step": 12865 }, { "epoch": 3.93335371446041, "grad_norm": 0.198660209774971, "learning_rate": 2.7873551016941956e-05, "loss": 0.0504, "step": 12866 }, { "epoch": 3.9336594313665545, "grad_norm": 0.21376225352287292, "learning_rate": 2.7873126406522018e-05, "loss": 0.0674, "step": 12867 }, { "epoch": 3.9339651482726996, "grad_norm": 0.267231822013855, "learning_rate": 2.7872701796102077e-05, "loss": 0.0744, "step": 12868 }, { "epoch": 3.9342708651788443, "grad_norm": 0.17424820363521576, "learning_rate": 2.787227718568214e-05, "loss": 0.0707, "step": 12869 }, { "epoch": 3.9345765820849894, "grad_norm": 0.2198694497346878, "learning_rate": 2.7871852575262198e-05, "loss": 0.0479, "step": 12870 }, { "epoch": 3.934882298991134, "grad_norm": 0.21448355913162231, "learning_rate": 2.7871427964842256e-05, "loss": 0.0587, "step": 12871 }, { "epoch": 3.935188015897279, "grad_norm": 0.3380422592163086, "learning_rate": 2.787100335442232e-05, "loss": 0.0778, "step": 12872 }, { "epoch": 3.9354937328034243, "grad_norm": 0.312454491853714, "learning_rate": 2.7870578744002377e-05, "loss": 0.0695, "step": 12873 }, { "epoch": 3.935799449709569, "grad_norm": 0.3527754247188568, "learning_rate": 2.787015413358244e-05, "loss": 0.1134, "step": 12874 }, { "epoch": 3.9361051666157136, "grad_norm": 0.43993595242500305, "learning_rate": 2.7869729523162498e-05, "loss": 0.1254, "step": 12875 }, { "epoch": 3.9364108835218588, "grad_norm": 0.5072477459907532, "learning_rate": 2.786930491274256e-05, "loss": 0.1087, "step": 12876 }, { "epoch": 3.936716600428004, "grad_norm": 0.6820381283760071, "learning_rate": 2.786888030232262e-05, "loss": 0.1328, "step": 12877 }, { "epoch": 3.9370223173341485, "grad_norm": 0.47169771790504456, "learning_rate": 2.786845569190268e-05, "loss": 0.1749, "step": 12878 }, { "epoch": 3.9373280342402937, "grad_norm": 0.5099872946739197, "learning_rate": 2.786803108148274e-05, "loss": 0.1483, "step": 12879 }, { "epoch": 3.9376337511464383, "grad_norm": 0.9071097373962402, "learning_rate": 2.78676064710628e-05, "loss": 0.1815, "step": 12880 }, { "epoch": 3.9379394680525834, "grad_norm": 0.5463353991508484, "learning_rate": 2.786718186064286e-05, "loss": 0.1617, "step": 12881 }, { "epoch": 3.938245184958728, "grad_norm": 0.652990460395813, "learning_rate": 2.786675725022292e-05, "loss": 0.1774, "step": 12882 }, { "epoch": 3.938550901864873, "grad_norm": 1.051215648651123, "learning_rate": 2.786633263980298e-05, "loss": 0.1864, "step": 12883 }, { "epoch": 3.938856618771018, "grad_norm": 1.0634267330169678, "learning_rate": 2.786590802938304e-05, "loss": 0.2069, "step": 12884 }, { "epoch": 3.939162335677163, "grad_norm": 0.738839864730835, "learning_rate": 2.7865483418963102e-05, "loss": 0.1879, "step": 12885 }, { "epoch": 3.939468052583308, "grad_norm": 1.232881784439087, "learning_rate": 2.786505880854316e-05, "loss": 0.196, "step": 12886 }, { "epoch": 3.939773769489453, "grad_norm": 1.26139235496521, "learning_rate": 2.7864634198123223e-05, "loss": 0.2416, "step": 12887 }, { "epoch": 3.9400794863955975, "grad_norm": 1.078547477722168, "learning_rate": 2.786420958770328e-05, "loss": 0.2745, "step": 12888 }, { "epoch": 3.9403852033017426, "grad_norm": 0.3061031997203827, "learning_rate": 2.7863784977283343e-05, "loss": 0.1471, "step": 12889 }, { "epoch": 3.9406909202078877, "grad_norm": 0.3701903820037842, "learning_rate": 2.7863360366863402e-05, "loss": 0.0949, "step": 12890 }, { "epoch": 3.9409966371140324, "grad_norm": 0.36980706453323364, "learning_rate": 2.7862935756443464e-05, "loss": 0.0943, "step": 12891 }, { "epoch": 3.9413023540201775, "grad_norm": 0.2603070139884949, "learning_rate": 2.7862511146023523e-05, "loss": 0.0545, "step": 12892 }, { "epoch": 3.941608070926322, "grad_norm": 0.2418753206729889, "learning_rate": 2.7862086535603585e-05, "loss": 0.0591, "step": 12893 }, { "epoch": 3.9419137878324673, "grad_norm": 0.2944003641605377, "learning_rate": 2.7861661925183644e-05, "loss": 0.0493, "step": 12894 }, { "epoch": 3.942219504738612, "grad_norm": 0.47913146018981934, "learning_rate": 2.7861237314763702e-05, "loss": 0.0504, "step": 12895 }, { "epoch": 3.942525221644757, "grad_norm": 0.3792876601219177, "learning_rate": 2.7860812704343765e-05, "loss": 0.0528, "step": 12896 }, { "epoch": 3.9428309385509017, "grad_norm": 0.4120624363422394, "learning_rate": 2.7860388093923823e-05, "loss": 0.0851, "step": 12897 }, { "epoch": 3.943136655457047, "grad_norm": 0.49041685461997986, "learning_rate": 2.7859963483503885e-05, "loss": 0.0723, "step": 12898 }, { "epoch": 3.943442372363192, "grad_norm": 0.38143086433410645, "learning_rate": 2.7859538873083944e-05, "loss": 0.0903, "step": 12899 }, { "epoch": 3.9437480892693366, "grad_norm": 0.33200570940971375, "learning_rate": 2.7859114262664006e-05, "loss": 0.0776, "step": 12900 }, { "epoch": 3.9440538061754813, "grad_norm": 0.2797792851924896, "learning_rate": 2.7858689652244065e-05, "loss": 0.0869, "step": 12901 }, { "epoch": 3.9443595230816264, "grad_norm": 0.4527464210987091, "learning_rate": 2.7858265041824127e-05, "loss": 0.136, "step": 12902 }, { "epoch": 3.9446652399877715, "grad_norm": 0.4643622040748596, "learning_rate": 2.7857840431404186e-05, "loss": 0.1164, "step": 12903 }, { "epoch": 3.944970956893916, "grad_norm": 0.7257064580917358, "learning_rate": 2.7857415820984248e-05, "loss": 0.1414, "step": 12904 }, { "epoch": 3.9452766738000613, "grad_norm": 0.7935320734977722, "learning_rate": 2.7856991210564306e-05, "loss": 0.1698, "step": 12905 }, { "epoch": 3.945582390706206, "grad_norm": 0.46783432364463806, "learning_rate": 2.785656660014437e-05, "loss": 0.1872, "step": 12906 }, { "epoch": 3.945888107612351, "grad_norm": 0.6540877819061279, "learning_rate": 2.7856141989724427e-05, "loss": 0.1716, "step": 12907 }, { "epoch": 3.9461938245184958, "grad_norm": 0.6668471097946167, "learning_rate": 2.7855717379304486e-05, "loss": 0.1849, "step": 12908 }, { "epoch": 3.946499541424641, "grad_norm": 0.5857793688774109, "learning_rate": 2.7855292768884548e-05, "loss": 0.1602, "step": 12909 }, { "epoch": 3.9468052583307855, "grad_norm": 0.9351179599761963, "learning_rate": 2.7854868158464607e-05, "loss": 0.183, "step": 12910 }, { "epoch": 3.9471109752369307, "grad_norm": 0.7500543594360352, "learning_rate": 2.785444354804467e-05, "loss": 0.2301, "step": 12911 }, { "epoch": 3.947416692143076, "grad_norm": 1.2025268077850342, "learning_rate": 2.7854018937624727e-05, "loss": 0.2524, "step": 12912 }, { "epoch": 3.9477224090492204, "grad_norm": 1.125483751296997, "learning_rate": 2.785359432720479e-05, "loss": 0.2761, "step": 12913 }, { "epoch": 3.948028125955365, "grad_norm": 0.3111622929573059, "learning_rate": 2.7853169716784848e-05, "loss": 0.1473, "step": 12914 }, { "epoch": 3.9483338428615102, "grad_norm": 0.2309739738702774, "learning_rate": 2.785274510636491e-05, "loss": 0.0771, "step": 12915 }, { "epoch": 3.9486395597676553, "grad_norm": 0.3425169885158539, "learning_rate": 2.785232049594497e-05, "loss": 0.0915, "step": 12916 }, { "epoch": 3.9489452766738, "grad_norm": 0.4303465485572815, "learning_rate": 2.785189588552503e-05, "loss": 0.053, "step": 12917 }, { "epoch": 3.949250993579945, "grad_norm": 0.20913124084472656, "learning_rate": 2.785147127510509e-05, "loss": 0.0475, "step": 12918 }, { "epoch": 3.94955671048609, "grad_norm": 0.22092673182487488, "learning_rate": 2.7851046664685152e-05, "loss": 0.0744, "step": 12919 }, { "epoch": 3.949862427392235, "grad_norm": 0.18935316801071167, "learning_rate": 2.7850622054265214e-05, "loss": 0.0605, "step": 12920 }, { "epoch": 3.9501681442983796, "grad_norm": 0.21563056111335754, "learning_rate": 2.7850197443845273e-05, "loss": 0.0663, "step": 12921 }, { "epoch": 3.9504738612045247, "grad_norm": 0.2944338023662567, "learning_rate": 2.7849772833425335e-05, "loss": 0.096, "step": 12922 }, { "epoch": 3.9507795781106694, "grad_norm": 0.5362118482589722, "learning_rate": 2.7849348223005394e-05, "loss": 0.0835, "step": 12923 }, { "epoch": 3.9510852950168145, "grad_norm": 0.46064451336860657, "learning_rate": 2.7848923612585456e-05, "loss": 0.0884, "step": 12924 }, { "epoch": 3.9513910119229596, "grad_norm": 0.23316146433353424, "learning_rate": 2.7848499002165514e-05, "loss": 0.0748, "step": 12925 }, { "epoch": 3.9516967288291043, "grad_norm": 0.46562862396240234, "learning_rate": 2.7848074391745576e-05, "loss": 0.0976, "step": 12926 }, { "epoch": 3.952002445735249, "grad_norm": 0.35250261425971985, "learning_rate": 2.7847649781325635e-05, "loss": 0.1419, "step": 12927 }, { "epoch": 3.952308162641394, "grad_norm": 0.3646871745586395, "learning_rate": 2.7847225170905697e-05, "loss": 0.1358, "step": 12928 }, { "epoch": 3.952613879547539, "grad_norm": 0.5363051891326904, "learning_rate": 2.7846800560485756e-05, "loss": 0.1315, "step": 12929 }, { "epoch": 3.952919596453684, "grad_norm": 0.4267081916332245, "learning_rate": 2.7846375950065818e-05, "loss": 0.1637, "step": 12930 }, { "epoch": 3.953225313359829, "grad_norm": 0.39531266689300537, "learning_rate": 2.7845951339645877e-05, "loss": 0.196, "step": 12931 }, { "epoch": 3.9535310302659736, "grad_norm": 0.591958224773407, "learning_rate": 2.784552672922594e-05, "loss": 0.1495, "step": 12932 }, { "epoch": 3.9538367471721187, "grad_norm": 0.605510950088501, "learning_rate": 2.7845102118805997e-05, "loss": 0.1911, "step": 12933 }, { "epoch": 3.9541424640782634, "grad_norm": 0.7253336310386658, "learning_rate": 2.7844677508386056e-05, "loss": 0.1664, "step": 12934 }, { "epoch": 3.9544481809844085, "grad_norm": 0.7151932120323181, "learning_rate": 2.7844252897966118e-05, "loss": 0.1962, "step": 12935 }, { "epoch": 3.954753897890553, "grad_norm": 1.2851617336273193, "learning_rate": 2.7843828287546177e-05, "loss": 0.1892, "step": 12936 }, { "epoch": 3.9550596147966983, "grad_norm": 0.8114538192749023, "learning_rate": 2.784340367712624e-05, "loss": 0.2215, "step": 12937 }, { "epoch": 3.9553653317028434, "grad_norm": 1.330302119255066, "learning_rate": 2.7842979066706298e-05, "loss": 0.2782, "step": 12938 }, { "epoch": 3.955671048608988, "grad_norm": 0.8392424583435059, "learning_rate": 2.784255445628636e-05, "loss": 0.1636, "step": 12939 }, { "epoch": 3.9559767655151328, "grad_norm": 0.24898689985275269, "learning_rate": 2.784212984586642e-05, "loss": 0.0784, "step": 12940 }, { "epoch": 3.956282482421278, "grad_norm": 0.20165856182575226, "learning_rate": 2.784170523544648e-05, "loss": 0.0623, "step": 12941 }, { "epoch": 3.956588199327423, "grad_norm": 0.7929010987281799, "learning_rate": 2.784128062502654e-05, "loss": 0.0633, "step": 12942 }, { "epoch": 3.9568939162335677, "grad_norm": 0.24790024757385254, "learning_rate": 2.78408560146066e-05, "loss": 0.0527, "step": 12943 }, { "epoch": 3.957199633139713, "grad_norm": 0.2558095455169678, "learning_rate": 2.784043140418666e-05, "loss": 0.0882, "step": 12944 }, { "epoch": 3.9575053500458575, "grad_norm": 0.23334206640720367, "learning_rate": 2.7840006793766722e-05, "loss": 0.0517, "step": 12945 }, { "epoch": 3.9578110669520026, "grad_norm": 0.2631241977214813, "learning_rate": 2.783958218334678e-05, "loss": 0.0594, "step": 12946 }, { "epoch": 3.9581167838581472, "grad_norm": 0.578944981098175, "learning_rate": 2.783915757292684e-05, "loss": 0.0861, "step": 12947 }, { "epoch": 3.9584225007642924, "grad_norm": 0.32248103618621826, "learning_rate": 2.78387329625069e-05, "loss": 0.0783, "step": 12948 }, { "epoch": 3.958728217670437, "grad_norm": 0.6402571797370911, "learning_rate": 2.783830835208696e-05, "loss": 0.1017, "step": 12949 }, { "epoch": 3.959033934576582, "grad_norm": 0.2857830822467804, "learning_rate": 2.7837883741667022e-05, "loss": 0.0725, "step": 12950 }, { "epoch": 3.9593396514827273, "grad_norm": 0.595598042011261, "learning_rate": 2.783745913124708e-05, "loss": 0.1094, "step": 12951 }, { "epoch": 3.959645368388872, "grad_norm": 0.44345834851264954, "learning_rate": 2.7837034520827143e-05, "loss": 0.1337, "step": 12952 }, { "epoch": 3.9599510852950166, "grad_norm": 1.083530306816101, "learning_rate": 2.7836609910407202e-05, "loss": 0.1052, "step": 12953 }, { "epoch": 3.9602568022011617, "grad_norm": 0.3428784906864166, "learning_rate": 2.7836185299987264e-05, "loss": 0.1493, "step": 12954 }, { "epoch": 3.960562519107307, "grad_norm": 0.5745550394058228, "learning_rate": 2.7835760689567323e-05, "loss": 0.1537, "step": 12955 }, { "epoch": 3.9608682360134515, "grad_norm": 0.5295932292938232, "learning_rate": 2.7835336079147385e-05, "loss": 0.1709, "step": 12956 }, { "epoch": 3.9611739529195966, "grad_norm": 0.543388843536377, "learning_rate": 2.7834911468727444e-05, "loss": 0.173, "step": 12957 }, { "epoch": 3.9614796698257413, "grad_norm": 1.1161088943481445, "learning_rate": 2.7834486858307506e-05, "loss": 0.21, "step": 12958 }, { "epoch": 3.9617853867318864, "grad_norm": 0.908129870891571, "learning_rate": 2.7834062247887564e-05, "loss": 0.1755, "step": 12959 }, { "epoch": 3.962091103638031, "grad_norm": 0.984261691570282, "learning_rate": 2.7833637637467623e-05, "loss": 0.2162, "step": 12960 }, { "epoch": 3.962396820544176, "grad_norm": 0.9904894828796387, "learning_rate": 2.7833213027047685e-05, "loss": 0.2253, "step": 12961 }, { "epoch": 3.962702537450321, "grad_norm": 0.8201888203620911, "learning_rate": 2.7832788416627744e-05, "loss": 0.1929, "step": 12962 }, { "epoch": 3.963008254356466, "grad_norm": 1.6137237548828125, "learning_rate": 2.7832363806207806e-05, "loss": 0.2681, "step": 12963 }, { "epoch": 3.963313971262611, "grad_norm": 0.37956374883651733, "learning_rate": 2.7831939195787865e-05, "loss": 0.1685, "step": 12964 }, { "epoch": 3.9636196881687558, "grad_norm": 0.40239790081977844, "learning_rate": 2.7831514585367927e-05, "loss": 0.1019, "step": 12965 }, { "epoch": 3.9639254050749004, "grad_norm": 0.28550857305526733, "learning_rate": 2.7831089974947985e-05, "loss": 0.0547, "step": 12966 }, { "epoch": 3.9642311219810455, "grad_norm": 0.35805800557136536, "learning_rate": 2.7830665364528047e-05, "loss": 0.0576, "step": 12967 }, { "epoch": 3.9645368388871907, "grad_norm": 0.2699202597141266, "learning_rate": 2.7830240754108106e-05, "loss": 0.0793, "step": 12968 }, { "epoch": 3.9648425557933353, "grad_norm": 0.1932174265384674, "learning_rate": 2.7829816143688168e-05, "loss": 0.0502, "step": 12969 }, { "epoch": 3.9651482726994804, "grad_norm": 0.17429250478744507, "learning_rate": 2.7829391533268227e-05, "loss": 0.0532, "step": 12970 }, { "epoch": 3.965453989605625, "grad_norm": 0.33172717690467834, "learning_rate": 2.782896692284829e-05, "loss": 0.0624, "step": 12971 }, { "epoch": 3.9657597065117702, "grad_norm": 0.22129015624523163, "learning_rate": 2.7828542312428348e-05, "loss": 0.0537, "step": 12972 }, { "epoch": 3.966065423417915, "grad_norm": 0.1617840826511383, "learning_rate": 2.7828117702008406e-05, "loss": 0.0424, "step": 12973 }, { "epoch": 3.96637114032406, "grad_norm": 0.42860743403434753, "learning_rate": 2.782769309158847e-05, "loss": 0.1006, "step": 12974 }, { "epoch": 3.9666768572302047, "grad_norm": 0.3906301259994507, "learning_rate": 2.7827268481168527e-05, "loss": 0.0891, "step": 12975 }, { "epoch": 3.96698257413635, "grad_norm": 0.30319294333457947, "learning_rate": 2.782684387074859e-05, "loss": 0.0927, "step": 12976 }, { "epoch": 3.967288291042495, "grad_norm": 0.4714825451374054, "learning_rate": 2.7826419260328648e-05, "loss": 0.1365, "step": 12977 }, { "epoch": 3.9675940079486396, "grad_norm": 0.33611783385276794, "learning_rate": 2.782599464990871e-05, "loss": 0.1061, "step": 12978 }, { "epoch": 3.9678997248547843, "grad_norm": 0.49943050742149353, "learning_rate": 2.782557003948877e-05, "loss": 0.1515, "step": 12979 }, { "epoch": 3.9682054417609294, "grad_norm": 0.5730121731758118, "learning_rate": 2.782514542906883e-05, "loss": 0.1403, "step": 12980 }, { "epoch": 3.9685111586670745, "grad_norm": 0.7508202195167542, "learning_rate": 2.782472081864889e-05, "loss": 0.1668, "step": 12981 }, { "epoch": 3.968816875573219, "grad_norm": 0.6158155798912048, "learning_rate": 2.782429620822895e-05, "loss": 0.1566, "step": 12982 }, { "epoch": 3.969122592479364, "grad_norm": 0.786291778087616, "learning_rate": 2.782387159780901e-05, "loss": 0.2191, "step": 12983 }, { "epoch": 3.969428309385509, "grad_norm": 4.471213340759277, "learning_rate": 2.7823446987389072e-05, "loss": 0.1599, "step": 12984 }, { "epoch": 3.969734026291654, "grad_norm": 3.4205591678619385, "learning_rate": 2.782302237696913e-05, "loss": 0.1939, "step": 12985 }, { "epoch": 3.9700397431977987, "grad_norm": 0.7166623473167419, "learning_rate": 2.782259776654919e-05, "loss": 0.1783, "step": 12986 }, { "epoch": 3.970345460103944, "grad_norm": 0.8187294006347656, "learning_rate": 2.7822173156129252e-05, "loss": 0.2335, "step": 12987 }, { "epoch": 3.9706511770100885, "grad_norm": 1.172858476638794, "learning_rate": 2.782174854570931e-05, "loss": 0.2254, "step": 12988 }, { "epoch": 3.9709568939162336, "grad_norm": 0.2780904173851013, "learning_rate": 2.7821323935289373e-05, "loss": 0.1517, "step": 12989 }, { "epoch": 3.9712626108223787, "grad_norm": 0.49937790632247925, "learning_rate": 2.782089932486943e-05, "loss": 0.0816, "step": 12990 }, { "epoch": 3.9715683277285234, "grad_norm": 0.26140475273132324, "learning_rate": 2.7820474714449494e-05, "loss": 0.0954, "step": 12991 }, { "epoch": 3.971874044634668, "grad_norm": 0.20148010551929474, "learning_rate": 2.7820050104029552e-05, "loss": 0.0702, "step": 12992 }, { "epoch": 3.972179761540813, "grad_norm": 0.19045689702033997, "learning_rate": 2.7819625493609614e-05, "loss": 0.0556, "step": 12993 }, { "epoch": 3.9724854784469583, "grad_norm": 0.44812294840812683, "learning_rate": 2.7819200883189673e-05, "loss": 0.0549, "step": 12994 }, { "epoch": 3.972791195353103, "grad_norm": 0.16541166603565216, "learning_rate": 2.7818776272769735e-05, "loss": 0.0558, "step": 12995 }, { "epoch": 3.9730969122592477, "grad_norm": 0.38328465819358826, "learning_rate": 2.7818351662349794e-05, "loss": 0.0838, "step": 12996 }, { "epoch": 3.9734026291653928, "grad_norm": 0.6651934385299683, "learning_rate": 2.7817927051929853e-05, "loss": 0.0745, "step": 12997 }, { "epoch": 3.973708346071538, "grad_norm": 0.28808534145355225, "learning_rate": 2.7817502441509915e-05, "loss": 0.0662, "step": 12998 }, { "epoch": 3.9740140629776826, "grad_norm": 0.3395620286464691, "learning_rate": 2.7817077831089973e-05, "loss": 0.1079, "step": 12999 }, { "epoch": 3.9743197798838277, "grad_norm": 0.34848931431770325, "learning_rate": 2.7816653220670035e-05, "loss": 0.0925, "step": 13000 }, { "epoch": 3.9743197798838277, "eval_cer": 0.1890881913303438, "eval_loss": 0.23406705260276794, "eval_runtime": 19.0114, "eval_samples_per_second": 238.699, "eval_steps_per_second": 0.789, "eval_wer": 0.3320298315689506, "step": 13000 }, { "epoch": 3.9746254967899723, "grad_norm": 0.27866998314857483, "learning_rate": 2.7816228610250094e-05, "loss": 0.0841, "step": 13001 }, { "epoch": 3.9749312136961175, "grad_norm": 0.3999832570552826, "learning_rate": 2.7815803999830156e-05, "loss": 0.1651, "step": 13002 }, { "epoch": 3.9752369306022626, "grad_norm": 0.8763354420661926, "learning_rate": 2.7815379389410215e-05, "loss": 0.1193, "step": 13003 }, { "epoch": 3.9755426475084072, "grad_norm": 0.7047785520553589, "learning_rate": 2.7814954778990277e-05, "loss": 0.1456, "step": 13004 }, { "epoch": 3.975848364414552, "grad_norm": 0.6140563488006592, "learning_rate": 2.7814530168570336e-05, "loss": 0.1526, "step": 13005 }, { "epoch": 3.976154081320697, "grad_norm": 1.2771928310394287, "learning_rate": 2.7814105558150398e-05, "loss": 0.1761, "step": 13006 }, { "epoch": 3.976459798226842, "grad_norm": 0.5398784279823303, "learning_rate": 2.7813680947730456e-05, "loss": 0.1468, "step": 13007 }, { "epoch": 3.976765515132987, "grad_norm": 2.925163984298706, "learning_rate": 2.781325633731052e-05, "loss": 0.1706, "step": 13008 }, { "epoch": 3.9770712320391315, "grad_norm": 1.2124077081680298, "learning_rate": 2.7812831726890577e-05, "loss": 0.1825, "step": 13009 }, { "epoch": 3.9773769489452766, "grad_norm": 0.947055995464325, "learning_rate": 2.7812407116470636e-05, "loss": 0.1466, "step": 13010 }, { "epoch": 3.9776826658514217, "grad_norm": 0.6207373738288879, "learning_rate": 2.7811982506050698e-05, "loss": 0.1877, "step": 13011 }, { "epoch": 3.9779883827575664, "grad_norm": 1.1321676969528198, "learning_rate": 2.7811557895630757e-05, "loss": 0.2073, "step": 13012 }, { "epoch": 3.9782940996637115, "grad_norm": 3.094856023788452, "learning_rate": 2.781113328521082e-05, "loss": 0.3198, "step": 13013 }, { "epoch": 3.978599816569856, "grad_norm": 0.4718695282936096, "learning_rate": 2.7810708674790878e-05, "loss": 0.1512, "step": 13014 }, { "epoch": 3.9789055334760013, "grad_norm": 0.34490135312080383, "learning_rate": 2.781028406437094e-05, "loss": 0.1302, "step": 13015 }, { "epoch": 3.9792112503821464, "grad_norm": 0.3110799193382263, "learning_rate": 2.7809859453950998e-05, "loss": 0.0574, "step": 13016 }, { "epoch": 3.979516967288291, "grad_norm": 0.48090416193008423, "learning_rate": 2.780943484353106e-05, "loss": 0.0869, "step": 13017 }, { "epoch": 3.9798226841944357, "grad_norm": 0.24548965692520142, "learning_rate": 2.780901023311112e-05, "loss": 0.0482, "step": 13018 }, { "epoch": 3.980128401100581, "grad_norm": 0.2716936767101288, "learning_rate": 2.780858562269118e-05, "loss": 0.075, "step": 13019 }, { "epoch": 3.980434118006726, "grad_norm": 0.2524164915084839, "learning_rate": 2.780816101227124e-05, "loss": 0.0889, "step": 13020 }, { "epoch": 3.9807398349128706, "grad_norm": 0.19210582971572876, "learning_rate": 2.7807736401851302e-05, "loss": 0.058, "step": 13021 }, { "epoch": 3.9810455518190153, "grad_norm": 0.3779352009296417, "learning_rate": 2.7807311791431364e-05, "loss": 0.08, "step": 13022 }, { "epoch": 3.9813512687251604, "grad_norm": 0.2908056676387787, "learning_rate": 2.7806887181011423e-05, "loss": 0.0714, "step": 13023 }, { "epoch": 3.9816569856313055, "grad_norm": 0.3906140923500061, "learning_rate": 2.7806462570591485e-05, "loss": 0.1173, "step": 13024 }, { "epoch": 3.98196270253745, "grad_norm": 0.6357743144035339, "learning_rate": 2.7806037960171544e-05, "loss": 0.0815, "step": 13025 }, { "epoch": 3.9822684194435953, "grad_norm": 0.25729790329933167, "learning_rate": 2.7805613349751606e-05, "loss": 0.0821, "step": 13026 }, { "epoch": 3.98257413634974, "grad_norm": 1.4447846412658691, "learning_rate": 2.7805188739331664e-05, "loss": 0.1296, "step": 13027 }, { "epoch": 3.982879853255885, "grad_norm": 0.31299465894699097, "learning_rate": 2.7804764128911726e-05, "loss": 0.1019, "step": 13028 }, { "epoch": 3.9831855701620302, "grad_norm": 0.49619099497795105, "learning_rate": 2.7804339518491785e-05, "loss": 0.193, "step": 13029 }, { "epoch": 3.983491287068175, "grad_norm": 0.5840837955474854, "learning_rate": 2.7803914908071847e-05, "loss": 0.1773, "step": 13030 }, { "epoch": 3.9837970039743196, "grad_norm": 0.554854691028595, "learning_rate": 2.7803490297651906e-05, "loss": 0.1802, "step": 13031 }, { "epoch": 3.9841027208804647, "grad_norm": 0.7622833251953125, "learning_rate": 2.7803065687231968e-05, "loss": 0.1751, "step": 13032 }, { "epoch": 3.98440843778661, "grad_norm": 0.8360525369644165, "learning_rate": 2.7802641076812027e-05, "loss": 0.1692, "step": 13033 }, { "epoch": 3.9847141546927545, "grad_norm": 1.1400139331817627, "learning_rate": 2.780221646639209e-05, "loss": 0.2068, "step": 13034 }, { "epoch": 3.985019871598899, "grad_norm": 0.8291430473327637, "learning_rate": 2.7801791855972147e-05, "loss": 0.2232, "step": 13035 }, { "epoch": 3.9853255885050443, "grad_norm": 0.8613014221191406, "learning_rate": 2.7801367245552206e-05, "loss": 0.182, "step": 13036 }, { "epoch": 3.9856313054111894, "grad_norm": 1.261579155921936, "learning_rate": 2.7800942635132268e-05, "loss": 0.2082, "step": 13037 }, { "epoch": 3.985937022317334, "grad_norm": 2.3240444660186768, "learning_rate": 2.7800518024712327e-05, "loss": 0.2214, "step": 13038 }, { "epoch": 3.986242739223479, "grad_norm": 0.27571120858192444, "learning_rate": 2.780009341429239e-05, "loss": 0.1294, "step": 13039 }, { "epoch": 3.986548456129624, "grad_norm": 0.38062331080436707, "learning_rate": 2.7799668803872448e-05, "loss": 0.0783, "step": 13040 }, { "epoch": 3.986854173035769, "grad_norm": 0.4973098635673523, "learning_rate": 2.779924419345251e-05, "loss": 0.0994, "step": 13041 }, { "epoch": 3.987159889941914, "grad_norm": 0.2925884425640106, "learning_rate": 2.779881958303257e-05, "loss": 0.0449, "step": 13042 }, { "epoch": 3.9874656068480587, "grad_norm": 0.5972641706466675, "learning_rate": 2.779839497261263e-05, "loss": 0.0497, "step": 13043 }, { "epoch": 3.9877713237542034, "grad_norm": 0.342106431722641, "learning_rate": 2.779797036219269e-05, "loss": 0.0606, "step": 13044 }, { "epoch": 3.9880770406603485, "grad_norm": 0.2992664575576782, "learning_rate": 2.779754575177275e-05, "loss": 0.0685, "step": 13045 }, { "epoch": 3.9883827575664936, "grad_norm": 0.5882282853126526, "learning_rate": 2.779712114135281e-05, "loss": 0.0607, "step": 13046 }, { "epoch": 3.9886884744726383, "grad_norm": 0.31919240951538086, "learning_rate": 2.7796696530932872e-05, "loss": 0.0873, "step": 13047 }, { "epoch": 3.988994191378783, "grad_norm": 0.19517868757247925, "learning_rate": 2.779627192051293e-05, "loss": 0.0452, "step": 13048 }, { "epoch": 3.989299908284928, "grad_norm": 0.7056368589401245, "learning_rate": 2.779584731009299e-05, "loss": 0.0841, "step": 13049 }, { "epoch": 3.989605625191073, "grad_norm": 0.2908773422241211, "learning_rate": 2.7795422699673052e-05, "loss": 0.0878, "step": 13050 }, { "epoch": 3.989911342097218, "grad_norm": 0.3899102210998535, "learning_rate": 2.779499808925311e-05, "loss": 0.0969, "step": 13051 }, { "epoch": 3.990217059003363, "grad_norm": 0.8748526573181152, "learning_rate": 2.7794573478833172e-05, "loss": 0.1668, "step": 13052 }, { "epoch": 3.9905227759095077, "grad_norm": 0.5200314521789551, "learning_rate": 2.779414886841323e-05, "loss": 0.1032, "step": 13053 }, { "epoch": 3.9908284928156528, "grad_norm": 0.4098909795284271, "learning_rate": 2.7793724257993293e-05, "loss": 0.1278, "step": 13054 }, { "epoch": 3.991134209721798, "grad_norm": 0.5371338725090027, "learning_rate": 2.7793299647573352e-05, "loss": 0.1452, "step": 13055 }, { "epoch": 3.9914399266279426, "grad_norm": 0.5276004076004028, "learning_rate": 2.7792875037153414e-05, "loss": 0.1579, "step": 13056 }, { "epoch": 3.9917456435340872, "grad_norm": 0.8973274230957031, "learning_rate": 2.7792450426733473e-05, "loss": 0.172, "step": 13057 }, { "epoch": 3.9920513604402323, "grad_norm": 0.7399389743804932, "learning_rate": 2.7792025816313535e-05, "loss": 0.1622, "step": 13058 }, { "epoch": 3.9923570773463775, "grad_norm": 1.2844983339309692, "learning_rate": 2.7791601205893594e-05, "loss": 0.1816, "step": 13059 }, { "epoch": 3.992662794252522, "grad_norm": 0.7173266410827637, "learning_rate": 2.7791176595473656e-05, "loss": 0.207, "step": 13060 }, { "epoch": 3.992968511158667, "grad_norm": 1.0227144956588745, "learning_rate": 2.7790751985053714e-05, "loss": 0.186, "step": 13061 }, { "epoch": 3.993274228064812, "grad_norm": 1.1135280132293701, "learning_rate": 2.7790327374633773e-05, "loss": 0.2191, "step": 13062 }, { "epoch": 3.993579944970957, "grad_norm": 1.4549038410186768, "learning_rate": 2.7789902764213835e-05, "loss": 0.2508, "step": 13063 }, { "epoch": 3.9938856618771017, "grad_norm": 0.5037142634391785, "learning_rate": 2.7789478153793894e-05, "loss": 0.1329, "step": 13064 }, { "epoch": 3.994191378783247, "grad_norm": 0.23481446504592896, "learning_rate": 2.7789053543373956e-05, "loss": 0.078, "step": 13065 }, { "epoch": 3.9944970956893915, "grad_norm": 0.30090945959091187, "learning_rate": 2.7788628932954015e-05, "loss": 0.0787, "step": 13066 }, { "epoch": 3.9948028125955366, "grad_norm": 0.2992687225341797, "learning_rate": 2.7788204322534077e-05, "loss": 0.0729, "step": 13067 }, { "epoch": 3.9951085295016817, "grad_norm": 0.6511582732200623, "learning_rate": 2.7787779712114135e-05, "loss": 0.0888, "step": 13068 }, { "epoch": 3.9954142464078264, "grad_norm": 0.3258184790611267, "learning_rate": 2.7787355101694197e-05, "loss": 0.0599, "step": 13069 }, { "epoch": 3.995719963313971, "grad_norm": 0.3458271324634552, "learning_rate": 2.7786930491274256e-05, "loss": 0.0638, "step": 13070 }, { "epoch": 3.996025680220116, "grad_norm": 0.34370648860931396, "learning_rate": 2.7786505880854318e-05, "loss": 0.0728, "step": 13071 }, { "epoch": 3.9963313971262613, "grad_norm": 0.26616236567497253, "learning_rate": 2.7786081270434377e-05, "loss": 0.077, "step": 13072 }, { "epoch": 3.996637114032406, "grad_norm": 0.3094501495361328, "learning_rate": 2.778565666001444e-05, "loss": 0.0904, "step": 13073 }, { "epoch": 3.9969428309385506, "grad_norm": 0.8096712231636047, "learning_rate": 2.7785232049594498e-05, "loss": 0.1147, "step": 13074 }, { "epoch": 3.9972485478446957, "grad_norm": 0.39324840903282166, "learning_rate": 2.7784807439174556e-05, "loss": 0.1579, "step": 13075 }, { "epoch": 3.997554264750841, "grad_norm": 0.5214860439300537, "learning_rate": 2.778438282875462e-05, "loss": 0.1692, "step": 13076 }, { "epoch": 3.9978599816569855, "grad_norm": 0.6880795955657959, "learning_rate": 2.7783958218334677e-05, "loss": 0.1279, "step": 13077 }, { "epoch": 3.9981656985631306, "grad_norm": 0.6825464963912964, "learning_rate": 2.778353360791474e-05, "loss": 0.1728, "step": 13078 }, { "epoch": 3.9984714154692753, "grad_norm": 0.6482611298561096, "learning_rate": 2.7783108997494798e-05, "loss": 0.1721, "step": 13079 }, { "epoch": 3.9987771323754204, "grad_norm": 1.0701230764389038, "learning_rate": 2.778268438707486e-05, "loss": 0.1653, "step": 13080 }, { "epoch": 3.9990828492815655, "grad_norm": 0.7238001823425293, "learning_rate": 2.778225977665492e-05, "loss": 0.1765, "step": 13081 }, { "epoch": 3.99938856618771, "grad_norm": 0.8220131993293762, "learning_rate": 2.778183516623498e-05, "loss": 0.18, "step": 13082 }, { "epoch": 3.999694283093855, "grad_norm": 1.890055775642395, "learning_rate": 2.778141055581504e-05, "loss": 0.2375, "step": 13083 }, { "epoch": 4.0, "grad_norm": 0.7529042959213257, "learning_rate": 2.7780985945395102e-05, "loss": 0.2019, "step": 13084 }, { "epoch": 4.000305716906145, "grad_norm": 0.5561637282371521, "learning_rate": 2.778056133497516e-05, "loss": 0.1433, "step": 13085 }, { "epoch": 4.00061143381229, "grad_norm": 0.49350544810295105, "learning_rate": 2.7780136724555222e-05, "loss": 0.0753, "step": 13086 }, { "epoch": 4.0009171507184345, "grad_norm": 0.2820277512073517, "learning_rate": 2.777971211413528e-05, "loss": 0.0731, "step": 13087 }, { "epoch": 4.00122286762458, "grad_norm": 0.2706902027130127, "learning_rate": 2.777928750371534e-05, "loss": 0.0666, "step": 13088 }, { "epoch": 4.001528584530725, "grad_norm": 0.19382183253765106, "learning_rate": 2.7778862893295402e-05, "loss": 0.0651, "step": 13089 }, { "epoch": 4.00183430143687, "grad_norm": 0.5187515020370483, "learning_rate": 2.777843828287546e-05, "loss": 0.0636, "step": 13090 }, { "epoch": 4.002140018343014, "grad_norm": 0.24466392397880554, "learning_rate": 2.7778013672455523e-05, "loss": 0.0655, "step": 13091 }, { "epoch": 4.002445735249159, "grad_norm": 0.4994967579841614, "learning_rate": 2.777758906203558e-05, "loss": 0.0708, "step": 13092 }, { "epoch": 4.002751452155304, "grad_norm": 0.4118143916130066, "learning_rate": 2.7777164451615644e-05, "loss": 0.1139, "step": 13093 }, { "epoch": 4.003057169061449, "grad_norm": 0.4060824513435364, "learning_rate": 2.7776739841195702e-05, "loss": 0.0715, "step": 13094 }, { "epoch": 4.003362885967594, "grad_norm": 0.9333071708679199, "learning_rate": 2.7776315230775764e-05, "loss": 0.0762, "step": 13095 }, { "epoch": 4.003668602873739, "grad_norm": 0.34256935119628906, "learning_rate": 2.7775890620355823e-05, "loss": 0.0667, "step": 13096 }, { "epoch": 4.003974319779884, "grad_norm": 0.5853037238121033, "learning_rate": 2.7775466009935885e-05, "loss": 0.1074, "step": 13097 }, { "epoch": 4.004280036686029, "grad_norm": 0.6122083067893982, "learning_rate": 2.7775041399515944e-05, "loss": 0.1377, "step": 13098 }, { "epoch": 4.004585753592174, "grad_norm": 0.4913578927516937, "learning_rate": 2.7774616789096006e-05, "loss": 0.1179, "step": 13099 }, { "epoch": 4.004891470498318, "grad_norm": 17.43705177307129, "learning_rate": 2.7774192178676065e-05, "loss": 0.1317, "step": 13100 }, { "epoch": 4.005197187404463, "grad_norm": 0.8716374635696411, "learning_rate": 2.7773767568256123e-05, "loss": 0.1583, "step": 13101 }, { "epoch": 4.0055029043106085, "grad_norm": 1.127442479133606, "learning_rate": 2.7773342957836185e-05, "loss": 0.1569, "step": 13102 }, { "epoch": 4.005808621216754, "grad_norm": 0.7430824637413025, "learning_rate": 2.7772918347416244e-05, "loss": 0.1773, "step": 13103 }, { "epoch": 4.006114338122898, "grad_norm": 1.36094069480896, "learning_rate": 2.7772493736996306e-05, "loss": 0.1506, "step": 13104 }, { "epoch": 4.006420055029043, "grad_norm": 0.7315120697021484, "learning_rate": 2.7772069126576365e-05, "loss": 0.1853, "step": 13105 }, { "epoch": 4.006725771935188, "grad_norm": 2.918186664581299, "learning_rate": 2.7771644516156427e-05, "loss": 0.187, "step": 13106 }, { "epoch": 4.007031488841333, "grad_norm": 0.8365120887756348, "learning_rate": 2.7771219905736486e-05, "loss": 0.1945, "step": 13107 }, { "epoch": 4.007337205747477, "grad_norm": 1.1705658435821533, "learning_rate": 2.7770795295316548e-05, "loss": 0.197, "step": 13108 }, { "epoch": 4.0076429226536225, "grad_norm": 1.0838514566421509, "learning_rate": 2.7770370684896606e-05, "loss": 0.2478, "step": 13109 }, { "epoch": 4.007948639559768, "grad_norm": 0.48454856872558594, "learning_rate": 2.776994607447667e-05, "loss": 0.1466, "step": 13110 }, { "epoch": 4.008254356465913, "grad_norm": 0.3421136736869812, "learning_rate": 2.7769521464056727e-05, "loss": 0.0693, "step": 13111 }, { "epoch": 4.008560073372058, "grad_norm": 0.24013668298721313, "learning_rate": 2.7769096853636786e-05, "loss": 0.0908, "step": 13112 }, { "epoch": 4.008865790278202, "grad_norm": 0.24736765027046204, "learning_rate": 2.7768672243216848e-05, "loss": 0.0473, "step": 13113 }, { "epoch": 4.009171507184347, "grad_norm": 0.3492421805858612, "learning_rate": 2.7768247632796907e-05, "loss": 0.0558, "step": 13114 }, { "epoch": 4.009477224090492, "grad_norm": 0.5344838500022888, "learning_rate": 2.776782302237697e-05, "loss": 0.0531, "step": 13115 }, { "epoch": 4.0097829409966375, "grad_norm": 0.2605728805065155, "learning_rate": 2.7767398411957028e-05, "loss": 0.0438, "step": 13116 }, { "epoch": 4.010088657902782, "grad_norm": 0.5695974826812744, "learning_rate": 2.776697380153709e-05, "loss": 0.0635, "step": 13117 }, { "epoch": 4.010394374808927, "grad_norm": 0.4435711205005646, "learning_rate": 2.776654919111715e-05, "loss": 0.0941, "step": 13118 }, { "epoch": 4.010700091715072, "grad_norm": 0.26083335280418396, "learning_rate": 2.776612458069721e-05, "loss": 0.0516, "step": 13119 }, { "epoch": 4.011005808621217, "grad_norm": 0.29490453004837036, "learning_rate": 2.776569997027727e-05, "loss": 0.0955, "step": 13120 }, { "epoch": 4.011311525527361, "grad_norm": 0.3246840536594391, "learning_rate": 2.776527535985733e-05, "loss": 0.0739, "step": 13121 }, { "epoch": 4.011617242433506, "grad_norm": 0.5003387928009033, "learning_rate": 2.776485074943739e-05, "loss": 0.0868, "step": 13122 }, { "epoch": 4.0119229593396515, "grad_norm": 0.4233975410461426, "learning_rate": 2.7764426139017452e-05, "loss": 0.1114, "step": 13123 }, { "epoch": 4.012228676245797, "grad_norm": 0.8697778582572937, "learning_rate": 2.7764001528597514e-05, "loss": 0.1685, "step": 13124 }, { "epoch": 4.012534393151942, "grad_norm": 0.5533639788627625, "learning_rate": 2.7763576918177573e-05, "loss": 0.1214, "step": 13125 }, { "epoch": 4.012840110058086, "grad_norm": 0.5409693121910095, "learning_rate": 2.7763152307757635e-05, "loss": 0.1582, "step": 13126 }, { "epoch": 4.013145826964231, "grad_norm": 0.8201282024383545, "learning_rate": 2.7762727697337694e-05, "loss": 0.1742, "step": 13127 }, { "epoch": 4.013451543870376, "grad_norm": 0.8401950001716614, "learning_rate": 2.7762303086917756e-05, "loss": 0.1837, "step": 13128 }, { "epoch": 4.013757260776521, "grad_norm": 0.6156793832778931, "learning_rate": 2.7761878476497814e-05, "loss": 0.1927, "step": 13129 }, { "epoch": 4.0140629776826655, "grad_norm": 0.6012495160102844, "learning_rate": 2.7761453866077876e-05, "loss": 0.202, "step": 13130 }, { "epoch": 4.014368694588811, "grad_norm": 0.7582480311393738, "learning_rate": 2.7761029255657935e-05, "loss": 0.2123, "step": 13131 }, { "epoch": 4.014674411494956, "grad_norm": 1.7585748434066772, "learning_rate": 2.7760604645237997e-05, "loss": 0.2047, "step": 13132 }, { "epoch": 4.014980128401101, "grad_norm": 1.1534487009048462, "learning_rate": 2.7760180034818056e-05, "loss": 0.2334, "step": 13133 }, { "epoch": 4.015285845307245, "grad_norm": 1.845804214477539, "learning_rate": 2.7759755424398118e-05, "loss": 0.266, "step": 13134 }, { "epoch": 4.01559156221339, "grad_norm": 0.4766070246696472, "learning_rate": 2.7759330813978177e-05, "loss": 0.1547, "step": 13135 }, { "epoch": 4.015897279119535, "grad_norm": 0.570661723613739, "learning_rate": 2.775890620355824e-05, "loss": 0.0915, "step": 13136 }, { "epoch": 4.01620299602568, "grad_norm": 0.44751179218292236, "learning_rate": 2.7758481593138298e-05, "loss": 0.113, "step": 13137 }, { "epoch": 4.0165087129318255, "grad_norm": 0.3262402415275574, "learning_rate": 2.7758056982718356e-05, "loss": 0.071, "step": 13138 }, { "epoch": 4.01681442983797, "grad_norm": 0.20011764764785767, "learning_rate": 2.7757632372298418e-05, "loss": 0.047, "step": 13139 }, { "epoch": 4.017120146744115, "grad_norm": 0.3400900959968567, "learning_rate": 2.7757207761878477e-05, "loss": 0.0441, "step": 13140 }, { "epoch": 4.01742586365026, "grad_norm": 0.19586902856826782, "learning_rate": 2.775678315145854e-05, "loss": 0.0504, "step": 13141 }, { "epoch": 4.017731580556405, "grad_norm": 0.43616411089897156, "learning_rate": 2.7756358541038598e-05, "loss": 0.0628, "step": 13142 }, { "epoch": 4.018037297462549, "grad_norm": 0.2441832423210144, "learning_rate": 2.775593393061866e-05, "loss": 0.0561, "step": 13143 }, { "epoch": 4.0183430143686945, "grad_norm": 0.503880500793457, "learning_rate": 2.775550932019872e-05, "loss": 0.0702, "step": 13144 }, { "epoch": 4.01864873127484, "grad_norm": 0.5915712714195251, "learning_rate": 2.775508470977878e-05, "loss": 0.0984, "step": 13145 }, { "epoch": 4.018954448180985, "grad_norm": 0.5355332493782043, "learning_rate": 2.775466009935884e-05, "loss": 0.0998, "step": 13146 }, { "epoch": 4.019260165087129, "grad_norm": 0.5675991177558899, "learning_rate": 2.77542354889389e-05, "loss": 0.1239, "step": 13147 }, { "epoch": 4.019565881993274, "grad_norm": 0.5127055048942566, "learning_rate": 2.775381087851896e-05, "loss": 0.0918, "step": 13148 }, { "epoch": 4.019871598899419, "grad_norm": 0.5228986740112305, "learning_rate": 2.7753386268099022e-05, "loss": 0.1198, "step": 13149 }, { "epoch": 4.020177315805564, "grad_norm": 0.4891906976699829, "learning_rate": 2.775296165767908e-05, "loss": 0.1595, "step": 13150 }, { "epoch": 4.020483032711709, "grad_norm": 0.6112825870513916, "learning_rate": 2.775253704725914e-05, "loss": 0.1471, "step": 13151 }, { "epoch": 4.020788749617854, "grad_norm": 0.7379589080810547, "learning_rate": 2.7752112436839202e-05, "loss": 0.1528, "step": 13152 }, { "epoch": 4.021094466523999, "grad_norm": 0.5909841656684875, "learning_rate": 2.775168782641926e-05, "loss": 0.1796, "step": 13153 }, { "epoch": 4.021400183430144, "grad_norm": 0.5314594507217407, "learning_rate": 2.7751263215999323e-05, "loss": 0.1684, "step": 13154 }, { "epoch": 4.021705900336289, "grad_norm": 0.5492422580718994, "learning_rate": 2.775083860557938e-05, "loss": 0.1688, "step": 13155 }, { "epoch": 4.022011617242433, "grad_norm": 4.752148151397705, "learning_rate": 2.7750413995159443e-05, "loss": 0.1507, "step": 13156 }, { "epoch": 4.022317334148578, "grad_norm": 0.873337984085083, "learning_rate": 2.7749989384739502e-05, "loss": 0.186, "step": 13157 }, { "epoch": 4.022623051054723, "grad_norm": 1.3064409494400024, "learning_rate": 2.7749564774319564e-05, "loss": 0.1804, "step": 13158 }, { "epoch": 4.0229287679608685, "grad_norm": 2.019625425338745, "learning_rate": 2.7749140163899623e-05, "loss": 0.2162, "step": 13159 }, { "epoch": 4.023234484867013, "grad_norm": 0.3802054524421692, "learning_rate": 2.7748715553479685e-05, "loss": 0.1537, "step": 13160 }, { "epoch": 4.023540201773158, "grad_norm": 0.35138580203056335, "learning_rate": 2.7748290943059744e-05, "loss": 0.075, "step": 13161 }, { "epoch": 4.023845918679303, "grad_norm": 0.26006096601486206, "learning_rate": 2.7747866332639806e-05, "loss": 0.0593, "step": 13162 }, { "epoch": 4.024151635585448, "grad_norm": 0.5428351163864136, "learning_rate": 2.7747441722219864e-05, "loss": 0.0528, "step": 13163 }, { "epoch": 4.024457352491593, "grad_norm": 0.3297906816005707, "learning_rate": 2.7747017111799923e-05, "loss": 0.0626, "step": 13164 }, { "epoch": 4.024763069397737, "grad_norm": 0.245903879404068, "learning_rate": 2.7746592501379985e-05, "loss": 0.0591, "step": 13165 }, { "epoch": 4.0250687863038825, "grad_norm": 0.6710256934165955, "learning_rate": 2.7746167890960044e-05, "loss": 0.0651, "step": 13166 }, { "epoch": 4.025374503210028, "grad_norm": 0.45569875836372375, "learning_rate": 2.7745743280540106e-05, "loss": 0.0713, "step": 13167 }, { "epoch": 4.025680220116173, "grad_norm": 0.3700437545776367, "learning_rate": 2.7745318670120165e-05, "loss": 0.0786, "step": 13168 }, { "epoch": 4.025985937022317, "grad_norm": 0.3435668349266052, "learning_rate": 2.7744894059700227e-05, "loss": 0.0718, "step": 13169 }, { "epoch": 4.026291653928462, "grad_norm": 0.9352173209190369, "learning_rate": 2.7744469449280285e-05, "loss": 0.0891, "step": 13170 }, { "epoch": 4.026597370834607, "grad_norm": 1.3400559425354004, "learning_rate": 2.7744044838860348e-05, "loss": 0.0846, "step": 13171 }, { "epoch": 4.026903087740752, "grad_norm": 1.466352939605713, "learning_rate": 2.7743620228440406e-05, "loss": 0.0987, "step": 13172 }, { "epoch": 4.027208804646897, "grad_norm": 0.3611978590488434, "learning_rate": 2.774319561802047e-05, "loss": 0.1231, "step": 13173 }, { "epoch": 4.027514521553042, "grad_norm": 0.4267370104789734, "learning_rate": 2.7742771007600527e-05, "loss": 0.1297, "step": 13174 }, { "epoch": 4.027820238459187, "grad_norm": 0.5259841680526733, "learning_rate": 2.774234639718059e-05, "loss": 0.1529, "step": 13175 }, { "epoch": 4.028125955365332, "grad_norm": 0.8659301996231079, "learning_rate": 2.7741921786760648e-05, "loss": 0.1448, "step": 13176 }, { "epoch": 4.028431672271477, "grad_norm": 0.8450571894645691, "learning_rate": 2.7741497176340706e-05, "loss": 0.184, "step": 13177 }, { "epoch": 4.028737389177621, "grad_norm": 1.8106836080551147, "learning_rate": 2.774107256592077e-05, "loss": 0.1718, "step": 13178 }, { "epoch": 4.029043106083766, "grad_norm": 0.7797873616218567, "learning_rate": 2.7740647955500827e-05, "loss": 0.1512, "step": 13179 }, { "epoch": 4.0293488229899115, "grad_norm": 0.4887699782848358, "learning_rate": 2.774022334508089e-05, "loss": 0.1789, "step": 13180 }, { "epoch": 4.029654539896057, "grad_norm": 1.3531928062438965, "learning_rate": 2.7739798734660948e-05, "loss": 0.1722, "step": 13181 }, { "epoch": 4.029960256802201, "grad_norm": 0.7850187420845032, "learning_rate": 2.773937412424101e-05, "loss": 0.1948, "step": 13182 }, { "epoch": 4.030265973708346, "grad_norm": 1.2633479833602905, "learning_rate": 2.773894951382107e-05, "loss": 0.2392, "step": 13183 }, { "epoch": 4.030571690614491, "grad_norm": 0.9854757785797119, "learning_rate": 2.773852490340113e-05, "loss": 0.2574, "step": 13184 }, { "epoch": 4.030877407520636, "grad_norm": 0.5333064794540405, "learning_rate": 2.773810029298119e-05, "loss": 0.1399, "step": 13185 }, { "epoch": 4.03118312442678, "grad_norm": 0.30229488015174866, "learning_rate": 2.7737675682561252e-05, "loss": 0.0759, "step": 13186 }, { "epoch": 4.0314888413329255, "grad_norm": 0.5042138695716858, "learning_rate": 2.773725107214131e-05, "loss": 0.0707, "step": 13187 }, { "epoch": 4.031794558239071, "grad_norm": 0.3010985553264618, "learning_rate": 2.7736826461721373e-05, "loss": 0.0655, "step": 13188 }, { "epoch": 4.032100275145216, "grad_norm": 0.4442216455936432, "learning_rate": 2.773640185130143e-05, "loss": 0.0661, "step": 13189 }, { "epoch": 4.032405992051361, "grad_norm": 5.6682538986206055, "learning_rate": 2.773597724088149e-05, "loss": 0.0466, "step": 13190 }, { "epoch": 4.032711708957505, "grad_norm": 0.2679855525493622, "learning_rate": 2.7735552630461552e-05, "loss": 0.0692, "step": 13191 }, { "epoch": 4.03301742586365, "grad_norm": 0.30521583557128906, "learning_rate": 2.773512802004161e-05, "loss": 0.0537, "step": 13192 }, { "epoch": 4.033323142769795, "grad_norm": 0.21537108719348907, "learning_rate": 2.7734703409621673e-05, "loss": 0.0835, "step": 13193 }, { "epoch": 4.03362885967594, "grad_norm": 1.0049387216567993, "learning_rate": 2.773427879920173e-05, "loss": 0.0564, "step": 13194 }, { "epoch": 4.033934576582085, "grad_norm": 0.3992101848125458, "learning_rate": 2.7733854188781794e-05, "loss": 0.1032, "step": 13195 }, { "epoch": 4.03424029348823, "grad_norm": 0.33493342995643616, "learning_rate": 2.7733429578361852e-05, "loss": 0.0745, "step": 13196 }, { "epoch": 4.034546010394375, "grad_norm": 1.133471965789795, "learning_rate": 2.7733004967941914e-05, "loss": 0.1151, "step": 13197 }, { "epoch": 4.03485172730052, "grad_norm": 0.48921576142311096, "learning_rate": 2.7732580357521973e-05, "loss": 0.1165, "step": 13198 }, { "epoch": 4.035157444206664, "grad_norm": 1.1269880533218384, "learning_rate": 2.7732155747102035e-05, "loss": 0.1184, "step": 13199 }, { "epoch": 4.035463161112809, "grad_norm": 0.4284026026725769, "learning_rate": 2.7731731136682094e-05, "loss": 0.1095, "step": 13200 }, { "epoch": 4.0357688780189545, "grad_norm": 0.5981253981590271, "learning_rate": 2.7731306526262156e-05, "loss": 0.1421, "step": 13201 }, { "epoch": 4.0360745949251, "grad_norm": 0.9232502579689026, "learning_rate": 2.7730881915842215e-05, "loss": 0.1448, "step": 13202 }, { "epoch": 4.036380311831245, "grad_norm": 0.8058618903160095, "learning_rate": 2.7730457305422273e-05, "loss": 0.1672, "step": 13203 }, { "epoch": 4.036686028737389, "grad_norm": 0.9634830355644226, "learning_rate": 2.7730032695002335e-05, "loss": 0.1577, "step": 13204 }, { "epoch": 4.036991745643534, "grad_norm": 0.6847993731498718, "learning_rate": 2.7729608084582394e-05, "loss": 0.1859, "step": 13205 }, { "epoch": 4.037297462549679, "grad_norm": 0.9237799644470215, "learning_rate": 2.7729183474162456e-05, "loss": 0.1767, "step": 13206 }, { "epoch": 4.037603179455824, "grad_norm": 2.1278016567230225, "learning_rate": 2.7728758863742515e-05, "loss": 0.1687, "step": 13207 }, { "epoch": 4.0379088963619685, "grad_norm": 0.8240693211555481, "learning_rate": 2.7728334253322577e-05, "loss": 0.1775, "step": 13208 }, { "epoch": 4.038214613268114, "grad_norm": 1.9039785861968994, "learning_rate": 2.7727909642902636e-05, "loss": 0.2498, "step": 13209 }, { "epoch": 4.038520330174259, "grad_norm": 0.3197096586227417, "learning_rate": 2.7727485032482698e-05, "loss": 0.168, "step": 13210 }, { "epoch": 4.038826047080404, "grad_norm": 0.4108821153640747, "learning_rate": 2.7727060422062757e-05, "loss": 0.0875, "step": 13211 }, { "epoch": 4.039131763986548, "grad_norm": 0.31489744782447815, "learning_rate": 2.772663581164282e-05, "loss": 0.0738, "step": 13212 }, { "epoch": 4.039437480892693, "grad_norm": 0.25250086188316345, "learning_rate": 2.7726211201222877e-05, "loss": 0.0691, "step": 13213 }, { "epoch": 4.039743197798838, "grad_norm": 0.2522953152656555, "learning_rate": 2.772578659080294e-05, "loss": 0.0638, "step": 13214 }, { "epoch": 4.040048914704983, "grad_norm": 1.2273188829421997, "learning_rate": 2.7725361980382998e-05, "loss": 0.0646, "step": 13215 }, { "epoch": 4.0403546316111285, "grad_norm": 0.23266352713108063, "learning_rate": 2.7724937369963057e-05, "loss": 0.0515, "step": 13216 }, { "epoch": 4.040660348517273, "grad_norm": 0.2735312581062317, "learning_rate": 2.772451275954312e-05, "loss": 0.0533, "step": 13217 }, { "epoch": 4.040966065423418, "grad_norm": 1.1647151708602905, "learning_rate": 2.7724088149123178e-05, "loss": 0.0839, "step": 13218 }, { "epoch": 4.041271782329563, "grad_norm": 0.36253687739372253, "learning_rate": 2.772366353870324e-05, "loss": 0.0743, "step": 13219 }, { "epoch": 4.041577499235708, "grad_norm": 0.31601226329803467, "learning_rate": 2.77232389282833e-05, "loss": 0.082, "step": 13220 }, { "epoch": 4.041883216141852, "grad_norm": 0.24620763957500458, "learning_rate": 2.772281431786336e-05, "loss": 0.0773, "step": 13221 }, { "epoch": 4.042188933047997, "grad_norm": 0.9322500228881836, "learning_rate": 2.772238970744342e-05, "loss": 0.1163, "step": 13222 }, { "epoch": 4.0424946499541425, "grad_norm": 1.2387644052505493, "learning_rate": 2.772196509702348e-05, "loss": 0.1368, "step": 13223 }, { "epoch": 4.042800366860288, "grad_norm": 0.6523501873016357, "learning_rate": 2.772154048660354e-05, "loss": 0.1484, "step": 13224 }, { "epoch": 4.043106083766432, "grad_norm": 0.6517122387886047, "learning_rate": 2.7721115876183602e-05, "loss": 0.1531, "step": 13225 }, { "epoch": 4.043411800672577, "grad_norm": 0.6880832314491272, "learning_rate": 2.7720691265763664e-05, "loss": 0.1769, "step": 13226 }, { "epoch": 4.043717517578722, "grad_norm": 0.5819571614265442, "learning_rate": 2.7720266655343723e-05, "loss": 0.1642, "step": 13227 }, { "epoch": 4.044023234484867, "grad_norm": 1.0707420110702515, "learning_rate": 2.7719842044923785e-05, "loss": 0.2015, "step": 13228 }, { "epoch": 4.044328951391012, "grad_norm": 1.7314585447311401, "learning_rate": 2.7719417434503844e-05, "loss": 0.1898, "step": 13229 }, { "epoch": 4.044634668297157, "grad_norm": 0.9371612668037415, "learning_rate": 2.7718992824083906e-05, "loss": 0.1793, "step": 13230 }, { "epoch": 4.044940385203302, "grad_norm": 0.5682465434074402, "learning_rate": 2.7718568213663964e-05, "loss": 0.1691, "step": 13231 }, { "epoch": 4.045246102109447, "grad_norm": 1.218571662902832, "learning_rate": 2.7718143603244026e-05, "loss": 0.1931, "step": 13232 }, { "epoch": 4.045551819015592, "grad_norm": 1.1195220947265625, "learning_rate": 2.7717718992824085e-05, "loss": 0.2008, "step": 13233 }, { "epoch": 4.045857535921736, "grad_norm": 1.2325732707977295, "learning_rate": 2.7717294382404147e-05, "loss": 0.2644, "step": 13234 }, { "epoch": 4.046163252827881, "grad_norm": 0.36397120356559753, "learning_rate": 2.7716869771984206e-05, "loss": 0.1415, "step": 13235 }, { "epoch": 4.046468969734026, "grad_norm": 0.2702012062072754, "learning_rate": 2.7716445161564268e-05, "loss": 0.0745, "step": 13236 }, { "epoch": 4.0467746866401715, "grad_norm": 0.2908802330493927, "learning_rate": 2.7716020551144327e-05, "loss": 0.0754, "step": 13237 }, { "epoch": 4.047080403546316, "grad_norm": 0.28217169642448425, "learning_rate": 2.771559594072439e-05, "loss": 0.0635, "step": 13238 }, { "epoch": 4.047386120452461, "grad_norm": 0.23459851741790771, "learning_rate": 2.7715171330304448e-05, "loss": 0.0563, "step": 13239 }, { "epoch": 4.047691837358606, "grad_norm": 0.27578839659690857, "learning_rate": 2.7714746719884506e-05, "loss": 0.0562, "step": 13240 }, { "epoch": 4.047997554264751, "grad_norm": 0.48495253920555115, "learning_rate": 2.771432210946457e-05, "loss": 0.0873, "step": 13241 }, { "epoch": 4.048303271170896, "grad_norm": 0.26001307368278503, "learning_rate": 2.7713897499044627e-05, "loss": 0.0585, "step": 13242 }, { "epoch": 4.04860898807704, "grad_norm": 0.24727904796600342, "learning_rate": 2.771347288862469e-05, "loss": 0.0726, "step": 13243 }, { "epoch": 4.0489147049831855, "grad_norm": 0.21962237358093262, "learning_rate": 2.7713048278204748e-05, "loss": 0.0657, "step": 13244 }, { "epoch": 4.049220421889331, "grad_norm": 0.22305352985858917, "learning_rate": 2.771262366778481e-05, "loss": 0.065, "step": 13245 }, { "epoch": 4.049526138795476, "grad_norm": 0.5533862113952637, "learning_rate": 2.771219905736487e-05, "loss": 0.1079, "step": 13246 }, { "epoch": 4.04983185570162, "grad_norm": 0.36762064695358276, "learning_rate": 2.771177444694493e-05, "loss": 0.0963, "step": 13247 }, { "epoch": 4.050137572607765, "grad_norm": 0.8420436978340149, "learning_rate": 2.771134983652499e-05, "loss": 0.1108, "step": 13248 }, { "epoch": 4.05044328951391, "grad_norm": 0.7324002981185913, "learning_rate": 2.771092522610505e-05, "loss": 0.1926, "step": 13249 }, { "epoch": 4.050749006420055, "grad_norm": 1.2423475980758667, "learning_rate": 2.771050061568511e-05, "loss": 0.158, "step": 13250 }, { "epoch": 4.0510547233261995, "grad_norm": 0.5802162289619446, "learning_rate": 2.7710076005265172e-05, "loss": 0.1603, "step": 13251 }, { "epoch": 4.051360440232345, "grad_norm": 5.434298515319824, "learning_rate": 2.770965139484523e-05, "loss": 0.1861, "step": 13252 }, { "epoch": 4.05166615713849, "grad_norm": 0.6884622573852539, "learning_rate": 2.770922678442529e-05, "loss": 0.1734, "step": 13253 }, { "epoch": 4.051971874044635, "grad_norm": 0.7117375731468201, "learning_rate": 2.7708802174005352e-05, "loss": 0.1955, "step": 13254 }, { "epoch": 4.05227759095078, "grad_norm": 1.1411793231964111, "learning_rate": 2.770837756358541e-05, "loss": 0.1994, "step": 13255 }, { "epoch": 4.052583307856924, "grad_norm": 1.0504168272018433, "learning_rate": 2.7707952953165473e-05, "loss": 0.2277, "step": 13256 }, { "epoch": 4.052889024763069, "grad_norm": 0.9090582728385925, "learning_rate": 2.770752834274553e-05, "loss": 0.1845, "step": 13257 }, { "epoch": 4.0531947416692145, "grad_norm": 1.1021296977996826, "learning_rate": 2.7707103732325593e-05, "loss": 0.2205, "step": 13258 }, { "epoch": 4.05350045857536, "grad_norm": 3.9203855991363525, "learning_rate": 2.7706679121905652e-05, "loss": 0.3079, "step": 13259 }, { "epoch": 4.053806175481504, "grad_norm": 0.30544623732566833, "learning_rate": 2.7706254511485714e-05, "loss": 0.1393, "step": 13260 }, { "epoch": 4.054111892387649, "grad_norm": 0.26308855414390564, "learning_rate": 2.7705829901065773e-05, "loss": 0.0946, "step": 13261 }, { "epoch": 4.054417609293794, "grad_norm": 0.2525118291378021, "learning_rate": 2.7705405290645835e-05, "loss": 0.0782, "step": 13262 }, { "epoch": 4.054723326199939, "grad_norm": 0.248046413064003, "learning_rate": 2.7704980680225894e-05, "loss": 0.0609, "step": 13263 }, { "epoch": 4.055029043106083, "grad_norm": 0.17864291369915009, "learning_rate": 2.7704556069805956e-05, "loss": 0.054, "step": 13264 }, { "epoch": 4.0553347600122285, "grad_norm": 0.21694180369377136, "learning_rate": 2.7704131459386014e-05, "loss": 0.0616, "step": 13265 }, { "epoch": 4.055640476918374, "grad_norm": 0.972935676574707, "learning_rate": 2.7703706848966073e-05, "loss": 0.0642, "step": 13266 }, { "epoch": 4.055946193824519, "grad_norm": 0.2888561487197876, "learning_rate": 2.7703282238546135e-05, "loss": 0.0746, "step": 13267 }, { "epoch": 4.056251910730664, "grad_norm": 0.26751166582107544, "learning_rate": 2.7702857628126194e-05, "loss": 0.0655, "step": 13268 }, { "epoch": 4.056557627636808, "grad_norm": 0.2191745787858963, "learning_rate": 2.7702433017706256e-05, "loss": 0.0602, "step": 13269 }, { "epoch": 4.056863344542953, "grad_norm": 0.30992335081100464, "learning_rate": 2.7702008407286315e-05, "loss": 0.109, "step": 13270 }, { "epoch": 4.057169061449098, "grad_norm": 0.2382793426513672, "learning_rate": 2.7701583796866377e-05, "loss": 0.0827, "step": 13271 }, { "epoch": 4.057474778355243, "grad_norm": 0.3499319553375244, "learning_rate": 2.7701159186446435e-05, "loss": 0.1131, "step": 13272 }, { "epoch": 4.057780495261388, "grad_norm": 0.7869304418563843, "learning_rate": 2.7700734576026498e-05, "loss": 0.1288, "step": 13273 }, { "epoch": 4.058086212167533, "grad_norm": 0.6370041370391846, "learning_rate": 2.7700309965606556e-05, "loss": 0.1124, "step": 13274 }, { "epoch": 4.058391929073678, "grad_norm": 0.48254260420799255, "learning_rate": 2.769988535518662e-05, "loss": 0.1386, "step": 13275 }, { "epoch": 4.058697645979823, "grad_norm": 0.7314375042915344, "learning_rate": 2.7699460744766677e-05, "loss": 0.1398, "step": 13276 }, { "epoch": 4.059003362885967, "grad_norm": 0.5956602692604065, "learning_rate": 2.769903613434674e-05, "loss": 0.1772, "step": 13277 }, { "epoch": 4.059309079792112, "grad_norm": 0.7385565638542175, "learning_rate": 2.7698611523926798e-05, "loss": 0.164, "step": 13278 }, { "epoch": 4.059614796698257, "grad_norm": 1.064041256904602, "learning_rate": 2.7698186913506857e-05, "loss": 0.1702, "step": 13279 }, { "epoch": 4.0599205136044025, "grad_norm": 0.8519697785377502, "learning_rate": 2.769776230308692e-05, "loss": 0.1881, "step": 13280 }, { "epoch": 4.060226230510548, "grad_norm": 0.9634599685668945, "learning_rate": 2.7697337692666977e-05, "loss": 0.163, "step": 13281 }, { "epoch": 4.060531947416692, "grad_norm": 0.6991767287254333, "learning_rate": 2.769691308224704e-05, "loss": 0.1908, "step": 13282 }, { "epoch": 4.060837664322837, "grad_norm": 0.871757984161377, "learning_rate": 2.7696488471827098e-05, "loss": 0.2318, "step": 13283 }, { "epoch": 4.061143381228982, "grad_norm": 1.608367919921875, "learning_rate": 2.769606386140716e-05, "loss": 0.2815, "step": 13284 }, { "epoch": 4.061449098135127, "grad_norm": 0.3256901502609253, "learning_rate": 2.769563925098722e-05, "loss": 0.1233, "step": 13285 }, { "epoch": 4.0617548150412714, "grad_norm": 0.32326510548591614, "learning_rate": 2.769521464056728e-05, "loss": 0.0742, "step": 13286 }, { "epoch": 4.062060531947417, "grad_norm": 0.936809778213501, "learning_rate": 2.769479003014734e-05, "loss": 0.078, "step": 13287 }, { "epoch": 4.062366248853562, "grad_norm": 0.18620572984218597, "learning_rate": 2.7694365419727402e-05, "loss": 0.0495, "step": 13288 }, { "epoch": 4.062671965759707, "grad_norm": 1.2139346599578857, "learning_rate": 2.769394080930746e-05, "loss": 0.0584, "step": 13289 }, { "epoch": 4.062977682665851, "grad_norm": 0.2854005992412567, "learning_rate": 2.7693516198887523e-05, "loss": 0.0603, "step": 13290 }, { "epoch": 4.063283399571996, "grad_norm": 0.4340643584728241, "learning_rate": 2.769309158846758e-05, "loss": 0.0761, "step": 13291 }, { "epoch": 4.063589116478141, "grad_norm": 0.20346635580062866, "learning_rate": 2.769266697804764e-05, "loss": 0.0568, "step": 13292 }, { "epoch": 4.063894833384286, "grad_norm": 0.4495980441570282, "learning_rate": 2.7692242367627702e-05, "loss": 0.0946, "step": 13293 }, { "epoch": 4.0642005502904315, "grad_norm": 0.31495851278305054, "learning_rate": 2.769181775720776e-05, "loss": 0.0775, "step": 13294 }, { "epoch": 4.064506267196576, "grad_norm": 0.5041214227676392, "learning_rate": 2.7691393146787823e-05, "loss": 0.0865, "step": 13295 }, { "epoch": 4.064811984102721, "grad_norm": 0.5187363028526306, "learning_rate": 2.769096853636788e-05, "loss": 0.0946, "step": 13296 }, { "epoch": 4.065117701008866, "grad_norm": 0.3904270529747009, "learning_rate": 2.7690543925947944e-05, "loss": 0.0863, "step": 13297 }, { "epoch": 4.065423417915011, "grad_norm": 0.7110073566436768, "learning_rate": 2.7690119315528002e-05, "loss": 0.1535, "step": 13298 }, { "epoch": 4.065729134821155, "grad_norm": 0.35271570086479187, "learning_rate": 2.7689694705108064e-05, "loss": 0.1494, "step": 13299 }, { "epoch": 4.0660348517273, "grad_norm": 0.3602530062198639, "learning_rate": 2.7689270094688123e-05, "loss": 0.1343, "step": 13300 }, { "epoch": 4.0663405686334455, "grad_norm": 0.8032369613647461, "learning_rate": 2.7688845484268185e-05, "loss": 0.1586, "step": 13301 }, { "epoch": 4.066646285539591, "grad_norm": 0.44293680787086487, "learning_rate": 2.7688420873848244e-05, "loss": 0.1467, "step": 13302 }, { "epoch": 4.066952002445735, "grad_norm": 1.0054148435592651, "learning_rate": 2.7687996263428306e-05, "loss": 0.1697, "step": 13303 }, { "epoch": 4.06725771935188, "grad_norm": 1.3787002563476562, "learning_rate": 2.7687571653008365e-05, "loss": 0.1506, "step": 13304 }, { "epoch": 4.067563436258025, "grad_norm": 0.6149824857711792, "learning_rate": 2.7687147042588423e-05, "loss": 0.2225, "step": 13305 }, { "epoch": 4.06786915316417, "grad_norm": 0.8237692713737488, "learning_rate": 2.7686722432168485e-05, "loss": 0.1723, "step": 13306 }, { "epoch": 4.068174870070315, "grad_norm": 0.999464750289917, "learning_rate": 2.7686297821748544e-05, "loss": 0.2032, "step": 13307 }, { "epoch": 4.0684805869764595, "grad_norm": 1.4945695400238037, "learning_rate": 2.7685873211328606e-05, "loss": 0.2011, "step": 13308 }, { "epoch": 4.068786303882605, "grad_norm": 1.060049057006836, "learning_rate": 2.7685448600908665e-05, "loss": 0.2908, "step": 13309 }, { "epoch": 4.06909202078875, "grad_norm": 0.39099863171577454, "learning_rate": 2.7685023990488727e-05, "loss": 0.139, "step": 13310 }, { "epoch": 4.069397737694895, "grad_norm": 0.3086581230163574, "learning_rate": 2.7684599380068786e-05, "loss": 0.0886, "step": 13311 }, { "epoch": 4.069703454601039, "grad_norm": 1.0196938514709473, "learning_rate": 2.7684174769648848e-05, "loss": 0.0855, "step": 13312 }, { "epoch": 4.070009171507184, "grad_norm": 0.49017825722694397, "learning_rate": 2.7683750159228907e-05, "loss": 0.05, "step": 13313 }, { "epoch": 4.070314888413329, "grad_norm": 0.21760299801826477, "learning_rate": 2.768332554880897e-05, "loss": 0.0728, "step": 13314 }, { "epoch": 4.0706206053194744, "grad_norm": 0.49326661229133606, "learning_rate": 2.7682900938389027e-05, "loss": 0.0441, "step": 13315 }, { "epoch": 4.070926322225619, "grad_norm": 0.17504246532917023, "learning_rate": 2.768247632796909e-05, "loss": 0.0412, "step": 13316 }, { "epoch": 4.071232039131764, "grad_norm": 0.22925521433353424, "learning_rate": 2.7682051717549148e-05, "loss": 0.0552, "step": 13317 }, { "epoch": 4.071537756037909, "grad_norm": 0.22204285860061646, "learning_rate": 2.7681627107129207e-05, "loss": 0.0913, "step": 13318 }, { "epoch": 4.071843472944054, "grad_norm": 0.2704968750476837, "learning_rate": 2.768120249670927e-05, "loss": 0.0551, "step": 13319 }, { "epoch": 4.072149189850199, "grad_norm": 0.436727374792099, "learning_rate": 2.7680777886289328e-05, "loss": 0.0928, "step": 13320 }, { "epoch": 4.072454906756343, "grad_norm": 0.3756893575191498, "learning_rate": 2.768035327586939e-05, "loss": 0.0856, "step": 13321 }, { "epoch": 4.0727606236624885, "grad_norm": 0.4183407127857208, "learning_rate": 2.767992866544945e-05, "loss": 0.0942, "step": 13322 }, { "epoch": 4.073066340568634, "grad_norm": 1.3328133821487427, "learning_rate": 2.767950405502951e-05, "loss": 0.117, "step": 13323 }, { "epoch": 4.073372057474779, "grad_norm": 0.38207876682281494, "learning_rate": 2.767907944460957e-05, "loss": 0.1046, "step": 13324 }, { "epoch": 4.073677774380923, "grad_norm": 1.5391815900802612, "learning_rate": 2.767865483418963e-05, "loss": 0.1885, "step": 13325 }, { "epoch": 4.073983491287068, "grad_norm": 0.794202983379364, "learning_rate": 2.767823022376969e-05, "loss": 0.1599, "step": 13326 }, { "epoch": 4.074289208193213, "grad_norm": 0.5156667828559875, "learning_rate": 2.7677805613349752e-05, "loss": 0.1762, "step": 13327 }, { "epoch": 4.074594925099358, "grad_norm": 0.8615328073501587, "learning_rate": 2.767738100292981e-05, "loss": 0.2216, "step": 13328 }, { "epoch": 4.0749006420055025, "grad_norm": 1.3398884534835815, "learning_rate": 2.7676956392509873e-05, "loss": 0.2202, "step": 13329 }, { "epoch": 4.075206358911648, "grad_norm": 0.7466917634010315, "learning_rate": 2.7676531782089935e-05, "loss": 0.1925, "step": 13330 }, { "epoch": 4.075512075817793, "grad_norm": 0.873742401599884, "learning_rate": 2.7676107171669994e-05, "loss": 0.2022, "step": 13331 }, { "epoch": 4.075817792723938, "grad_norm": 0.6713085174560547, "learning_rate": 2.7675682561250056e-05, "loss": 0.2016, "step": 13332 }, { "epoch": 4.076123509630083, "grad_norm": 5.718562602996826, "learning_rate": 2.7675257950830114e-05, "loss": 0.2121, "step": 13333 }, { "epoch": 4.076429226536227, "grad_norm": 1.6939358711242676, "learning_rate": 2.7674833340410177e-05, "loss": 0.2718, "step": 13334 }, { "epoch": 4.076734943442372, "grad_norm": 0.37923428416252136, "learning_rate": 2.7674408729990235e-05, "loss": 0.1511, "step": 13335 }, { "epoch": 4.077040660348517, "grad_norm": 0.29890885949134827, "learning_rate": 2.7673984119570297e-05, "loss": 0.0844, "step": 13336 }, { "epoch": 4.0773463772546625, "grad_norm": 0.5412906408309937, "learning_rate": 2.7673559509150356e-05, "loss": 0.094, "step": 13337 }, { "epoch": 4.077652094160807, "grad_norm": 0.27878448367118835, "learning_rate": 2.7673134898730418e-05, "loss": 0.0471, "step": 13338 }, { "epoch": 4.077957811066952, "grad_norm": 0.7297509908676147, "learning_rate": 2.7672710288310477e-05, "loss": 0.0612, "step": 13339 }, { "epoch": 4.078263527973097, "grad_norm": 0.284709095954895, "learning_rate": 2.767228567789054e-05, "loss": 0.0577, "step": 13340 }, { "epoch": 4.078569244879242, "grad_norm": 0.42724883556365967, "learning_rate": 2.7671861067470598e-05, "loss": 0.0773, "step": 13341 }, { "epoch": 4.078874961785386, "grad_norm": 0.2504649758338928, "learning_rate": 2.767143645705066e-05, "loss": 0.0468, "step": 13342 }, { "epoch": 4.0791806786915314, "grad_norm": 0.6507853269577026, "learning_rate": 2.767101184663072e-05, "loss": 0.1055, "step": 13343 }, { "epoch": 4.079486395597677, "grad_norm": 0.22269797325134277, "learning_rate": 2.7670587236210777e-05, "loss": 0.0731, "step": 13344 }, { "epoch": 4.079792112503822, "grad_norm": 0.4406808018684387, "learning_rate": 2.767016262579084e-05, "loss": 0.0784, "step": 13345 }, { "epoch": 4.080097829409967, "grad_norm": 0.3152027130126953, "learning_rate": 2.7669738015370898e-05, "loss": 0.074, "step": 13346 }, { "epoch": 4.080403546316111, "grad_norm": 0.7587950825691223, "learning_rate": 2.766931340495096e-05, "loss": 0.134, "step": 13347 }, { "epoch": 4.080709263222256, "grad_norm": 0.6022463440895081, "learning_rate": 2.766888879453102e-05, "loss": 0.0994, "step": 13348 }, { "epoch": 4.081014980128401, "grad_norm": 0.5934556722640991, "learning_rate": 2.766846418411108e-05, "loss": 0.1405, "step": 13349 }, { "epoch": 4.081320697034546, "grad_norm": 0.5627138018608093, "learning_rate": 2.766803957369114e-05, "loss": 0.1314, "step": 13350 }, { "epoch": 4.081626413940691, "grad_norm": 0.42455270886421204, "learning_rate": 2.76676149632712e-05, "loss": 0.1618, "step": 13351 }, { "epoch": 4.081932130846836, "grad_norm": 0.8116849064826965, "learning_rate": 2.766719035285126e-05, "loss": 0.1812, "step": 13352 }, { "epoch": 4.082237847752981, "grad_norm": 0.8662304878234863, "learning_rate": 2.7666765742431322e-05, "loss": 0.1463, "step": 13353 }, { "epoch": 4.082543564659126, "grad_norm": 0.7022531032562256, "learning_rate": 2.766634113201138e-05, "loss": 0.191, "step": 13354 }, { "epoch": 4.08284928156527, "grad_norm": 3.0124266147613525, "learning_rate": 2.766591652159144e-05, "loss": 0.1984, "step": 13355 }, { "epoch": 4.083154998471415, "grad_norm": 0.7757308483123779, "learning_rate": 2.7665491911171502e-05, "loss": 0.222, "step": 13356 }, { "epoch": 4.08346071537756, "grad_norm": 0.7451691627502441, "learning_rate": 2.766506730075156e-05, "loss": 0.2153, "step": 13357 }, { "epoch": 4.0837664322837055, "grad_norm": 1.5831578969955444, "learning_rate": 2.7664642690331623e-05, "loss": 0.1957, "step": 13358 }, { "epoch": 4.084072149189851, "grad_norm": 2.6776318550109863, "learning_rate": 2.766421807991168e-05, "loss": 0.276, "step": 13359 }, { "epoch": 4.084377866095995, "grad_norm": 0.36884912848472595, "learning_rate": 2.7663793469491743e-05, "loss": 0.1787, "step": 13360 }, { "epoch": 4.08468358300214, "grad_norm": 0.36766737699508667, "learning_rate": 2.7663368859071802e-05, "loss": 0.0831, "step": 13361 }, { "epoch": 4.084989299908285, "grad_norm": 0.3524606227874756, "learning_rate": 2.7662944248651864e-05, "loss": 0.062, "step": 13362 }, { "epoch": 4.08529501681443, "grad_norm": 0.39536333084106445, "learning_rate": 2.7662519638231923e-05, "loss": 0.0852, "step": 13363 }, { "epoch": 4.085600733720574, "grad_norm": 0.44432905316352844, "learning_rate": 2.7662095027811985e-05, "loss": 0.0927, "step": 13364 }, { "epoch": 4.0859064506267195, "grad_norm": 0.2595023810863495, "learning_rate": 2.7661670417392044e-05, "loss": 0.0634, "step": 13365 }, { "epoch": 4.086212167532865, "grad_norm": 0.6353480815887451, "learning_rate": 2.7661245806972106e-05, "loss": 0.0683, "step": 13366 }, { "epoch": 4.08651788443901, "grad_norm": 0.499379426240921, "learning_rate": 2.7660821196552164e-05, "loss": 0.0373, "step": 13367 }, { "epoch": 4.086823601345154, "grad_norm": 0.31915393471717834, "learning_rate": 2.7660396586132223e-05, "loss": 0.0704, "step": 13368 }, { "epoch": 4.087129318251299, "grad_norm": 1.034541130065918, "learning_rate": 2.7659971975712285e-05, "loss": 0.0722, "step": 13369 }, { "epoch": 4.087435035157444, "grad_norm": 0.5514142513275146, "learning_rate": 2.7659547365292344e-05, "loss": 0.077, "step": 13370 }, { "epoch": 4.087740752063589, "grad_norm": 0.5231286883354187, "learning_rate": 2.7659122754872406e-05, "loss": 0.1065, "step": 13371 }, { "epoch": 4.0880464689697344, "grad_norm": 0.4818457365036011, "learning_rate": 2.7658698144452465e-05, "loss": 0.1174, "step": 13372 }, { "epoch": 4.088352185875879, "grad_norm": 0.45574235916137695, "learning_rate": 2.7658273534032527e-05, "loss": 0.1315, "step": 13373 }, { "epoch": 4.088657902782024, "grad_norm": 0.3599547743797302, "learning_rate": 2.7657848923612585e-05, "loss": 0.1117, "step": 13374 }, { "epoch": 4.088963619688169, "grad_norm": 0.5157399773597717, "learning_rate": 2.7657424313192648e-05, "loss": 0.146, "step": 13375 }, { "epoch": 4.089269336594314, "grad_norm": 0.6005337238311768, "learning_rate": 2.7656999702772706e-05, "loss": 0.1505, "step": 13376 }, { "epoch": 4.089575053500458, "grad_norm": 0.6334528923034668, "learning_rate": 2.765657509235277e-05, "loss": 0.1728, "step": 13377 }, { "epoch": 4.089880770406603, "grad_norm": 0.8277460336685181, "learning_rate": 2.7656150481932827e-05, "loss": 0.1576, "step": 13378 }, { "epoch": 4.0901864873127485, "grad_norm": 0.6884830594062805, "learning_rate": 2.765572587151289e-05, "loss": 0.1557, "step": 13379 }, { "epoch": 4.090492204218894, "grad_norm": 0.6673216223716736, "learning_rate": 2.7655301261092948e-05, "loss": 0.165, "step": 13380 }, { "epoch": 4.090797921125038, "grad_norm": 0.6315158009529114, "learning_rate": 2.7654876650673007e-05, "loss": 0.1954, "step": 13381 }, { "epoch": 4.091103638031183, "grad_norm": 0.8984721899032593, "learning_rate": 2.765445204025307e-05, "loss": 0.192, "step": 13382 }, { "epoch": 4.091409354937328, "grad_norm": 1.377252459526062, "learning_rate": 2.7654027429833127e-05, "loss": 0.2543, "step": 13383 }, { "epoch": 4.091715071843473, "grad_norm": 1.710036039352417, "learning_rate": 2.765360281941319e-05, "loss": 0.2307, "step": 13384 }, { "epoch": 4.092020788749618, "grad_norm": 0.3617965877056122, "learning_rate": 2.7653178208993248e-05, "loss": 0.1336, "step": 13385 }, { "epoch": 4.0923265056557625, "grad_norm": 0.2534744441509247, "learning_rate": 2.765275359857331e-05, "loss": 0.0815, "step": 13386 }, { "epoch": 4.092632222561908, "grad_norm": 0.2625056803226471, "learning_rate": 2.765232898815337e-05, "loss": 0.0714, "step": 13387 }, { "epoch": 4.092937939468053, "grad_norm": 0.27595090866088867, "learning_rate": 2.765190437773343e-05, "loss": 0.0628, "step": 13388 }, { "epoch": 4.093243656374198, "grad_norm": 0.26508739590644836, "learning_rate": 2.765147976731349e-05, "loss": 0.069, "step": 13389 }, { "epoch": 4.093549373280342, "grad_norm": 0.7136226892471313, "learning_rate": 2.7651055156893552e-05, "loss": 0.0636, "step": 13390 }, { "epoch": 4.093855090186487, "grad_norm": 0.2371574193239212, "learning_rate": 2.765063054647361e-05, "loss": 0.0612, "step": 13391 }, { "epoch": 4.094160807092632, "grad_norm": 0.37909889221191406, "learning_rate": 2.7650205936053673e-05, "loss": 0.0652, "step": 13392 }, { "epoch": 4.094466523998777, "grad_norm": 0.3401952087879181, "learning_rate": 2.764978132563373e-05, "loss": 0.0714, "step": 13393 }, { "epoch": 4.094772240904922, "grad_norm": 0.31582143902778625, "learning_rate": 2.764935671521379e-05, "loss": 0.0572, "step": 13394 }, { "epoch": 4.095077957811067, "grad_norm": 0.44678089022636414, "learning_rate": 2.7648932104793852e-05, "loss": 0.1038, "step": 13395 }, { "epoch": 4.095383674717212, "grad_norm": 1.1464321613311768, "learning_rate": 2.764850749437391e-05, "loss": 0.0775, "step": 13396 }, { "epoch": 4.095689391623357, "grad_norm": 0.4697396457195282, "learning_rate": 2.7648082883953973e-05, "loss": 0.0811, "step": 13397 }, { "epoch": 4.095995108529502, "grad_norm": 0.8363460898399353, "learning_rate": 2.764765827353403e-05, "loss": 0.1172, "step": 13398 }, { "epoch": 4.096300825435646, "grad_norm": 0.4350399971008301, "learning_rate": 2.7647233663114094e-05, "loss": 0.1074, "step": 13399 }, { "epoch": 4.0966065423417914, "grad_norm": 0.5419108867645264, "learning_rate": 2.7646809052694152e-05, "loss": 0.1234, "step": 13400 }, { "epoch": 4.096912259247937, "grad_norm": 0.927909791469574, "learning_rate": 2.7646384442274214e-05, "loss": 0.1779, "step": 13401 }, { "epoch": 4.097217976154082, "grad_norm": 0.6581662893295288, "learning_rate": 2.7645959831854273e-05, "loss": 0.1708, "step": 13402 }, { "epoch": 4.097523693060226, "grad_norm": 0.6185435652732849, "learning_rate": 2.7645535221434335e-05, "loss": 0.1577, "step": 13403 }, { "epoch": 4.097829409966371, "grad_norm": 0.9988043904304504, "learning_rate": 2.7645110611014394e-05, "loss": 0.1683, "step": 13404 }, { "epoch": 4.098135126872516, "grad_norm": 2.774578332901001, "learning_rate": 2.7644686000594456e-05, "loss": 0.2023, "step": 13405 }, { "epoch": 4.098440843778661, "grad_norm": 0.9952835440635681, "learning_rate": 2.7644261390174515e-05, "loss": 0.1967, "step": 13406 }, { "epoch": 4.0987465606848055, "grad_norm": 1.3824448585510254, "learning_rate": 2.7643836779754573e-05, "loss": 0.1823, "step": 13407 }, { "epoch": 4.099052277590951, "grad_norm": 1.7185909748077393, "learning_rate": 2.7643412169334635e-05, "loss": 0.2247, "step": 13408 }, { "epoch": 4.099357994497096, "grad_norm": 2.781747341156006, "learning_rate": 2.7642987558914694e-05, "loss": 0.1903, "step": 13409 }, { "epoch": 4.099663711403241, "grad_norm": 0.4145406186580658, "learning_rate": 2.7642562948494756e-05, "loss": 0.1356, "step": 13410 }, { "epoch": 4.099969428309386, "grad_norm": 0.23560862243175507, "learning_rate": 2.7642138338074815e-05, "loss": 0.0894, "step": 13411 }, { "epoch": 4.10027514521553, "grad_norm": 0.33029523491859436, "learning_rate": 2.7641713727654877e-05, "loss": 0.0786, "step": 13412 }, { "epoch": 4.100580862121675, "grad_norm": 0.20311781764030457, "learning_rate": 2.7641289117234936e-05, "loss": 0.0518, "step": 13413 }, { "epoch": 4.10088657902782, "grad_norm": 0.3655914068222046, "learning_rate": 2.7640864506814998e-05, "loss": 0.0596, "step": 13414 }, { "epoch": 4.1011922959339655, "grad_norm": 0.44152769446372986, "learning_rate": 2.7640439896395057e-05, "loss": 0.0686, "step": 13415 }, { "epoch": 4.10149801284011, "grad_norm": 0.2820490896701813, "learning_rate": 2.764001528597512e-05, "loss": 0.0633, "step": 13416 }, { "epoch": 4.101803729746255, "grad_norm": 0.18942564725875854, "learning_rate": 2.7639590675555177e-05, "loss": 0.0693, "step": 13417 }, { "epoch": 4.1021094466524, "grad_norm": 0.41294124722480774, "learning_rate": 2.763916606513524e-05, "loss": 0.0952, "step": 13418 }, { "epoch": 4.102415163558545, "grad_norm": 0.7048726081848145, "learning_rate": 2.7638741454715298e-05, "loss": 0.0856, "step": 13419 }, { "epoch": 4.102720880464689, "grad_norm": 0.9609678387641907, "learning_rate": 2.7638316844295357e-05, "loss": 0.1303, "step": 13420 }, { "epoch": 4.103026597370834, "grad_norm": 0.4411744773387909, "learning_rate": 2.763789223387542e-05, "loss": 0.1154, "step": 13421 }, { "epoch": 4.1033323142769795, "grad_norm": 0.3789236545562744, "learning_rate": 2.7637467623455478e-05, "loss": 0.0944, "step": 13422 }, { "epoch": 4.103638031183125, "grad_norm": 0.4466986656188965, "learning_rate": 2.763704301303554e-05, "loss": 0.1286, "step": 13423 }, { "epoch": 4.10394374808927, "grad_norm": 0.5159034132957458, "learning_rate": 2.76366184026156e-05, "loss": 0.1061, "step": 13424 }, { "epoch": 4.104249464995414, "grad_norm": 0.5575665831565857, "learning_rate": 2.763619379219566e-05, "loss": 0.1452, "step": 13425 }, { "epoch": 4.104555181901559, "grad_norm": 0.6690864562988281, "learning_rate": 2.763576918177572e-05, "loss": 0.142, "step": 13426 }, { "epoch": 4.104860898807704, "grad_norm": 0.6304564476013184, "learning_rate": 2.763534457135578e-05, "loss": 0.1548, "step": 13427 }, { "epoch": 4.105166615713849, "grad_norm": 0.7957323789596558, "learning_rate": 2.763491996093584e-05, "loss": 0.1813, "step": 13428 }, { "epoch": 4.105472332619994, "grad_norm": 0.6598344445228577, "learning_rate": 2.7634495350515902e-05, "loss": 0.1792, "step": 13429 }, { "epoch": 4.105778049526139, "grad_norm": 0.6299479007720947, "learning_rate": 2.763407074009596e-05, "loss": 0.1668, "step": 13430 }, { "epoch": 4.106083766432284, "grad_norm": 1.0268802642822266, "learning_rate": 2.7633646129676023e-05, "loss": 0.1507, "step": 13431 }, { "epoch": 4.106389483338429, "grad_norm": 1.2738488912582397, "learning_rate": 2.7633221519256085e-05, "loss": 0.2237, "step": 13432 }, { "epoch": 4.106695200244573, "grad_norm": 1.1543468236923218, "learning_rate": 2.7632796908836144e-05, "loss": 0.2225, "step": 13433 }, { "epoch": 4.107000917150718, "grad_norm": 1.523202896118164, "learning_rate": 2.7632372298416206e-05, "loss": 0.3185, "step": 13434 }, { "epoch": 4.107306634056863, "grad_norm": 0.8906022906303406, "learning_rate": 2.7631947687996264e-05, "loss": 0.1727, "step": 13435 }, { "epoch": 4.1076123509630085, "grad_norm": 0.2036709487438202, "learning_rate": 2.7631523077576327e-05, "loss": 0.0785, "step": 13436 }, { "epoch": 4.107918067869154, "grad_norm": 0.46415209770202637, "learning_rate": 2.7631098467156385e-05, "loss": 0.0985, "step": 13437 }, { "epoch": 4.108223784775298, "grad_norm": 0.6526029109954834, "learning_rate": 2.7630673856736447e-05, "loss": 0.089, "step": 13438 }, { "epoch": 4.108529501681443, "grad_norm": 0.2234593778848648, "learning_rate": 2.7630249246316506e-05, "loss": 0.0652, "step": 13439 }, { "epoch": 4.108835218587588, "grad_norm": 0.3694942593574524, "learning_rate": 2.7629824635896568e-05, "loss": 0.0587, "step": 13440 }, { "epoch": 4.109140935493733, "grad_norm": 0.7152299284934998, "learning_rate": 2.7629400025476627e-05, "loss": 0.0793, "step": 13441 }, { "epoch": 4.109446652399877, "grad_norm": 0.4256362318992615, "learning_rate": 2.762897541505669e-05, "loss": 0.0604, "step": 13442 }, { "epoch": 4.1097523693060225, "grad_norm": 0.40510478615760803, "learning_rate": 2.7628550804636748e-05, "loss": 0.0878, "step": 13443 }, { "epoch": 4.110058086212168, "grad_norm": 0.7683449387550354, "learning_rate": 2.762812619421681e-05, "loss": 0.0707, "step": 13444 }, { "epoch": 4.110363803118313, "grad_norm": 0.5027934312820435, "learning_rate": 2.762770158379687e-05, "loss": 0.0987, "step": 13445 }, { "epoch": 4.110669520024457, "grad_norm": 0.6219208836555481, "learning_rate": 2.7627276973376927e-05, "loss": 0.0933, "step": 13446 }, { "epoch": 4.110975236930602, "grad_norm": 0.48766809701919556, "learning_rate": 2.762685236295699e-05, "loss": 0.14, "step": 13447 }, { "epoch": 4.111280953836747, "grad_norm": 0.37170127034187317, "learning_rate": 2.7626427752537048e-05, "loss": 0.1183, "step": 13448 }, { "epoch": 4.111586670742892, "grad_norm": 1.0295515060424805, "learning_rate": 2.762600314211711e-05, "loss": 0.1782, "step": 13449 }, { "epoch": 4.111892387649037, "grad_norm": 1.6675890684127808, "learning_rate": 2.762557853169717e-05, "loss": 0.1778, "step": 13450 }, { "epoch": 4.112198104555182, "grad_norm": 0.6958795189857483, "learning_rate": 2.762515392127723e-05, "loss": 0.1771, "step": 13451 }, { "epoch": 4.112503821461327, "grad_norm": 0.9432625770568848, "learning_rate": 2.762472931085729e-05, "loss": 0.1939, "step": 13452 }, { "epoch": 4.112809538367472, "grad_norm": 0.6263432502746582, "learning_rate": 2.762430470043735e-05, "loss": 0.2276, "step": 13453 }, { "epoch": 4.113115255273617, "grad_norm": 0.8888636231422424, "learning_rate": 2.762388009001741e-05, "loss": 0.2023, "step": 13454 }, { "epoch": 4.113420972179761, "grad_norm": 1.1288386583328247, "learning_rate": 2.7623455479597472e-05, "loss": 0.2056, "step": 13455 }, { "epoch": 4.113726689085906, "grad_norm": 1.3009971380233765, "learning_rate": 2.762303086917753e-05, "loss": 0.1872, "step": 13456 }, { "epoch": 4.1140324059920514, "grad_norm": 1.346588373184204, "learning_rate": 2.7622606258757593e-05, "loss": 0.2023, "step": 13457 }, { "epoch": 4.114338122898197, "grad_norm": 1.5278897285461426, "learning_rate": 2.7622181648337652e-05, "loss": 0.2117, "step": 13458 }, { "epoch": 4.114643839804341, "grad_norm": 1.724365234375, "learning_rate": 2.762175703791771e-05, "loss": 0.2299, "step": 13459 }, { "epoch": 4.114949556710486, "grad_norm": 0.4421626031398773, "learning_rate": 2.7621332427497773e-05, "loss": 0.1391, "step": 13460 }, { "epoch": 4.115255273616631, "grad_norm": 0.23736105859279633, "learning_rate": 2.762090781707783e-05, "loss": 0.0747, "step": 13461 }, { "epoch": 4.115560990522776, "grad_norm": 0.28268179297447205, "learning_rate": 2.7620483206657893e-05, "loss": 0.0894, "step": 13462 }, { "epoch": 4.115866707428921, "grad_norm": 0.19742044806480408, "learning_rate": 2.7620058596237952e-05, "loss": 0.0616, "step": 13463 }, { "epoch": 4.1161724243350655, "grad_norm": 0.3517466187477112, "learning_rate": 2.7619633985818014e-05, "loss": 0.0532, "step": 13464 }, { "epoch": 4.116478141241211, "grad_norm": 0.22188733518123627, "learning_rate": 2.7619209375398073e-05, "loss": 0.0693, "step": 13465 }, { "epoch": 4.116783858147356, "grad_norm": 0.38915005326271057, "learning_rate": 2.7618784764978135e-05, "loss": 0.0601, "step": 13466 }, { "epoch": 4.117089575053501, "grad_norm": 0.3091684579849243, "learning_rate": 2.7618360154558194e-05, "loss": 0.0932, "step": 13467 }, { "epoch": 4.117395291959645, "grad_norm": 0.2889281213283539, "learning_rate": 2.7617935544138256e-05, "loss": 0.081, "step": 13468 }, { "epoch": 4.11770100886579, "grad_norm": 0.249647855758667, "learning_rate": 2.7617510933718314e-05, "loss": 0.096, "step": 13469 }, { "epoch": 4.118006725771935, "grad_norm": 0.2809579372406006, "learning_rate": 2.7617086323298373e-05, "loss": 0.0819, "step": 13470 }, { "epoch": 4.11831244267808, "grad_norm": 0.434131383895874, "learning_rate": 2.7616661712878435e-05, "loss": 0.066, "step": 13471 }, { "epoch": 4.118618159584225, "grad_norm": 0.433318167924881, "learning_rate": 2.7616237102458494e-05, "loss": 0.0735, "step": 13472 }, { "epoch": 4.11892387649037, "grad_norm": 0.5489268898963928, "learning_rate": 2.7615812492038556e-05, "loss": 0.1298, "step": 13473 }, { "epoch": 4.119229593396515, "grad_norm": 0.5363247394561768, "learning_rate": 2.7615387881618615e-05, "loss": 0.1569, "step": 13474 }, { "epoch": 4.11953531030266, "grad_norm": 0.51360684633255, "learning_rate": 2.7614963271198677e-05, "loss": 0.1402, "step": 13475 }, { "epoch": 4.119841027208805, "grad_norm": 0.6054378151893616, "learning_rate": 2.7614538660778736e-05, "loss": 0.1667, "step": 13476 }, { "epoch": 4.120146744114949, "grad_norm": 0.6717725992202759, "learning_rate": 2.7614114050358798e-05, "loss": 0.1511, "step": 13477 }, { "epoch": 4.120452461021094, "grad_norm": 0.7889077067375183, "learning_rate": 2.7613689439938856e-05, "loss": 0.2122, "step": 13478 }, { "epoch": 4.1207581779272395, "grad_norm": 0.9043264389038086, "learning_rate": 2.761326482951892e-05, "loss": 0.1674, "step": 13479 }, { "epoch": 4.121063894833385, "grad_norm": 0.7722340226173401, "learning_rate": 2.7612840219098977e-05, "loss": 0.1743, "step": 13480 }, { "epoch": 4.121369611739529, "grad_norm": 1.3228894472122192, "learning_rate": 2.761241560867904e-05, "loss": 0.1721, "step": 13481 }, { "epoch": 4.121675328645674, "grad_norm": 1.0842416286468506, "learning_rate": 2.7611990998259098e-05, "loss": 0.2119, "step": 13482 }, { "epoch": 4.121981045551819, "grad_norm": 1.0924742221832275, "learning_rate": 2.7611566387839157e-05, "loss": 0.2575, "step": 13483 }, { "epoch": 4.122286762457964, "grad_norm": 1.336381435394287, "learning_rate": 2.761114177741922e-05, "loss": 0.2254, "step": 13484 }, { "epoch": 4.122592479364108, "grad_norm": 0.30920660495758057, "learning_rate": 2.7610717166999277e-05, "loss": 0.1295, "step": 13485 }, { "epoch": 4.122898196270254, "grad_norm": 0.3160986006259918, "learning_rate": 2.761029255657934e-05, "loss": 0.0672, "step": 13486 }, { "epoch": 4.123203913176399, "grad_norm": 0.2555776536464691, "learning_rate": 2.7609867946159398e-05, "loss": 0.0608, "step": 13487 }, { "epoch": 4.123509630082544, "grad_norm": 0.19614997506141663, "learning_rate": 2.760944333573946e-05, "loss": 0.0681, "step": 13488 }, { "epoch": 4.123815346988689, "grad_norm": 0.28097623586654663, "learning_rate": 2.760901872531952e-05, "loss": 0.0458, "step": 13489 }, { "epoch": 4.124121063894833, "grad_norm": 0.19429142773151398, "learning_rate": 2.760859411489958e-05, "loss": 0.0409, "step": 13490 }, { "epoch": 4.124426780800978, "grad_norm": 0.366815984249115, "learning_rate": 2.760816950447964e-05, "loss": 0.0514, "step": 13491 }, { "epoch": 4.124732497707123, "grad_norm": 0.4811936914920807, "learning_rate": 2.7607744894059702e-05, "loss": 0.05, "step": 13492 }, { "epoch": 4.1250382146132685, "grad_norm": 0.2812938988208771, "learning_rate": 2.760732028363976e-05, "loss": 0.0561, "step": 13493 }, { "epoch": 4.125343931519413, "grad_norm": 0.3837212026119232, "learning_rate": 2.7606895673219823e-05, "loss": 0.0682, "step": 13494 }, { "epoch": 4.125649648425558, "grad_norm": 0.5128830075263977, "learning_rate": 2.760647106279988e-05, "loss": 0.0865, "step": 13495 }, { "epoch": 4.125955365331703, "grad_norm": 0.3740437924861908, "learning_rate": 2.760604645237994e-05, "loss": 0.1004, "step": 13496 }, { "epoch": 4.126261082237848, "grad_norm": 0.5224207639694214, "learning_rate": 2.7605621841960002e-05, "loss": 0.1038, "step": 13497 }, { "epoch": 4.126566799143992, "grad_norm": 0.36339232325553894, "learning_rate": 2.760519723154006e-05, "loss": 0.1322, "step": 13498 }, { "epoch": 4.126872516050137, "grad_norm": 0.4319113790988922, "learning_rate": 2.7604772621120123e-05, "loss": 0.13, "step": 13499 }, { "epoch": 4.1271782329562825, "grad_norm": 0.832981288433075, "learning_rate": 2.760434801070018e-05, "loss": 0.1256, "step": 13500 }, { "epoch": 4.127483949862428, "grad_norm": 0.9416308999061584, "learning_rate": 2.7603923400280244e-05, "loss": 0.1657, "step": 13501 }, { "epoch": 4.127789666768573, "grad_norm": 0.6938762664794922, "learning_rate": 2.7603498789860302e-05, "loss": 0.1682, "step": 13502 }, { "epoch": 4.128095383674717, "grad_norm": 1.2498770952224731, "learning_rate": 2.7603074179440364e-05, "loss": 0.1629, "step": 13503 }, { "epoch": 4.128401100580862, "grad_norm": 0.49280574917793274, "learning_rate": 2.7602649569020423e-05, "loss": 0.1577, "step": 13504 }, { "epoch": 4.128706817487007, "grad_norm": 1.0527762174606323, "learning_rate": 2.7602224958600485e-05, "loss": 0.1721, "step": 13505 }, { "epoch": 4.129012534393152, "grad_norm": 0.8104509115219116, "learning_rate": 2.7601800348180544e-05, "loss": 0.1665, "step": 13506 }, { "epoch": 4.1293182512992965, "grad_norm": 0.7312922477722168, "learning_rate": 2.7601375737760606e-05, "loss": 0.1798, "step": 13507 }, { "epoch": 4.129623968205442, "grad_norm": 1.0854089260101318, "learning_rate": 2.7600951127340665e-05, "loss": 0.1951, "step": 13508 }, { "epoch": 4.129929685111587, "grad_norm": 3.805873394012451, "learning_rate": 2.7600526516920723e-05, "loss": 0.2211, "step": 13509 }, { "epoch": 4.130235402017732, "grad_norm": 0.4957030415534973, "learning_rate": 2.7600101906500786e-05, "loss": 0.1551, "step": 13510 }, { "epoch": 4.130541118923876, "grad_norm": 0.2276565581560135, "learning_rate": 2.7599677296080844e-05, "loss": 0.0762, "step": 13511 }, { "epoch": 4.130846835830021, "grad_norm": 0.24009278416633606, "learning_rate": 2.7599252685660906e-05, "loss": 0.0659, "step": 13512 }, { "epoch": 4.131152552736166, "grad_norm": 0.6577039957046509, "learning_rate": 2.7598828075240965e-05, "loss": 0.0534, "step": 13513 }, { "epoch": 4.131458269642311, "grad_norm": 0.44031113386154175, "learning_rate": 2.7598403464821027e-05, "loss": 0.0504, "step": 13514 }, { "epoch": 4.131763986548457, "grad_norm": 0.20504431426525116, "learning_rate": 2.7597978854401086e-05, "loss": 0.0483, "step": 13515 }, { "epoch": 4.132069703454601, "grad_norm": 0.31223833560943604, "learning_rate": 2.7597554243981148e-05, "loss": 0.0754, "step": 13516 }, { "epoch": 4.132375420360746, "grad_norm": 0.3277880549430847, "learning_rate": 2.7597129633561207e-05, "loss": 0.0742, "step": 13517 }, { "epoch": 4.132681137266891, "grad_norm": 0.30768656730651855, "learning_rate": 2.759670502314127e-05, "loss": 0.0774, "step": 13518 }, { "epoch": 4.132986854173036, "grad_norm": 0.4139818847179413, "learning_rate": 2.7596280412721327e-05, "loss": 0.0779, "step": 13519 }, { "epoch": 4.13329257107918, "grad_norm": 0.25706589221954346, "learning_rate": 2.759585580230139e-05, "loss": 0.0925, "step": 13520 }, { "epoch": 4.1335982879853255, "grad_norm": 0.404143750667572, "learning_rate": 2.7595431191881448e-05, "loss": 0.0798, "step": 13521 }, { "epoch": 4.133904004891471, "grad_norm": 0.273002952337265, "learning_rate": 2.7595006581461507e-05, "loss": 0.1051, "step": 13522 }, { "epoch": 4.134209721797616, "grad_norm": 0.486341267824173, "learning_rate": 2.759458197104157e-05, "loss": 0.1166, "step": 13523 }, { "epoch": 4.13451543870376, "grad_norm": 0.4229927659034729, "learning_rate": 2.7594157360621628e-05, "loss": 0.1394, "step": 13524 }, { "epoch": 4.134821155609905, "grad_norm": 0.4220741391181946, "learning_rate": 2.759373275020169e-05, "loss": 0.1399, "step": 13525 }, { "epoch": 4.13512687251605, "grad_norm": 0.43162477016448975, "learning_rate": 2.759330813978175e-05, "loss": 0.151, "step": 13526 }, { "epoch": 4.135432589422195, "grad_norm": 0.4982406795024872, "learning_rate": 2.759288352936181e-05, "loss": 0.1421, "step": 13527 }, { "epoch": 4.13573830632834, "grad_norm": 0.4916478991508484, "learning_rate": 2.759245891894187e-05, "loss": 0.1519, "step": 13528 }, { "epoch": 4.136044023234485, "grad_norm": 1.2923979759216309, "learning_rate": 2.759203430852193e-05, "loss": 0.1796, "step": 13529 }, { "epoch": 4.13634974014063, "grad_norm": 1.0402971506118774, "learning_rate": 2.759160969810199e-05, "loss": 0.2147, "step": 13530 }, { "epoch": 4.136655457046775, "grad_norm": 0.5450578331947327, "learning_rate": 2.7591185087682052e-05, "loss": 0.1776, "step": 13531 }, { "epoch": 4.13696117395292, "grad_norm": 1.1370272636413574, "learning_rate": 2.759076047726211e-05, "loss": 0.1573, "step": 13532 }, { "epoch": 4.137266890859064, "grad_norm": 0.9290776252746582, "learning_rate": 2.7590335866842173e-05, "loss": 0.207, "step": 13533 }, { "epoch": 4.137572607765209, "grad_norm": 1.5972968339920044, "learning_rate": 2.7589911256422235e-05, "loss": 0.2275, "step": 13534 }, { "epoch": 4.137878324671354, "grad_norm": 0.6147744059562683, "learning_rate": 2.7589486646002294e-05, "loss": 0.1454, "step": 13535 }, { "epoch": 4.1381840415774995, "grad_norm": 0.2670133411884308, "learning_rate": 2.7589062035582356e-05, "loss": 0.0799, "step": 13536 }, { "epoch": 4.138489758483644, "grad_norm": 0.4106846749782562, "learning_rate": 2.7588637425162414e-05, "loss": 0.0704, "step": 13537 }, { "epoch": 4.138795475389789, "grad_norm": 0.21746967732906342, "learning_rate": 2.7588212814742477e-05, "loss": 0.0407, "step": 13538 }, { "epoch": 4.139101192295934, "grad_norm": 0.3691830337047577, "learning_rate": 2.7587788204322535e-05, "loss": 0.0507, "step": 13539 }, { "epoch": 4.139406909202079, "grad_norm": 0.30988118052482605, "learning_rate": 2.7587363593902597e-05, "loss": 0.0643, "step": 13540 }, { "epoch": 4.139712626108224, "grad_norm": 0.8032299876213074, "learning_rate": 2.7586938983482656e-05, "loss": 0.0673, "step": 13541 }, { "epoch": 4.140018343014368, "grad_norm": 0.5628619194030762, "learning_rate": 2.7586514373062718e-05, "loss": 0.0607, "step": 13542 }, { "epoch": 4.1403240599205136, "grad_norm": 0.40600425004959106, "learning_rate": 2.7586089762642777e-05, "loss": 0.0748, "step": 13543 }, { "epoch": 4.140629776826659, "grad_norm": 0.34027203917503357, "learning_rate": 2.758566515222284e-05, "loss": 0.078, "step": 13544 }, { "epoch": 4.140935493732804, "grad_norm": 0.26626336574554443, "learning_rate": 2.7585240541802898e-05, "loss": 0.0802, "step": 13545 }, { "epoch": 4.141241210638948, "grad_norm": 0.254874587059021, "learning_rate": 2.758481593138296e-05, "loss": 0.0888, "step": 13546 }, { "epoch": 4.141546927545093, "grad_norm": 0.38677746057510376, "learning_rate": 2.758439132096302e-05, "loss": 0.0909, "step": 13547 }, { "epoch": 4.141852644451238, "grad_norm": 0.2579224407672882, "learning_rate": 2.7583966710543077e-05, "loss": 0.1155, "step": 13548 }, { "epoch": 4.142158361357383, "grad_norm": 0.4525660276412964, "learning_rate": 2.758354210012314e-05, "loss": 0.1255, "step": 13549 }, { "epoch": 4.142464078263528, "grad_norm": 0.40845730900764465, "learning_rate": 2.7583117489703198e-05, "loss": 0.1535, "step": 13550 }, { "epoch": 4.142769795169673, "grad_norm": 0.5199663639068604, "learning_rate": 2.758269287928326e-05, "loss": 0.1733, "step": 13551 }, { "epoch": 4.143075512075818, "grad_norm": 0.5446755886077881, "learning_rate": 2.758226826886332e-05, "loss": 0.1699, "step": 13552 }, { "epoch": 4.143381228981963, "grad_norm": 0.6477108001708984, "learning_rate": 2.758184365844338e-05, "loss": 0.1675, "step": 13553 }, { "epoch": 4.143686945888108, "grad_norm": 0.683307409286499, "learning_rate": 2.758141904802344e-05, "loss": 0.2043, "step": 13554 }, { "epoch": 4.143992662794252, "grad_norm": 0.8036741614341736, "learning_rate": 2.75809944376035e-05, "loss": 0.1798, "step": 13555 }, { "epoch": 4.144298379700397, "grad_norm": 1.1697734594345093, "learning_rate": 2.758056982718356e-05, "loss": 0.2342, "step": 13556 }, { "epoch": 4.1446040966065425, "grad_norm": 0.7012492418289185, "learning_rate": 2.7580145216763622e-05, "loss": 0.2025, "step": 13557 }, { "epoch": 4.144909813512688, "grad_norm": 5.989058494567871, "learning_rate": 2.757972060634368e-05, "loss": 0.2228, "step": 13558 }, { "epoch": 4.145215530418832, "grad_norm": 1.200129508972168, "learning_rate": 2.7579295995923743e-05, "loss": 0.2077, "step": 13559 }, { "epoch": 4.145521247324977, "grad_norm": 0.5042074918746948, "learning_rate": 2.7578871385503802e-05, "loss": 0.1579, "step": 13560 }, { "epoch": 4.145826964231122, "grad_norm": 0.37576285004615784, "learning_rate": 2.757844677508386e-05, "loss": 0.0703, "step": 13561 }, { "epoch": 4.146132681137267, "grad_norm": 0.30053287744522095, "learning_rate": 2.7578022164663923e-05, "loss": 0.0586, "step": 13562 }, { "epoch": 4.146438398043411, "grad_norm": 0.24985966086387634, "learning_rate": 2.757759755424398e-05, "loss": 0.0406, "step": 13563 }, { "epoch": 4.1467441149495565, "grad_norm": 0.3449644148349762, "learning_rate": 2.7577172943824043e-05, "loss": 0.0619, "step": 13564 }, { "epoch": 4.147049831855702, "grad_norm": 0.17421963810920715, "learning_rate": 2.7576748333404102e-05, "loss": 0.05, "step": 13565 }, { "epoch": 4.147355548761847, "grad_norm": 0.3342324495315552, "learning_rate": 2.7576323722984164e-05, "loss": 0.0577, "step": 13566 }, { "epoch": 4.147661265667992, "grad_norm": 0.46623915433883667, "learning_rate": 2.7575899112564223e-05, "loss": 0.0789, "step": 13567 }, { "epoch": 4.147966982574136, "grad_norm": 0.2799883186817169, "learning_rate": 2.7575474502144285e-05, "loss": 0.0654, "step": 13568 }, { "epoch": 4.148272699480281, "grad_norm": 0.5394760966300964, "learning_rate": 2.7575049891724344e-05, "loss": 0.0677, "step": 13569 }, { "epoch": 4.148578416386426, "grad_norm": 0.5418713092803955, "learning_rate": 2.7574625281304406e-05, "loss": 0.0999, "step": 13570 }, { "epoch": 4.148884133292571, "grad_norm": 0.41552817821502686, "learning_rate": 2.7574200670884464e-05, "loss": 0.0987, "step": 13571 }, { "epoch": 4.149189850198716, "grad_norm": 0.6639132499694824, "learning_rate": 2.7573776060464527e-05, "loss": 0.089, "step": 13572 }, { "epoch": 4.149495567104861, "grad_norm": 0.8495844006538391, "learning_rate": 2.7573351450044585e-05, "loss": 0.102, "step": 13573 }, { "epoch": 4.149801284011006, "grad_norm": 0.40802040696144104, "learning_rate": 2.7572926839624644e-05, "loss": 0.0946, "step": 13574 }, { "epoch": 4.150107000917151, "grad_norm": 0.3492819368839264, "learning_rate": 2.7572502229204706e-05, "loss": 0.1497, "step": 13575 }, { "epoch": 4.150412717823295, "grad_norm": 0.4138652980327606, "learning_rate": 2.7572077618784765e-05, "loss": 0.1191, "step": 13576 }, { "epoch": 4.15071843472944, "grad_norm": 0.64557284116745, "learning_rate": 2.7571653008364827e-05, "loss": 0.1592, "step": 13577 }, { "epoch": 4.1510241516355855, "grad_norm": 0.5272688865661621, "learning_rate": 2.7571228397944886e-05, "loss": 0.1427, "step": 13578 }, { "epoch": 4.151329868541731, "grad_norm": 0.4682074189186096, "learning_rate": 2.7570803787524948e-05, "loss": 0.1575, "step": 13579 }, { "epoch": 4.151635585447876, "grad_norm": 1.3708806037902832, "learning_rate": 2.7570379177105006e-05, "loss": 0.1782, "step": 13580 }, { "epoch": 4.15194130235402, "grad_norm": 0.9650365710258484, "learning_rate": 2.756995456668507e-05, "loss": 0.1658, "step": 13581 }, { "epoch": 4.152247019260165, "grad_norm": 0.6926095485687256, "learning_rate": 2.7569529956265127e-05, "loss": 0.2244, "step": 13582 }, { "epoch": 4.15255273616631, "grad_norm": 0.9414966106414795, "learning_rate": 2.756910534584519e-05, "loss": 0.2034, "step": 13583 }, { "epoch": 4.152858453072455, "grad_norm": 1.6602129936218262, "learning_rate": 2.7568680735425248e-05, "loss": 0.2266, "step": 13584 }, { "epoch": 4.1531641699785995, "grad_norm": 0.40350502729415894, "learning_rate": 2.7568256125005307e-05, "loss": 0.1609, "step": 13585 }, { "epoch": 4.153469886884745, "grad_norm": 0.42082032561302185, "learning_rate": 2.756783151458537e-05, "loss": 0.0635, "step": 13586 }, { "epoch": 4.15377560379089, "grad_norm": 0.22125457227230072, "learning_rate": 2.7567406904165427e-05, "loss": 0.057, "step": 13587 }, { "epoch": 4.154081320697035, "grad_norm": 0.3215661644935608, "learning_rate": 2.756698229374549e-05, "loss": 0.0737, "step": 13588 }, { "epoch": 4.154387037603179, "grad_norm": 0.24371010065078735, "learning_rate": 2.7566557683325548e-05, "loss": 0.0556, "step": 13589 }, { "epoch": 4.154692754509324, "grad_norm": 0.23737861216068268, "learning_rate": 2.756613307290561e-05, "loss": 0.0603, "step": 13590 }, { "epoch": 4.154998471415469, "grad_norm": 0.24669504165649414, "learning_rate": 2.756570846248567e-05, "loss": 0.045, "step": 13591 }, { "epoch": 4.155304188321614, "grad_norm": 0.2248280644416809, "learning_rate": 2.756528385206573e-05, "loss": 0.0628, "step": 13592 }, { "epoch": 4.1556099052277595, "grad_norm": 0.38748598098754883, "learning_rate": 2.756485924164579e-05, "loss": 0.1065, "step": 13593 }, { "epoch": 4.155915622133904, "grad_norm": 0.4265848994255066, "learning_rate": 2.7564434631225852e-05, "loss": 0.0728, "step": 13594 }, { "epoch": 4.156221339040049, "grad_norm": 0.25884440541267395, "learning_rate": 2.756401002080591e-05, "loss": 0.0815, "step": 13595 }, { "epoch": 4.156527055946194, "grad_norm": 0.5558943748474121, "learning_rate": 2.7563585410385973e-05, "loss": 0.1243, "step": 13596 }, { "epoch": 4.156832772852339, "grad_norm": 0.5212362408638, "learning_rate": 2.756316079996603e-05, "loss": 0.0642, "step": 13597 }, { "epoch": 4.157138489758483, "grad_norm": 0.31959566473960876, "learning_rate": 2.756273618954609e-05, "loss": 0.1021, "step": 13598 }, { "epoch": 4.157444206664628, "grad_norm": 0.582105815410614, "learning_rate": 2.7562311579126152e-05, "loss": 0.1422, "step": 13599 }, { "epoch": 4.1577499235707736, "grad_norm": 0.44287171959877014, "learning_rate": 2.756188696870621e-05, "loss": 0.1257, "step": 13600 }, { "epoch": 4.158055640476919, "grad_norm": 0.7954109907150269, "learning_rate": 2.7561462358286273e-05, "loss": 0.1622, "step": 13601 }, { "epoch": 4.158361357383063, "grad_norm": 0.5178627371788025, "learning_rate": 2.756103774786633e-05, "loss": 0.1792, "step": 13602 }, { "epoch": 4.158667074289208, "grad_norm": 2.953568458557129, "learning_rate": 2.7560613137446394e-05, "loss": 0.19, "step": 13603 }, { "epoch": 4.158972791195353, "grad_norm": 0.5061753392219543, "learning_rate": 2.7560188527026452e-05, "loss": 0.1605, "step": 13604 }, { "epoch": 4.159278508101498, "grad_norm": 0.7890948057174683, "learning_rate": 2.7559763916606514e-05, "loss": 0.1699, "step": 13605 }, { "epoch": 4.159584225007643, "grad_norm": 3.4891161918640137, "learning_rate": 2.7559339306186573e-05, "loss": 0.1777, "step": 13606 }, { "epoch": 4.159889941913788, "grad_norm": 0.7543187737464905, "learning_rate": 2.7558914695766635e-05, "loss": 0.1717, "step": 13607 }, { "epoch": 4.160195658819933, "grad_norm": 0.8775588274002075, "learning_rate": 2.7558490085346694e-05, "loss": 0.2131, "step": 13608 }, { "epoch": 4.160501375726078, "grad_norm": 1.1491801738739014, "learning_rate": 2.7558065474926756e-05, "loss": 0.2541, "step": 13609 }, { "epoch": 4.160807092632223, "grad_norm": 0.2819606065750122, "learning_rate": 2.7557640864506815e-05, "loss": 0.1213, "step": 13610 }, { "epoch": 4.161112809538367, "grad_norm": 0.2075568288564682, "learning_rate": 2.7557216254086873e-05, "loss": 0.0695, "step": 13611 }, { "epoch": 4.161418526444512, "grad_norm": 0.518045961856842, "learning_rate": 2.7556791643666936e-05, "loss": 0.1027, "step": 13612 }, { "epoch": 4.161724243350657, "grad_norm": 0.22626973688602448, "learning_rate": 2.7556367033246994e-05, "loss": 0.0722, "step": 13613 }, { "epoch": 4.1620299602568025, "grad_norm": 0.36540326476097107, "learning_rate": 2.7555942422827056e-05, "loss": 0.0415, "step": 13614 }, { "epoch": 4.162335677162947, "grad_norm": 0.2553619146347046, "learning_rate": 2.7555517812407115e-05, "loss": 0.0608, "step": 13615 }, { "epoch": 4.162641394069092, "grad_norm": 0.26952242851257324, "learning_rate": 2.7555093201987177e-05, "loss": 0.0623, "step": 13616 }, { "epoch": 4.162947110975237, "grad_norm": 0.2477789968252182, "learning_rate": 2.7554668591567236e-05, "loss": 0.0821, "step": 13617 }, { "epoch": 4.163252827881382, "grad_norm": 0.2966485321521759, "learning_rate": 2.7554243981147298e-05, "loss": 0.0566, "step": 13618 }, { "epoch": 4.163558544787527, "grad_norm": 0.29740071296691895, "learning_rate": 2.7553819370727357e-05, "loss": 0.0647, "step": 13619 }, { "epoch": 4.163864261693671, "grad_norm": 0.36662808060646057, "learning_rate": 2.755339476030742e-05, "loss": 0.1307, "step": 13620 }, { "epoch": 4.1641699785998165, "grad_norm": 0.5040295124053955, "learning_rate": 2.7552970149887477e-05, "loss": 0.0688, "step": 13621 }, { "epoch": 4.164475695505962, "grad_norm": 0.36477625370025635, "learning_rate": 2.755254553946754e-05, "loss": 0.0798, "step": 13622 }, { "epoch": 4.164781412412107, "grad_norm": 0.5290716290473938, "learning_rate": 2.7552120929047598e-05, "loss": 0.1072, "step": 13623 }, { "epoch": 4.165087129318251, "grad_norm": 0.4328950047492981, "learning_rate": 2.7551696318627657e-05, "loss": 0.1003, "step": 13624 }, { "epoch": 4.165392846224396, "grad_norm": 0.6340389251708984, "learning_rate": 2.755127170820772e-05, "loss": 0.1645, "step": 13625 }, { "epoch": 4.165698563130541, "grad_norm": 0.7424623966217041, "learning_rate": 2.7550847097787778e-05, "loss": 0.1545, "step": 13626 }, { "epoch": 4.166004280036686, "grad_norm": 0.6656714677810669, "learning_rate": 2.755042248736784e-05, "loss": 0.1392, "step": 13627 }, { "epoch": 4.1663099969428306, "grad_norm": 0.9148986339569092, "learning_rate": 2.75499978769479e-05, "loss": 0.1661, "step": 13628 }, { "epoch": 4.166615713848976, "grad_norm": 0.5911037921905518, "learning_rate": 2.754957326652796e-05, "loss": 0.2169, "step": 13629 }, { "epoch": 4.166921430755121, "grad_norm": 0.9484231472015381, "learning_rate": 2.754914865610802e-05, "loss": 0.196, "step": 13630 }, { "epoch": 4.167227147661266, "grad_norm": 0.7973124384880066, "learning_rate": 2.754872404568808e-05, "loss": 0.1488, "step": 13631 }, { "epoch": 4.167532864567411, "grad_norm": 0.6668093204498291, "learning_rate": 2.754829943526814e-05, "loss": 0.1733, "step": 13632 }, { "epoch": 4.167838581473555, "grad_norm": 0.9055975079536438, "learning_rate": 2.7547874824848202e-05, "loss": 0.1965, "step": 13633 }, { "epoch": 4.1681442983797, "grad_norm": 2.2767136096954346, "learning_rate": 2.754745021442826e-05, "loss": 0.2257, "step": 13634 }, { "epoch": 4.1684500152858455, "grad_norm": 0.8835585713386536, "learning_rate": 2.7547025604008323e-05, "loss": 0.1424, "step": 13635 }, { "epoch": 4.168755732191991, "grad_norm": 0.7655414938926697, "learning_rate": 2.7546600993588385e-05, "loss": 0.0654, "step": 13636 }, { "epoch": 4.169061449098135, "grad_norm": 0.2782325744628906, "learning_rate": 2.7546176383168444e-05, "loss": 0.0795, "step": 13637 }, { "epoch": 4.16936716600428, "grad_norm": 0.29219892621040344, "learning_rate": 2.7545751772748506e-05, "loss": 0.0566, "step": 13638 }, { "epoch": 4.169672882910425, "grad_norm": 0.25291022658348083, "learning_rate": 2.7545327162328565e-05, "loss": 0.0526, "step": 13639 }, { "epoch": 4.16997859981657, "grad_norm": 0.34402185678482056, "learning_rate": 2.7544902551908627e-05, "loss": 0.0578, "step": 13640 }, { "epoch": 4.170284316722714, "grad_norm": 0.23822307586669922, "learning_rate": 2.7544477941488685e-05, "loss": 0.0532, "step": 13641 }, { "epoch": 4.1705900336288595, "grad_norm": 0.21404027938842773, "learning_rate": 2.7544053331068747e-05, "loss": 0.0598, "step": 13642 }, { "epoch": 4.170895750535005, "grad_norm": 0.6928902864456177, "learning_rate": 2.7543628720648806e-05, "loss": 0.0758, "step": 13643 }, { "epoch": 4.17120146744115, "grad_norm": 0.2386694848537445, "learning_rate": 2.7543204110228868e-05, "loss": 0.0793, "step": 13644 }, { "epoch": 4.171507184347295, "grad_norm": 0.5258467793464661, "learning_rate": 2.7542779499808927e-05, "loss": 0.1177, "step": 13645 }, { "epoch": 4.171812901253439, "grad_norm": 0.41446709632873535, "learning_rate": 2.754235488938899e-05, "loss": 0.0955, "step": 13646 }, { "epoch": 4.172118618159584, "grad_norm": 0.27436861395835876, "learning_rate": 2.7541930278969048e-05, "loss": 0.0738, "step": 13647 }, { "epoch": 4.172424335065729, "grad_norm": 0.36093226075172424, "learning_rate": 2.754150566854911e-05, "loss": 0.1318, "step": 13648 }, { "epoch": 4.172730051971874, "grad_norm": 0.3950074017047882, "learning_rate": 2.754108105812917e-05, "loss": 0.1651, "step": 13649 }, { "epoch": 4.173035768878019, "grad_norm": 1.016940951347351, "learning_rate": 2.7540656447709227e-05, "loss": 0.1471, "step": 13650 }, { "epoch": 4.173341485784164, "grad_norm": 0.6941959261894226, "learning_rate": 2.754023183728929e-05, "loss": 0.1648, "step": 13651 }, { "epoch": 4.173647202690309, "grad_norm": 0.5001826882362366, "learning_rate": 2.7539807226869348e-05, "loss": 0.1528, "step": 13652 }, { "epoch": 4.173952919596454, "grad_norm": 0.9830875396728516, "learning_rate": 2.753938261644941e-05, "loss": 0.173, "step": 13653 }, { "epoch": 4.174258636502598, "grad_norm": 1.0358870029449463, "learning_rate": 2.753895800602947e-05, "loss": 0.172, "step": 13654 }, { "epoch": 4.174564353408743, "grad_norm": 1.5038286447525024, "learning_rate": 2.753853339560953e-05, "loss": 0.1804, "step": 13655 }, { "epoch": 4.174870070314888, "grad_norm": 0.5542459487915039, "learning_rate": 2.753810878518959e-05, "loss": 0.1705, "step": 13656 }, { "epoch": 4.1751757872210336, "grad_norm": 1.3621718883514404, "learning_rate": 2.753768417476965e-05, "loss": 0.2123, "step": 13657 }, { "epoch": 4.175481504127179, "grad_norm": 0.774776816368103, "learning_rate": 2.753725956434971e-05, "loss": 0.1684, "step": 13658 }, { "epoch": 4.175787221033323, "grad_norm": 0.9957660436630249, "learning_rate": 2.7536834953929772e-05, "loss": 0.2211, "step": 13659 }, { "epoch": 4.176092937939468, "grad_norm": 0.24926500022411346, "learning_rate": 2.753641034350983e-05, "loss": 0.1591, "step": 13660 }, { "epoch": 4.176398654845613, "grad_norm": 0.2648554742336273, "learning_rate": 2.7535985733089893e-05, "loss": 0.0945, "step": 13661 }, { "epoch": 4.176704371751758, "grad_norm": 0.6556075811386108, "learning_rate": 2.7535561122669952e-05, "loss": 0.0699, "step": 13662 }, { "epoch": 4.1770100886579025, "grad_norm": 0.22760224342346191, "learning_rate": 2.753513651225001e-05, "loss": 0.0616, "step": 13663 }, { "epoch": 4.177315805564048, "grad_norm": 0.2605651021003723, "learning_rate": 2.7534711901830073e-05, "loss": 0.0697, "step": 13664 }, { "epoch": 4.177621522470193, "grad_norm": 0.5001703500747681, "learning_rate": 2.753428729141013e-05, "loss": 0.0853, "step": 13665 }, { "epoch": 4.177927239376338, "grad_norm": 0.3141132891178131, "learning_rate": 2.7533862680990193e-05, "loss": 0.0986, "step": 13666 }, { "epoch": 4.178232956282482, "grad_norm": 0.321996808052063, "learning_rate": 2.7533438070570252e-05, "loss": 0.05, "step": 13667 }, { "epoch": 4.178538673188627, "grad_norm": 0.33736541867256165, "learning_rate": 2.7533013460150314e-05, "loss": 0.101, "step": 13668 }, { "epoch": 4.178844390094772, "grad_norm": 0.5877682566642761, "learning_rate": 2.7532588849730373e-05, "loss": 0.0609, "step": 13669 }, { "epoch": 4.179150107000917, "grad_norm": 0.37330934405326843, "learning_rate": 2.7532164239310435e-05, "loss": 0.0726, "step": 13670 }, { "epoch": 4.1794558239070625, "grad_norm": 0.43524232506752014, "learning_rate": 2.7531739628890494e-05, "loss": 0.088, "step": 13671 }, { "epoch": 4.179761540813207, "grad_norm": 0.7664813995361328, "learning_rate": 2.7531315018470556e-05, "loss": 0.132, "step": 13672 }, { "epoch": 4.180067257719352, "grad_norm": 0.47295111417770386, "learning_rate": 2.7530890408050615e-05, "loss": 0.1108, "step": 13673 }, { "epoch": 4.180372974625497, "grad_norm": 0.43278253078460693, "learning_rate": 2.7530465797630677e-05, "loss": 0.0944, "step": 13674 }, { "epoch": 4.180678691531642, "grad_norm": 0.3325393795967102, "learning_rate": 2.7530041187210735e-05, "loss": 0.1254, "step": 13675 }, { "epoch": 4.180984408437786, "grad_norm": 0.5910556316375732, "learning_rate": 2.7529616576790794e-05, "loss": 0.1343, "step": 13676 }, { "epoch": 4.181290125343931, "grad_norm": 0.78619784116745, "learning_rate": 2.7529191966370856e-05, "loss": 0.1914, "step": 13677 }, { "epoch": 4.1815958422500765, "grad_norm": 0.4871615767478943, "learning_rate": 2.7528767355950915e-05, "loss": 0.1408, "step": 13678 }, { "epoch": 4.181901559156222, "grad_norm": 0.8681051731109619, "learning_rate": 2.7528342745530977e-05, "loss": 0.2019, "step": 13679 }, { "epoch": 4.182207276062366, "grad_norm": 0.9944502115249634, "learning_rate": 2.7527918135111036e-05, "loss": 0.1936, "step": 13680 }, { "epoch": 4.182512992968511, "grad_norm": 0.8964234590530396, "learning_rate": 2.7527493524691098e-05, "loss": 0.2242, "step": 13681 }, { "epoch": 4.182818709874656, "grad_norm": 0.7371673583984375, "learning_rate": 2.7527068914271156e-05, "loss": 0.2062, "step": 13682 }, { "epoch": 4.183124426780801, "grad_norm": 1.029115080833435, "learning_rate": 2.752664430385122e-05, "loss": 0.2236, "step": 13683 }, { "epoch": 4.183430143686946, "grad_norm": 1.9322454929351807, "learning_rate": 2.7526219693431277e-05, "loss": 0.2325, "step": 13684 }, { "epoch": 4.1837358605930905, "grad_norm": 0.344448447227478, "learning_rate": 2.752579508301134e-05, "loss": 0.1532, "step": 13685 }, { "epoch": 4.184041577499236, "grad_norm": 0.2625230550765991, "learning_rate": 2.7525370472591398e-05, "loss": 0.0736, "step": 13686 }, { "epoch": 4.184347294405381, "grad_norm": 0.6638964414596558, "learning_rate": 2.752494586217146e-05, "loss": 0.0749, "step": 13687 }, { "epoch": 4.184653011311526, "grad_norm": 0.23204657435417175, "learning_rate": 2.752452125175152e-05, "loss": 0.0475, "step": 13688 }, { "epoch": 4.18495872821767, "grad_norm": 0.31941676139831543, "learning_rate": 2.7524096641331577e-05, "loss": 0.0796, "step": 13689 }, { "epoch": 4.185264445123815, "grad_norm": 0.4776177406311035, "learning_rate": 2.752367203091164e-05, "loss": 0.0664, "step": 13690 }, { "epoch": 4.18557016202996, "grad_norm": 0.8182060718536377, "learning_rate": 2.7523247420491698e-05, "loss": 0.0666, "step": 13691 }, { "epoch": 4.1858758789361055, "grad_norm": 0.26128584146499634, "learning_rate": 2.752282281007176e-05, "loss": 0.0696, "step": 13692 }, { "epoch": 4.18618159584225, "grad_norm": 0.3828282356262207, "learning_rate": 2.752239819965182e-05, "loss": 0.0729, "step": 13693 }, { "epoch": 4.186487312748395, "grad_norm": 0.202840656042099, "learning_rate": 2.752197358923188e-05, "loss": 0.0552, "step": 13694 }, { "epoch": 4.18679302965454, "grad_norm": 0.27018314599990845, "learning_rate": 2.752154897881194e-05, "loss": 0.081, "step": 13695 }, { "epoch": 4.187098746560685, "grad_norm": 0.6800670027732849, "learning_rate": 2.7521124368392002e-05, "loss": 0.0892, "step": 13696 }, { "epoch": 4.18740446346683, "grad_norm": 0.5062950253486633, "learning_rate": 2.752069975797206e-05, "loss": 0.117, "step": 13697 }, { "epoch": 4.187710180372974, "grad_norm": 0.41888993978500366, "learning_rate": 2.7520275147552123e-05, "loss": 0.1087, "step": 13698 }, { "epoch": 4.1880158972791195, "grad_norm": 0.7992141842842102, "learning_rate": 2.751985053713218e-05, "loss": 0.1539, "step": 13699 }, { "epoch": 4.188321614185265, "grad_norm": 1.2052396535873413, "learning_rate": 2.751942592671224e-05, "loss": 0.1478, "step": 13700 }, { "epoch": 4.18862733109141, "grad_norm": 0.5735210180282593, "learning_rate": 2.7519001316292302e-05, "loss": 0.1475, "step": 13701 }, { "epoch": 4.188933047997554, "grad_norm": 0.576945960521698, "learning_rate": 2.751857670587236e-05, "loss": 0.1442, "step": 13702 }, { "epoch": 4.189238764903699, "grad_norm": 0.47089684009552, "learning_rate": 2.7518152095452423e-05, "loss": 0.1541, "step": 13703 }, { "epoch": 4.189544481809844, "grad_norm": 1.479016900062561, "learning_rate": 2.751772748503248e-05, "loss": 0.1919, "step": 13704 }, { "epoch": 4.189850198715989, "grad_norm": 1.299822211265564, "learning_rate": 2.7517302874612544e-05, "loss": 0.1879, "step": 13705 }, { "epoch": 4.1901559156221335, "grad_norm": 0.911543071269989, "learning_rate": 2.7516878264192602e-05, "loss": 0.167, "step": 13706 }, { "epoch": 4.190461632528279, "grad_norm": 1.4794093370437622, "learning_rate": 2.7516453653772665e-05, "loss": 0.2194, "step": 13707 }, { "epoch": 4.190767349434424, "grad_norm": 1.060352087020874, "learning_rate": 2.7516029043352723e-05, "loss": 0.2493, "step": 13708 }, { "epoch": 4.191073066340569, "grad_norm": 0.982745885848999, "learning_rate": 2.7515604432932785e-05, "loss": 0.2453, "step": 13709 }, { "epoch": 4.191378783246714, "grad_norm": 0.5419291257858276, "learning_rate": 2.7515179822512844e-05, "loss": 0.1707, "step": 13710 }, { "epoch": 4.191684500152858, "grad_norm": 0.23354953527450562, "learning_rate": 2.7514755212092906e-05, "loss": 0.0657, "step": 13711 }, { "epoch": 4.191990217059003, "grad_norm": 0.43485745787620544, "learning_rate": 2.7514330601672965e-05, "loss": 0.0942, "step": 13712 }, { "epoch": 4.192295933965148, "grad_norm": 0.3912777304649353, "learning_rate": 2.7513905991253023e-05, "loss": 0.0549, "step": 13713 }, { "epoch": 4.1926016508712936, "grad_norm": 0.2281094491481781, "learning_rate": 2.7513481380833086e-05, "loss": 0.0442, "step": 13714 }, { "epoch": 4.192907367777438, "grad_norm": 0.543449342250824, "learning_rate": 2.7513056770413144e-05, "loss": 0.075, "step": 13715 }, { "epoch": 4.193213084683583, "grad_norm": 0.19872912764549255, "learning_rate": 2.7512632159993206e-05, "loss": 0.0599, "step": 13716 }, { "epoch": 4.193518801589728, "grad_norm": 0.3730262219905853, "learning_rate": 2.7512207549573265e-05, "loss": 0.0671, "step": 13717 }, { "epoch": 4.193824518495873, "grad_norm": 0.9778617024421692, "learning_rate": 2.7511782939153327e-05, "loss": 0.0908, "step": 13718 }, { "epoch": 4.194130235402017, "grad_norm": 1.2817440032958984, "learning_rate": 2.7511358328733386e-05, "loss": 0.0825, "step": 13719 }, { "epoch": 4.1944359523081625, "grad_norm": 0.2737765312194824, "learning_rate": 2.7510933718313448e-05, "loss": 0.083, "step": 13720 }, { "epoch": 4.194741669214308, "grad_norm": 0.3809559643268585, "learning_rate": 2.7510509107893507e-05, "loss": 0.1088, "step": 13721 }, { "epoch": 4.195047386120453, "grad_norm": 0.44486045837402344, "learning_rate": 2.751008449747357e-05, "loss": 0.1082, "step": 13722 }, { "epoch": 4.195353103026598, "grad_norm": 0.5720708966255188, "learning_rate": 2.7509659887053627e-05, "loss": 0.1252, "step": 13723 }, { "epoch": 4.195658819932742, "grad_norm": 0.7673816680908203, "learning_rate": 2.750923527663369e-05, "loss": 0.1289, "step": 13724 }, { "epoch": 4.195964536838887, "grad_norm": 0.5045298337936401, "learning_rate": 2.7508810666213748e-05, "loss": 0.159, "step": 13725 }, { "epoch": 4.196270253745032, "grad_norm": 2.966996669769287, "learning_rate": 2.7508386055793807e-05, "loss": 0.1562, "step": 13726 }, { "epoch": 4.196575970651177, "grad_norm": 1.1346765756607056, "learning_rate": 2.750796144537387e-05, "loss": 0.2026, "step": 13727 }, { "epoch": 4.196881687557322, "grad_norm": 1.0485683679580688, "learning_rate": 2.7507536834953928e-05, "loss": 0.1605, "step": 13728 }, { "epoch": 4.197187404463467, "grad_norm": 0.6726252436637878, "learning_rate": 2.750711222453399e-05, "loss": 0.1565, "step": 13729 }, { "epoch": 4.197493121369612, "grad_norm": 0.7929691076278687, "learning_rate": 2.750668761411405e-05, "loss": 0.1801, "step": 13730 }, { "epoch": 4.197798838275757, "grad_norm": 0.9206365346908569, "learning_rate": 2.750626300369411e-05, "loss": 0.1837, "step": 13731 }, { "epoch": 4.198104555181901, "grad_norm": 1.3431751728057861, "learning_rate": 2.750583839327417e-05, "loss": 0.2049, "step": 13732 }, { "epoch": 4.198410272088046, "grad_norm": 1.125525712966919, "learning_rate": 2.750541378285423e-05, "loss": 0.2015, "step": 13733 }, { "epoch": 4.198715988994191, "grad_norm": 1.3718639612197876, "learning_rate": 2.750498917243429e-05, "loss": 0.2987, "step": 13734 }, { "epoch": 4.1990217059003365, "grad_norm": 0.3926018476486206, "learning_rate": 2.7504564562014352e-05, "loss": 0.1343, "step": 13735 }, { "epoch": 4.199327422806482, "grad_norm": 0.2784746587276459, "learning_rate": 2.750413995159441e-05, "loss": 0.0777, "step": 13736 }, { "epoch": 4.199633139712626, "grad_norm": 0.44061052799224854, "learning_rate": 2.7503715341174473e-05, "loss": 0.065, "step": 13737 }, { "epoch": 4.199938856618771, "grad_norm": 0.2717010974884033, "learning_rate": 2.7503290730754535e-05, "loss": 0.0577, "step": 13738 }, { "epoch": 4.200244573524916, "grad_norm": 0.2224300354719162, "learning_rate": 2.7502866120334594e-05, "loss": 0.0504, "step": 13739 }, { "epoch": 4.200550290431061, "grad_norm": 1.0398123264312744, "learning_rate": 2.7502441509914656e-05, "loss": 0.0893, "step": 13740 }, { "epoch": 4.200856007337205, "grad_norm": 0.3322748839855194, "learning_rate": 2.7502016899494715e-05, "loss": 0.0765, "step": 13741 }, { "epoch": 4.2011617242433505, "grad_norm": 0.22089943289756775, "learning_rate": 2.7501592289074777e-05, "loss": 0.0753, "step": 13742 }, { "epoch": 4.201467441149496, "grad_norm": 0.22454270720481873, "learning_rate": 2.7501167678654835e-05, "loss": 0.0793, "step": 13743 }, { "epoch": 4.201773158055641, "grad_norm": 0.243051677942276, "learning_rate": 2.7500743068234897e-05, "loss": 0.0532, "step": 13744 }, { "epoch": 4.202078874961785, "grad_norm": 0.5838326215744019, "learning_rate": 2.7500318457814956e-05, "loss": 0.0763, "step": 13745 }, { "epoch": 4.20238459186793, "grad_norm": 0.31176891922950745, "learning_rate": 2.7499893847395018e-05, "loss": 0.0675, "step": 13746 }, { "epoch": 4.202690308774075, "grad_norm": 0.5504496693611145, "learning_rate": 2.7499469236975077e-05, "loss": 0.0809, "step": 13747 }, { "epoch": 4.20299602568022, "grad_norm": 0.4782331585884094, "learning_rate": 2.749904462655514e-05, "loss": 0.119, "step": 13748 }, { "epoch": 4.2033017425863655, "grad_norm": 0.3925847113132477, "learning_rate": 2.7498620016135198e-05, "loss": 0.1152, "step": 13749 }, { "epoch": 4.20360745949251, "grad_norm": 0.6280595660209656, "learning_rate": 2.749819540571526e-05, "loss": 0.148, "step": 13750 }, { "epoch": 4.203913176398655, "grad_norm": 1.0692858695983887, "learning_rate": 2.749777079529532e-05, "loss": 0.1727, "step": 13751 }, { "epoch": 4.2042188933048, "grad_norm": 0.628257691860199, "learning_rate": 2.7497346184875377e-05, "loss": 0.1423, "step": 13752 }, { "epoch": 4.204524610210945, "grad_norm": 0.6037154197692871, "learning_rate": 2.749692157445544e-05, "loss": 0.1475, "step": 13753 }, { "epoch": 4.204830327117089, "grad_norm": 0.8773321509361267, "learning_rate": 2.7496496964035498e-05, "loss": 0.1857, "step": 13754 }, { "epoch": 4.205136044023234, "grad_norm": 0.6949993968009949, "learning_rate": 2.749607235361556e-05, "loss": 0.164, "step": 13755 }, { "epoch": 4.2054417609293795, "grad_norm": 28.119911193847656, "learning_rate": 2.749564774319562e-05, "loss": 0.2191, "step": 13756 }, { "epoch": 4.205747477835525, "grad_norm": 0.941990315914154, "learning_rate": 2.749522313277568e-05, "loss": 0.2052, "step": 13757 }, { "epoch": 4.206053194741669, "grad_norm": 1.1734488010406494, "learning_rate": 2.749479852235574e-05, "loss": 0.2102, "step": 13758 }, { "epoch": 4.206358911647814, "grad_norm": 0.7846343517303467, "learning_rate": 2.74943739119358e-05, "loss": 0.2313, "step": 13759 }, { "epoch": 4.206664628553959, "grad_norm": 0.36135348677635193, "learning_rate": 2.749394930151586e-05, "loss": 0.1412, "step": 13760 }, { "epoch": 4.206970345460104, "grad_norm": 0.22758997976779938, "learning_rate": 2.7493524691095922e-05, "loss": 0.0778, "step": 13761 }, { "epoch": 4.207276062366249, "grad_norm": 0.21047718822956085, "learning_rate": 2.749310008067598e-05, "loss": 0.0596, "step": 13762 }, { "epoch": 4.2075817792723935, "grad_norm": 0.24599620699882507, "learning_rate": 2.7492675470256043e-05, "loss": 0.0862, "step": 13763 }, { "epoch": 4.207887496178539, "grad_norm": 0.2903871238231659, "learning_rate": 2.7492250859836102e-05, "loss": 0.0731, "step": 13764 }, { "epoch": 4.208193213084684, "grad_norm": 0.4169386923313141, "learning_rate": 2.749182624941616e-05, "loss": 0.066, "step": 13765 }, { "epoch": 4.208498929990829, "grad_norm": 0.2549358606338501, "learning_rate": 2.7491401638996223e-05, "loss": 0.0634, "step": 13766 }, { "epoch": 4.208804646896973, "grad_norm": 0.9725632667541504, "learning_rate": 2.749097702857628e-05, "loss": 0.0626, "step": 13767 }, { "epoch": 4.209110363803118, "grad_norm": 0.414288729429245, "learning_rate": 2.7490552418156343e-05, "loss": 0.0658, "step": 13768 }, { "epoch": 4.209416080709263, "grad_norm": 0.37708425521850586, "learning_rate": 2.7490127807736402e-05, "loss": 0.0692, "step": 13769 }, { "epoch": 4.209721797615408, "grad_norm": 0.35756930708885193, "learning_rate": 2.7489703197316464e-05, "loss": 0.0775, "step": 13770 }, { "epoch": 4.210027514521553, "grad_norm": 0.2809881567955017, "learning_rate": 2.7489278586896523e-05, "loss": 0.0582, "step": 13771 }, { "epoch": 4.210333231427698, "grad_norm": 0.7827501893043518, "learning_rate": 2.7488853976476585e-05, "loss": 0.1118, "step": 13772 }, { "epoch": 4.210638948333843, "grad_norm": 0.30876198410987854, "learning_rate": 2.7488429366056644e-05, "loss": 0.0961, "step": 13773 }, { "epoch": 4.210944665239988, "grad_norm": 0.3686000108718872, "learning_rate": 2.7488004755636706e-05, "loss": 0.1127, "step": 13774 }, { "epoch": 4.211250382146133, "grad_norm": 0.5875163078308105, "learning_rate": 2.7487580145216765e-05, "loss": 0.1401, "step": 13775 }, { "epoch": 4.211556099052277, "grad_norm": 1.2649998664855957, "learning_rate": 2.7487155534796827e-05, "loss": 0.1299, "step": 13776 }, { "epoch": 4.2118618159584225, "grad_norm": 0.591430127620697, "learning_rate": 2.7486730924376885e-05, "loss": 0.1427, "step": 13777 }, { "epoch": 4.212167532864568, "grad_norm": 4.961968421936035, "learning_rate": 2.7486306313956944e-05, "loss": 0.1715, "step": 13778 }, { "epoch": 4.212473249770713, "grad_norm": 1.5561696290969849, "learning_rate": 2.7485881703537006e-05, "loss": 0.1885, "step": 13779 }, { "epoch": 4.212778966676857, "grad_norm": 0.7640284299850464, "learning_rate": 2.7485457093117065e-05, "loss": 0.1783, "step": 13780 }, { "epoch": 4.213084683583002, "grad_norm": 1.518105387687683, "learning_rate": 2.7485032482697127e-05, "loss": 0.2179, "step": 13781 }, { "epoch": 4.213390400489147, "grad_norm": 1.1905827522277832, "learning_rate": 2.7484607872277186e-05, "loss": 0.209, "step": 13782 }, { "epoch": 4.213696117395292, "grad_norm": 1.2299885749816895, "learning_rate": 2.7484183261857248e-05, "loss": 0.2115, "step": 13783 }, { "epoch": 4.2140018343014365, "grad_norm": 1.0036016702651978, "learning_rate": 2.7483758651437306e-05, "loss": 0.2585, "step": 13784 }, { "epoch": 4.214307551207582, "grad_norm": 0.515030026435852, "learning_rate": 2.748333404101737e-05, "loss": 0.1249, "step": 13785 }, { "epoch": 4.214613268113727, "grad_norm": 0.25420722365379333, "learning_rate": 2.7482909430597427e-05, "loss": 0.0793, "step": 13786 }, { "epoch": 4.214918985019872, "grad_norm": 0.37479323148727417, "learning_rate": 2.748248482017749e-05, "loss": 0.0671, "step": 13787 }, { "epoch": 4.215224701926017, "grad_norm": 0.21312826871871948, "learning_rate": 2.7482060209757548e-05, "loss": 0.0577, "step": 13788 }, { "epoch": 4.215530418832161, "grad_norm": 0.3124435245990753, "learning_rate": 2.748163559933761e-05, "loss": 0.0873, "step": 13789 }, { "epoch": 4.215836135738306, "grad_norm": 0.3738439977169037, "learning_rate": 2.748121098891767e-05, "loss": 0.0378, "step": 13790 }, { "epoch": 4.216141852644451, "grad_norm": 0.2369433492422104, "learning_rate": 2.7480786378497727e-05, "loss": 0.0418, "step": 13791 }, { "epoch": 4.2164475695505965, "grad_norm": 0.223827064037323, "learning_rate": 2.748036176807779e-05, "loss": 0.07, "step": 13792 }, { "epoch": 4.216753286456741, "grad_norm": 0.24607685208320618, "learning_rate": 2.7479937157657848e-05, "loss": 0.0661, "step": 13793 }, { "epoch": 4.217059003362886, "grad_norm": 0.4022452235221863, "learning_rate": 2.747951254723791e-05, "loss": 0.0555, "step": 13794 }, { "epoch": 4.217364720269031, "grad_norm": 0.3804295063018799, "learning_rate": 2.747908793681797e-05, "loss": 0.0698, "step": 13795 }, { "epoch": 4.217670437175176, "grad_norm": 0.4162552058696747, "learning_rate": 2.747866332639803e-05, "loss": 0.0865, "step": 13796 }, { "epoch": 4.21797615408132, "grad_norm": 0.4988885521888733, "learning_rate": 2.747823871597809e-05, "loss": 0.1447, "step": 13797 }, { "epoch": 4.218281870987465, "grad_norm": 0.3756488263607025, "learning_rate": 2.7477814105558152e-05, "loss": 0.0712, "step": 13798 }, { "epoch": 4.2185875878936105, "grad_norm": 0.4009400010108948, "learning_rate": 2.747738949513821e-05, "loss": 0.1145, "step": 13799 }, { "epoch": 4.218893304799756, "grad_norm": 0.47158563137054443, "learning_rate": 2.7476964884718273e-05, "loss": 0.1358, "step": 13800 }, { "epoch": 4.219199021705901, "grad_norm": 0.8809833526611328, "learning_rate": 2.747654027429833e-05, "loss": 0.1473, "step": 13801 }, { "epoch": 4.219504738612045, "grad_norm": 0.5143085718154907, "learning_rate": 2.7476115663878393e-05, "loss": 0.1306, "step": 13802 }, { "epoch": 4.21981045551819, "grad_norm": 1.02666175365448, "learning_rate": 2.7475691053458452e-05, "loss": 0.1787, "step": 13803 }, { "epoch": 4.220116172424335, "grad_norm": 0.5622339844703674, "learning_rate": 2.747526644303851e-05, "loss": 0.1633, "step": 13804 }, { "epoch": 4.22042188933048, "grad_norm": 2.072927474975586, "learning_rate": 2.7474841832618573e-05, "loss": 0.1615, "step": 13805 }, { "epoch": 4.220727606236625, "grad_norm": 0.8734936118125916, "learning_rate": 2.747441722219863e-05, "loss": 0.1963, "step": 13806 }, { "epoch": 4.22103332314277, "grad_norm": 0.9814215302467346, "learning_rate": 2.7473992611778694e-05, "loss": 0.1679, "step": 13807 }, { "epoch": 4.221339040048915, "grad_norm": 0.6813483834266663, "learning_rate": 2.7473568001358752e-05, "loss": 0.1866, "step": 13808 }, { "epoch": 4.22164475695506, "grad_norm": 3.0188639163970947, "learning_rate": 2.7473143390938815e-05, "loss": 0.2162, "step": 13809 }, { "epoch": 4.221950473861204, "grad_norm": 1.3024553060531616, "learning_rate": 2.7472718780518873e-05, "loss": 0.1584, "step": 13810 }, { "epoch": 4.222256190767349, "grad_norm": 0.23311316967010498, "learning_rate": 2.7472294170098935e-05, "loss": 0.0813, "step": 13811 }, { "epoch": 4.222561907673494, "grad_norm": 0.272676557302475, "learning_rate": 2.7471869559678994e-05, "loss": 0.0707, "step": 13812 }, { "epoch": 4.2228676245796395, "grad_norm": 0.23014971613883972, "learning_rate": 2.7471444949259056e-05, "loss": 0.0551, "step": 13813 }, { "epoch": 4.223173341485785, "grad_norm": 0.3350631296634674, "learning_rate": 2.7471020338839115e-05, "loss": 0.0732, "step": 13814 }, { "epoch": 4.223479058391929, "grad_norm": 0.2058342546224594, "learning_rate": 2.7470595728419174e-05, "loss": 0.0431, "step": 13815 }, { "epoch": 4.223784775298074, "grad_norm": 0.4205544888973236, "learning_rate": 2.7470171117999236e-05, "loss": 0.0833, "step": 13816 }, { "epoch": 4.224090492204219, "grad_norm": 0.6880265474319458, "learning_rate": 2.7469746507579294e-05, "loss": 0.0599, "step": 13817 }, { "epoch": 4.224396209110364, "grad_norm": 0.41358622908592224, "learning_rate": 2.7469321897159356e-05, "loss": 0.0622, "step": 13818 }, { "epoch": 4.224701926016508, "grad_norm": 0.23911304771900177, "learning_rate": 2.7468897286739415e-05, "loss": 0.0939, "step": 13819 }, { "epoch": 4.2250076429226535, "grad_norm": 0.3329553008079529, "learning_rate": 2.7468472676319477e-05, "loss": 0.1072, "step": 13820 }, { "epoch": 4.225313359828799, "grad_norm": 0.32736802101135254, "learning_rate": 2.7468048065899536e-05, "loss": 0.0792, "step": 13821 }, { "epoch": 4.225619076734944, "grad_norm": 0.5566268563270569, "learning_rate": 2.7467623455479598e-05, "loss": 0.0709, "step": 13822 }, { "epoch": 4.225924793641088, "grad_norm": 0.5525083541870117, "learning_rate": 2.7467198845059657e-05, "loss": 0.1423, "step": 13823 }, { "epoch": 4.226230510547233, "grad_norm": 0.3213498294353485, "learning_rate": 2.746677423463972e-05, "loss": 0.1264, "step": 13824 }, { "epoch": 4.226536227453378, "grad_norm": 0.995932936668396, "learning_rate": 2.7466349624219777e-05, "loss": 0.1183, "step": 13825 }, { "epoch": 4.226841944359523, "grad_norm": 0.5951395034790039, "learning_rate": 2.746592501379984e-05, "loss": 0.1668, "step": 13826 }, { "epoch": 4.227147661265668, "grad_norm": 1.3666330575942993, "learning_rate": 2.7465500403379898e-05, "loss": 0.1575, "step": 13827 }, { "epoch": 4.227453378171813, "grad_norm": 0.5791121125221252, "learning_rate": 2.7465075792959957e-05, "loss": 0.1932, "step": 13828 }, { "epoch": 4.227759095077958, "grad_norm": 0.5949839353561401, "learning_rate": 2.746465118254002e-05, "loss": 0.1802, "step": 13829 }, { "epoch": 4.228064811984103, "grad_norm": 1.147328495979309, "learning_rate": 2.7464226572120078e-05, "loss": 0.1482, "step": 13830 }, { "epoch": 4.228370528890248, "grad_norm": 0.9628361463546753, "learning_rate": 2.746380196170014e-05, "loss": 0.1913, "step": 13831 }, { "epoch": 4.228676245796392, "grad_norm": 0.7942927479743958, "learning_rate": 2.74633773512802e-05, "loss": 0.1843, "step": 13832 }, { "epoch": 4.228981962702537, "grad_norm": 4.821360111236572, "learning_rate": 2.746295274086026e-05, "loss": 0.2311, "step": 13833 }, { "epoch": 4.2292876796086825, "grad_norm": 2.561540365219116, "learning_rate": 2.746252813044032e-05, "loss": 0.2903, "step": 13834 }, { "epoch": 4.229593396514828, "grad_norm": 0.5575928688049316, "learning_rate": 2.746210352002038e-05, "loss": 0.1549, "step": 13835 }, { "epoch": 4.229899113420972, "grad_norm": 0.3951088786125183, "learning_rate": 2.746167890960044e-05, "loss": 0.0707, "step": 13836 }, { "epoch": 4.230204830327117, "grad_norm": 0.360467791557312, "learning_rate": 2.7461254299180502e-05, "loss": 0.079, "step": 13837 }, { "epoch": 4.230510547233262, "grad_norm": 0.46892303228378296, "learning_rate": 2.746082968876056e-05, "loss": 0.0653, "step": 13838 }, { "epoch": 4.230816264139407, "grad_norm": 0.31855252385139465, "learning_rate": 2.7460405078340623e-05, "loss": 0.0564, "step": 13839 }, { "epoch": 4.231121981045552, "grad_norm": 0.15014953911304474, "learning_rate": 2.7459980467920685e-05, "loss": 0.038, "step": 13840 }, { "epoch": 4.2314276979516965, "grad_norm": 0.24111637473106384, "learning_rate": 2.7459555857500744e-05, "loss": 0.055, "step": 13841 }, { "epoch": 4.231733414857842, "grad_norm": 0.40829768776893616, "learning_rate": 2.7459131247080806e-05, "loss": 0.061, "step": 13842 }, { "epoch": 4.232039131763987, "grad_norm": 0.8522893190383911, "learning_rate": 2.7458706636660865e-05, "loss": 0.0763, "step": 13843 }, { "epoch": 4.232344848670132, "grad_norm": 0.27972811460494995, "learning_rate": 2.7458282026240927e-05, "loss": 0.0657, "step": 13844 }, { "epoch": 4.232650565576276, "grad_norm": 1.052746295928955, "learning_rate": 2.7457857415820985e-05, "loss": 0.1006, "step": 13845 }, { "epoch": 4.232956282482421, "grad_norm": 0.8152413964271545, "learning_rate": 2.7457432805401047e-05, "loss": 0.0658, "step": 13846 }, { "epoch": 4.233261999388566, "grad_norm": 0.3387407958507538, "learning_rate": 2.7457008194981106e-05, "loss": 0.0841, "step": 13847 }, { "epoch": 4.233567716294711, "grad_norm": 1.1506966352462769, "learning_rate": 2.7456583584561168e-05, "loss": 0.1503, "step": 13848 }, { "epoch": 4.233873433200856, "grad_norm": 0.3250083923339844, "learning_rate": 2.7456158974141227e-05, "loss": 0.1231, "step": 13849 }, { "epoch": 4.234179150107001, "grad_norm": 0.5331533551216125, "learning_rate": 2.745573436372129e-05, "loss": 0.1397, "step": 13850 }, { "epoch": 4.234484867013146, "grad_norm": 0.597029983997345, "learning_rate": 2.7455309753301348e-05, "loss": 0.1542, "step": 13851 }, { "epoch": 4.234790583919291, "grad_norm": 0.45420506596565247, "learning_rate": 2.745488514288141e-05, "loss": 0.172, "step": 13852 }, { "epoch": 4.235096300825436, "grad_norm": 0.8446249961853027, "learning_rate": 2.745446053246147e-05, "loss": 0.1594, "step": 13853 }, { "epoch": 4.23540201773158, "grad_norm": 0.8918272256851196, "learning_rate": 2.7454035922041527e-05, "loss": 0.1531, "step": 13854 }, { "epoch": 4.235707734637725, "grad_norm": 0.8333079814910889, "learning_rate": 2.745361131162159e-05, "loss": 0.1979, "step": 13855 }, { "epoch": 4.2360134515438705, "grad_norm": 0.7814617156982422, "learning_rate": 2.7453186701201648e-05, "loss": 0.2094, "step": 13856 }, { "epoch": 4.236319168450016, "grad_norm": 0.8708589673042297, "learning_rate": 2.745276209078171e-05, "loss": 0.2107, "step": 13857 }, { "epoch": 4.23662488535616, "grad_norm": 3.674281597137451, "learning_rate": 2.745233748036177e-05, "loss": 0.219, "step": 13858 }, { "epoch": 4.236930602262305, "grad_norm": 27.526968002319336, "learning_rate": 2.745191286994183e-05, "loss": 0.1933, "step": 13859 }, { "epoch": 4.23723631916845, "grad_norm": 0.33537963032722473, "learning_rate": 2.745148825952189e-05, "loss": 0.1378, "step": 13860 }, { "epoch": 4.237542036074595, "grad_norm": 0.9364458918571472, "learning_rate": 2.745106364910195e-05, "loss": 0.0841, "step": 13861 }, { "epoch": 4.2378477529807395, "grad_norm": 0.5118234157562256, "learning_rate": 2.745063903868201e-05, "loss": 0.0679, "step": 13862 }, { "epoch": 4.238153469886885, "grad_norm": 0.4102688729763031, "learning_rate": 2.7450214428262072e-05, "loss": 0.0778, "step": 13863 }, { "epoch": 4.23845918679303, "grad_norm": 0.2754855453968048, "learning_rate": 2.744978981784213e-05, "loss": 0.0546, "step": 13864 }, { "epoch": 4.238764903699175, "grad_norm": 0.46609967947006226, "learning_rate": 2.7449365207422193e-05, "loss": 0.0456, "step": 13865 }, { "epoch": 4.23907062060532, "grad_norm": 0.15487615764141083, "learning_rate": 2.7448940597002252e-05, "loss": 0.0577, "step": 13866 }, { "epoch": 4.239376337511464, "grad_norm": 2.6681020259857178, "learning_rate": 2.744851598658231e-05, "loss": 0.0597, "step": 13867 }, { "epoch": 4.239682054417609, "grad_norm": 0.24369582533836365, "learning_rate": 2.7448091376162373e-05, "loss": 0.0625, "step": 13868 }, { "epoch": 4.239987771323754, "grad_norm": 0.6466782093048096, "learning_rate": 2.744766676574243e-05, "loss": 0.0778, "step": 13869 }, { "epoch": 4.2402934882298995, "grad_norm": 0.9712162017822266, "learning_rate": 2.7447242155322494e-05, "loss": 0.0837, "step": 13870 }, { "epoch": 4.240599205136044, "grad_norm": 0.6061424612998962, "learning_rate": 2.7446817544902552e-05, "loss": 0.0787, "step": 13871 }, { "epoch": 4.240904922042189, "grad_norm": 0.4333076775074005, "learning_rate": 2.7446392934482614e-05, "loss": 0.1089, "step": 13872 }, { "epoch": 4.241210638948334, "grad_norm": 3.30010986328125, "learning_rate": 2.7445968324062673e-05, "loss": 0.1151, "step": 13873 }, { "epoch": 4.241516355854479, "grad_norm": 0.32548514008522034, "learning_rate": 2.7445543713642735e-05, "loss": 0.1013, "step": 13874 }, { "epoch": 4.241822072760623, "grad_norm": 0.5630825757980347, "learning_rate": 2.7445119103222794e-05, "loss": 0.1603, "step": 13875 }, { "epoch": 4.242127789666768, "grad_norm": 0.5689238905906677, "learning_rate": 2.7444694492802856e-05, "loss": 0.1379, "step": 13876 }, { "epoch": 4.2424335065729135, "grad_norm": 0.7590375542640686, "learning_rate": 2.7444269882382915e-05, "loss": 0.1553, "step": 13877 }, { "epoch": 4.242739223479059, "grad_norm": 0.29537513852119446, "learning_rate": 2.7443845271962977e-05, "loss": 0.1279, "step": 13878 }, { "epoch": 4.243044940385204, "grad_norm": 1.2026236057281494, "learning_rate": 2.7443420661543035e-05, "loss": 0.1676, "step": 13879 }, { "epoch": 4.243350657291348, "grad_norm": 1.0668638944625854, "learning_rate": 2.7442996051123094e-05, "loss": 0.2041, "step": 13880 }, { "epoch": 4.243656374197493, "grad_norm": 1.1695964336395264, "learning_rate": 2.7442571440703156e-05, "loss": 0.175, "step": 13881 }, { "epoch": 4.243962091103638, "grad_norm": 0.9082977175712585, "learning_rate": 2.7442146830283215e-05, "loss": 0.1911, "step": 13882 }, { "epoch": 4.244267808009783, "grad_norm": 1.4864376783370972, "learning_rate": 2.7441722219863277e-05, "loss": 0.183, "step": 13883 }, { "epoch": 4.2445735249159275, "grad_norm": 1.1338937282562256, "learning_rate": 2.7441297609443336e-05, "loss": 0.2466, "step": 13884 }, { "epoch": 4.244879241822073, "grad_norm": 0.7551661729812622, "learning_rate": 2.7440872999023398e-05, "loss": 0.1646, "step": 13885 }, { "epoch": 4.245184958728218, "grad_norm": 0.49887213110923767, "learning_rate": 2.7440448388603456e-05, "loss": 0.0859, "step": 13886 }, { "epoch": 4.245490675634363, "grad_norm": 0.27633821964263916, "learning_rate": 2.744002377818352e-05, "loss": 0.057, "step": 13887 }, { "epoch": 4.245796392540507, "grad_norm": 0.3529573082923889, "learning_rate": 2.7439599167763577e-05, "loss": 0.0438, "step": 13888 }, { "epoch": 4.246102109446652, "grad_norm": 0.6697316765785217, "learning_rate": 2.743917455734364e-05, "loss": 0.0635, "step": 13889 }, { "epoch": 4.246407826352797, "grad_norm": 0.2857176661491394, "learning_rate": 2.7438749946923698e-05, "loss": 0.0658, "step": 13890 }, { "epoch": 4.2467135432589425, "grad_norm": 0.717122495174408, "learning_rate": 2.743832533650376e-05, "loss": 0.0604, "step": 13891 }, { "epoch": 4.247019260165088, "grad_norm": 1.2744301557540894, "learning_rate": 2.743790072608382e-05, "loss": 0.0723, "step": 13892 }, { "epoch": 4.247324977071232, "grad_norm": 0.4509321451187134, "learning_rate": 2.7437476115663877e-05, "loss": 0.0872, "step": 13893 }, { "epoch": 4.247630693977377, "grad_norm": 0.32986000180244446, "learning_rate": 2.743705150524394e-05, "loss": 0.0793, "step": 13894 }, { "epoch": 4.247936410883522, "grad_norm": 0.7163294553756714, "learning_rate": 2.7436626894823998e-05, "loss": 0.076, "step": 13895 }, { "epoch": 4.248242127789667, "grad_norm": 0.6689328551292419, "learning_rate": 2.743620228440406e-05, "loss": 0.0698, "step": 13896 }, { "epoch": 4.248547844695811, "grad_norm": 0.34999704360961914, "learning_rate": 2.743577767398412e-05, "loss": 0.1505, "step": 13897 }, { "epoch": 4.2488535616019565, "grad_norm": 1.7152372598648071, "learning_rate": 2.743535306356418e-05, "loss": 0.16, "step": 13898 }, { "epoch": 4.249159278508102, "grad_norm": 0.34272152185440063, "learning_rate": 2.743492845314424e-05, "loss": 0.1232, "step": 13899 }, { "epoch": 4.249464995414247, "grad_norm": 0.40798842906951904, "learning_rate": 2.7434503842724302e-05, "loss": 0.1517, "step": 13900 }, { "epoch": 4.249770712320391, "grad_norm": 0.5232248306274414, "learning_rate": 2.743407923230436e-05, "loss": 0.1519, "step": 13901 }, { "epoch": 4.250076429226536, "grad_norm": 1.3416000604629517, "learning_rate": 2.7433654621884423e-05, "loss": 0.1863, "step": 13902 }, { "epoch": 4.250382146132681, "grad_norm": 0.7117258310317993, "learning_rate": 2.743323001146448e-05, "loss": 0.1818, "step": 13903 }, { "epoch": 4.250687863038826, "grad_norm": 0.6423786282539368, "learning_rate": 2.7432805401044544e-05, "loss": 0.185, "step": 13904 }, { "epoch": 4.250993579944971, "grad_norm": 1.31887948513031, "learning_rate": 2.7432380790624602e-05, "loss": 0.2266, "step": 13905 }, { "epoch": 4.251299296851116, "grad_norm": 1.4882012605667114, "learning_rate": 2.743195618020466e-05, "loss": 0.1953, "step": 13906 }, { "epoch": 4.251605013757261, "grad_norm": 0.7844006419181824, "learning_rate": 2.7431531569784723e-05, "loss": 0.1914, "step": 13907 }, { "epoch": 4.251910730663406, "grad_norm": 0.863256573677063, "learning_rate": 2.7431106959364782e-05, "loss": 0.21, "step": 13908 }, { "epoch": 4.252216447569551, "grad_norm": 3.435513496398926, "learning_rate": 2.7430682348944844e-05, "loss": 0.2497, "step": 13909 }, { "epoch": 4.252522164475695, "grad_norm": 0.713378369808197, "learning_rate": 2.7430257738524902e-05, "loss": 0.1715, "step": 13910 }, { "epoch": 4.25282788138184, "grad_norm": 0.3617590665817261, "learning_rate": 2.7429833128104965e-05, "loss": 0.1036, "step": 13911 }, { "epoch": 4.253133598287985, "grad_norm": 0.23492959141731262, "learning_rate": 2.7429408517685023e-05, "loss": 0.0664, "step": 13912 }, { "epoch": 4.2534393151941305, "grad_norm": 0.245854914188385, "learning_rate": 2.7428983907265085e-05, "loss": 0.0614, "step": 13913 }, { "epoch": 4.253745032100275, "grad_norm": 0.16395042836666107, "learning_rate": 2.7428559296845144e-05, "loss": 0.0515, "step": 13914 }, { "epoch": 4.25405074900642, "grad_norm": 0.16950222849845886, "learning_rate": 2.7428134686425206e-05, "loss": 0.0531, "step": 13915 }, { "epoch": 4.254356465912565, "grad_norm": 0.17844419181346893, "learning_rate": 2.7427710076005265e-05, "loss": 0.0458, "step": 13916 }, { "epoch": 4.25466218281871, "grad_norm": 0.31174176931381226, "learning_rate": 2.7427285465585327e-05, "loss": 0.0612, "step": 13917 }, { "epoch": 4.254967899724855, "grad_norm": 0.2918841242790222, "learning_rate": 2.7426860855165386e-05, "loss": 0.0502, "step": 13918 }, { "epoch": 4.2552736166309995, "grad_norm": 0.870884895324707, "learning_rate": 2.7426436244745444e-05, "loss": 0.0667, "step": 13919 }, { "epoch": 4.255579333537145, "grad_norm": 0.32386547327041626, "learning_rate": 2.7426011634325506e-05, "loss": 0.06, "step": 13920 }, { "epoch": 4.25588505044329, "grad_norm": 0.44546282291412354, "learning_rate": 2.7425587023905565e-05, "loss": 0.0872, "step": 13921 }, { "epoch": 4.256190767349435, "grad_norm": 0.20441387593746185, "learning_rate": 2.7425162413485627e-05, "loss": 0.0735, "step": 13922 }, { "epoch": 4.256496484255579, "grad_norm": 0.5989581346511841, "learning_rate": 2.7424737803065686e-05, "loss": 0.0948, "step": 13923 }, { "epoch": 4.256802201161724, "grad_norm": 1.673283576965332, "learning_rate": 2.7424313192645748e-05, "loss": 0.1113, "step": 13924 }, { "epoch": 4.257107918067869, "grad_norm": 0.550168514251709, "learning_rate": 2.7423888582225807e-05, "loss": 0.1552, "step": 13925 }, { "epoch": 4.257413634974014, "grad_norm": 0.8943251371383667, "learning_rate": 2.742346397180587e-05, "loss": 0.1429, "step": 13926 }, { "epoch": 4.257719351880159, "grad_norm": 0.7802530527114868, "learning_rate": 2.7423039361385927e-05, "loss": 0.1442, "step": 13927 }, { "epoch": 4.258025068786304, "grad_norm": 0.9085826873779297, "learning_rate": 2.742261475096599e-05, "loss": 0.1781, "step": 13928 }, { "epoch": 4.258330785692449, "grad_norm": 0.8080001473426819, "learning_rate": 2.7422190140546048e-05, "loss": 0.1752, "step": 13929 }, { "epoch": 4.258636502598594, "grad_norm": 0.7055867910385132, "learning_rate": 2.742176553012611e-05, "loss": 0.1966, "step": 13930 }, { "epoch": 4.258942219504739, "grad_norm": 0.849475085735321, "learning_rate": 2.742134091970617e-05, "loss": 0.1746, "step": 13931 }, { "epoch": 4.259247936410883, "grad_norm": 2.3084330558776855, "learning_rate": 2.7420916309286228e-05, "loss": 0.1974, "step": 13932 }, { "epoch": 4.259553653317028, "grad_norm": 1.1998445987701416, "learning_rate": 2.742049169886629e-05, "loss": 0.2378, "step": 13933 }, { "epoch": 4.2598593702231735, "grad_norm": 1.9239394664764404, "learning_rate": 2.742006708844635e-05, "loss": 0.2212, "step": 13934 }, { "epoch": 4.260165087129319, "grad_norm": 0.9196375012397766, "learning_rate": 2.741964247802641e-05, "loss": 0.1499, "step": 13935 }, { "epoch": 4.260470804035463, "grad_norm": 0.4420221447944641, "learning_rate": 2.741921786760647e-05, "loss": 0.0751, "step": 13936 }, { "epoch": 4.260776520941608, "grad_norm": 0.3165205419063568, "learning_rate": 2.741879325718653e-05, "loss": 0.0559, "step": 13937 }, { "epoch": 4.261082237847753, "grad_norm": 0.36954256892204285, "learning_rate": 2.741836864676659e-05, "loss": 0.0822, "step": 13938 }, { "epoch": 4.261387954753898, "grad_norm": 0.3371262848377228, "learning_rate": 2.7417944036346652e-05, "loss": 0.0694, "step": 13939 }, { "epoch": 4.261693671660042, "grad_norm": 0.3790472447872162, "learning_rate": 2.741751942592671e-05, "loss": 0.0501, "step": 13940 }, { "epoch": 4.2619993885661875, "grad_norm": 0.22270093858242035, "learning_rate": 2.7417094815506773e-05, "loss": 0.0469, "step": 13941 }, { "epoch": 4.262305105472333, "grad_norm": 0.470651775598526, "learning_rate": 2.7416670205086835e-05, "loss": 0.0948, "step": 13942 }, { "epoch": 4.262610822378478, "grad_norm": 0.22374284267425537, "learning_rate": 2.7416245594666894e-05, "loss": 0.0505, "step": 13943 }, { "epoch": 4.262916539284623, "grad_norm": 0.21114778518676758, "learning_rate": 2.7415820984246956e-05, "loss": 0.0684, "step": 13944 }, { "epoch": 4.263222256190767, "grad_norm": 0.32750168442726135, "learning_rate": 2.7415396373827015e-05, "loss": 0.0926, "step": 13945 }, { "epoch": 4.263527973096912, "grad_norm": 0.20836496353149414, "learning_rate": 2.7414971763407077e-05, "loss": 0.0693, "step": 13946 }, { "epoch": 4.263833690003057, "grad_norm": 0.374235600233078, "learning_rate": 2.7414547152987135e-05, "loss": 0.0947, "step": 13947 }, { "epoch": 4.2641394069092025, "grad_norm": 0.41492149233818054, "learning_rate": 2.7414122542567197e-05, "loss": 0.1315, "step": 13948 }, { "epoch": 4.264445123815347, "grad_norm": 0.4578406810760498, "learning_rate": 2.7413697932147256e-05, "loss": 0.1321, "step": 13949 }, { "epoch": 4.264750840721492, "grad_norm": 0.5130382180213928, "learning_rate": 2.7413273321727318e-05, "loss": 0.1402, "step": 13950 }, { "epoch": 4.265056557627637, "grad_norm": 0.6719509959220886, "learning_rate": 2.7412848711307377e-05, "loss": 0.1675, "step": 13951 }, { "epoch": 4.265362274533782, "grad_norm": 0.6780668497085571, "learning_rate": 2.741242410088744e-05, "loss": 0.169, "step": 13952 }, { "epoch": 4.265667991439926, "grad_norm": 0.4518777132034302, "learning_rate": 2.7411999490467498e-05, "loss": 0.1745, "step": 13953 }, { "epoch": 4.265973708346071, "grad_norm": 0.6264888644218445, "learning_rate": 2.741157488004756e-05, "loss": 0.185, "step": 13954 }, { "epoch": 4.2662794252522165, "grad_norm": 1.2697755098342896, "learning_rate": 2.741115026962762e-05, "loss": 0.1977, "step": 13955 }, { "epoch": 4.266585142158362, "grad_norm": 0.6657447814941406, "learning_rate": 2.7410725659207677e-05, "loss": 0.2113, "step": 13956 }, { "epoch": 4.266890859064507, "grad_norm": 1.2380702495574951, "learning_rate": 2.741030104878774e-05, "loss": 0.1788, "step": 13957 }, { "epoch": 4.267196575970651, "grad_norm": 1.1780891418457031, "learning_rate": 2.7409876438367798e-05, "loss": 0.2227, "step": 13958 }, { "epoch": 4.267502292876796, "grad_norm": 12.714239120483398, "learning_rate": 2.740945182794786e-05, "loss": 0.2221, "step": 13959 }, { "epoch": 4.267808009782941, "grad_norm": 0.35141682624816895, "learning_rate": 2.740902721752792e-05, "loss": 0.1238, "step": 13960 }, { "epoch": 4.268113726689086, "grad_norm": 0.3133683502674103, "learning_rate": 2.740860260710798e-05, "loss": 0.0904, "step": 13961 }, { "epoch": 4.2684194435952305, "grad_norm": 0.2992732524871826, "learning_rate": 2.740817799668804e-05, "loss": 0.0647, "step": 13962 }, { "epoch": 4.268725160501376, "grad_norm": 0.17963995039463043, "learning_rate": 2.74077533862681e-05, "loss": 0.0496, "step": 13963 }, { "epoch": 4.269030877407521, "grad_norm": 0.15333528816699982, "learning_rate": 2.740732877584816e-05, "loss": 0.048, "step": 13964 }, { "epoch": 4.269336594313666, "grad_norm": 0.3516606390476227, "learning_rate": 2.7406904165428222e-05, "loss": 0.0837, "step": 13965 }, { "epoch": 4.26964231121981, "grad_norm": 0.42882871627807617, "learning_rate": 2.740647955500828e-05, "loss": 0.0545, "step": 13966 }, { "epoch": 4.269948028125955, "grad_norm": 0.2359185814857483, "learning_rate": 2.7406054944588343e-05, "loss": 0.0582, "step": 13967 }, { "epoch": 4.2702537450321, "grad_norm": 0.3356930911540985, "learning_rate": 2.7405630334168402e-05, "loss": 0.1277, "step": 13968 }, { "epoch": 4.270559461938245, "grad_norm": 0.289013534784317, "learning_rate": 2.740520572374846e-05, "loss": 0.0761, "step": 13969 }, { "epoch": 4.2708651788443905, "grad_norm": 0.8400011658668518, "learning_rate": 2.7404781113328523e-05, "loss": 0.1227, "step": 13970 }, { "epoch": 4.271170895750535, "grad_norm": 0.31022948026657104, "learning_rate": 2.740435650290858e-05, "loss": 0.1001, "step": 13971 }, { "epoch": 4.27147661265668, "grad_norm": 0.3661097586154938, "learning_rate": 2.7403931892488644e-05, "loss": 0.076, "step": 13972 }, { "epoch": 4.271782329562825, "grad_norm": 0.4364772140979767, "learning_rate": 2.7403507282068702e-05, "loss": 0.1312, "step": 13973 }, { "epoch": 4.27208804646897, "grad_norm": 0.34803634881973267, "learning_rate": 2.7403082671648764e-05, "loss": 0.0988, "step": 13974 }, { "epoch": 4.272393763375114, "grad_norm": 1.2530291080474854, "learning_rate": 2.7402658061228823e-05, "loss": 0.1361, "step": 13975 }, { "epoch": 4.2726994802812595, "grad_norm": 0.3744623064994812, "learning_rate": 2.7402233450808885e-05, "loss": 0.1637, "step": 13976 }, { "epoch": 4.273005197187405, "grad_norm": 0.7661336660385132, "learning_rate": 2.7401808840388944e-05, "loss": 0.1624, "step": 13977 }, { "epoch": 4.27331091409355, "grad_norm": 1.1080929040908813, "learning_rate": 2.7401384229969006e-05, "loss": 0.1533, "step": 13978 }, { "epoch": 4.273616630999694, "grad_norm": 0.5138101577758789, "learning_rate": 2.7400959619549065e-05, "loss": 0.1786, "step": 13979 }, { "epoch": 4.273922347905839, "grad_norm": 0.6618227362632751, "learning_rate": 2.7400535009129127e-05, "loss": 0.198, "step": 13980 }, { "epoch": 4.274228064811984, "grad_norm": 1.413111686706543, "learning_rate": 2.7400110398709185e-05, "loss": 0.1805, "step": 13981 }, { "epoch": 4.274533781718129, "grad_norm": 1.122929334640503, "learning_rate": 2.7399685788289244e-05, "loss": 0.2066, "step": 13982 }, { "epoch": 4.274839498624274, "grad_norm": 0.9309731125831604, "learning_rate": 2.7399261177869306e-05, "loss": 0.1999, "step": 13983 }, { "epoch": 4.275145215530419, "grad_norm": 2.7992069721221924, "learning_rate": 2.7398836567449365e-05, "loss": 0.2539, "step": 13984 }, { "epoch": 4.275450932436564, "grad_norm": 0.5704513192176819, "learning_rate": 2.7398411957029427e-05, "loss": 0.1438, "step": 13985 }, { "epoch": 4.275756649342709, "grad_norm": 0.27063798904418945, "learning_rate": 2.7397987346609486e-05, "loss": 0.0881, "step": 13986 }, { "epoch": 4.276062366248854, "grad_norm": 0.2689717710018158, "learning_rate": 2.7397562736189548e-05, "loss": 0.0726, "step": 13987 }, { "epoch": 4.276368083154998, "grad_norm": 0.3024556338787079, "learning_rate": 2.7397138125769606e-05, "loss": 0.0825, "step": 13988 }, { "epoch": 4.276673800061143, "grad_norm": 0.5549679398536682, "learning_rate": 2.739671351534967e-05, "loss": 0.0437, "step": 13989 }, { "epoch": 4.276979516967288, "grad_norm": 0.13055068254470825, "learning_rate": 2.7396288904929727e-05, "loss": 0.0524, "step": 13990 }, { "epoch": 4.2772852338734335, "grad_norm": 0.3850674629211426, "learning_rate": 2.739586429450979e-05, "loss": 0.0529, "step": 13991 }, { "epoch": 4.277590950779578, "grad_norm": 0.3597561717033386, "learning_rate": 2.7395439684089848e-05, "loss": 0.0663, "step": 13992 }, { "epoch": 4.277896667685723, "grad_norm": 0.27843067049980164, "learning_rate": 2.739501507366991e-05, "loss": 0.0845, "step": 13993 }, { "epoch": 4.278202384591868, "grad_norm": 0.35756778717041016, "learning_rate": 2.739459046324997e-05, "loss": 0.0549, "step": 13994 }, { "epoch": 4.278508101498013, "grad_norm": 0.3351753354072571, "learning_rate": 2.7394165852830028e-05, "loss": 0.096, "step": 13995 }, { "epoch": 4.278813818404158, "grad_norm": 0.3578203022480011, "learning_rate": 2.739374124241009e-05, "loss": 0.0834, "step": 13996 }, { "epoch": 4.279119535310302, "grad_norm": 0.3030267357826233, "learning_rate": 2.7393316631990148e-05, "loss": 0.091, "step": 13997 }, { "epoch": 4.2794252522164475, "grad_norm": 0.9486894011497498, "learning_rate": 2.739289202157021e-05, "loss": 0.1638, "step": 13998 }, { "epoch": 4.279730969122593, "grad_norm": 0.35746490955352783, "learning_rate": 2.739246741115027e-05, "loss": 0.0934, "step": 13999 }, { "epoch": 4.280036686028738, "grad_norm": 0.4406617283821106, "learning_rate": 2.739204280073033e-05, "loss": 0.1522, "step": 14000 }, { "epoch": 4.280036686028738, "eval_cer": 0.18947552180506913, "eval_loss": 0.24142511188983917, "eval_runtime": 19.2022, "eval_samples_per_second": 236.327, "eval_steps_per_second": 0.781, "eval_wer": 0.3308566799810061, "step": 14000 }, { "epoch": 4.280342402934882, "grad_norm": 0.85866379737854, "learning_rate": 2.739161819031039e-05, "loss": 0.1371, "step": 14001 }, { "epoch": 4.280648119841027, "grad_norm": 0.5128108263015747, "learning_rate": 2.7391193579890452e-05, "loss": 0.1593, "step": 14002 }, { "epoch": 4.280953836747172, "grad_norm": 0.7198511362075806, "learning_rate": 2.739076896947051e-05, "loss": 0.168, "step": 14003 }, { "epoch": 4.281259553653317, "grad_norm": 0.8459679484367371, "learning_rate": 2.7390344359050573e-05, "loss": 0.2058, "step": 14004 }, { "epoch": 4.281565270559462, "grad_norm": 0.493407666683197, "learning_rate": 2.738991974863063e-05, "loss": 0.1785, "step": 14005 }, { "epoch": 4.281870987465607, "grad_norm": 0.6606143116950989, "learning_rate": 2.7389495138210694e-05, "loss": 0.185, "step": 14006 }, { "epoch": 4.282176704371752, "grad_norm": 0.7644363641738892, "learning_rate": 2.7389070527790752e-05, "loss": 0.217, "step": 14007 }, { "epoch": 4.282482421277897, "grad_norm": 1.3940538167953491, "learning_rate": 2.738864591737081e-05, "loss": 0.1815, "step": 14008 }, { "epoch": 4.282788138184042, "grad_norm": 3.1635193824768066, "learning_rate": 2.7388221306950873e-05, "loss": 0.2443, "step": 14009 }, { "epoch": 4.283093855090186, "grad_norm": 0.4992072880268097, "learning_rate": 2.7387796696530932e-05, "loss": 0.1617, "step": 14010 }, { "epoch": 4.283399571996331, "grad_norm": 0.31549200415611267, "learning_rate": 2.7387372086110994e-05, "loss": 0.0627, "step": 14011 }, { "epoch": 4.2837052889024765, "grad_norm": 0.5289226770401001, "learning_rate": 2.7386947475691053e-05, "loss": 0.0865, "step": 14012 }, { "epoch": 4.284011005808622, "grad_norm": 0.3030151128768921, "learning_rate": 2.7386522865271115e-05, "loss": 0.0491, "step": 14013 }, { "epoch": 4.284316722714766, "grad_norm": 0.29001349210739136, "learning_rate": 2.7386098254851173e-05, "loss": 0.0635, "step": 14014 }, { "epoch": 4.284622439620911, "grad_norm": 0.3028789162635803, "learning_rate": 2.7385673644431235e-05, "loss": 0.0832, "step": 14015 }, { "epoch": 4.284928156527056, "grad_norm": 0.5700621008872986, "learning_rate": 2.7385249034011294e-05, "loss": 0.0593, "step": 14016 }, { "epoch": 4.285233873433201, "grad_norm": 0.49529504776000977, "learning_rate": 2.7384824423591356e-05, "loss": 0.0575, "step": 14017 }, { "epoch": 4.285539590339345, "grad_norm": 0.2799837291240692, "learning_rate": 2.7384399813171415e-05, "loss": 0.0772, "step": 14018 }, { "epoch": 4.2858453072454905, "grad_norm": 0.4166446328163147, "learning_rate": 2.7383975202751477e-05, "loss": 0.0658, "step": 14019 }, { "epoch": 4.286151024151636, "grad_norm": 0.36522090435028076, "learning_rate": 2.7383550592331536e-05, "loss": 0.1164, "step": 14020 }, { "epoch": 4.286456741057781, "grad_norm": 0.7830626368522644, "learning_rate": 2.7383125981911594e-05, "loss": 0.0634, "step": 14021 }, { "epoch": 4.286762457963926, "grad_norm": 0.42291638255119324, "learning_rate": 2.7382701371491656e-05, "loss": 0.1449, "step": 14022 }, { "epoch": 4.28706817487007, "grad_norm": 0.48512208461761475, "learning_rate": 2.7382276761071715e-05, "loss": 0.0757, "step": 14023 }, { "epoch": 4.287373891776215, "grad_norm": 0.6699188351631165, "learning_rate": 2.7381852150651777e-05, "loss": 0.1204, "step": 14024 }, { "epoch": 4.28767960868236, "grad_norm": 0.9305131435394287, "learning_rate": 2.7381427540231836e-05, "loss": 0.1518, "step": 14025 }, { "epoch": 4.287985325588505, "grad_norm": 0.7198421359062195, "learning_rate": 2.7381002929811898e-05, "loss": 0.1621, "step": 14026 }, { "epoch": 4.28829104249465, "grad_norm": 0.5670399069786072, "learning_rate": 2.7380578319391957e-05, "loss": 0.1513, "step": 14027 }, { "epoch": 4.288596759400795, "grad_norm": 1.095263123512268, "learning_rate": 2.738015370897202e-05, "loss": 0.1598, "step": 14028 }, { "epoch": 4.28890247630694, "grad_norm": 1.9932584762573242, "learning_rate": 2.7379729098552078e-05, "loss": 0.1755, "step": 14029 }, { "epoch": 4.289208193213085, "grad_norm": 0.8447781801223755, "learning_rate": 2.737930448813214e-05, "loss": 0.1549, "step": 14030 }, { "epoch": 4.289513910119229, "grad_norm": 1.1525013446807861, "learning_rate": 2.73788798777122e-05, "loss": 0.1531, "step": 14031 }, { "epoch": 4.289819627025374, "grad_norm": 0.9306631088256836, "learning_rate": 2.737845526729226e-05, "loss": 0.1871, "step": 14032 }, { "epoch": 4.2901253439315195, "grad_norm": 1.7150131464004517, "learning_rate": 2.737803065687232e-05, "loss": 0.2086, "step": 14033 }, { "epoch": 4.290431060837665, "grad_norm": 1.1928538084030151, "learning_rate": 2.7377606046452378e-05, "loss": 0.2262, "step": 14034 }, { "epoch": 4.29073677774381, "grad_norm": 0.4851502478122711, "learning_rate": 2.737718143603244e-05, "loss": 0.1827, "step": 14035 }, { "epoch": 4.291042494649954, "grad_norm": 0.3275355100631714, "learning_rate": 2.73767568256125e-05, "loss": 0.098, "step": 14036 }, { "epoch": 4.291348211556099, "grad_norm": 0.3365787863731384, "learning_rate": 2.737633221519256e-05, "loss": 0.0834, "step": 14037 }, { "epoch": 4.291653928462244, "grad_norm": 0.27382969856262207, "learning_rate": 2.737590760477262e-05, "loss": 0.0574, "step": 14038 }, { "epoch": 4.291959645368389, "grad_norm": 0.23043540120124817, "learning_rate": 2.737548299435268e-05, "loss": 0.0596, "step": 14039 }, { "epoch": 4.2922653622745335, "grad_norm": 0.2352287918329239, "learning_rate": 2.737505838393274e-05, "loss": 0.0518, "step": 14040 }, { "epoch": 4.292571079180679, "grad_norm": 0.3604625463485718, "learning_rate": 2.7374633773512802e-05, "loss": 0.0582, "step": 14041 }, { "epoch": 4.292876796086824, "grad_norm": 0.5341261029243469, "learning_rate": 2.737420916309286e-05, "loss": 0.0573, "step": 14042 }, { "epoch": 4.293182512992969, "grad_norm": 0.6867982149124146, "learning_rate": 2.7373784552672923e-05, "loss": 0.0993, "step": 14043 }, { "epoch": 4.293488229899113, "grad_norm": 0.36157000064849854, "learning_rate": 2.7373359942252985e-05, "loss": 0.061, "step": 14044 }, { "epoch": 4.293793946805258, "grad_norm": 0.8425976037979126, "learning_rate": 2.7372935331833047e-05, "loss": 0.1026, "step": 14045 }, { "epoch": 4.294099663711403, "grad_norm": 0.5143864154815674, "learning_rate": 2.7372510721413106e-05, "loss": 0.0654, "step": 14046 }, { "epoch": 4.294405380617548, "grad_norm": 0.36785101890563965, "learning_rate": 2.7372086110993165e-05, "loss": 0.0637, "step": 14047 }, { "epoch": 4.2947110975236935, "grad_norm": 0.31875407695770264, "learning_rate": 2.7371661500573227e-05, "loss": 0.1083, "step": 14048 }, { "epoch": 4.295016814429838, "grad_norm": 1.0515844821929932, "learning_rate": 2.7371236890153285e-05, "loss": 0.1206, "step": 14049 }, { "epoch": 4.295322531335983, "grad_norm": 0.42662128806114197, "learning_rate": 2.7370812279733347e-05, "loss": 0.1425, "step": 14050 }, { "epoch": 4.295628248242128, "grad_norm": 0.6119645833969116, "learning_rate": 2.7370387669313406e-05, "loss": 0.1739, "step": 14051 }, { "epoch": 4.295933965148273, "grad_norm": 0.8434609770774841, "learning_rate": 2.7369963058893468e-05, "loss": 0.168, "step": 14052 }, { "epoch": 4.296239682054417, "grad_norm": 0.4671836197376251, "learning_rate": 2.7369538448473527e-05, "loss": 0.1735, "step": 14053 }, { "epoch": 4.296545398960562, "grad_norm": 1.2355303764343262, "learning_rate": 2.736911383805359e-05, "loss": 0.1669, "step": 14054 }, { "epoch": 4.2968511158667075, "grad_norm": 0.7317405939102173, "learning_rate": 2.7368689227633648e-05, "loss": 0.1872, "step": 14055 }, { "epoch": 4.297156832772853, "grad_norm": 0.9848539233207703, "learning_rate": 2.736826461721371e-05, "loss": 0.1562, "step": 14056 }, { "epoch": 4.297462549678997, "grad_norm": 1.0983805656433105, "learning_rate": 2.736784000679377e-05, "loss": 0.1987, "step": 14057 }, { "epoch": 4.297768266585142, "grad_norm": 1.417351245880127, "learning_rate": 2.7367415396373827e-05, "loss": 0.2558, "step": 14058 }, { "epoch": 4.298073983491287, "grad_norm": 1.2566285133361816, "learning_rate": 2.736699078595389e-05, "loss": 0.2528, "step": 14059 }, { "epoch": 4.298379700397432, "grad_norm": 0.4753519296646118, "learning_rate": 2.7366566175533948e-05, "loss": 0.1391, "step": 14060 }, { "epoch": 4.298685417303577, "grad_norm": 0.6424202919006348, "learning_rate": 2.736614156511401e-05, "loss": 0.0998, "step": 14061 }, { "epoch": 4.298991134209722, "grad_norm": 0.14744873344898224, "learning_rate": 2.736571695469407e-05, "loss": 0.0544, "step": 14062 }, { "epoch": 4.299296851115867, "grad_norm": 0.22164766490459442, "learning_rate": 2.736529234427413e-05, "loss": 0.094, "step": 14063 }, { "epoch": 4.299602568022012, "grad_norm": 0.2092195451259613, "learning_rate": 2.736486773385419e-05, "loss": 0.0716, "step": 14064 }, { "epoch": 4.299908284928157, "grad_norm": 0.23492150008678436, "learning_rate": 2.7364443123434252e-05, "loss": 0.0915, "step": 14065 }, { "epoch": 4.300214001834301, "grad_norm": 0.19796514511108398, "learning_rate": 2.736401851301431e-05, "loss": 0.0487, "step": 14066 }, { "epoch": 4.300519718740446, "grad_norm": 0.22485575079917908, "learning_rate": 2.7363593902594372e-05, "loss": 0.0599, "step": 14067 }, { "epoch": 4.300825435646591, "grad_norm": 0.1904451996088028, "learning_rate": 2.736316929217443e-05, "loss": 0.0823, "step": 14068 }, { "epoch": 4.3011311525527365, "grad_norm": 0.24391226470470428, "learning_rate": 2.7362744681754493e-05, "loss": 0.0612, "step": 14069 }, { "epoch": 4.301436869458881, "grad_norm": 0.36540937423706055, "learning_rate": 2.7362320071334552e-05, "loss": 0.1293, "step": 14070 }, { "epoch": 4.301742586365026, "grad_norm": 0.3381972908973694, "learning_rate": 2.736189546091461e-05, "loss": 0.0764, "step": 14071 }, { "epoch": 4.302048303271171, "grad_norm": 0.38233742117881775, "learning_rate": 2.7361470850494673e-05, "loss": 0.0814, "step": 14072 }, { "epoch": 4.302354020177316, "grad_norm": 0.41749435663223267, "learning_rate": 2.736104624007473e-05, "loss": 0.1123, "step": 14073 }, { "epoch": 4.302659737083461, "grad_norm": 0.33893030881881714, "learning_rate": 2.7360621629654794e-05, "loss": 0.1153, "step": 14074 }, { "epoch": 4.302965453989605, "grad_norm": 0.7785319685935974, "learning_rate": 2.7360197019234852e-05, "loss": 0.1402, "step": 14075 }, { "epoch": 4.3032711708957505, "grad_norm": 0.40333113074302673, "learning_rate": 2.7359772408814914e-05, "loss": 0.1734, "step": 14076 }, { "epoch": 4.303576887801896, "grad_norm": 1.6344172954559326, "learning_rate": 2.7359347798394973e-05, "loss": 0.1978, "step": 14077 }, { "epoch": 4.303882604708041, "grad_norm": 0.6379988789558411, "learning_rate": 2.7358923187975035e-05, "loss": 0.2301, "step": 14078 }, { "epoch": 4.304188321614185, "grad_norm": 1.2892944812774658, "learning_rate": 2.7358498577555094e-05, "loss": 0.1378, "step": 14079 }, { "epoch": 4.30449403852033, "grad_norm": 0.7156697511672974, "learning_rate": 2.7358073967135156e-05, "loss": 0.1511, "step": 14080 }, { "epoch": 4.304799755426475, "grad_norm": 0.8099051713943481, "learning_rate": 2.7357649356715215e-05, "loss": 0.168, "step": 14081 }, { "epoch": 4.30510547233262, "grad_norm": 0.6347202062606812, "learning_rate": 2.7357224746295277e-05, "loss": 0.1553, "step": 14082 }, { "epoch": 4.3054111892387645, "grad_norm": 0.7102508544921875, "learning_rate": 2.7356800135875335e-05, "loss": 0.1938, "step": 14083 }, { "epoch": 4.30571690614491, "grad_norm": 1.3989017009735107, "learning_rate": 2.7356375525455394e-05, "loss": 0.2899, "step": 14084 }, { "epoch": 4.306022623051055, "grad_norm": 0.30376648902893066, "learning_rate": 2.7355950915035456e-05, "loss": 0.1675, "step": 14085 }, { "epoch": 4.3063283399572, "grad_norm": 0.31352150440216064, "learning_rate": 2.7355526304615515e-05, "loss": 0.1003, "step": 14086 }, { "epoch": 4.306634056863345, "grad_norm": 0.20619480311870575, "learning_rate": 2.7355101694195577e-05, "loss": 0.0708, "step": 14087 }, { "epoch": 4.306939773769489, "grad_norm": 0.3956781327724457, "learning_rate": 2.7354677083775636e-05, "loss": 0.0622, "step": 14088 }, { "epoch": 4.307245490675634, "grad_norm": 0.42238813638687134, "learning_rate": 2.7354252473355698e-05, "loss": 0.067, "step": 14089 }, { "epoch": 4.3075512075817795, "grad_norm": 0.22072207927703857, "learning_rate": 2.7353827862935756e-05, "loss": 0.0501, "step": 14090 }, { "epoch": 4.307856924487925, "grad_norm": 0.4901805520057678, "learning_rate": 2.735340325251582e-05, "loss": 0.0636, "step": 14091 }, { "epoch": 4.308162641394069, "grad_norm": 0.2540338933467865, "learning_rate": 2.7352978642095877e-05, "loss": 0.069, "step": 14092 }, { "epoch": 4.308468358300214, "grad_norm": 0.2271185964345932, "learning_rate": 2.735255403167594e-05, "loss": 0.0439, "step": 14093 }, { "epoch": 4.308774075206359, "grad_norm": 0.24029132723808289, "learning_rate": 2.7352129421255998e-05, "loss": 0.0917, "step": 14094 }, { "epoch": 4.309079792112504, "grad_norm": 0.1983702927827835, "learning_rate": 2.735170481083606e-05, "loss": 0.0568, "step": 14095 }, { "epoch": 4.309385509018648, "grad_norm": 0.33192017674446106, "learning_rate": 2.735128020041612e-05, "loss": 0.085, "step": 14096 }, { "epoch": 4.3096912259247935, "grad_norm": 0.2689114809036255, "learning_rate": 2.7350855589996178e-05, "loss": 0.1079, "step": 14097 }, { "epoch": 4.309996942830939, "grad_norm": 0.4382350742816925, "learning_rate": 2.735043097957624e-05, "loss": 0.1, "step": 14098 }, { "epoch": 4.310302659737084, "grad_norm": 0.8371070623397827, "learning_rate": 2.73500063691563e-05, "loss": 0.125, "step": 14099 }, { "epoch": 4.310608376643229, "grad_norm": 0.5242752432823181, "learning_rate": 2.734958175873636e-05, "loss": 0.1284, "step": 14100 }, { "epoch": 4.310914093549373, "grad_norm": 0.316903293132782, "learning_rate": 2.734915714831642e-05, "loss": 0.1427, "step": 14101 }, { "epoch": 4.311219810455518, "grad_norm": 0.41123077273368835, "learning_rate": 2.734873253789648e-05, "loss": 0.1726, "step": 14102 }, { "epoch": 4.311525527361663, "grad_norm": 3.553513288497925, "learning_rate": 2.734830792747654e-05, "loss": 0.1905, "step": 14103 }, { "epoch": 4.311831244267808, "grad_norm": 0.5284780263900757, "learning_rate": 2.7347883317056602e-05, "loss": 0.1779, "step": 14104 }, { "epoch": 4.312136961173953, "grad_norm": 0.5089156627655029, "learning_rate": 2.734745870663666e-05, "loss": 0.1655, "step": 14105 }, { "epoch": 4.312442678080098, "grad_norm": 0.6542913317680359, "learning_rate": 2.7347034096216723e-05, "loss": 0.1937, "step": 14106 }, { "epoch": 4.312748394986243, "grad_norm": 1.1651510000228882, "learning_rate": 2.734660948579678e-05, "loss": 0.1936, "step": 14107 }, { "epoch": 4.313054111892388, "grad_norm": 0.6851086616516113, "learning_rate": 2.7346184875376844e-05, "loss": 0.1686, "step": 14108 }, { "epoch": 4.313359828798532, "grad_norm": 1.0589467287063599, "learning_rate": 2.7345760264956902e-05, "loss": 0.2518, "step": 14109 }, { "epoch": 4.313665545704677, "grad_norm": 0.5504137873649597, "learning_rate": 2.734533565453696e-05, "loss": 0.159, "step": 14110 }, { "epoch": 4.313971262610822, "grad_norm": 0.45047080516815186, "learning_rate": 2.7344911044117023e-05, "loss": 0.0941, "step": 14111 }, { "epoch": 4.3142769795169675, "grad_norm": 0.7813011407852173, "learning_rate": 2.7344486433697082e-05, "loss": 0.0995, "step": 14112 }, { "epoch": 4.314582696423113, "grad_norm": 0.36480727791786194, "learning_rate": 2.7344061823277144e-05, "loss": 0.073, "step": 14113 }, { "epoch": 4.314888413329257, "grad_norm": 0.18682850897312164, "learning_rate": 2.7343637212857203e-05, "loss": 0.0708, "step": 14114 }, { "epoch": 4.315194130235402, "grad_norm": 0.18934907019138336, "learning_rate": 2.7343212602437265e-05, "loss": 0.0387, "step": 14115 }, { "epoch": 4.315499847141547, "grad_norm": 0.2434484362602234, "learning_rate": 2.7342787992017323e-05, "loss": 0.0514, "step": 14116 }, { "epoch": 4.315805564047692, "grad_norm": 0.3544362187385559, "learning_rate": 2.7342363381597385e-05, "loss": 0.0812, "step": 14117 }, { "epoch": 4.3161112809538364, "grad_norm": 0.31695249676704407, "learning_rate": 2.7341938771177444e-05, "loss": 0.0571, "step": 14118 }, { "epoch": 4.316416997859982, "grad_norm": 0.2700180113315582, "learning_rate": 2.7341514160757506e-05, "loss": 0.0572, "step": 14119 }, { "epoch": 4.316722714766127, "grad_norm": 0.2729020118713379, "learning_rate": 2.7341089550337565e-05, "loss": 0.0871, "step": 14120 }, { "epoch": 4.317028431672272, "grad_norm": 0.5931738018989563, "learning_rate": 2.7340664939917627e-05, "loss": 0.0747, "step": 14121 }, { "epoch": 4.317334148578416, "grad_norm": 0.3919568359851837, "learning_rate": 2.7340240329497686e-05, "loss": 0.0976, "step": 14122 }, { "epoch": 4.317639865484561, "grad_norm": 0.9183927774429321, "learning_rate": 2.7339815719077744e-05, "loss": 0.1332, "step": 14123 }, { "epoch": 4.317945582390706, "grad_norm": 0.3961344063282013, "learning_rate": 2.7339391108657806e-05, "loss": 0.1567, "step": 14124 }, { "epoch": 4.318251299296851, "grad_norm": 0.30138543248176575, "learning_rate": 2.7338966498237865e-05, "loss": 0.1325, "step": 14125 }, { "epoch": 4.3185570162029965, "grad_norm": 0.7328728437423706, "learning_rate": 2.7338541887817927e-05, "loss": 0.1212, "step": 14126 }, { "epoch": 4.318862733109141, "grad_norm": 0.6603227853775024, "learning_rate": 2.7338117277397986e-05, "loss": 0.1621, "step": 14127 }, { "epoch": 4.319168450015286, "grad_norm": 0.518578052520752, "learning_rate": 2.7337692666978048e-05, "loss": 0.1696, "step": 14128 }, { "epoch": 4.319474166921431, "grad_norm": 0.5430636405944824, "learning_rate": 2.7337268056558107e-05, "loss": 0.1604, "step": 14129 }, { "epoch": 4.319779883827576, "grad_norm": 0.45239096879959106, "learning_rate": 2.733684344613817e-05, "loss": 0.1679, "step": 14130 }, { "epoch": 4.32008560073372, "grad_norm": 0.891975998878479, "learning_rate": 2.7336418835718228e-05, "loss": 0.1984, "step": 14131 }, { "epoch": 4.320391317639865, "grad_norm": 0.8088558316230774, "learning_rate": 2.733599422529829e-05, "loss": 0.178, "step": 14132 }, { "epoch": 4.3206970345460105, "grad_norm": 0.9760035872459412, "learning_rate": 2.733556961487835e-05, "loss": 0.2086, "step": 14133 }, { "epoch": 4.321002751452156, "grad_norm": 1.4491702318191528, "learning_rate": 2.733514500445841e-05, "loss": 0.2117, "step": 14134 }, { "epoch": 4.3213084683583, "grad_norm": 0.7593148946762085, "learning_rate": 2.733472039403847e-05, "loss": 0.1837, "step": 14135 }, { "epoch": 4.321614185264445, "grad_norm": 0.27714288234710693, "learning_rate": 2.7334295783618528e-05, "loss": 0.0877, "step": 14136 }, { "epoch": 4.32191990217059, "grad_norm": 0.31007611751556396, "learning_rate": 2.733387117319859e-05, "loss": 0.0687, "step": 14137 }, { "epoch": 4.322225619076735, "grad_norm": 0.3812117278575897, "learning_rate": 2.733344656277865e-05, "loss": 0.0572, "step": 14138 }, { "epoch": 4.32253133598288, "grad_norm": 0.27635809779167175, "learning_rate": 2.733302195235871e-05, "loss": 0.0511, "step": 14139 }, { "epoch": 4.3228370528890245, "grad_norm": 0.178621307015419, "learning_rate": 2.733259734193877e-05, "loss": 0.0541, "step": 14140 }, { "epoch": 4.32314276979517, "grad_norm": 0.2132558822631836, "learning_rate": 2.733217273151883e-05, "loss": 0.0572, "step": 14141 }, { "epoch": 4.323448486701315, "grad_norm": 0.2517443597316742, "learning_rate": 2.733174812109889e-05, "loss": 0.0477, "step": 14142 }, { "epoch": 4.32375420360746, "grad_norm": 0.2982519268989563, "learning_rate": 2.7331323510678952e-05, "loss": 0.0818, "step": 14143 }, { "epoch": 4.324059920513604, "grad_norm": 0.23923645913600922, "learning_rate": 2.733089890025901e-05, "loss": 0.0662, "step": 14144 }, { "epoch": 4.324365637419749, "grad_norm": 0.23974710702896118, "learning_rate": 2.7330474289839073e-05, "loss": 0.0742, "step": 14145 }, { "epoch": 4.324671354325894, "grad_norm": 0.5581826567649841, "learning_rate": 2.7330049679419132e-05, "loss": 0.0776, "step": 14146 }, { "epoch": 4.3249770712320394, "grad_norm": 0.7805964946746826, "learning_rate": 2.7329625068999194e-05, "loss": 0.0783, "step": 14147 }, { "epoch": 4.325282788138184, "grad_norm": 0.5249888300895691, "learning_rate": 2.7329200458579256e-05, "loss": 0.0795, "step": 14148 }, { "epoch": 4.325588505044329, "grad_norm": 0.7751972079277039, "learning_rate": 2.7328775848159315e-05, "loss": 0.15, "step": 14149 }, { "epoch": 4.325894221950474, "grad_norm": 0.6807534098625183, "learning_rate": 2.7328351237739377e-05, "loss": 0.1678, "step": 14150 }, { "epoch": 4.326199938856619, "grad_norm": 0.5125371217727661, "learning_rate": 2.7327926627319435e-05, "loss": 0.1612, "step": 14151 }, { "epoch": 4.326505655762764, "grad_norm": 0.8204265832901001, "learning_rate": 2.7327502016899498e-05, "loss": 0.17, "step": 14152 }, { "epoch": 4.326811372668908, "grad_norm": 0.7689731121063232, "learning_rate": 2.7327077406479556e-05, "loss": 0.1819, "step": 14153 }, { "epoch": 4.3271170895750535, "grad_norm": 0.6164469718933105, "learning_rate": 2.732665279605962e-05, "loss": 0.1746, "step": 14154 }, { "epoch": 4.327422806481199, "grad_norm": 1.392866611480713, "learning_rate": 2.7326228185639677e-05, "loss": 0.1674, "step": 14155 }, { "epoch": 4.327728523387344, "grad_norm": 0.6336075067520142, "learning_rate": 2.732580357521974e-05, "loss": 0.1734, "step": 14156 }, { "epoch": 4.328034240293488, "grad_norm": 0.8843508362770081, "learning_rate": 2.7325378964799798e-05, "loss": 0.1742, "step": 14157 }, { "epoch": 4.328339957199633, "grad_norm": 1.018278956413269, "learning_rate": 2.732495435437986e-05, "loss": 0.2731, "step": 14158 }, { "epoch": 4.328645674105778, "grad_norm": 1.8883224725723267, "learning_rate": 2.732452974395992e-05, "loss": 0.2572, "step": 14159 }, { "epoch": 4.328951391011923, "grad_norm": 0.32279297709465027, "learning_rate": 2.732410513353998e-05, "loss": 0.1468, "step": 14160 }, { "epoch": 4.3292571079180675, "grad_norm": 0.43826696276664734, "learning_rate": 2.732368052312004e-05, "loss": 0.0627, "step": 14161 }, { "epoch": 4.329562824824213, "grad_norm": 0.4750661551952362, "learning_rate": 2.7323255912700098e-05, "loss": 0.0721, "step": 14162 }, { "epoch": 4.329868541730358, "grad_norm": 0.20843777060508728, "learning_rate": 2.732283130228016e-05, "loss": 0.082, "step": 14163 }, { "epoch": 4.330174258636503, "grad_norm": 0.259846031665802, "learning_rate": 2.732240669186022e-05, "loss": 0.0647, "step": 14164 }, { "epoch": 4.330479975542648, "grad_norm": 0.3341767489910126, "learning_rate": 2.732198208144028e-05, "loss": 0.0728, "step": 14165 }, { "epoch": 4.330785692448792, "grad_norm": 0.1567402184009552, "learning_rate": 2.732155747102034e-05, "loss": 0.0491, "step": 14166 }, { "epoch": 4.331091409354937, "grad_norm": 0.24950094521045685, "learning_rate": 2.7321132860600402e-05, "loss": 0.08, "step": 14167 }, { "epoch": 4.331397126261082, "grad_norm": 0.24388036131858826, "learning_rate": 2.732070825018046e-05, "loss": 0.0672, "step": 14168 }, { "epoch": 4.3317028431672275, "grad_norm": 0.4284875988960266, "learning_rate": 2.7320283639760523e-05, "loss": 0.0696, "step": 14169 }, { "epoch": 4.332008560073372, "grad_norm": 0.2301848977804184, "learning_rate": 2.731985902934058e-05, "loss": 0.0783, "step": 14170 }, { "epoch": 4.332314276979517, "grad_norm": 1.4044766426086426, "learning_rate": 2.7319434418920643e-05, "loss": 0.0994, "step": 14171 }, { "epoch": 4.332619993885662, "grad_norm": 0.34617719054222107, "learning_rate": 2.7319009808500702e-05, "loss": 0.1124, "step": 14172 }, { "epoch": 4.332925710791807, "grad_norm": 0.3773300051689148, "learning_rate": 2.731858519808076e-05, "loss": 0.0935, "step": 14173 }, { "epoch": 4.333231427697951, "grad_norm": 0.5911105275154114, "learning_rate": 2.7318160587660823e-05, "loss": 0.118, "step": 14174 }, { "epoch": 4.3335371446040964, "grad_norm": 0.2834663689136505, "learning_rate": 2.731773597724088e-05, "loss": 0.1299, "step": 14175 }, { "epoch": 4.333842861510242, "grad_norm": 0.8483585715293884, "learning_rate": 2.7317311366820944e-05, "loss": 0.1363, "step": 14176 }, { "epoch": 4.334148578416387, "grad_norm": 0.49143701791763306, "learning_rate": 2.7316886756401002e-05, "loss": 0.2062, "step": 14177 }, { "epoch": 4.334454295322532, "grad_norm": 0.5556592345237732, "learning_rate": 2.7316462145981064e-05, "loss": 0.174, "step": 14178 }, { "epoch": 4.334760012228676, "grad_norm": 0.4796590805053711, "learning_rate": 2.7316037535561123e-05, "loss": 0.1742, "step": 14179 }, { "epoch": 4.335065729134821, "grad_norm": 0.7583059668540955, "learning_rate": 2.7315612925141185e-05, "loss": 0.1702, "step": 14180 }, { "epoch": 4.335371446040966, "grad_norm": 1.223792552947998, "learning_rate": 2.7315188314721244e-05, "loss": 0.2119, "step": 14181 }, { "epoch": 4.335677162947111, "grad_norm": 0.6743370294570923, "learning_rate": 2.7314763704301306e-05, "loss": 0.1698, "step": 14182 }, { "epoch": 4.335982879853256, "grad_norm": 0.7641844153404236, "learning_rate": 2.7314339093881365e-05, "loss": 0.2059, "step": 14183 }, { "epoch": 4.336288596759401, "grad_norm": 2.1456127166748047, "learning_rate": 2.7313914483461427e-05, "loss": 0.2151, "step": 14184 }, { "epoch": 4.336594313665546, "grad_norm": 0.8291619420051575, "learning_rate": 2.7313489873041485e-05, "loss": 0.1462, "step": 14185 }, { "epoch": 4.336900030571691, "grad_norm": 0.2801341116428375, "learning_rate": 2.7313065262621544e-05, "loss": 0.091, "step": 14186 }, { "epoch": 4.337205747477835, "grad_norm": 0.30457383394241333, "learning_rate": 2.7312640652201606e-05, "loss": 0.0902, "step": 14187 }, { "epoch": 4.33751146438398, "grad_norm": 0.23733174800872803, "learning_rate": 2.7312216041781665e-05, "loss": 0.0923, "step": 14188 }, { "epoch": 4.337817181290125, "grad_norm": 0.24603864550590515, "learning_rate": 2.7311791431361727e-05, "loss": 0.0497, "step": 14189 }, { "epoch": 4.3381228981962705, "grad_norm": 0.3961407244205475, "learning_rate": 2.7311366820941786e-05, "loss": 0.0839, "step": 14190 }, { "epoch": 4.338428615102416, "grad_norm": 0.26140472292900085, "learning_rate": 2.7310942210521848e-05, "loss": 0.0714, "step": 14191 }, { "epoch": 4.33873433200856, "grad_norm": 0.29579228162765503, "learning_rate": 2.7310517600101907e-05, "loss": 0.0763, "step": 14192 }, { "epoch": 4.339040048914705, "grad_norm": 0.3680647015571594, "learning_rate": 2.731009298968197e-05, "loss": 0.0535, "step": 14193 }, { "epoch": 4.33934576582085, "grad_norm": 0.3159578740596771, "learning_rate": 2.7309668379262027e-05, "loss": 0.0998, "step": 14194 }, { "epoch": 4.339651482726995, "grad_norm": 0.23684266209602356, "learning_rate": 2.730924376884209e-05, "loss": 0.0856, "step": 14195 }, { "epoch": 4.339957199633139, "grad_norm": 0.3143622875213623, "learning_rate": 2.7308819158422148e-05, "loss": 0.0894, "step": 14196 }, { "epoch": 4.3402629165392845, "grad_norm": 0.24580857157707214, "learning_rate": 2.730839454800221e-05, "loss": 0.089, "step": 14197 }, { "epoch": 4.34056863344543, "grad_norm": 0.4972340762615204, "learning_rate": 2.730796993758227e-05, "loss": 0.104, "step": 14198 }, { "epoch": 4.340874350351575, "grad_norm": 0.3855232894420624, "learning_rate": 2.7307545327162328e-05, "loss": 0.1519, "step": 14199 }, { "epoch": 4.341180067257719, "grad_norm": 1.2173281908035278, "learning_rate": 2.730712071674239e-05, "loss": 0.1182, "step": 14200 }, { "epoch": 4.341485784163864, "grad_norm": 0.7494391202926636, "learning_rate": 2.730669610632245e-05, "loss": 0.1576, "step": 14201 }, { "epoch": 4.341791501070009, "grad_norm": 0.9011579155921936, "learning_rate": 2.730627149590251e-05, "loss": 0.1465, "step": 14202 }, { "epoch": 4.342097217976154, "grad_norm": 0.6253871321678162, "learning_rate": 2.730584688548257e-05, "loss": 0.1379, "step": 14203 }, { "epoch": 4.3424029348822994, "grad_norm": 0.6753238439559937, "learning_rate": 2.730542227506263e-05, "loss": 0.1692, "step": 14204 }, { "epoch": 4.342708651788444, "grad_norm": 0.6452096700668335, "learning_rate": 2.730499766464269e-05, "loss": 0.1913, "step": 14205 }, { "epoch": 4.343014368694589, "grad_norm": 0.5909054279327393, "learning_rate": 2.7304573054222752e-05, "loss": 0.1688, "step": 14206 }, { "epoch": 4.343320085600734, "grad_norm": 1.5263280868530273, "learning_rate": 2.730414844380281e-05, "loss": 0.2101, "step": 14207 }, { "epoch": 4.343625802506879, "grad_norm": 0.8992653489112854, "learning_rate": 2.7303723833382873e-05, "loss": 0.1825, "step": 14208 }, { "epoch": 4.343931519413023, "grad_norm": 1.1652889251708984, "learning_rate": 2.730329922296293e-05, "loss": 0.2206, "step": 14209 }, { "epoch": 4.344237236319168, "grad_norm": 0.350009560585022, "learning_rate": 2.7302874612542994e-05, "loss": 0.1653, "step": 14210 }, { "epoch": 4.3445429532253135, "grad_norm": 0.3101792633533478, "learning_rate": 2.7302450002123052e-05, "loss": 0.089, "step": 14211 }, { "epoch": 4.344848670131459, "grad_norm": 0.24774722754955292, "learning_rate": 2.730202539170311e-05, "loss": 0.0527, "step": 14212 }, { "epoch": 4.345154387037603, "grad_norm": 1.1593408584594727, "learning_rate": 2.7301600781283173e-05, "loss": 0.0667, "step": 14213 }, { "epoch": 4.345460103943748, "grad_norm": 0.20398308336734772, "learning_rate": 2.7301176170863232e-05, "loss": 0.0554, "step": 14214 }, { "epoch": 4.345765820849893, "grad_norm": 0.227591872215271, "learning_rate": 2.7300751560443294e-05, "loss": 0.0451, "step": 14215 }, { "epoch": 4.346071537756038, "grad_norm": 0.8197818398475647, "learning_rate": 2.7300326950023353e-05, "loss": 0.0701, "step": 14216 }, { "epoch": 4.346377254662183, "grad_norm": 0.5134749412536621, "learning_rate": 2.7299902339603415e-05, "loss": 0.0559, "step": 14217 }, { "epoch": 4.3466829715683275, "grad_norm": 0.3570065498352051, "learning_rate": 2.7299477729183473e-05, "loss": 0.0623, "step": 14218 }, { "epoch": 4.346988688474473, "grad_norm": 0.1945134699344635, "learning_rate": 2.7299053118763535e-05, "loss": 0.0629, "step": 14219 }, { "epoch": 4.347294405380618, "grad_norm": 0.3434978425502777, "learning_rate": 2.7298628508343594e-05, "loss": 0.1231, "step": 14220 }, { "epoch": 4.347600122286763, "grad_norm": 0.4840087294578552, "learning_rate": 2.7298203897923656e-05, "loss": 0.0896, "step": 14221 }, { "epoch": 4.347905839192907, "grad_norm": 0.4645930528640747, "learning_rate": 2.7297779287503715e-05, "loss": 0.0848, "step": 14222 }, { "epoch": 4.348211556099052, "grad_norm": 0.5596492886543274, "learning_rate": 2.7297354677083777e-05, "loss": 0.0891, "step": 14223 }, { "epoch": 4.348517273005197, "grad_norm": 0.4600352942943573, "learning_rate": 2.7296930066663836e-05, "loss": 0.1098, "step": 14224 }, { "epoch": 4.348822989911342, "grad_norm": 0.4407135248184204, "learning_rate": 2.7296505456243894e-05, "loss": 0.1335, "step": 14225 }, { "epoch": 4.349128706817487, "grad_norm": 1.324535608291626, "learning_rate": 2.7296080845823957e-05, "loss": 0.181, "step": 14226 }, { "epoch": 4.349434423723632, "grad_norm": 4.245535373687744, "learning_rate": 2.7295656235404015e-05, "loss": 0.1662, "step": 14227 }, { "epoch": 4.349740140629777, "grad_norm": 0.5334571003913879, "learning_rate": 2.7295231624984077e-05, "loss": 0.1696, "step": 14228 }, { "epoch": 4.350045857535922, "grad_norm": 0.6738114356994629, "learning_rate": 2.7294807014564136e-05, "loss": 0.175, "step": 14229 }, { "epoch": 4.350351574442067, "grad_norm": 0.5674344897270203, "learning_rate": 2.7294382404144198e-05, "loss": 0.1805, "step": 14230 }, { "epoch": 4.350657291348211, "grad_norm": 1.4564361572265625, "learning_rate": 2.7293957793724257e-05, "loss": 0.18, "step": 14231 }, { "epoch": 4.3509630082543564, "grad_norm": 0.7308028340339661, "learning_rate": 2.729353318330432e-05, "loss": 0.2075, "step": 14232 }, { "epoch": 4.351268725160502, "grad_norm": 2.7206850051879883, "learning_rate": 2.7293108572884378e-05, "loss": 0.2575, "step": 14233 }, { "epoch": 4.351574442066647, "grad_norm": 1.6938326358795166, "learning_rate": 2.729268396246444e-05, "loss": 0.2443, "step": 14234 }, { "epoch": 4.351880158972791, "grad_norm": 0.2746870815753937, "learning_rate": 2.72922593520445e-05, "loss": 0.1338, "step": 14235 }, { "epoch": 4.352185875878936, "grad_norm": 0.3828077018260956, "learning_rate": 2.729183474162456e-05, "loss": 0.0702, "step": 14236 }, { "epoch": 4.352491592785081, "grad_norm": 0.2140248566865921, "learning_rate": 2.729141013120462e-05, "loss": 0.0567, "step": 14237 }, { "epoch": 4.352797309691226, "grad_norm": 0.300726979970932, "learning_rate": 2.7290985520784678e-05, "loss": 0.0661, "step": 14238 }, { "epoch": 4.3531030265973705, "grad_norm": 0.16673973202705383, "learning_rate": 2.729056091036474e-05, "loss": 0.0478, "step": 14239 }, { "epoch": 4.353408743503516, "grad_norm": 0.2490721493959427, "learning_rate": 2.72901362999448e-05, "loss": 0.0542, "step": 14240 }, { "epoch": 4.353714460409661, "grad_norm": 0.9127814769744873, "learning_rate": 2.728971168952486e-05, "loss": 0.0569, "step": 14241 }, { "epoch": 4.354020177315806, "grad_norm": 0.23870331048965454, "learning_rate": 2.728928707910492e-05, "loss": 0.074, "step": 14242 }, { "epoch": 4.354325894221951, "grad_norm": 0.31523698568344116, "learning_rate": 2.728886246868498e-05, "loss": 0.0459, "step": 14243 }, { "epoch": 4.354631611128095, "grad_norm": 0.3899202048778534, "learning_rate": 2.728843785826504e-05, "loss": 0.046, "step": 14244 }, { "epoch": 4.35493732803424, "grad_norm": 0.3767424523830414, "learning_rate": 2.7288013247845102e-05, "loss": 0.0896, "step": 14245 }, { "epoch": 4.355243044940385, "grad_norm": 0.35145530104637146, "learning_rate": 2.728758863742516e-05, "loss": 0.0696, "step": 14246 }, { "epoch": 4.3555487618465305, "grad_norm": 0.6427071690559387, "learning_rate": 2.7287164027005223e-05, "loss": 0.1117, "step": 14247 }, { "epoch": 4.355854478752675, "grad_norm": 0.2868991792201996, "learning_rate": 2.7286739416585282e-05, "loss": 0.0903, "step": 14248 }, { "epoch": 4.35616019565882, "grad_norm": 0.8051638007164001, "learning_rate": 2.7286314806165344e-05, "loss": 0.1335, "step": 14249 }, { "epoch": 4.356465912564965, "grad_norm": 0.31484684348106384, "learning_rate": 2.7285890195745406e-05, "loss": 0.1355, "step": 14250 }, { "epoch": 4.35677162947111, "grad_norm": 0.4529702663421631, "learning_rate": 2.7285465585325465e-05, "loss": 0.1486, "step": 14251 }, { "epoch": 4.357077346377254, "grad_norm": 1.024579405784607, "learning_rate": 2.7285040974905527e-05, "loss": 0.158, "step": 14252 }, { "epoch": 4.357383063283399, "grad_norm": 0.6130483150482178, "learning_rate": 2.7284616364485585e-05, "loss": 0.1577, "step": 14253 }, { "epoch": 4.3576887801895445, "grad_norm": 0.47328925132751465, "learning_rate": 2.7284191754065648e-05, "loss": 0.1957, "step": 14254 }, { "epoch": 4.35799449709569, "grad_norm": 0.5040074586868286, "learning_rate": 2.7283767143645706e-05, "loss": 0.1587, "step": 14255 }, { "epoch": 4.358300214001835, "grad_norm": 0.9729853272438049, "learning_rate": 2.728334253322577e-05, "loss": 0.1744, "step": 14256 }, { "epoch": 4.358605930907979, "grad_norm": 3.7966649532318115, "learning_rate": 2.7282917922805827e-05, "loss": 0.1875, "step": 14257 }, { "epoch": 4.358911647814124, "grad_norm": 1.0616683959960938, "learning_rate": 2.728249331238589e-05, "loss": 0.2414, "step": 14258 }, { "epoch": 4.359217364720269, "grad_norm": 9.237844467163086, "learning_rate": 2.7282068701965948e-05, "loss": 0.2434, "step": 14259 }, { "epoch": 4.359523081626414, "grad_norm": 0.5932806134223938, "learning_rate": 2.728164409154601e-05, "loss": 0.1523, "step": 14260 }, { "epoch": 4.359828798532559, "grad_norm": 0.23908744752407074, "learning_rate": 2.728121948112607e-05, "loss": 0.0949, "step": 14261 }, { "epoch": 4.360134515438704, "grad_norm": 0.3862096667289734, "learning_rate": 2.728079487070613e-05, "loss": 0.0691, "step": 14262 }, { "epoch": 4.360440232344849, "grad_norm": 1.0996389389038086, "learning_rate": 2.728037026028619e-05, "loss": 0.0615, "step": 14263 }, { "epoch": 4.360745949250994, "grad_norm": 0.2865076959133148, "learning_rate": 2.7279945649866248e-05, "loss": 0.0809, "step": 14264 }, { "epoch": 4.361051666157138, "grad_norm": 0.2803371846675873, "learning_rate": 2.727952103944631e-05, "loss": 0.0702, "step": 14265 }, { "epoch": 4.361357383063283, "grad_norm": 1.1599035263061523, "learning_rate": 2.727909642902637e-05, "loss": 0.0535, "step": 14266 }, { "epoch": 4.361663099969428, "grad_norm": 0.17760521173477173, "learning_rate": 2.727867181860643e-05, "loss": 0.0517, "step": 14267 }, { "epoch": 4.3619688168755735, "grad_norm": 0.3654142916202545, "learning_rate": 2.727824720818649e-05, "loss": 0.0721, "step": 14268 }, { "epoch": 4.362274533781719, "grad_norm": 0.1909300982952118, "learning_rate": 2.7277822597766552e-05, "loss": 0.0556, "step": 14269 }, { "epoch": 4.362580250687863, "grad_norm": 0.30288422107696533, "learning_rate": 2.727739798734661e-05, "loss": 0.0835, "step": 14270 }, { "epoch": 4.362885967594008, "grad_norm": 0.6070531606674194, "learning_rate": 2.7276973376926673e-05, "loss": 0.0976, "step": 14271 }, { "epoch": 4.363191684500153, "grad_norm": 0.3637755215167999, "learning_rate": 2.727654876650673e-05, "loss": 0.1086, "step": 14272 }, { "epoch": 4.363497401406298, "grad_norm": 0.5051128268241882, "learning_rate": 2.7276124156086793e-05, "loss": 0.1251, "step": 14273 }, { "epoch": 4.363803118312442, "grad_norm": 4.192461013793945, "learning_rate": 2.7275699545666852e-05, "loss": 0.1071, "step": 14274 }, { "epoch": 4.3641088352185875, "grad_norm": 1.505017638206482, "learning_rate": 2.7275274935246914e-05, "loss": 0.1527, "step": 14275 }, { "epoch": 4.364414552124733, "grad_norm": 0.682859480381012, "learning_rate": 2.7274850324826973e-05, "loss": 0.1512, "step": 14276 }, { "epoch": 4.364720269030878, "grad_norm": 0.7609994411468506, "learning_rate": 2.727442571440703e-05, "loss": 0.1632, "step": 14277 }, { "epoch": 4.365025985937022, "grad_norm": 0.6938153505325317, "learning_rate": 2.7274001103987094e-05, "loss": 0.1665, "step": 14278 }, { "epoch": 4.365331702843167, "grad_norm": 0.7799692749977112, "learning_rate": 2.7273576493567152e-05, "loss": 0.1832, "step": 14279 }, { "epoch": 4.365637419749312, "grad_norm": 0.6296644806861877, "learning_rate": 2.7273151883147214e-05, "loss": 0.1812, "step": 14280 }, { "epoch": 4.365943136655457, "grad_norm": 0.9093557596206665, "learning_rate": 2.7272727272727273e-05, "loss": 0.1902, "step": 14281 }, { "epoch": 4.366248853561602, "grad_norm": 0.7285810112953186, "learning_rate": 2.7272302662307335e-05, "loss": 0.2086, "step": 14282 }, { "epoch": 4.366554570467747, "grad_norm": 1.201517939567566, "learning_rate": 2.7271878051887394e-05, "loss": 0.2348, "step": 14283 }, { "epoch": 4.366860287373892, "grad_norm": 1.0778921842575073, "learning_rate": 2.7271453441467456e-05, "loss": 0.2203, "step": 14284 }, { "epoch": 4.367166004280037, "grad_norm": 0.9760522842407227, "learning_rate": 2.7271028831047515e-05, "loss": 0.154, "step": 14285 }, { "epoch": 4.367471721186182, "grad_norm": 0.39916887879371643, "learning_rate": 2.7270604220627577e-05, "loss": 0.0714, "step": 14286 }, { "epoch": 4.367777438092326, "grad_norm": 0.19271422922611237, "learning_rate": 2.7270179610207635e-05, "loss": 0.0617, "step": 14287 }, { "epoch": 4.368083154998471, "grad_norm": 0.33832067251205444, "learning_rate": 2.7269754999787694e-05, "loss": 0.069, "step": 14288 }, { "epoch": 4.368388871904616, "grad_norm": 0.5417702794075012, "learning_rate": 2.7269330389367756e-05, "loss": 0.0812, "step": 14289 }, { "epoch": 4.368694588810762, "grad_norm": 0.2913169264793396, "learning_rate": 2.7268905778947815e-05, "loss": 0.0498, "step": 14290 }, { "epoch": 4.369000305716906, "grad_norm": 0.3764575719833374, "learning_rate": 2.7268481168527877e-05, "loss": 0.0616, "step": 14291 }, { "epoch": 4.369306022623051, "grad_norm": 0.20870304107666016, "learning_rate": 2.7268056558107936e-05, "loss": 0.0637, "step": 14292 }, { "epoch": 4.369611739529196, "grad_norm": 0.4190097153186798, "learning_rate": 2.7267631947687998e-05, "loss": 0.057, "step": 14293 }, { "epoch": 4.369917456435341, "grad_norm": 0.27470675110816956, "learning_rate": 2.7267207337268057e-05, "loss": 0.0792, "step": 14294 }, { "epoch": 4.370223173341486, "grad_norm": 0.3366176187992096, "learning_rate": 2.726678272684812e-05, "loss": 0.0914, "step": 14295 }, { "epoch": 4.3705288902476305, "grad_norm": 0.48107069730758667, "learning_rate": 2.7266358116428177e-05, "loss": 0.0811, "step": 14296 }, { "epoch": 4.370834607153776, "grad_norm": 0.5460048913955688, "learning_rate": 2.726593350600824e-05, "loss": 0.1035, "step": 14297 }, { "epoch": 4.371140324059921, "grad_norm": 0.3763912618160248, "learning_rate": 2.7265508895588298e-05, "loss": 0.1394, "step": 14298 }, { "epoch": 4.371446040966066, "grad_norm": 0.41793110966682434, "learning_rate": 2.726508428516836e-05, "loss": 0.0981, "step": 14299 }, { "epoch": 4.37175175787221, "grad_norm": 0.3967112600803375, "learning_rate": 2.726465967474842e-05, "loss": 0.1208, "step": 14300 }, { "epoch": 4.372057474778355, "grad_norm": 0.4578920602798462, "learning_rate": 2.7264235064328478e-05, "loss": 0.1322, "step": 14301 }, { "epoch": 4.3723631916845, "grad_norm": 0.6243684887886047, "learning_rate": 2.726381045390854e-05, "loss": 0.1366, "step": 14302 }, { "epoch": 4.372668908590645, "grad_norm": 3.125779867172241, "learning_rate": 2.72633858434886e-05, "loss": 0.1533, "step": 14303 }, { "epoch": 4.37297462549679, "grad_norm": 1.1410385370254517, "learning_rate": 2.726296123306866e-05, "loss": 0.1598, "step": 14304 }, { "epoch": 4.373280342402935, "grad_norm": 1.864339828491211, "learning_rate": 2.726253662264872e-05, "loss": 0.1697, "step": 14305 }, { "epoch": 4.37358605930908, "grad_norm": 0.47193339467048645, "learning_rate": 2.726211201222878e-05, "loss": 0.1805, "step": 14306 }, { "epoch": 4.373891776215225, "grad_norm": 0.4943760335445404, "learning_rate": 2.726168740180884e-05, "loss": 0.1722, "step": 14307 }, { "epoch": 4.37419749312137, "grad_norm": 0.8484916687011719, "learning_rate": 2.7261262791388902e-05, "loss": 0.2151, "step": 14308 }, { "epoch": 4.374503210027514, "grad_norm": 1.3669146299362183, "learning_rate": 2.726083818096896e-05, "loss": 0.3079, "step": 14309 }, { "epoch": 4.374808926933659, "grad_norm": 0.2767006456851959, "learning_rate": 2.7260413570549023e-05, "loss": 0.1285, "step": 14310 }, { "epoch": 4.3751146438398045, "grad_norm": 0.35759463906288147, "learning_rate": 2.725998896012908e-05, "loss": 0.0742, "step": 14311 }, { "epoch": 4.37542036074595, "grad_norm": 0.37136778235435486, "learning_rate": 2.7259564349709144e-05, "loss": 0.0785, "step": 14312 }, { "epoch": 4.375726077652094, "grad_norm": 0.1898246705532074, "learning_rate": 2.7259139739289202e-05, "loss": 0.0431, "step": 14313 }, { "epoch": 4.376031794558239, "grad_norm": 0.3954302668571472, "learning_rate": 2.725871512886926e-05, "loss": 0.0852, "step": 14314 }, { "epoch": 4.376337511464384, "grad_norm": 0.1930275857448578, "learning_rate": 2.7258290518449323e-05, "loss": 0.0468, "step": 14315 }, { "epoch": 4.376643228370529, "grad_norm": 0.27929583191871643, "learning_rate": 2.7257865908029382e-05, "loss": 0.0571, "step": 14316 }, { "epoch": 4.376948945276673, "grad_norm": 0.18468016386032104, "learning_rate": 2.7257441297609444e-05, "loss": 0.0581, "step": 14317 }, { "epoch": 4.377254662182819, "grad_norm": 0.4246963858604431, "learning_rate": 2.7257016687189503e-05, "loss": 0.1093, "step": 14318 }, { "epoch": 4.377560379088964, "grad_norm": 0.22468972206115723, "learning_rate": 2.7256592076769565e-05, "loss": 0.0621, "step": 14319 }, { "epoch": 4.377866095995109, "grad_norm": 0.5065781474113464, "learning_rate": 2.7256167466349623e-05, "loss": 0.0889, "step": 14320 }, { "epoch": 4.378171812901254, "grad_norm": 0.2253914475440979, "learning_rate": 2.7255742855929685e-05, "loss": 0.077, "step": 14321 }, { "epoch": 4.378477529807398, "grad_norm": 0.39910009503364563, "learning_rate": 2.7255318245509744e-05, "loss": 0.0963, "step": 14322 }, { "epoch": 4.378783246713543, "grad_norm": 0.963843822479248, "learning_rate": 2.7254893635089806e-05, "loss": 0.1055, "step": 14323 }, { "epoch": 4.379088963619688, "grad_norm": 0.444304496049881, "learning_rate": 2.7254469024669865e-05, "loss": 0.1246, "step": 14324 }, { "epoch": 4.3793946805258335, "grad_norm": 1.5424660444259644, "learning_rate": 2.7254044414249927e-05, "loss": 0.1433, "step": 14325 }, { "epoch": 4.379700397431978, "grad_norm": 0.7562119364738464, "learning_rate": 2.7253619803829986e-05, "loss": 0.1851, "step": 14326 }, { "epoch": 4.380006114338123, "grad_norm": 0.5305968523025513, "learning_rate": 2.7253195193410044e-05, "loss": 0.1799, "step": 14327 }, { "epoch": 4.380311831244268, "grad_norm": 0.7370104193687439, "learning_rate": 2.7252770582990107e-05, "loss": 0.1691, "step": 14328 }, { "epoch": 4.380617548150413, "grad_norm": 0.48119068145751953, "learning_rate": 2.7252345972570165e-05, "loss": 0.1487, "step": 14329 }, { "epoch": 4.380923265056557, "grad_norm": 1.5127848386764526, "learning_rate": 2.7251921362150227e-05, "loss": 0.173, "step": 14330 }, { "epoch": 4.381228981962702, "grad_norm": 1.1392748355865479, "learning_rate": 2.7251496751730286e-05, "loss": 0.1893, "step": 14331 }, { "epoch": 4.3815346988688475, "grad_norm": 1.2895402908325195, "learning_rate": 2.7251072141310348e-05, "loss": 0.1736, "step": 14332 }, { "epoch": 4.381840415774993, "grad_norm": 4.046663284301758, "learning_rate": 2.7250647530890407e-05, "loss": 0.1992, "step": 14333 }, { "epoch": 4.382146132681138, "grad_norm": 1.7768094539642334, "learning_rate": 2.725022292047047e-05, "loss": 0.2829, "step": 14334 }, { "epoch": 4.382451849587282, "grad_norm": 0.45087459683418274, "learning_rate": 2.7249798310050528e-05, "loss": 0.1349, "step": 14335 }, { "epoch": 4.382757566493427, "grad_norm": 0.3623490035533905, "learning_rate": 2.724937369963059e-05, "loss": 0.0792, "step": 14336 }, { "epoch": 4.383063283399572, "grad_norm": 0.2914038896560669, "learning_rate": 2.724894908921065e-05, "loss": 0.0788, "step": 14337 }, { "epoch": 4.383369000305717, "grad_norm": 0.45274779200553894, "learning_rate": 2.724852447879071e-05, "loss": 0.0619, "step": 14338 }, { "epoch": 4.3836747172118615, "grad_norm": 0.47049999237060547, "learning_rate": 2.724809986837077e-05, "loss": 0.0555, "step": 14339 }, { "epoch": 4.383980434118007, "grad_norm": 0.21526163816452026, "learning_rate": 2.7247675257950828e-05, "loss": 0.0479, "step": 14340 }, { "epoch": 4.384286151024152, "grad_norm": 0.1837645322084427, "learning_rate": 2.724725064753089e-05, "loss": 0.0337, "step": 14341 }, { "epoch": 4.384591867930297, "grad_norm": 0.2838267982006073, "learning_rate": 2.724682603711095e-05, "loss": 0.0663, "step": 14342 }, { "epoch": 4.384897584836441, "grad_norm": 0.9670231938362122, "learning_rate": 2.724640142669101e-05, "loss": 0.0647, "step": 14343 }, { "epoch": 4.385203301742586, "grad_norm": 0.2953696548938751, "learning_rate": 2.724597681627107e-05, "loss": 0.0741, "step": 14344 }, { "epoch": 4.385509018648731, "grad_norm": 1.0236716270446777, "learning_rate": 2.724555220585113e-05, "loss": 0.0979, "step": 14345 }, { "epoch": 4.385814735554876, "grad_norm": 2.1024210453033447, "learning_rate": 2.724512759543119e-05, "loss": 0.0704, "step": 14346 }, { "epoch": 4.386120452461022, "grad_norm": 0.4353124499320984, "learning_rate": 2.7244702985011252e-05, "loss": 0.117, "step": 14347 }, { "epoch": 4.386426169367166, "grad_norm": 1.0803172588348389, "learning_rate": 2.724427837459131e-05, "loss": 0.1462, "step": 14348 }, { "epoch": 4.386731886273311, "grad_norm": 0.6661896705627441, "learning_rate": 2.7243853764171373e-05, "loss": 0.1207, "step": 14349 }, { "epoch": 4.387037603179456, "grad_norm": 1.0019354820251465, "learning_rate": 2.7243429153751432e-05, "loss": 0.1637, "step": 14350 }, { "epoch": 4.387343320085601, "grad_norm": 0.644331157207489, "learning_rate": 2.7243004543331494e-05, "loss": 0.2067, "step": 14351 }, { "epoch": 4.387649036991745, "grad_norm": 0.5394116640090942, "learning_rate": 2.7242579932911556e-05, "loss": 0.1576, "step": 14352 }, { "epoch": 4.3879547538978905, "grad_norm": 1.2662062644958496, "learning_rate": 2.7242155322491615e-05, "loss": 0.1818, "step": 14353 }, { "epoch": 4.388260470804036, "grad_norm": 1.3087377548217773, "learning_rate": 2.7241730712071677e-05, "loss": 0.2359, "step": 14354 }, { "epoch": 4.388566187710181, "grad_norm": 0.7593356966972351, "learning_rate": 2.7241306101651735e-05, "loss": 0.1747, "step": 14355 }, { "epoch": 4.388871904616325, "grad_norm": 0.7735068798065186, "learning_rate": 2.7240881491231798e-05, "loss": 0.1892, "step": 14356 }, { "epoch": 4.38917762152247, "grad_norm": 0.7341505289077759, "learning_rate": 2.7240456880811856e-05, "loss": 0.1803, "step": 14357 }, { "epoch": 4.389483338428615, "grad_norm": 1.019860863685608, "learning_rate": 2.724003227039192e-05, "loss": 0.2235, "step": 14358 }, { "epoch": 4.38978905533476, "grad_norm": 1.2741323709487915, "learning_rate": 2.7239607659971977e-05, "loss": 0.2566, "step": 14359 }, { "epoch": 4.390094772240905, "grad_norm": 0.3941652178764343, "learning_rate": 2.723918304955204e-05, "loss": 0.1486, "step": 14360 }, { "epoch": 4.39040048914705, "grad_norm": 0.498052179813385, "learning_rate": 2.7238758439132098e-05, "loss": 0.0759, "step": 14361 }, { "epoch": 4.390706206053195, "grad_norm": 0.16004516184329987, "learning_rate": 2.723833382871216e-05, "loss": 0.0624, "step": 14362 }, { "epoch": 4.39101192295934, "grad_norm": 0.3149740993976593, "learning_rate": 2.723790921829222e-05, "loss": 0.0705, "step": 14363 }, { "epoch": 4.391317639865485, "grad_norm": 0.34052133560180664, "learning_rate": 2.723748460787228e-05, "loss": 0.0485, "step": 14364 }, { "epoch": 4.391623356771629, "grad_norm": 0.44678741693496704, "learning_rate": 2.723705999745234e-05, "loss": 0.0858, "step": 14365 }, { "epoch": 4.391929073677774, "grad_norm": 0.15014933049678802, "learning_rate": 2.7236635387032398e-05, "loss": 0.0416, "step": 14366 }, { "epoch": 4.392234790583919, "grad_norm": 0.4283859431743622, "learning_rate": 2.723621077661246e-05, "loss": 0.0452, "step": 14367 }, { "epoch": 4.3925405074900645, "grad_norm": 0.5156069397926331, "learning_rate": 2.723578616619252e-05, "loss": 0.0928, "step": 14368 }, { "epoch": 4.392846224396209, "grad_norm": 0.48713886737823486, "learning_rate": 2.723536155577258e-05, "loss": 0.077, "step": 14369 }, { "epoch": 4.393151941302354, "grad_norm": 0.6752716302871704, "learning_rate": 2.723493694535264e-05, "loss": 0.0878, "step": 14370 }, { "epoch": 4.393457658208499, "grad_norm": 0.305848091840744, "learning_rate": 2.7234512334932702e-05, "loss": 0.0884, "step": 14371 }, { "epoch": 4.393763375114644, "grad_norm": 0.2887207567691803, "learning_rate": 2.723408772451276e-05, "loss": 0.0923, "step": 14372 }, { "epoch": 4.394069092020789, "grad_norm": 0.5574854612350464, "learning_rate": 2.7233663114092823e-05, "loss": 0.122, "step": 14373 }, { "epoch": 4.394374808926933, "grad_norm": 0.3658098578453064, "learning_rate": 2.723323850367288e-05, "loss": 0.1189, "step": 14374 }, { "epoch": 4.3946805258330786, "grad_norm": 1.0054949522018433, "learning_rate": 2.7232813893252943e-05, "loss": 0.1295, "step": 14375 }, { "epoch": 4.394986242739224, "grad_norm": 0.8232479691505432, "learning_rate": 2.7232389282833002e-05, "loss": 0.1311, "step": 14376 }, { "epoch": 4.395291959645369, "grad_norm": 0.5624608397483826, "learning_rate": 2.7231964672413064e-05, "loss": 0.1739, "step": 14377 }, { "epoch": 4.395597676551513, "grad_norm": 1.0468456745147705, "learning_rate": 2.7231540061993123e-05, "loss": 0.1721, "step": 14378 }, { "epoch": 4.395903393457658, "grad_norm": 1.247160792350769, "learning_rate": 2.723111545157318e-05, "loss": 0.162, "step": 14379 }, { "epoch": 4.396209110363803, "grad_norm": 1.45285165309906, "learning_rate": 2.7230690841153244e-05, "loss": 0.1872, "step": 14380 }, { "epoch": 4.396514827269948, "grad_norm": 0.9678321480751038, "learning_rate": 2.7230266230733302e-05, "loss": 0.2009, "step": 14381 }, { "epoch": 4.396820544176093, "grad_norm": 1.2475181818008423, "learning_rate": 2.7229841620313364e-05, "loss": 0.2081, "step": 14382 }, { "epoch": 4.397126261082238, "grad_norm": 1.3967269659042358, "learning_rate": 2.7229417009893423e-05, "loss": 0.1942, "step": 14383 }, { "epoch": 4.397431977988383, "grad_norm": 1.0328490734100342, "learning_rate": 2.7228992399473485e-05, "loss": 0.2336, "step": 14384 }, { "epoch": 4.397737694894528, "grad_norm": 0.24771282076835632, "learning_rate": 2.7228567789053544e-05, "loss": 0.1193, "step": 14385 }, { "epoch": 4.398043411800673, "grad_norm": 0.2626550495624542, "learning_rate": 2.7228143178633606e-05, "loss": 0.0858, "step": 14386 }, { "epoch": 4.398349128706817, "grad_norm": 0.29993149638175964, "learning_rate": 2.7227718568213665e-05, "loss": 0.062, "step": 14387 }, { "epoch": 4.398654845612962, "grad_norm": 0.26886773109436035, "learning_rate": 2.7227293957793727e-05, "loss": 0.0526, "step": 14388 }, { "epoch": 4.3989605625191075, "grad_norm": 0.23662430047988892, "learning_rate": 2.7226869347373786e-05, "loss": 0.0461, "step": 14389 }, { "epoch": 4.399266279425253, "grad_norm": 0.23339003324508667, "learning_rate": 2.7226444736953848e-05, "loss": 0.0575, "step": 14390 }, { "epoch": 4.399571996331397, "grad_norm": 0.3219698965549469, "learning_rate": 2.7226020126533906e-05, "loss": 0.0598, "step": 14391 }, { "epoch": 4.399877713237542, "grad_norm": 0.29429832100868225, "learning_rate": 2.7225595516113965e-05, "loss": 0.0463, "step": 14392 }, { "epoch": 4.400183430143687, "grad_norm": 0.37255653738975525, "learning_rate": 2.7225170905694027e-05, "loss": 0.1078, "step": 14393 }, { "epoch": 4.400489147049832, "grad_norm": 0.37738069891929626, "learning_rate": 2.7224746295274086e-05, "loss": 0.0615, "step": 14394 }, { "epoch": 4.400794863955976, "grad_norm": 0.40627893805503845, "learning_rate": 2.7224321684854148e-05, "loss": 0.0779, "step": 14395 }, { "epoch": 4.4011005808621215, "grad_norm": 0.7666425704956055, "learning_rate": 2.7223897074434207e-05, "loss": 0.0671, "step": 14396 }, { "epoch": 4.401406297768267, "grad_norm": 0.3099823296070099, "learning_rate": 2.722347246401427e-05, "loss": 0.0863, "step": 14397 }, { "epoch": 4.401712014674412, "grad_norm": 0.5862555503845215, "learning_rate": 2.7223047853594327e-05, "loss": 0.1249, "step": 14398 }, { "epoch": 4.402017731580557, "grad_norm": 0.3953700363636017, "learning_rate": 2.722262324317439e-05, "loss": 0.1158, "step": 14399 }, { "epoch": 4.402323448486701, "grad_norm": 0.5585213899612427, "learning_rate": 2.7222198632754448e-05, "loss": 0.1318, "step": 14400 }, { "epoch": 4.402629165392846, "grad_norm": 1.9296774864196777, "learning_rate": 2.722177402233451e-05, "loss": 0.1705, "step": 14401 }, { "epoch": 4.402934882298991, "grad_norm": 1.030532717704773, "learning_rate": 2.722134941191457e-05, "loss": 0.2196, "step": 14402 }, { "epoch": 4.403240599205136, "grad_norm": 0.6951220035552979, "learning_rate": 2.722092480149463e-05, "loss": 0.1772, "step": 14403 }, { "epoch": 4.403546316111281, "grad_norm": 0.6964686512947083, "learning_rate": 2.722050019107469e-05, "loss": 0.1607, "step": 14404 }, { "epoch": 4.403852033017426, "grad_norm": 1.630038857460022, "learning_rate": 2.722007558065475e-05, "loss": 0.1827, "step": 14405 }, { "epoch": 4.404157749923571, "grad_norm": 1.4509801864624023, "learning_rate": 2.721965097023481e-05, "loss": 0.1913, "step": 14406 }, { "epoch": 4.404463466829716, "grad_norm": 1.2261152267456055, "learning_rate": 2.721922635981487e-05, "loss": 0.1803, "step": 14407 }, { "epoch": 4.40476918373586, "grad_norm": 1.082324504852295, "learning_rate": 2.721880174939493e-05, "loss": 0.263, "step": 14408 }, { "epoch": 4.405074900642005, "grad_norm": 1.338180661201477, "learning_rate": 2.721837713897499e-05, "loss": 0.2564, "step": 14409 }, { "epoch": 4.4053806175481505, "grad_norm": 0.5680637359619141, "learning_rate": 2.7217952528555052e-05, "loss": 0.1247, "step": 14410 }, { "epoch": 4.405686334454296, "grad_norm": 0.2874000668525696, "learning_rate": 2.721752791813511e-05, "loss": 0.0864, "step": 14411 }, { "epoch": 4.405992051360441, "grad_norm": 0.31927186250686646, "learning_rate": 2.7217103307715173e-05, "loss": 0.0616, "step": 14412 }, { "epoch": 4.406297768266585, "grad_norm": 0.395269513130188, "learning_rate": 2.721667869729523e-05, "loss": 0.0523, "step": 14413 }, { "epoch": 4.40660348517273, "grad_norm": 0.1703472137451172, "learning_rate": 2.7216254086875294e-05, "loss": 0.0511, "step": 14414 }, { "epoch": 4.406909202078875, "grad_norm": 0.26668426394462585, "learning_rate": 2.7215829476455352e-05, "loss": 0.044, "step": 14415 }, { "epoch": 4.40721491898502, "grad_norm": 0.29748231172561646, "learning_rate": 2.721540486603541e-05, "loss": 0.0533, "step": 14416 }, { "epoch": 4.4075206358911645, "grad_norm": 0.25659820437431335, "learning_rate": 2.7214980255615473e-05, "loss": 0.0577, "step": 14417 }, { "epoch": 4.40782635279731, "grad_norm": 0.4146851897239685, "learning_rate": 2.7214555645195532e-05, "loss": 0.0627, "step": 14418 }, { "epoch": 4.408132069703455, "grad_norm": 0.22579941153526306, "learning_rate": 2.7214131034775594e-05, "loss": 0.0908, "step": 14419 }, { "epoch": 4.4084377866096, "grad_norm": 0.518698513507843, "learning_rate": 2.7213706424355653e-05, "loss": 0.0902, "step": 14420 }, { "epoch": 4.408743503515744, "grad_norm": 0.5573223829269409, "learning_rate": 2.7213281813935715e-05, "loss": 0.0839, "step": 14421 }, { "epoch": 4.409049220421889, "grad_norm": 0.5346655249595642, "learning_rate": 2.7212857203515773e-05, "loss": 0.102, "step": 14422 }, { "epoch": 4.409354937328034, "grad_norm": 1.4240003824234009, "learning_rate": 2.7212432593095836e-05, "loss": 0.1395, "step": 14423 }, { "epoch": 4.409660654234179, "grad_norm": 0.35537219047546387, "learning_rate": 2.7212007982675894e-05, "loss": 0.114, "step": 14424 }, { "epoch": 4.4099663711403245, "grad_norm": 0.7458460927009583, "learning_rate": 2.7211583372255956e-05, "loss": 0.1179, "step": 14425 }, { "epoch": 4.410272088046469, "grad_norm": 0.8013120889663696, "learning_rate": 2.7211158761836015e-05, "loss": 0.1491, "step": 14426 }, { "epoch": 4.410577804952614, "grad_norm": 0.718469500541687, "learning_rate": 2.7210734151416077e-05, "loss": 0.1607, "step": 14427 }, { "epoch": 4.410883521858759, "grad_norm": 2.917618989944458, "learning_rate": 2.7210309540996136e-05, "loss": 0.1488, "step": 14428 }, { "epoch": 4.411189238764904, "grad_norm": 1.346650242805481, "learning_rate": 2.7209884930576194e-05, "loss": 0.2249, "step": 14429 }, { "epoch": 4.411494955671048, "grad_norm": 0.8746318221092224, "learning_rate": 2.7209460320156257e-05, "loss": 0.1471, "step": 14430 }, { "epoch": 4.411800672577193, "grad_norm": 0.6121952533721924, "learning_rate": 2.7209035709736315e-05, "loss": 0.1886, "step": 14431 }, { "epoch": 4.4121063894833386, "grad_norm": 1.1550586223602295, "learning_rate": 2.7208611099316377e-05, "loss": 0.2075, "step": 14432 }, { "epoch": 4.412412106389484, "grad_norm": 0.9827742576599121, "learning_rate": 2.7208186488896436e-05, "loss": 0.1999, "step": 14433 }, { "epoch": 4.412717823295628, "grad_norm": 1.2860413789749146, "learning_rate": 2.7207761878476498e-05, "loss": 0.2608, "step": 14434 }, { "epoch": 4.413023540201773, "grad_norm": 0.4302273988723755, "learning_rate": 2.7207337268056557e-05, "loss": 0.1528, "step": 14435 }, { "epoch": 4.413329257107918, "grad_norm": 0.4492913484573364, "learning_rate": 2.720691265763662e-05, "loss": 0.0928, "step": 14436 }, { "epoch": 4.413634974014063, "grad_norm": 0.311043381690979, "learning_rate": 2.7206488047216678e-05, "loss": 0.0645, "step": 14437 }, { "epoch": 4.413940690920208, "grad_norm": 0.2703157365322113, "learning_rate": 2.720606343679674e-05, "loss": 0.0671, "step": 14438 }, { "epoch": 4.414246407826353, "grad_norm": 0.3756953775882721, "learning_rate": 2.72056388263768e-05, "loss": 0.0779, "step": 14439 }, { "epoch": 4.414552124732498, "grad_norm": 0.2738822102546692, "learning_rate": 2.720521421595686e-05, "loss": 0.0588, "step": 14440 }, { "epoch": 4.414857841638643, "grad_norm": 0.2296927273273468, "learning_rate": 2.720478960553692e-05, "loss": 0.0609, "step": 14441 }, { "epoch": 4.415163558544788, "grad_norm": 0.9186120629310608, "learning_rate": 2.7204364995116978e-05, "loss": 0.0873, "step": 14442 }, { "epoch": 4.415469275450932, "grad_norm": 0.7810412049293518, "learning_rate": 2.720394038469704e-05, "loss": 0.0739, "step": 14443 }, { "epoch": 4.415774992357077, "grad_norm": 0.2865216135978699, "learning_rate": 2.72035157742771e-05, "loss": 0.0728, "step": 14444 }, { "epoch": 4.416080709263222, "grad_norm": 0.4821767807006836, "learning_rate": 2.720309116385716e-05, "loss": 0.1077, "step": 14445 }, { "epoch": 4.4163864261693675, "grad_norm": 0.4669110178947449, "learning_rate": 2.720266655343722e-05, "loss": 0.0807, "step": 14446 }, { "epoch": 4.416692143075512, "grad_norm": 0.8638492226600647, "learning_rate": 2.720224194301728e-05, "loss": 0.1021, "step": 14447 }, { "epoch": 4.416997859981657, "grad_norm": 0.5377247333526611, "learning_rate": 2.720181733259734e-05, "loss": 0.1442, "step": 14448 }, { "epoch": 4.417303576887802, "grad_norm": 0.5027090907096863, "learning_rate": 2.7201392722177402e-05, "loss": 0.1155, "step": 14449 }, { "epoch": 4.417609293793947, "grad_norm": 0.5024706721305847, "learning_rate": 2.720096811175746e-05, "loss": 0.139, "step": 14450 }, { "epoch": 4.417915010700092, "grad_norm": 0.9865642189979553, "learning_rate": 2.7200543501337523e-05, "loss": 0.1411, "step": 14451 }, { "epoch": 4.418220727606236, "grad_norm": 0.47437921166419983, "learning_rate": 2.7200118890917582e-05, "loss": 0.122, "step": 14452 }, { "epoch": 4.4185264445123815, "grad_norm": 3.9902634620666504, "learning_rate": 2.7199694280497644e-05, "loss": 0.1583, "step": 14453 }, { "epoch": 4.418832161418527, "grad_norm": 0.8493024706840515, "learning_rate": 2.7199269670077706e-05, "loss": 0.2127, "step": 14454 }, { "epoch": 4.419137878324672, "grad_norm": 0.8781498074531555, "learning_rate": 2.7198845059657765e-05, "loss": 0.162, "step": 14455 }, { "epoch": 4.419443595230816, "grad_norm": 0.8596749901771545, "learning_rate": 2.7198420449237827e-05, "loss": 0.1962, "step": 14456 }, { "epoch": 4.419749312136961, "grad_norm": 0.8181713819503784, "learning_rate": 2.7197995838817886e-05, "loss": 0.1819, "step": 14457 }, { "epoch": 4.420055029043106, "grad_norm": 0.9634520411491394, "learning_rate": 2.7197571228397948e-05, "loss": 0.1783, "step": 14458 }, { "epoch": 4.420360745949251, "grad_norm": 1.1027075052261353, "learning_rate": 2.7197146617978006e-05, "loss": 0.2648, "step": 14459 }, { "epoch": 4.4206664628553956, "grad_norm": 0.34837740659713745, "learning_rate": 2.719672200755807e-05, "loss": 0.1173, "step": 14460 }, { "epoch": 4.420972179761541, "grad_norm": 0.21744465827941895, "learning_rate": 2.7196297397138127e-05, "loss": 0.0697, "step": 14461 }, { "epoch": 4.421277896667686, "grad_norm": 0.30970263481140137, "learning_rate": 2.719587278671819e-05, "loss": 0.0817, "step": 14462 }, { "epoch": 4.421583613573831, "grad_norm": 0.2430933266878128, "learning_rate": 2.7195448176298248e-05, "loss": 0.0555, "step": 14463 }, { "epoch": 4.421889330479976, "grad_norm": 0.21608836948871613, "learning_rate": 2.719502356587831e-05, "loss": 0.0638, "step": 14464 }, { "epoch": 4.42219504738612, "grad_norm": 0.25816917419433594, "learning_rate": 2.719459895545837e-05, "loss": 0.0706, "step": 14465 }, { "epoch": 4.422500764292265, "grad_norm": 0.1650068312883377, "learning_rate": 2.719417434503843e-05, "loss": 0.0442, "step": 14466 }, { "epoch": 4.4228064811984105, "grad_norm": 0.1985924392938614, "learning_rate": 2.719374973461849e-05, "loss": 0.0458, "step": 14467 }, { "epoch": 4.423112198104556, "grad_norm": 1.0052459239959717, "learning_rate": 2.7193325124198548e-05, "loss": 0.0878, "step": 14468 }, { "epoch": 4.4234179150107, "grad_norm": 0.3143148720264435, "learning_rate": 2.719290051377861e-05, "loss": 0.0757, "step": 14469 }, { "epoch": 4.423723631916845, "grad_norm": 0.46869075298309326, "learning_rate": 2.719247590335867e-05, "loss": 0.0584, "step": 14470 }, { "epoch": 4.42402934882299, "grad_norm": 0.41491958498954773, "learning_rate": 2.719205129293873e-05, "loss": 0.0917, "step": 14471 }, { "epoch": 4.424335065729135, "grad_norm": 0.6012251377105713, "learning_rate": 2.719162668251879e-05, "loss": 0.0868, "step": 14472 }, { "epoch": 4.424640782635279, "grad_norm": 0.5651818513870239, "learning_rate": 2.7191202072098852e-05, "loss": 0.1123, "step": 14473 }, { "epoch": 4.4249464995414245, "grad_norm": 0.5108380913734436, "learning_rate": 2.719077746167891e-05, "loss": 0.1234, "step": 14474 }, { "epoch": 4.42525221644757, "grad_norm": 0.6670518517494202, "learning_rate": 2.7190352851258973e-05, "loss": 0.1478, "step": 14475 }, { "epoch": 4.425557933353715, "grad_norm": 0.5771527290344238, "learning_rate": 2.718992824083903e-05, "loss": 0.1508, "step": 14476 }, { "epoch": 4.42586365025986, "grad_norm": 0.845485270023346, "learning_rate": 2.7189503630419093e-05, "loss": 0.1431, "step": 14477 }, { "epoch": 4.426169367166004, "grad_norm": 1.4712663888931274, "learning_rate": 2.7189079019999152e-05, "loss": 0.1828, "step": 14478 }, { "epoch": 4.426475084072149, "grad_norm": 2.094494581222534, "learning_rate": 2.7188654409579214e-05, "loss": 0.203, "step": 14479 }, { "epoch": 4.426780800978294, "grad_norm": 0.6756488084793091, "learning_rate": 2.7188229799159273e-05, "loss": 0.2032, "step": 14480 }, { "epoch": 4.427086517884439, "grad_norm": 0.7764524817466736, "learning_rate": 2.718780518873933e-05, "loss": 0.19, "step": 14481 }, { "epoch": 4.427392234790584, "grad_norm": 0.8307797908782959, "learning_rate": 2.7187380578319394e-05, "loss": 0.211, "step": 14482 }, { "epoch": 4.427697951696729, "grad_norm": 2.0276448726654053, "learning_rate": 2.7186955967899452e-05, "loss": 0.1669, "step": 14483 }, { "epoch": 4.428003668602874, "grad_norm": 0.8568150401115417, "learning_rate": 2.7186531357479514e-05, "loss": 0.2406, "step": 14484 }, { "epoch": 4.428309385509019, "grad_norm": 0.3828549385070801, "learning_rate": 2.7186106747059573e-05, "loss": 0.1369, "step": 14485 }, { "epoch": 4.428615102415163, "grad_norm": 1.3293501138687134, "learning_rate": 2.7185682136639635e-05, "loss": 0.076, "step": 14486 }, { "epoch": 4.428920819321308, "grad_norm": 0.39316466450691223, "learning_rate": 2.7185257526219694e-05, "loss": 0.0875, "step": 14487 }, { "epoch": 4.429226536227453, "grad_norm": 0.2121610790491104, "learning_rate": 2.7184832915799756e-05, "loss": 0.0604, "step": 14488 }, { "epoch": 4.4295322531335986, "grad_norm": 0.2553686201572418, "learning_rate": 2.7184408305379815e-05, "loss": 0.0605, "step": 14489 }, { "epoch": 4.429837970039744, "grad_norm": 0.3056836724281311, "learning_rate": 2.7183983694959877e-05, "loss": 0.0691, "step": 14490 }, { "epoch": 4.430143686945888, "grad_norm": 0.6520987749099731, "learning_rate": 2.7183559084539936e-05, "loss": 0.0424, "step": 14491 }, { "epoch": 4.430449403852033, "grad_norm": 0.3343188762664795, "learning_rate": 2.7183134474119998e-05, "loss": 0.0535, "step": 14492 }, { "epoch": 4.430755120758178, "grad_norm": 0.22639048099517822, "learning_rate": 2.7182709863700056e-05, "loss": 0.0953, "step": 14493 }, { "epoch": 4.431060837664323, "grad_norm": 0.3100228011608124, "learning_rate": 2.7182285253280115e-05, "loss": 0.0556, "step": 14494 }, { "epoch": 4.4313665545704675, "grad_norm": 0.6832736730575562, "learning_rate": 2.7181860642860177e-05, "loss": 0.1186, "step": 14495 }, { "epoch": 4.431672271476613, "grad_norm": 0.5886963605880737, "learning_rate": 2.7181436032440236e-05, "loss": 0.0805, "step": 14496 }, { "epoch": 4.431977988382758, "grad_norm": 0.41123658418655396, "learning_rate": 2.7181011422020298e-05, "loss": 0.0919, "step": 14497 }, { "epoch": 4.432283705288903, "grad_norm": 0.478213906288147, "learning_rate": 2.7180586811600357e-05, "loss": 0.0852, "step": 14498 }, { "epoch": 4.432589422195047, "grad_norm": 0.28009626269340515, "learning_rate": 2.718016220118042e-05, "loss": 0.1386, "step": 14499 }, { "epoch": 4.432895139101192, "grad_norm": 0.5742916464805603, "learning_rate": 2.7179737590760477e-05, "loss": 0.1088, "step": 14500 }, { "epoch": 4.433200856007337, "grad_norm": 0.6948607563972473, "learning_rate": 2.717931298034054e-05, "loss": 0.1674, "step": 14501 }, { "epoch": 4.433506572913482, "grad_norm": 0.4177840054035187, "learning_rate": 2.7178888369920598e-05, "loss": 0.1256, "step": 14502 }, { "epoch": 4.4338122898196275, "grad_norm": 4.1104207038879395, "learning_rate": 2.717846375950066e-05, "loss": 0.1636, "step": 14503 }, { "epoch": 4.434118006725772, "grad_norm": 0.6636064648628235, "learning_rate": 2.717803914908072e-05, "loss": 0.1905, "step": 14504 }, { "epoch": 4.434423723631917, "grad_norm": 0.9027283787727356, "learning_rate": 2.717761453866078e-05, "loss": 0.1546, "step": 14505 }, { "epoch": 4.434729440538062, "grad_norm": 0.5822070837020874, "learning_rate": 2.717718992824084e-05, "loss": 0.1915, "step": 14506 }, { "epoch": 4.435035157444207, "grad_norm": 1.2817941904067993, "learning_rate": 2.71767653178209e-05, "loss": 0.2246, "step": 14507 }, { "epoch": 4.435340874350351, "grad_norm": 1.6138834953308105, "learning_rate": 2.717634070740096e-05, "loss": 0.2069, "step": 14508 }, { "epoch": 4.435646591256496, "grad_norm": 2.0528242588043213, "learning_rate": 2.717591609698102e-05, "loss": 0.2096, "step": 14509 }, { "epoch": 4.4359523081626415, "grad_norm": 0.32442811131477356, "learning_rate": 2.717549148656108e-05, "loss": 0.1323, "step": 14510 }, { "epoch": 4.436258025068787, "grad_norm": 0.33326295018196106, "learning_rate": 2.717506687614114e-05, "loss": 0.0593, "step": 14511 }, { "epoch": 4.436563741974931, "grad_norm": 0.23748835921287537, "learning_rate": 2.7174642265721202e-05, "loss": 0.0969, "step": 14512 }, { "epoch": 4.436869458881076, "grad_norm": 0.2153518944978714, "learning_rate": 2.717421765530126e-05, "loss": 0.0514, "step": 14513 }, { "epoch": 4.437175175787221, "grad_norm": 0.28278517723083496, "learning_rate": 2.7173793044881323e-05, "loss": 0.0529, "step": 14514 }, { "epoch": 4.437480892693366, "grad_norm": 0.23319923877716064, "learning_rate": 2.717336843446138e-05, "loss": 0.0711, "step": 14515 }, { "epoch": 4.437786609599511, "grad_norm": 0.24093782901763916, "learning_rate": 2.7172943824041444e-05, "loss": 0.0693, "step": 14516 }, { "epoch": 4.4380923265056555, "grad_norm": 0.3606530427932739, "learning_rate": 2.7172519213621502e-05, "loss": 0.0793, "step": 14517 }, { "epoch": 4.438398043411801, "grad_norm": 1.554999589920044, "learning_rate": 2.7172094603201564e-05, "loss": 0.0894, "step": 14518 }, { "epoch": 4.438703760317946, "grad_norm": 0.3363978862762451, "learning_rate": 2.7171669992781623e-05, "loss": 0.0651, "step": 14519 }, { "epoch": 4.439009477224091, "grad_norm": 0.45908352732658386, "learning_rate": 2.7171245382361682e-05, "loss": 0.0932, "step": 14520 }, { "epoch": 4.439315194130235, "grad_norm": 0.38531097769737244, "learning_rate": 2.7170820771941744e-05, "loss": 0.0781, "step": 14521 }, { "epoch": 4.43962091103638, "grad_norm": 0.4421522617340088, "learning_rate": 2.7170396161521803e-05, "loss": 0.1154, "step": 14522 }, { "epoch": 4.439926627942525, "grad_norm": 0.67113196849823, "learning_rate": 2.7169971551101865e-05, "loss": 0.1209, "step": 14523 }, { "epoch": 4.4402323448486705, "grad_norm": 0.48752743005752563, "learning_rate": 2.7169546940681923e-05, "loss": 0.1116, "step": 14524 }, { "epoch": 4.440538061754815, "grad_norm": 0.575541615486145, "learning_rate": 2.7169122330261986e-05, "loss": 0.1473, "step": 14525 }, { "epoch": 4.44084377866096, "grad_norm": 0.4590758681297302, "learning_rate": 2.7168697719842044e-05, "loss": 0.1166, "step": 14526 }, { "epoch": 4.441149495567105, "grad_norm": 1.0132930278778076, "learning_rate": 2.7168273109422106e-05, "loss": 0.1612, "step": 14527 }, { "epoch": 4.44145521247325, "grad_norm": 0.6839370727539062, "learning_rate": 2.7167848499002165e-05, "loss": 0.1703, "step": 14528 }, { "epoch": 4.441760929379395, "grad_norm": 0.685167670249939, "learning_rate": 2.7167423888582227e-05, "loss": 0.2153, "step": 14529 }, { "epoch": 4.442066646285539, "grad_norm": 0.8313899636268616, "learning_rate": 2.7166999278162286e-05, "loss": 0.1615, "step": 14530 }, { "epoch": 4.4423723631916845, "grad_norm": 0.7201988101005554, "learning_rate": 2.7166574667742345e-05, "loss": 0.1563, "step": 14531 }, { "epoch": 4.44267808009783, "grad_norm": 0.7806335687637329, "learning_rate": 2.7166150057322407e-05, "loss": 0.1873, "step": 14532 }, { "epoch": 4.442983797003975, "grad_norm": 1.3637127876281738, "learning_rate": 2.7165725446902465e-05, "loss": 0.1869, "step": 14533 }, { "epoch": 4.443289513910119, "grad_norm": 1.9744242429733276, "learning_rate": 2.7165300836482527e-05, "loss": 0.2519, "step": 14534 }, { "epoch": 4.443595230816264, "grad_norm": 0.3875206708908081, "learning_rate": 2.7164876226062586e-05, "loss": 0.1593, "step": 14535 }, { "epoch": 4.443900947722409, "grad_norm": 0.5389423966407776, "learning_rate": 2.7164451615642648e-05, "loss": 0.0771, "step": 14536 }, { "epoch": 4.444206664628554, "grad_norm": 0.40691739320755005, "learning_rate": 2.7164027005222707e-05, "loss": 0.0767, "step": 14537 }, { "epoch": 4.4445123815346985, "grad_norm": 0.48493310809135437, "learning_rate": 2.716360239480277e-05, "loss": 0.0762, "step": 14538 }, { "epoch": 4.444818098440844, "grad_norm": 0.9798800945281982, "learning_rate": 2.7163177784382828e-05, "loss": 0.0515, "step": 14539 }, { "epoch": 4.445123815346989, "grad_norm": 0.166558176279068, "learning_rate": 2.716275317396289e-05, "loss": 0.0423, "step": 14540 }, { "epoch": 4.445429532253134, "grad_norm": 0.2790682315826416, "learning_rate": 2.716232856354295e-05, "loss": 0.0558, "step": 14541 }, { "epoch": 4.445735249159279, "grad_norm": 0.28538811206817627, "learning_rate": 2.716190395312301e-05, "loss": 0.0846, "step": 14542 }, { "epoch": 4.446040966065423, "grad_norm": 0.2762254476547241, "learning_rate": 2.716147934270307e-05, "loss": 0.0479, "step": 14543 }, { "epoch": 4.446346682971568, "grad_norm": 0.4479845464229584, "learning_rate": 2.7161054732283128e-05, "loss": 0.0659, "step": 14544 }, { "epoch": 4.446652399877713, "grad_norm": 0.3160566985607147, "learning_rate": 2.716063012186319e-05, "loss": 0.0967, "step": 14545 }, { "epoch": 4.4469581167838586, "grad_norm": 0.6429881453514099, "learning_rate": 2.716020551144325e-05, "loss": 0.0857, "step": 14546 }, { "epoch": 4.447263833690003, "grad_norm": 0.6039536595344543, "learning_rate": 2.715978090102331e-05, "loss": 0.1139, "step": 14547 }, { "epoch": 4.447569550596148, "grad_norm": 0.43419915437698364, "learning_rate": 2.715935629060337e-05, "loss": 0.0942, "step": 14548 }, { "epoch": 4.447875267502293, "grad_norm": 0.5530080795288086, "learning_rate": 2.715893168018343e-05, "loss": 0.115, "step": 14549 }, { "epoch": 4.448180984408438, "grad_norm": 0.5810584425926208, "learning_rate": 2.715850706976349e-05, "loss": 0.1369, "step": 14550 }, { "epoch": 4.448486701314582, "grad_norm": 1.4688329696655273, "learning_rate": 2.7158082459343552e-05, "loss": 0.1645, "step": 14551 }, { "epoch": 4.4487924182207275, "grad_norm": 0.5219430327415466, "learning_rate": 2.715765784892361e-05, "loss": 0.1445, "step": 14552 }, { "epoch": 4.449098135126873, "grad_norm": 0.6024319529533386, "learning_rate": 2.7157233238503673e-05, "loss": 0.1885, "step": 14553 }, { "epoch": 4.449403852033018, "grad_norm": 3.57478404045105, "learning_rate": 2.7156808628083732e-05, "loss": 0.1875, "step": 14554 }, { "epoch": 4.449709568939163, "grad_norm": 3.3994922637939453, "learning_rate": 2.7156384017663794e-05, "loss": 0.1861, "step": 14555 }, { "epoch": 4.450015285845307, "grad_norm": 1.0106528997421265, "learning_rate": 2.7155959407243856e-05, "loss": 0.1953, "step": 14556 }, { "epoch": 4.450321002751452, "grad_norm": 0.8375324010848999, "learning_rate": 2.7155534796823915e-05, "loss": 0.1949, "step": 14557 }, { "epoch": 4.450626719657597, "grad_norm": 1.56394624710083, "learning_rate": 2.7155110186403977e-05, "loss": 0.1714, "step": 14558 }, { "epoch": 4.450932436563742, "grad_norm": 1.6562318801879883, "learning_rate": 2.7154685575984036e-05, "loss": 0.2144, "step": 14559 }, { "epoch": 4.451238153469887, "grad_norm": 0.3570365905761719, "learning_rate": 2.7154260965564098e-05, "loss": 0.1179, "step": 14560 }, { "epoch": 4.451543870376032, "grad_norm": 0.324855238199234, "learning_rate": 2.7153836355144156e-05, "loss": 0.0863, "step": 14561 }, { "epoch": 4.451849587282177, "grad_norm": 0.20178170502185822, "learning_rate": 2.715341174472422e-05, "loss": 0.0563, "step": 14562 }, { "epoch": 4.452155304188322, "grad_norm": 0.4093068540096283, "learning_rate": 2.7152987134304277e-05, "loss": 0.0669, "step": 14563 }, { "epoch": 4.452461021094466, "grad_norm": 0.21677224338054657, "learning_rate": 2.715256252388434e-05, "loss": 0.0647, "step": 14564 }, { "epoch": 4.452766738000611, "grad_norm": 0.41955286264419556, "learning_rate": 2.7152137913464398e-05, "loss": 0.0598, "step": 14565 }, { "epoch": 4.453072454906756, "grad_norm": 0.24472182989120483, "learning_rate": 2.715171330304446e-05, "loss": 0.0532, "step": 14566 }, { "epoch": 4.4533781718129015, "grad_norm": 0.2900508940219879, "learning_rate": 2.715128869262452e-05, "loss": 0.0575, "step": 14567 }, { "epoch": 4.453683888719046, "grad_norm": 0.2906894087791443, "learning_rate": 2.715086408220458e-05, "loss": 0.0722, "step": 14568 }, { "epoch": 4.453989605625191, "grad_norm": 0.6541433334350586, "learning_rate": 2.715043947178464e-05, "loss": 0.0666, "step": 14569 }, { "epoch": 4.454295322531336, "grad_norm": 0.3380293846130371, "learning_rate": 2.7150014861364698e-05, "loss": 0.0813, "step": 14570 }, { "epoch": 4.454601039437481, "grad_norm": 0.2808278799057007, "learning_rate": 2.714959025094476e-05, "loss": 0.0593, "step": 14571 }, { "epoch": 4.454906756343626, "grad_norm": 0.7964391708374023, "learning_rate": 2.714916564052482e-05, "loss": 0.1142, "step": 14572 }, { "epoch": 4.45521247324977, "grad_norm": 0.5377917885780334, "learning_rate": 2.714874103010488e-05, "loss": 0.1132, "step": 14573 }, { "epoch": 4.4555181901559155, "grad_norm": 1.1152527332305908, "learning_rate": 2.714831641968494e-05, "loss": 0.1306, "step": 14574 }, { "epoch": 4.455823907062061, "grad_norm": 0.7706248164176941, "learning_rate": 2.7147891809265002e-05, "loss": 0.1335, "step": 14575 }, { "epoch": 4.456129623968206, "grad_norm": 0.5587882399559021, "learning_rate": 2.714746719884506e-05, "loss": 0.1802, "step": 14576 }, { "epoch": 4.45643534087435, "grad_norm": 0.7314732074737549, "learning_rate": 2.7147042588425123e-05, "loss": 0.1708, "step": 14577 }, { "epoch": 4.456741057780495, "grad_norm": 0.6374103426933289, "learning_rate": 2.714661797800518e-05, "loss": 0.1519, "step": 14578 }, { "epoch": 4.45704677468664, "grad_norm": 1.7685569524765015, "learning_rate": 2.7146193367585243e-05, "loss": 0.1736, "step": 14579 }, { "epoch": 4.457352491592785, "grad_norm": 1.07774019241333, "learning_rate": 2.7145768757165302e-05, "loss": 0.1731, "step": 14580 }, { "epoch": 4.45765820849893, "grad_norm": 0.8443768620491028, "learning_rate": 2.7145344146745364e-05, "loss": 0.1791, "step": 14581 }, { "epoch": 4.457963925405075, "grad_norm": 1.3887746334075928, "learning_rate": 2.7144919536325423e-05, "loss": 0.1817, "step": 14582 }, { "epoch": 4.45826964231122, "grad_norm": 0.9004783034324646, "learning_rate": 2.714449492590548e-05, "loss": 0.1749, "step": 14583 }, { "epoch": 4.458575359217365, "grad_norm": 1.5196189880371094, "learning_rate": 2.7144070315485544e-05, "loss": 0.2415, "step": 14584 }, { "epoch": 4.45888107612351, "grad_norm": 1.2667794227600098, "learning_rate": 2.7143645705065602e-05, "loss": 0.1357, "step": 14585 }, { "epoch": 4.459186793029654, "grad_norm": 0.5894879698753357, "learning_rate": 2.7143221094645664e-05, "loss": 0.0866, "step": 14586 }, { "epoch": 4.459492509935799, "grad_norm": 1.2526775598526, "learning_rate": 2.7142796484225723e-05, "loss": 0.0722, "step": 14587 }, { "epoch": 4.4597982268419445, "grad_norm": 0.17965327203273773, "learning_rate": 2.7142371873805785e-05, "loss": 0.0538, "step": 14588 }, { "epoch": 4.46010394374809, "grad_norm": 0.38206571340560913, "learning_rate": 2.7141947263385844e-05, "loss": 0.0577, "step": 14589 }, { "epoch": 4.460409660654234, "grad_norm": 0.19419115781784058, "learning_rate": 2.7141522652965906e-05, "loss": 0.0406, "step": 14590 }, { "epoch": 4.460715377560379, "grad_norm": 0.27740243077278137, "learning_rate": 2.7141098042545965e-05, "loss": 0.0365, "step": 14591 }, { "epoch": 4.461021094466524, "grad_norm": 0.38763293623924255, "learning_rate": 2.7140673432126027e-05, "loss": 0.0915, "step": 14592 }, { "epoch": 4.461326811372669, "grad_norm": 0.26536694169044495, "learning_rate": 2.7140248821706086e-05, "loss": 0.0551, "step": 14593 }, { "epoch": 4.461632528278813, "grad_norm": 0.29181358218193054, "learning_rate": 2.7139824211286148e-05, "loss": 0.043, "step": 14594 }, { "epoch": 4.4619382451849585, "grad_norm": 0.8534982204437256, "learning_rate": 2.7139399600866206e-05, "loss": 0.1131, "step": 14595 }, { "epoch": 4.462243962091104, "grad_norm": 0.4536721408367157, "learning_rate": 2.7138974990446265e-05, "loss": 0.0601, "step": 14596 }, { "epoch": 4.462549678997249, "grad_norm": 0.8021422028541565, "learning_rate": 2.7138550380026327e-05, "loss": 0.0853, "step": 14597 }, { "epoch": 4.462855395903394, "grad_norm": 0.37400203943252563, "learning_rate": 2.7138125769606386e-05, "loss": 0.1112, "step": 14598 }, { "epoch": 4.463161112809538, "grad_norm": 0.5264663696289062, "learning_rate": 2.7137701159186448e-05, "loss": 0.133, "step": 14599 }, { "epoch": 4.463466829715683, "grad_norm": 1.3493903875350952, "learning_rate": 2.7137276548766507e-05, "loss": 0.1437, "step": 14600 }, { "epoch": 4.463772546621828, "grad_norm": 1.5559134483337402, "learning_rate": 2.713685193834657e-05, "loss": 0.1485, "step": 14601 }, { "epoch": 4.464078263527973, "grad_norm": 1.2769638299942017, "learning_rate": 2.7136427327926627e-05, "loss": 0.1537, "step": 14602 }, { "epoch": 4.464383980434118, "grad_norm": 1.1096384525299072, "learning_rate": 2.713600271750669e-05, "loss": 0.1415, "step": 14603 }, { "epoch": 4.464689697340263, "grad_norm": 1.0325286388397217, "learning_rate": 2.7135578107086748e-05, "loss": 0.1685, "step": 14604 }, { "epoch": 4.464995414246408, "grad_norm": 1.0272083282470703, "learning_rate": 2.713515349666681e-05, "loss": 0.1729, "step": 14605 }, { "epoch": 4.465301131152553, "grad_norm": 2.1394472122192383, "learning_rate": 2.713472888624687e-05, "loss": 0.1885, "step": 14606 }, { "epoch": 4.465606848058697, "grad_norm": 1.3892558813095093, "learning_rate": 2.713430427582693e-05, "loss": 0.214, "step": 14607 }, { "epoch": 4.465912564964842, "grad_norm": 0.9586330056190491, "learning_rate": 2.713387966540699e-05, "loss": 0.2251, "step": 14608 }, { "epoch": 4.4662182818709875, "grad_norm": 2.983154773712158, "learning_rate": 2.713345505498705e-05, "loss": 0.2805, "step": 14609 }, { "epoch": 4.466523998777133, "grad_norm": 0.4760669767856598, "learning_rate": 2.713303044456711e-05, "loss": 0.1319, "step": 14610 }, { "epoch": 4.466829715683278, "grad_norm": 0.432526171207428, "learning_rate": 2.713260583414717e-05, "loss": 0.0864, "step": 14611 }, { "epoch": 4.467135432589422, "grad_norm": 0.23594166338443756, "learning_rate": 2.713218122372723e-05, "loss": 0.0654, "step": 14612 }, { "epoch": 4.467441149495567, "grad_norm": 0.35211896896362305, "learning_rate": 2.713175661330729e-05, "loss": 0.0895, "step": 14613 }, { "epoch": 4.467746866401712, "grad_norm": 0.3382732570171356, "learning_rate": 2.7131332002887352e-05, "loss": 0.0536, "step": 14614 }, { "epoch": 4.468052583307857, "grad_norm": 0.2115425318479538, "learning_rate": 2.713090739246741e-05, "loss": 0.0482, "step": 14615 }, { "epoch": 4.4683583002140015, "grad_norm": 1.2285183668136597, "learning_rate": 2.7130482782047473e-05, "loss": 0.1013, "step": 14616 }, { "epoch": 4.468664017120147, "grad_norm": 0.582547664642334, "learning_rate": 2.713005817162753e-05, "loss": 0.0533, "step": 14617 }, { "epoch": 4.468969734026292, "grad_norm": 0.6298527121543884, "learning_rate": 2.7129633561207594e-05, "loss": 0.0692, "step": 14618 }, { "epoch": 4.469275450932437, "grad_norm": 1.1538721323013306, "learning_rate": 2.7129208950787652e-05, "loss": 0.0682, "step": 14619 }, { "epoch": 4.469581167838581, "grad_norm": 0.3133822977542877, "learning_rate": 2.7128784340367715e-05, "loss": 0.0906, "step": 14620 }, { "epoch": 4.469886884744726, "grad_norm": 2.354862689971924, "learning_rate": 2.7128359729947773e-05, "loss": 0.0881, "step": 14621 }, { "epoch": 4.470192601650871, "grad_norm": 0.3252435326576233, "learning_rate": 2.7127935119527832e-05, "loss": 0.085, "step": 14622 }, { "epoch": 4.470498318557016, "grad_norm": 0.40219900012016296, "learning_rate": 2.7127510509107894e-05, "loss": 0.1361, "step": 14623 }, { "epoch": 4.4708040354631615, "grad_norm": 0.8030990362167358, "learning_rate": 2.7127085898687953e-05, "loss": 0.1119, "step": 14624 }, { "epoch": 4.471109752369306, "grad_norm": 0.38726451992988586, "learning_rate": 2.7126661288268015e-05, "loss": 0.1273, "step": 14625 }, { "epoch": 4.471415469275451, "grad_norm": 0.7036991715431213, "learning_rate": 2.7126236677848073e-05, "loss": 0.1462, "step": 14626 }, { "epoch": 4.471721186181596, "grad_norm": 3.54960036277771, "learning_rate": 2.7125812067428136e-05, "loss": 0.1783, "step": 14627 }, { "epoch": 4.472026903087741, "grad_norm": 0.732994794845581, "learning_rate": 2.7125387457008194e-05, "loss": 0.1805, "step": 14628 }, { "epoch": 4.472332619993885, "grad_norm": 0.9843920469284058, "learning_rate": 2.7124962846588256e-05, "loss": 0.1635, "step": 14629 }, { "epoch": 4.47263833690003, "grad_norm": 0.7376084923744202, "learning_rate": 2.7124538236168315e-05, "loss": 0.1619, "step": 14630 }, { "epoch": 4.4729440538061755, "grad_norm": 0.5584295988082886, "learning_rate": 2.7124113625748377e-05, "loss": 0.1452, "step": 14631 }, { "epoch": 4.473249770712321, "grad_norm": 0.945618748664856, "learning_rate": 2.7123689015328436e-05, "loss": 0.193, "step": 14632 }, { "epoch": 4.473555487618465, "grad_norm": 2.0925111770629883, "learning_rate": 2.7123264404908498e-05, "loss": 0.2052, "step": 14633 }, { "epoch": 4.47386120452461, "grad_norm": 4.201822757720947, "learning_rate": 2.7122839794488557e-05, "loss": 0.2288, "step": 14634 }, { "epoch": 4.474166921430755, "grad_norm": 0.4587872326374054, "learning_rate": 2.7122415184068615e-05, "loss": 0.1613, "step": 14635 }, { "epoch": 4.4744726383369, "grad_norm": 0.3018239438533783, "learning_rate": 2.7121990573648677e-05, "loss": 0.0885, "step": 14636 }, { "epoch": 4.474778355243045, "grad_norm": 0.2557578682899475, "learning_rate": 2.7121565963228736e-05, "loss": 0.0439, "step": 14637 }, { "epoch": 4.47508407214919, "grad_norm": 0.2682934105396271, "learning_rate": 2.7121141352808798e-05, "loss": 0.0622, "step": 14638 }, { "epoch": 4.475389789055335, "grad_norm": 0.3103990852832794, "learning_rate": 2.7120716742388857e-05, "loss": 0.0532, "step": 14639 }, { "epoch": 4.47569550596148, "grad_norm": 0.39876842498779297, "learning_rate": 2.712029213196892e-05, "loss": 0.0427, "step": 14640 }, { "epoch": 4.476001222867625, "grad_norm": 0.32197389006614685, "learning_rate": 2.7119867521548978e-05, "loss": 0.0419, "step": 14641 }, { "epoch": 4.476306939773769, "grad_norm": 0.3960363566875458, "learning_rate": 2.711944291112904e-05, "loss": 0.0964, "step": 14642 }, { "epoch": 4.476612656679914, "grad_norm": 0.6471211910247803, "learning_rate": 2.71190183007091e-05, "loss": 0.0444, "step": 14643 }, { "epoch": 4.476918373586059, "grad_norm": 0.17382307350635529, "learning_rate": 2.711859369028916e-05, "loss": 0.039, "step": 14644 }, { "epoch": 4.4772240904922045, "grad_norm": 1.0236260890960693, "learning_rate": 2.711816907986922e-05, "loss": 0.1122, "step": 14645 }, { "epoch": 4.477529807398349, "grad_norm": 0.8119114637374878, "learning_rate": 2.7117744469449278e-05, "loss": 0.0795, "step": 14646 }, { "epoch": 4.477835524304494, "grad_norm": 0.3618432879447937, "learning_rate": 2.711731985902934e-05, "loss": 0.1171, "step": 14647 }, { "epoch": 4.478141241210639, "grad_norm": 2.16438627243042, "learning_rate": 2.71168952486094e-05, "loss": 0.1587, "step": 14648 }, { "epoch": 4.478446958116784, "grad_norm": 0.45639991760253906, "learning_rate": 2.711647063818946e-05, "loss": 0.1581, "step": 14649 }, { "epoch": 4.478752675022929, "grad_norm": 0.5467153191566467, "learning_rate": 2.711604602776952e-05, "loss": 0.1318, "step": 14650 }, { "epoch": 4.479058391929073, "grad_norm": 0.5044739842414856, "learning_rate": 2.711562141734958e-05, "loss": 0.1501, "step": 14651 }, { "epoch": 4.4793641088352185, "grad_norm": 0.9199202060699463, "learning_rate": 2.711519680692964e-05, "loss": 0.1621, "step": 14652 }, { "epoch": 4.479669825741364, "grad_norm": 1.0798022747039795, "learning_rate": 2.7114772196509702e-05, "loss": 0.1827, "step": 14653 }, { "epoch": 4.479975542647509, "grad_norm": 1.1513031721115112, "learning_rate": 2.711434758608976e-05, "loss": 0.1629, "step": 14654 }, { "epoch": 4.480281259553653, "grad_norm": 0.7053007483482361, "learning_rate": 2.7113922975669823e-05, "loss": 0.1792, "step": 14655 }, { "epoch": 4.480586976459798, "grad_norm": 0.7904508709907532, "learning_rate": 2.7113498365249882e-05, "loss": 0.1633, "step": 14656 }, { "epoch": 4.480892693365943, "grad_norm": 0.992485523223877, "learning_rate": 2.7113073754829944e-05, "loss": 0.192, "step": 14657 }, { "epoch": 4.481198410272088, "grad_norm": 2.2368431091308594, "learning_rate": 2.7112649144410006e-05, "loss": 0.2166, "step": 14658 }, { "epoch": 4.4815041271782325, "grad_norm": 1.5853204727172852, "learning_rate": 2.7112224533990065e-05, "loss": 0.2089, "step": 14659 }, { "epoch": 4.481809844084378, "grad_norm": 0.37052732706069946, "learning_rate": 2.7111799923570127e-05, "loss": 0.1465, "step": 14660 }, { "epoch": 4.482115560990523, "grad_norm": 0.532674252986908, "learning_rate": 2.7111375313150186e-05, "loss": 0.1162, "step": 14661 }, { "epoch": 4.482421277896668, "grad_norm": 0.4104871451854706, "learning_rate": 2.7110950702730248e-05, "loss": 0.0678, "step": 14662 }, { "epoch": 4.482726994802813, "grad_norm": 0.26746463775634766, "learning_rate": 2.7110526092310306e-05, "loss": 0.0847, "step": 14663 }, { "epoch": 4.483032711708957, "grad_norm": 0.3744140565395355, "learning_rate": 2.711010148189037e-05, "loss": 0.0894, "step": 14664 }, { "epoch": 4.483338428615102, "grad_norm": 0.2668428421020508, "learning_rate": 2.7109676871470427e-05, "loss": 0.0449, "step": 14665 }, { "epoch": 4.4836441455212475, "grad_norm": 0.46617385745048523, "learning_rate": 2.710925226105049e-05, "loss": 0.0662, "step": 14666 }, { "epoch": 4.483949862427393, "grad_norm": 0.32966530323028564, "learning_rate": 2.7108827650630548e-05, "loss": 0.0574, "step": 14667 }, { "epoch": 4.484255579333537, "grad_norm": 0.29605984687805176, "learning_rate": 2.710840304021061e-05, "loss": 0.0643, "step": 14668 }, { "epoch": 4.484561296239682, "grad_norm": 0.49442756175994873, "learning_rate": 2.710797842979067e-05, "loss": 0.061, "step": 14669 }, { "epoch": 4.484867013145827, "grad_norm": 0.3792601525783539, "learning_rate": 2.710755381937073e-05, "loss": 0.0846, "step": 14670 }, { "epoch": 4.485172730051972, "grad_norm": 0.29025277495384216, "learning_rate": 2.710712920895079e-05, "loss": 0.0757, "step": 14671 }, { "epoch": 4.485478446958116, "grad_norm": 0.4439960718154907, "learning_rate": 2.7106704598530848e-05, "loss": 0.0632, "step": 14672 }, { "epoch": 4.4857841638642615, "grad_norm": 0.6261323094367981, "learning_rate": 2.710627998811091e-05, "loss": 0.1281, "step": 14673 }, { "epoch": 4.486089880770407, "grad_norm": 0.8312703371047974, "learning_rate": 2.710585537769097e-05, "loss": 0.1104, "step": 14674 }, { "epoch": 4.486395597676552, "grad_norm": 0.6052479147911072, "learning_rate": 2.710543076727103e-05, "loss": 0.128, "step": 14675 }, { "epoch": 4.486701314582697, "grad_norm": 1.078845739364624, "learning_rate": 2.710500615685109e-05, "loss": 0.1741, "step": 14676 }, { "epoch": 4.487007031488841, "grad_norm": 1.0710182189941406, "learning_rate": 2.7104581546431152e-05, "loss": 0.1862, "step": 14677 }, { "epoch": 4.487312748394986, "grad_norm": 0.6280719637870789, "learning_rate": 2.710415693601121e-05, "loss": 0.1604, "step": 14678 }, { "epoch": 4.487618465301131, "grad_norm": 1.1692910194396973, "learning_rate": 2.7103732325591273e-05, "loss": 0.1733, "step": 14679 }, { "epoch": 4.487924182207276, "grad_norm": 1.2505896091461182, "learning_rate": 2.710330771517133e-05, "loss": 0.1839, "step": 14680 }, { "epoch": 4.488229899113421, "grad_norm": 2.500971555709839, "learning_rate": 2.7102883104751393e-05, "loss": 0.1987, "step": 14681 }, { "epoch": 4.488535616019566, "grad_norm": 2.7866642475128174, "learning_rate": 2.7102458494331452e-05, "loss": 0.1687, "step": 14682 }, { "epoch": 4.488841332925711, "grad_norm": 1.8618919849395752, "learning_rate": 2.7102033883911514e-05, "loss": 0.1928, "step": 14683 }, { "epoch": 4.489147049831856, "grad_norm": 1.9939110279083252, "learning_rate": 2.7101609273491573e-05, "loss": 0.2478, "step": 14684 }, { "epoch": 4.489452766738, "grad_norm": 0.6005783677101135, "learning_rate": 2.710118466307163e-05, "loss": 0.1896, "step": 14685 }, { "epoch": 4.489758483644145, "grad_norm": 1.2610020637512207, "learning_rate": 2.7100760052651694e-05, "loss": 0.0933, "step": 14686 }, { "epoch": 4.49006420055029, "grad_norm": 0.4526810646057129, "learning_rate": 2.7100335442231752e-05, "loss": 0.0602, "step": 14687 }, { "epoch": 4.4903699174564355, "grad_norm": 0.3923143744468689, "learning_rate": 2.7099910831811815e-05, "loss": 0.0615, "step": 14688 }, { "epoch": 4.490675634362581, "grad_norm": 0.18970432877540588, "learning_rate": 2.7099486221391873e-05, "loss": 0.0599, "step": 14689 }, { "epoch": 4.490981351268725, "grad_norm": 0.3835609257221222, "learning_rate": 2.7099061610971935e-05, "loss": 0.0616, "step": 14690 }, { "epoch": 4.49128706817487, "grad_norm": 0.2205372005701065, "learning_rate": 2.7098637000551994e-05, "loss": 0.0473, "step": 14691 }, { "epoch": 4.491592785081015, "grad_norm": 0.48273009061813354, "learning_rate": 2.7098212390132056e-05, "loss": 0.0616, "step": 14692 }, { "epoch": 4.49189850198716, "grad_norm": 0.46579504013061523, "learning_rate": 2.7097787779712115e-05, "loss": 0.059, "step": 14693 }, { "epoch": 4.4922042188933045, "grad_norm": 0.5092663764953613, "learning_rate": 2.7097363169292177e-05, "loss": 0.0581, "step": 14694 }, { "epoch": 4.49250993579945, "grad_norm": 0.38288551568984985, "learning_rate": 2.7096938558872236e-05, "loss": 0.107, "step": 14695 }, { "epoch": 4.492815652705595, "grad_norm": 0.2884606122970581, "learning_rate": 2.7096513948452298e-05, "loss": 0.0702, "step": 14696 }, { "epoch": 4.49312136961174, "grad_norm": 0.28578463196754456, "learning_rate": 2.7096089338032356e-05, "loss": 0.094, "step": 14697 }, { "epoch": 4.493427086517884, "grad_norm": 1.162040114402771, "learning_rate": 2.7095664727612415e-05, "loss": 0.1092, "step": 14698 }, { "epoch": 4.493732803424029, "grad_norm": 7.481799125671387, "learning_rate": 2.7095240117192477e-05, "loss": 0.1158, "step": 14699 }, { "epoch": 4.494038520330174, "grad_norm": 0.6691536903381348, "learning_rate": 2.7094815506772536e-05, "loss": 0.1604, "step": 14700 }, { "epoch": 4.494344237236319, "grad_norm": 1.436592936515808, "learning_rate": 2.7094390896352598e-05, "loss": 0.1737, "step": 14701 }, { "epoch": 4.4946499541424645, "grad_norm": 0.8436417579650879, "learning_rate": 2.7093966285932657e-05, "loss": 0.1507, "step": 14702 }, { "epoch": 4.494955671048609, "grad_norm": 0.684350311756134, "learning_rate": 2.709354167551272e-05, "loss": 0.1768, "step": 14703 }, { "epoch": 4.495261387954754, "grad_norm": 0.6975675225257874, "learning_rate": 2.7093117065092777e-05, "loss": 0.1694, "step": 14704 }, { "epoch": 4.495567104860899, "grad_norm": 1.6493197679519653, "learning_rate": 2.709269245467284e-05, "loss": 0.2038, "step": 14705 }, { "epoch": 4.495872821767044, "grad_norm": 0.5836867690086365, "learning_rate": 2.7092267844252898e-05, "loss": 0.1725, "step": 14706 }, { "epoch": 4.496178538673188, "grad_norm": 1.0197755098342896, "learning_rate": 2.709184323383296e-05, "loss": 0.1845, "step": 14707 }, { "epoch": 4.496484255579333, "grad_norm": 1.1208999156951904, "learning_rate": 2.709141862341302e-05, "loss": 0.203, "step": 14708 }, { "epoch": 4.4967899724854785, "grad_norm": 1.0751042366027832, "learning_rate": 2.709099401299308e-05, "loss": 0.189, "step": 14709 }, { "epoch": 4.497095689391624, "grad_norm": 0.5890744924545288, "learning_rate": 2.709056940257314e-05, "loss": 0.1295, "step": 14710 }, { "epoch": 4.497401406297768, "grad_norm": 0.6536648273468018, "learning_rate": 2.70901447921532e-05, "loss": 0.0844, "step": 14711 }, { "epoch": 4.497707123203913, "grad_norm": 0.4078136682510376, "learning_rate": 2.708972018173326e-05, "loss": 0.0701, "step": 14712 }, { "epoch": 4.498012840110058, "grad_norm": 0.3349440395832062, "learning_rate": 2.708929557131332e-05, "loss": 0.075, "step": 14713 }, { "epoch": 4.498318557016203, "grad_norm": 0.25032681226730347, "learning_rate": 2.708887096089338e-05, "loss": 0.0524, "step": 14714 }, { "epoch": 4.498624273922348, "grad_norm": 0.2766275405883789, "learning_rate": 2.708844635047344e-05, "loss": 0.0592, "step": 14715 }, { "epoch": 4.4989299908284925, "grad_norm": 0.2494514286518097, "learning_rate": 2.7088021740053502e-05, "loss": 0.0411, "step": 14716 }, { "epoch": 4.499235707734638, "grad_norm": 0.3997732102870941, "learning_rate": 2.708759712963356e-05, "loss": 0.0437, "step": 14717 }, { "epoch": 4.499541424640783, "grad_norm": 0.473504900932312, "learning_rate": 2.7087172519213623e-05, "loss": 0.0859, "step": 14718 }, { "epoch": 4.499847141546928, "grad_norm": 0.9715701341629028, "learning_rate": 2.708674790879368e-05, "loss": 0.0654, "step": 14719 }, { "epoch": 4.500152858453072, "grad_norm": 2.9738004207611084, "learning_rate": 2.7086323298373744e-05, "loss": 0.1064, "step": 14720 }, { "epoch": 4.500458575359217, "grad_norm": 0.36440402269363403, "learning_rate": 2.7085898687953802e-05, "loss": 0.0874, "step": 14721 }, { "epoch": 4.500764292265362, "grad_norm": 0.3338824510574341, "learning_rate": 2.7085474077533865e-05, "loss": 0.1069, "step": 14722 }, { "epoch": 4.5010700091715075, "grad_norm": 0.5944037437438965, "learning_rate": 2.7085049467113923e-05, "loss": 0.0924, "step": 14723 }, { "epoch": 4.501375726077653, "grad_norm": 0.9349799752235413, "learning_rate": 2.7084624856693982e-05, "loss": 0.1082, "step": 14724 }, { "epoch": 4.501681442983797, "grad_norm": 0.5763305425643921, "learning_rate": 2.7084200246274044e-05, "loss": 0.1072, "step": 14725 }, { "epoch": 4.501987159889942, "grad_norm": 0.918502926826477, "learning_rate": 2.7083775635854103e-05, "loss": 0.1594, "step": 14726 }, { "epoch": 4.502292876796087, "grad_norm": 1.1917542219161987, "learning_rate": 2.7083351025434165e-05, "loss": 0.1947, "step": 14727 }, { "epoch": 4.502598593702231, "grad_norm": 0.8161969780921936, "learning_rate": 2.7082926415014224e-05, "loss": 0.1922, "step": 14728 }, { "epoch": 4.502904310608376, "grad_norm": 0.8762136101722717, "learning_rate": 2.7082501804594286e-05, "loss": 0.23, "step": 14729 }, { "epoch": 4.5032100275145215, "grad_norm": 2.169820547103882, "learning_rate": 2.7082077194174344e-05, "loss": 0.1618, "step": 14730 }, { "epoch": 4.503515744420667, "grad_norm": 0.7765315175056458, "learning_rate": 2.7081652583754406e-05, "loss": 0.2086, "step": 14731 }, { "epoch": 4.503821461326812, "grad_norm": 1.5260450839996338, "learning_rate": 2.7081227973334465e-05, "loss": 0.1899, "step": 14732 }, { "epoch": 4.504127178232956, "grad_norm": 1.3156319856643677, "learning_rate": 2.7080803362914527e-05, "loss": 0.2158, "step": 14733 }, { "epoch": 4.504432895139101, "grad_norm": 1.8828531503677368, "learning_rate": 2.7080378752494586e-05, "loss": 0.2271, "step": 14734 }, { "epoch": 4.504738612045246, "grad_norm": 0.4742388129234314, "learning_rate": 2.7079954142074648e-05, "loss": 0.1372, "step": 14735 }, { "epoch": 4.505044328951391, "grad_norm": 0.33463945984840393, "learning_rate": 2.7079529531654707e-05, "loss": 0.0712, "step": 14736 }, { "epoch": 4.505350045857536, "grad_norm": 0.3448033630847931, "learning_rate": 2.7079104921234765e-05, "loss": 0.0747, "step": 14737 }, { "epoch": 4.505655762763681, "grad_norm": 0.35684654116630554, "learning_rate": 2.7078680310814827e-05, "loss": 0.064, "step": 14738 }, { "epoch": 4.505961479669826, "grad_norm": 0.2351716309785843, "learning_rate": 2.7078255700394886e-05, "loss": 0.0602, "step": 14739 }, { "epoch": 4.506267196575971, "grad_norm": 0.219261035323143, "learning_rate": 2.7077831089974948e-05, "loss": 0.0456, "step": 14740 }, { "epoch": 4.506572913482115, "grad_norm": 3.1828866004943848, "learning_rate": 2.7077406479555007e-05, "loss": 0.0868, "step": 14741 }, { "epoch": 4.50687863038826, "grad_norm": 0.28527623414993286, "learning_rate": 2.707698186913507e-05, "loss": 0.0592, "step": 14742 }, { "epoch": 4.507184347294405, "grad_norm": 0.2965519428253174, "learning_rate": 2.7076557258715128e-05, "loss": 0.0659, "step": 14743 }, { "epoch": 4.50749006420055, "grad_norm": 0.3076885938644409, "learning_rate": 2.707613264829519e-05, "loss": 0.0704, "step": 14744 }, { "epoch": 4.5077957811066955, "grad_norm": 0.5311720371246338, "learning_rate": 2.707570803787525e-05, "loss": 0.1254, "step": 14745 }, { "epoch": 4.50810149801284, "grad_norm": 1.5945791006088257, "learning_rate": 2.707528342745531e-05, "loss": 0.1038, "step": 14746 }, { "epoch": 4.508407214918985, "grad_norm": 0.35260578989982605, "learning_rate": 2.707485881703537e-05, "loss": 0.1064, "step": 14747 }, { "epoch": 4.50871293182513, "grad_norm": 0.6601476669311523, "learning_rate": 2.707443420661543e-05, "loss": 0.0951, "step": 14748 }, { "epoch": 4.509018648731275, "grad_norm": 1.1886520385742188, "learning_rate": 2.707400959619549e-05, "loss": 0.1735, "step": 14749 }, { "epoch": 4.50932436563742, "grad_norm": 0.5027493834495544, "learning_rate": 2.707358498577555e-05, "loss": 0.1144, "step": 14750 }, { "epoch": 4.5096300825435645, "grad_norm": 1.3971549272537231, "learning_rate": 2.707316037535561e-05, "loss": 0.1623, "step": 14751 }, { "epoch": 4.50993579944971, "grad_norm": 1.67288076877594, "learning_rate": 2.707273576493567e-05, "loss": 0.1671, "step": 14752 }, { "epoch": 4.510241516355855, "grad_norm": 1.9229497909545898, "learning_rate": 2.707231115451573e-05, "loss": 0.1761, "step": 14753 }, { "epoch": 4.510547233261999, "grad_norm": 0.835918128490448, "learning_rate": 2.707188654409579e-05, "loss": 0.152, "step": 14754 }, { "epoch": 4.510852950168144, "grad_norm": 0.6021867990493774, "learning_rate": 2.7071461933675852e-05, "loss": 0.1886, "step": 14755 }, { "epoch": 4.511158667074289, "grad_norm": 1.0740183591842651, "learning_rate": 2.707103732325591e-05, "loss": 0.1907, "step": 14756 }, { "epoch": 4.511464383980434, "grad_norm": 0.9533546566963196, "learning_rate": 2.7070612712835973e-05, "loss": 0.2025, "step": 14757 }, { "epoch": 4.511770100886579, "grad_norm": 3.2363052368164062, "learning_rate": 2.7070188102416032e-05, "loss": 0.2126, "step": 14758 }, { "epoch": 4.512075817792724, "grad_norm": 1.6013679504394531, "learning_rate": 2.7069763491996094e-05, "loss": 0.2038, "step": 14759 }, { "epoch": 4.512381534698869, "grad_norm": 0.3161483108997345, "learning_rate": 2.7069338881576156e-05, "loss": 0.1538, "step": 14760 }, { "epoch": 4.512687251605014, "grad_norm": 0.26916173100471497, "learning_rate": 2.7068914271156215e-05, "loss": 0.0648, "step": 14761 }, { "epoch": 4.512992968511159, "grad_norm": 0.26968786120414734, "learning_rate": 2.7068489660736277e-05, "loss": 0.0668, "step": 14762 }, { "epoch": 4.513298685417304, "grad_norm": 0.22486186027526855, "learning_rate": 2.7068065050316336e-05, "loss": 0.0559, "step": 14763 }, { "epoch": 4.513604402323448, "grad_norm": 0.6188250780105591, "learning_rate": 2.7067640439896398e-05, "loss": 0.0622, "step": 14764 }, { "epoch": 4.513910119229593, "grad_norm": 0.2628366947174072, "learning_rate": 2.7067215829476456e-05, "loss": 0.0539, "step": 14765 }, { "epoch": 4.5142158361357385, "grad_norm": 0.20291449129581451, "learning_rate": 2.706679121905652e-05, "loss": 0.0639, "step": 14766 }, { "epoch": 4.514521553041883, "grad_norm": 0.33971109986305237, "learning_rate": 2.7066366608636577e-05, "loss": 0.0564, "step": 14767 }, { "epoch": 4.514827269948028, "grad_norm": 0.5866163372993469, "learning_rate": 2.706594199821664e-05, "loss": 0.0769, "step": 14768 }, { "epoch": 4.515132986854173, "grad_norm": 0.39589381217956543, "learning_rate": 2.7065517387796698e-05, "loss": 0.0747, "step": 14769 }, { "epoch": 4.515438703760318, "grad_norm": 0.3922659158706665, "learning_rate": 2.706509277737676e-05, "loss": 0.1217, "step": 14770 }, { "epoch": 4.515744420666463, "grad_norm": 0.33495935797691345, "learning_rate": 2.706466816695682e-05, "loss": 0.0597, "step": 14771 }, { "epoch": 4.516050137572607, "grad_norm": 0.27315598726272583, "learning_rate": 2.706424355653688e-05, "loss": 0.0921, "step": 14772 }, { "epoch": 4.5163558544787525, "grad_norm": 1.4362210035324097, "learning_rate": 2.706381894611694e-05, "loss": 0.1206, "step": 14773 }, { "epoch": 4.516661571384898, "grad_norm": 0.4380008280277252, "learning_rate": 2.7063394335696998e-05, "loss": 0.109, "step": 14774 }, { "epoch": 4.516967288291043, "grad_norm": 1.35433828830719, "learning_rate": 2.706296972527706e-05, "loss": 0.1463, "step": 14775 }, { "epoch": 4.517273005197188, "grad_norm": 0.5674862265586853, "learning_rate": 2.706254511485712e-05, "loss": 0.1828, "step": 14776 }, { "epoch": 4.517578722103332, "grad_norm": 1.3458890914916992, "learning_rate": 2.706212050443718e-05, "loss": 0.1679, "step": 14777 }, { "epoch": 4.517884439009477, "grad_norm": 1.1777271032333374, "learning_rate": 2.706169589401724e-05, "loss": 0.1634, "step": 14778 }, { "epoch": 4.518190155915622, "grad_norm": 6.521132946014404, "learning_rate": 2.7061271283597302e-05, "loss": 0.1736, "step": 14779 }, { "epoch": 4.518495872821767, "grad_norm": 1.4661821126937866, "learning_rate": 2.706084667317736e-05, "loss": 0.1722, "step": 14780 }, { "epoch": 4.518801589727912, "grad_norm": 0.883078396320343, "learning_rate": 2.7060422062757423e-05, "loss": 0.2016, "step": 14781 }, { "epoch": 4.519107306634057, "grad_norm": 0.8692835569381714, "learning_rate": 2.705999745233748e-05, "loss": 0.1762, "step": 14782 }, { "epoch": 4.519413023540202, "grad_norm": 3.8810338973999023, "learning_rate": 2.7059572841917543e-05, "loss": 0.2196, "step": 14783 }, { "epoch": 4.519718740446347, "grad_norm": 1.8193141222000122, "learning_rate": 2.7059148231497602e-05, "loss": 0.2436, "step": 14784 }, { "epoch": 4.520024457352491, "grad_norm": 0.39066120982170105, "learning_rate": 2.7058723621077664e-05, "loss": 0.1647, "step": 14785 }, { "epoch": 4.520330174258636, "grad_norm": 0.2882489264011383, "learning_rate": 2.7058299010657723e-05, "loss": 0.0805, "step": 14786 }, { "epoch": 4.5206358911647815, "grad_norm": 0.1971578299999237, "learning_rate": 2.705787440023778e-05, "loss": 0.0672, "step": 14787 }, { "epoch": 4.520941608070927, "grad_norm": 0.22073523700237274, "learning_rate": 2.7057449789817844e-05, "loss": 0.0616, "step": 14788 }, { "epoch": 4.521247324977072, "grad_norm": 0.22122080624103546, "learning_rate": 2.7057025179397902e-05, "loss": 0.0804, "step": 14789 }, { "epoch": 4.521553041883216, "grad_norm": 0.49878954887390137, "learning_rate": 2.7056600568977965e-05, "loss": 0.0799, "step": 14790 }, { "epoch": 4.521858758789361, "grad_norm": 0.2743287980556488, "learning_rate": 2.7056175958558023e-05, "loss": 0.0516, "step": 14791 }, { "epoch": 4.522164475695506, "grad_norm": 0.9782485365867615, "learning_rate": 2.7055751348138085e-05, "loss": 0.065, "step": 14792 }, { "epoch": 4.52247019260165, "grad_norm": 0.4328322112560272, "learning_rate": 2.7055326737718144e-05, "loss": 0.0862, "step": 14793 }, { "epoch": 4.5227759095077955, "grad_norm": 0.22307279706001282, "learning_rate": 2.7054902127298206e-05, "loss": 0.0599, "step": 14794 }, { "epoch": 4.523081626413941, "grad_norm": 0.6302967071533203, "learning_rate": 2.7054477516878265e-05, "loss": 0.0915, "step": 14795 }, { "epoch": 4.523387343320086, "grad_norm": 0.3268224895000458, "learning_rate": 2.7054052906458327e-05, "loss": 0.1008, "step": 14796 }, { "epoch": 4.523693060226231, "grad_norm": 0.6864125728607178, "learning_rate": 2.7053628296038386e-05, "loss": 0.0906, "step": 14797 }, { "epoch": 4.523998777132375, "grad_norm": 0.5060561895370483, "learning_rate": 2.7053203685618448e-05, "loss": 0.1306, "step": 14798 }, { "epoch": 4.52430449403852, "grad_norm": 0.6881362199783325, "learning_rate": 2.7052779075198506e-05, "loss": 0.1218, "step": 14799 }, { "epoch": 4.524610210944665, "grad_norm": 0.5049127340316772, "learning_rate": 2.7052354464778565e-05, "loss": 0.1748, "step": 14800 }, { "epoch": 4.52491592785081, "grad_norm": 0.6985663175582886, "learning_rate": 2.7051929854358627e-05, "loss": 0.1568, "step": 14801 }, { "epoch": 4.5252216447569555, "grad_norm": 0.8005124926567078, "learning_rate": 2.7051505243938686e-05, "loss": 0.1607, "step": 14802 }, { "epoch": 4.5255273616631, "grad_norm": 0.6921008229255676, "learning_rate": 2.7051080633518748e-05, "loss": 0.1847, "step": 14803 }, { "epoch": 4.525833078569245, "grad_norm": 1.8891507387161255, "learning_rate": 2.7050656023098807e-05, "loss": 0.2029, "step": 14804 }, { "epoch": 4.52613879547539, "grad_norm": 0.6989970207214355, "learning_rate": 2.705023141267887e-05, "loss": 0.2008, "step": 14805 }, { "epoch": 4.526444512381534, "grad_norm": 1.0183818340301514, "learning_rate": 2.7049806802258927e-05, "loss": 0.1784, "step": 14806 }, { "epoch": 4.526750229287679, "grad_norm": 1.0845355987548828, "learning_rate": 2.704938219183899e-05, "loss": 0.1979, "step": 14807 }, { "epoch": 4.5270559461938245, "grad_norm": 1.1368950605392456, "learning_rate": 2.7048957581419048e-05, "loss": 0.2297, "step": 14808 }, { "epoch": 4.52736166309997, "grad_norm": 1.7636812925338745, "learning_rate": 2.704853297099911e-05, "loss": 0.2534, "step": 14809 }, { "epoch": 4.527667380006115, "grad_norm": 0.7931777834892273, "learning_rate": 2.704810836057917e-05, "loss": 0.1387, "step": 14810 }, { "epoch": 4.527973096912259, "grad_norm": 0.26984405517578125, "learning_rate": 2.704768375015923e-05, "loss": 0.0868, "step": 14811 }, { "epoch": 4.528278813818404, "grad_norm": 0.18215540051460266, "learning_rate": 2.704725913973929e-05, "loss": 0.0667, "step": 14812 }, { "epoch": 4.528584530724549, "grad_norm": 1.4925904273986816, "learning_rate": 2.704683452931935e-05, "loss": 0.0571, "step": 14813 }, { "epoch": 4.528890247630694, "grad_norm": 0.2931714653968811, "learning_rate": 2.704640991889941e-05, "loss": 0.0684, "step": 14814 }, { "epoch": 4.529195964536839, "grad_norm": 0.315741628408432, "learning_rate": 2.704598530847947e-05, "loss": 0.0415, "step": 14815 }, { "epoch": 4.529501681442984, "grad_norm": 0.5511412620544434, "learning_rate": 2.704556069805953e-05, "loss": 0.0659, "step": 14816 }, { "epoch": 4.529807398349129, "grad_norm": 0.31155145168304443, "learning_rate": 2.704513608763959e-05, "loss": 0.0725, "step": 14817 }, { "epoch": 4.530113115255274, "grad_norm": 0.47198551893234253, "learning_rate": 2.7044711477219652e-05, "loss": 0.0735, "step": 14818 }, { "epoch": 4.530418832161418, "grad_norm": 0.32094088196754456, "learning_rate": 2.704428686679971e-05, "loss": 0.0757, "step": 14819 }, { "epoch": 4.530724549067563, "grad_norm": 0.2514004111289978, "learning_rate": 2.7043862256379773e-05, "loss": 0.0938, "step": 14820 }, { "epoch": 4.531030265973708, "grad_norm": 0.3013887107372284, "learning_rate": 2.704343764595983e-05, "loss": 0.0448, "step": 14821 }, { "epoch": 4.531335982879853, "grad_norm": 0.6414461135864258, "learning_rate": 2.7043013035539894e-05, "loss": 0.0918, "step": 14822 }, { "epoch": 4.5316416997859985, "grad_norm": 0.9852994084358215, "learning_rate": 2.7042588425119952e-05, "loss": 0.1316, "step": 14823 }, { "epoch": 4.531947416692143, "grad_norm": 0.5713377594947815, "learning_rate": 2.7042163814700015e-05, "loss": 0.1359, "step": 14824 }, { "epoch": 4.532253133598288, "grad_norm": 0.3797421157360077, "learning_rate": 2.7041739204280073e-05, "loss": 0.1503, "step": 14825 }, { "epoch": 4.532558850504433, "grad_norm": 0.6027655601501465, "learning_rate": 2.7041314593860132e-05, "loss": 0.1339, "step": 14826 }, { "epoch": 4.532864567410578, "grad_norm": 0.6524612307548523, "learning_rate": 2.7040889983440194e-05, "loss": 0.187, "step": 14827 }, { "epoch": 4.533170284316723, "grad_norm": 0.7371341586112976, "learning_rate": 2.7040465373020253e-05, "loss": 0.1618, "step": 14828 }, { "epoch": 4.533476001222867, "grad_norm": 0.9278116226196289, "learning_rate": 2.7040040762600315e-05, "loss": 0.1758, "step": 14829 }, { "epoch": 4.5337817181290125, "grad_norm": 0.5187748670578003, "learning_rate": 2.7039616152180374e-05, "loss": 0.2073, "step": 14830 }, { "epoch": 4.534087435035158, "grad_norm": 1.463538408279419, "learning_rate": 2.7039191541760436e-05, "loss": 0.2052, "step": 14831 }, { "epoch": 4.534393151941302, "grad_norm": 0.9562026858329773, "learning_rate": 2.7038766931340494e-05, "loss": 0.2131, "step": 14832 }, { "epoch": 4.534698868847447, "grad_norm": 1.1933927536010742, "learning_rate": 2.7038342320920556e-05, "loss": 0.1654, "step": 14833 }, { "epoch": 4.535004585753592, "grad_norm": 1.7027921676635742, "learning_rate": 2.7037917710500615e-05, "loss": 0.2772, "step": 14834 }, { "epoch": 4.535310302659737, "grad_norm": 0.4133804738521576, "learning_rate": 2.7037493100080677e-05, "loss": 0.1493, "step": 14835 }, { "epoch": 4.535616019565882, "grad_norm": 0.2961902916431427, "learning_rate": 2.7037068489660736e-05, "loss": 0.0856, "step": 14836 }, { "epoch": 4.535921736472027, "grad_norm": 0.5693686008453369, "learning_rate": 2.7036643879240798e-05, "loss": 0.0698, "step": 14837 }, { "epoch": 4.536227453378172, "grad_norm": 0.20612819492816925, "learning_rate": 2.7036219268820857e-05, "loss": 0.0553, "step": 14838 }, { "epoch": 4.536533170284317, "grad_norm": 0.33406367897987366, "learning_rate": 2.7035794658400915e-05, "loss": 0.0506, "step": 14839 }, { "epoch": 4.536838887190462, "grad_norm": 0.4911133050918579, "learning_rate": 2.7035370047980977e-05, "loss": 0.0565, "step": 14840 }, { "epoch": 4.537144604096607, "grad_norm": 0.6341552138328552, "learning_rate": 2.7034945437561036e-05, "loss": 0.0612, "step": 14841 }, { "epoch": 4.537450321002751, "grad_norm": 0.6239396929740906, "learning_rate": 2.7034520827141098e-05, "loss": 0.0694, "step": 14842 }, { "epoch": 4.537756037908896, "grad_norm": 0.21840937435626984, "learning_rate": 2.7034096216721157e-05, "loss": 0.0919, "step": 14843 }, { "epoch": 4.5380617548150415, "grad_norm": 0.31200864911079407, "learning_rate": 2.703367160630122e-05, "loss": 0.0534, "step": 14844 }, { "epoch": 4.538367471721186, "grad_norm": 0.5030868649482727, "learning_rate": 2.7033246995881278e-05, "loss": 0.1237, "step": 14845 }, { "epoch": 4.538673188627331, "grad_norm": 0.3357924520969391, "learning_rate": 2.703282238546134e-05, "loss": 0.0938, "step": 14846 }, { "epoch": 4.538978905533476, "grad_norm": 0.4850807189941406, "learning_rate": 2.70323977750414e-05, "loss": 0.0877, "step": 14847 }, { "epoch": 4.539284622439621, "grad_norm": 0.9268652200698853, "learning_rate": 2.703197316462146e-05, "loss": 0.1255, "step": 14848 }, { "epoch": 4.539590339345766, "grad_norm": 0.522497296333313, "learning_rate": 2.703154855420152e-05, "loss": 0.1165, "step": 14849 }, { "epoch": 4.53989605625191, "grad_norm": 0.6166905164718628, "learning_rate": 2.703112394378158e-05, "loss": 0.1256, "step": 14850 }, { "epoch": 4.5402017731580555, "grad_norm": 0.48097914457321167, "learning_rate": 2.703069933336164e-05, "loss": 0.1547, "step": 14851 }, { "epoch": 4.540507490064201, "grad_norm": 1.7278571128845215, "learning_rate": 2.70302747229417e-05, "loss": 0.1453, "step": 14852 }, { "epoch": 4.540813206970346, "grad_norm": 0.4890095591545105, "learning_rate": 2.702985011252176e-05, "loss": 0.1538, "step": 14853 }, { "epoch": 4.541118923876491, "grad_norm": 0.6399084329605103, "learning_rate": 2.702942550210182e-05, "loss": 0.1852, "step": 14854 }, { "epoch": 4.541424640782635, "grad_norm": 1.4783660173416138, "learning_rate": 2.7029000891681882e-05, "loss": 0.2181, "step": 14855 }, { "epoch": 4.54173035768878, "grad_norm": 0.9138280749320984, "learning_rate": 2.702857628126194e-05, "loss": 0.2099, "step": 14856 }, { "epoch": 4.542036074594925, "grad_norm": 0.9538041949272156, "learning_rate": 2.7028151670842002e-05, "loss": 0.1746, "step": 14857 }, { "epoch": 4.5423417915010695, "grad_norm": 1.2266292572021484, "learning_rate": 2.702772706042206e-05, "loss": 0.1922, "step": 14858 }, { "epoch": 4.542647508407215, "grad_norm": 1.1527867317199707, "learning_rate": 2.7027302450002123e-05, "loss": 0.2388, "step": 14859 }, { "epoch": 4.54295322531336, "grad_norm": 0.5505346059799194, "learning_rate": 2.7026877839582182e-05, "loss": 0.1532, "step": 14860 }, { "epoch": 4.543258942219505, "grad_norm": 0.3910045623779297, "learning_rate": 2.7026453229162244e-05, "loss": 0.075, "step": 14861 }, { "epoch": 4.54356465912565, "grad_norm": 0.314776748418808, "learning_rate": 2.7026028618742306e-05, "loss": 0.0746, "step": 14862 }, { "epoch": 4.543870376031794, "grad_norm": 0.3686797618865967, "learning_rate": 2.7025604008322368e-05, "loss": 0.0667, "step": 14863 }, { "epoch": 4.544176092937939, "grad_norm": 0.2034263163805008, "learning_rate": 2.7025179397902427e-05, "loss": 0.0534, "step": 14864 }, { "epoch": 4.5444818098440845, "grad_norm": 0.18224723637104034, "learning_rate": 2.7024754787482486e-05, "loss": 0.0652, "step": 14865 }, { "epoch": 4.54478752675023, "grad_norm": 0.5696367621421814, "learning_rate": 2.7024330177062548e-05, "loss": 0.0549, "step": 14866 }, { "epoch": 4.545093243656375, "grad_norm": 0.5601726174354553, "learning_rate": 2.7023905566642606e-05, "loss": 0.0697, "step": 14867 }, { "epoch": 4.545398960562519, "grad_norm": 0.4659078121185303, "learning_rate": 2.702348095622267e-05, "loss": 0.0818, "step": 14868 }, { "epoch": 4.545704677468664, "grad_norm": 0.3706096410751343, "learning_rate": 2.7023056345802727e-05, "loss": 0.0653, "step": 14869 }, { "epoch": 4.546010394374809, "grad_norm": 0.2835104763507843, "learning_rate": 2.702263173538279e-05, "loss": 0.0874, "step": 14870 }, { "epoch": 4.546316111280953, "grad_norm": 0.5863632559776306, "learning_rate": 2.7022207124962848e-05, "loss": 0.092, "step": 14871 }, { "epoch": 4.5466218281870985, "grad_norm": 1.002048373222351, "learning_rate": 2.702178251454291e-05, "loss": 0.0964, "step": 14872 }, { "epoch": 4.546927545093244, "grad_norm": 0.598137378692627, "learning_rate": 2.702135790412297e-05, "loss": 0.1245, "step": 14873 }, { "epoch": 4.547233261999389, "grad_norm": 0.41773661971092224, "learning_rate": 2.702093329370303e-05, "loss": 0.1268, "step": 14874 }, { "epoch": 4.547538978905534, "grad_norm": 1.2518714666366577, "learning_rate": 2.702050868328309e-05, "loss": 0.1226, "step": 14875 }, { "epoch": 4.547844695811678, "grad_norm": 0.5120351314544678, "learning_rate": 2.702008407286315e-05, "loss": 0.1664, "step": 14876 }, { "epoch": 4.548150412717823, "grad_norm": 0.5585694909095764, "learning_rate": 2.701965946244321e-05, "loss": 0.1688, "step": 14877 }, { "epoch": 4.548456129623968, "grad_norm": 0.9118504524230957, "learning_rate": 2.701923485202327e-05, "loss": 0.1772, "step": 14878 }, { "epoch": 4.548761846530113, "grad_norm": 1.1457693576812744, "learning_rate": 2.701881024160333e-05, "loss": 0.2184, "step": 14879 }, { "epoch": 4.5490675634362585, "grad_norm": 1.2347692251205444, "learning_rate": 2.701838563118339e-05, "loss": 0.1931, "step": 14880 }, { "epoch": 4.549373280342403, "grad_norm": 1.580915927886963, "learning_rate": 2.7017961020763452e-05, "loss": 0.1997, "step": 14881 }, { "epoch": 4.549678997248548, "grad_norm": 0.5547152757644653, "learning_rate": 2.701753641034351e-05, "loss": 0.1617, "step": 14882 }, { "epoch": 4.549984714154693, "grad_norm": 1.2903021574020386, "learning_rate": 2.7017111799923573e-05, "loss": 0.1943, "step": 14883 }, { "epoch": 4.550290431060837, "grad_norm": 3.3250350952148438, "learning_rate": 2.701668718950363e-05, "loss": 0.2621, "step": 14884 }, { "epoch": 4.550596147966982, "grad_norm": 0.4999689757823944, "learning_rate": 2.7016262579083694e-05, "loss": 0.1105, "step": 14885 }, { "epoch": 4.550901864873127, "grad_norm": 0.3333258032798767, "learning_rate": 2.7015837968663752e-05, "loss": 0.0866, "step": 14886 }, { "epoch": 4.5512075817792725, "grad_norm": 0.5079188942909241, "learning_rate": 2.7015413358243814e-05, "loss": 0.1186, "step": 14887 }, { "epoch": 4.551513298685418, "grad_norm": 0.24999099969863892, "learning_rate": 2.7014988747823873e-05, "loss": 0.0561, "step": 14888 }, { "epoch": 4.551819015591562, "grad_norm": 1.154921293258667, "learning_rate": 2.7014564137403932e-05, "loss": 0.0554, "step": 14889 }, { "epoch": 4.552124732497707, "grad_norm": 0.6944872140884399, "learning_rate": 2.7014139526983994e-05, "loss": 0.0448, "step": 14890 }, { "epoch": 4.552430449403852, "grad_norm": 0.2704637944698334, "learning_rate": 2.7013714916564052e-05, "loss": 0.0661, "step": 14891 }, { "epoch": 4.552736166309997, "grad_norm": 0.44853100180625916, "learning_rate": 2.7013290306144115e-05, "loss": 0.0852, "step": 14892 }, { "epoch": 4.553041883216142, "grad_norm": 0.34047186374664307, "learning_rate": 2.7012865695724173e-05, "loss": 0.0737, "step": 14893 }, { "epoch": 4.553347600122287, "grad_norm": 0.30823782086372375, "learning_rate": 2.7012441085304235e-05, "loss": 0.0693, "step": 14894 }, { "epoch": 4.553653317028432, "grad_norm": 0.4286544919013977, "learning_rate": 2.7012016474884294e-05, "loss": 0.1039, "step": 14895 }, { "epoch": 4.553959033934577, "grad_norm": 0.5385163426399231, "learning_rate": 2.7011591864464356e-05, "loss": 0.0794, "step": 14896 }, { "epoch": 4.554264750840721, "grad_norm": 0.7024672627449036, "learning_rate": 2.7011167254044415e-05, "loss": 0.0804, "step": 14897 }, { "epoch": 4.554570467746866, "grad_norm": 0.3335869014263153, "learning_rate": 2.7010742643624477e-05, "loss": 0.1154, "step": 14898 }, { "epoch": 4.554876184653011, "grad_norm": 0.7063404321670532, "learning_rate": 2.7010318033204536e-05, "loss": 0.0911, "step": 14899 }, { "epoch": 4.555181901559156, "grad_norm": 0.32749080657958984, "learning_rate": 2.7009893422784598e-05, "loss": 0.127, "step": 14900 }, { "epoch": 4.5554876184653015, "grad_norm": 0.8630606532096863, "learning_rate": 2.7009468812364656e-05, "loss": 0.1923, "step": 14901 }, { "epoch": 4.555793335371446, "grad_norm": 0.4955720901489258, "learning_rate": 2.7009044201944715e-05, "loss": 0.1438, "step": 14902 }, { "epoch": 4.556099052277591, "grad_norm": 0.6138044595718384, "learning_rate": 2.7008619591524777e-05, "loss": 0.2111, "step": 14903 }, { "epoch": 4.556404769183736, "grad_norm": 0.6516501307487488, "learning_rate": 2.7008194981104836e-05, "loss": 0.1741, "step": 14904 }, { "epoch": 4.556710486089881, "grad_norm": 0.9500826001167297, "learning_rate": 2.7007770370684898e-05, "loss": 0.2289, "step": 14905 }, { "epoch": 4.557016202996026, "grad_norm": 0.6472358107566833, "learning_rate": 2.7007345760264957e-05, "loss": 0.1931, "step": 14906 }, { "epoch": 4.55732191990217, "grad_norm": 0.9247979521751404, "learning_rate": 2.700692114984502e-05, "loss": 0.1822, "step": 14907 }, { "epoch": 4.5576276368083155, "grad_norm": 0.9206299185752869, "learning_rate": 2.7006496539425078e-05, "loss": 0.18, "step": 14908 }, { "epoch": 4.557933353714461, "grad_norm": 2.778468370437622, "learning_rate": 2.700607192900514e-05, "loss": 0.2321, "step": 14909 }, { "epoch": 4.558239070620605, "grad_norm": 0.31127992272377014, "learning_rate": 2.7005647318585198e-05, "loss": 0.1582, "step": 14910 }, { "epoch": 4.55854478752675, "grad_norm": 0.2955670654773712, "learning_rate": 2.700522270816526e-05, "loss": 0.0714, "step": 14911 }, { "epoch": 4.558850504432895, "grad_norm": 0.3444644808769226, "learning_rate": 2.700479809774532e-05, "loss": 0.0565, "step": 14912 }, { "epoch": 4.55915622133904, "grad_norm": 0.1735016405582428, "learning_rate": 2.700437348732538e-05, "loss": 0.0565, "step": 14913 }, { "epoch": 4.559461938245185, "grad_norm": 0.19415093958377838, "learning_rate": 2.700394887690544e-05, "loss": 0.0473, "step": 14914 }, { "epoch": 4.5597676551513295, "grad_norm": 0.3808739483356476, "learning_rate": 2.70035242664855e-05, "loss": 0.0696, "step": 14915 }, { "epoch": 4.560073372057475, "grad_norm": 0.40080592036247253, "learning_rate": 2.700309965606556e-05, "loss": 0.0483, "step": 14916 }, { "epoch": 4.56037908896362, "grad_norm": 0.2668180763721466, "learning_rate": 2.700267504564562e-05, "loss": 0.0815, "step": 14917 }, { "epoch": 4.560684805869765, "grad_norm": 0.5147748589515686, "learning_rate": 2.700225043522568e-05, "loss": 0.0476, "step": 14918 }, { "epoch": 4.56099052277591, "grad_norm": 0.2653060853481293, "learning_rate": 2.700182582480574e-05, "loss": 0.0795, "step": 14919 }, { "epoch": 4.561296239682054, "grad_norm": 0.39520612359046936, "learning_rate": 2.7001401214385802e-05, "loss": 0.1199, "step": 14920 }, { "epoch": 4.561601956588199, "grad_norm": 1.7614059448242188, "learning_rate": 2.700097660396586e-05, "loss": 0.0881, "step": 14921 }, { "epoch": 4.5619076734943445, "grad_norm": 0.2555614411830902, "learning_rate": 2.7000551993545923e-05, "loss": 0.0738, "step": 14922 }, { "epoch": 4.562213390400489, "grad_norm": 0.5330464839935303, "learning_rate": 2.7000127383125982e-05, "loss": 0.1352, "step": 14923 }, { "epoch": 4.562519107306634, "grad_norm": 0.5741540789604187, "learning_rate": 2.6999702772706044e-05, "loss": 0.1164, "step": 14924 }, { "epoch": 4.562824824212779, "grad_norm": 0.9640870690345764, "learning_rate": 2.6999278162286103e-05, "loss": 0.1282, "step": 14925 }, { "epoch": 4.563130541118924, "grad_norm": 0.5625866651535034, "learning_rate": 2.6998853551866165e-05, "loss": 0.1592, "step": 14926 }, { "epoch": 4.563436258025069, "grad_norm": 0.972978949546814, "learning_rate": 2.6998428941446223e-05, "loss": 0.1756, "step": 14927 }, { "epoch": 4.563741974931213, "grad_norm": 0.9643951058387756, "learning_rate": 2.6998004331026282e-05, "loss": 0.1849, "step": 14928 }, { "epoch": 4.5640476918373585, "grad_norm": 0.6974555253982544, "learning_rate": 2.6997579720606344e-05, "loss": 0.1711, "step": 14929 }, { "epoch": 4.564353408743504, "grad_norm": 1.5074429512023926, "learning_rate": 2.6997155110186403e-05, "loss": 0.1685, "step": 14930 }, { "epoch": 4.564659125649649, "grad_norm": 1.2557170391082764, "learning_rate": 2.6996730499766465e-05, "loss": 0.1864, "step": 14931 }, { "epoch": 4.564964842555794, "grad_norm": 1.0719654560089111, "learning_rate": 2.6996305889346524e-05, "loss": 0.1926, "step": 14932 }, { "epoch": 4.565270559461938, "grad_norm": 1.0041496753692627, "learning_rate": 2.6995881278926586e-05, "loss": 0.2365, "step": 14933 }, { "epoch": 4.565576276368083, "grad_norm": 1.2809662818908691, "learning_rate": 2.6995456668506644e-05, "loss": 0.2212, "step": 14934 }, { "epoch": 4.565881993274228, "grad_norm": 0.6069086790084839, "learning_rate": 2.6995032058086706e-05, "loss": 0.1272, "step": 14935 }, { "epoch": 4.5661877101803725, "grad_norm": 0.2804504930973053, "learning_rate": 2.6994607447666765e-05, "loss": 0.0727, "step": 14936 }, { "epoch": 4.566493427086518, "grad_norm": 0.3524111211299896, "learning_rate": 2.6994182837246827e-05, "loss": 0.0635, "step": 14937 }, { "epoch": 4.566799143992663, "grad_norm": 0.32603222131729126, "learning_rate": 2.6993758226826886e-05, "loss": 0.0833, "step": 14938 }, { "epoch": 4.567104860898808, "grad_norm": 0.2473733276128769, "learning_rate": 2.6993333616406948e-05, "loss": 0.0509, "step": 14939 }, { "epoch": 4.567410577804953, "grad_norm": 0.30418989062309265, "learning_rate": 2.6992909005987007e-05, "loss": 0.0761, "step": 14940 }, { "epoch": 4.567716294711097, "grad_norm": 0.35203853249549866, "learning_rate": 2.6992484395567065e-05, "loss": 0.0477, "step": 14941 }, { "epoch": 4.568022011617242, "grad_norm": 0.46780863404273987, "learning_rate": 2.6992059785147128e-05, "loss": 0.0642, "step": 14942 }, { "epoch": 4.568327728523387, "grad_norm": 0.42103323340415955, "learning_rate": 2.6991635174727186e-05, "loss": 0.104, "step": 14943 }, { "epoch": 4.5686334454295325, "grad_norm": 0.3064846694469452, "learning_rate": 2.6991210564307248e-05, "loss": 0.0653, "step": 14944 }, { "epoch": 4.568939162335678, "grad_norm": 0.4001339077949524, "learning_rate": 2.6990785953887307e-05, "loss": 0.0928, "step": 14945 }, { "epoch": 4.569244879241822, "grad_norm": 0.5763856768608093, "learning_rate": 2.699036134346737e-05, "loss": 0.0866, "step": 14946 }, { "epoch": 4.569550596147967, "grad_norm": 0.35504984855651855, "learning_rate": 2.6989936733047428e-05, "loss": 0.0807, "step": 14947 }, { "epoch": 4.569856313054112, "grad_norm": 0.5493974685668945, "learning_rate": 2.698951212262749e-05, "loss": 0.1414, "step": 14948 }, { "epoch": 4.570162029960256, "grad_norm": 0.6327942609786987, "learning_rate": 2.698908751220755e-05, "loss": 0.12, "step": 14949 }, { "epoch": 4.5704677468664014, "grad_norm": 0.41471877694129944, "learning_rate": 2.698866290178761e-05, "loss": 0.1384, "step": 14950 }, { "epoch": 4.570773463772547, "grad_norm": 1.077652096748352, "learning_rate": 2.698823829136767e-05, "loss": 0.1938, "step": 14951 }, { "epoch": 4.571079180678692, "grad_norm": 0.4237973690032959, "learning_rate": 2.698781368094773e-05, "loss": 0.1428, "step": 14952 }, { "epoch": 4.571384897584837, "grad_norm": 0.48982328176498413, "learning_rate": 2.698738907052779e-05, "loss": 0.2036, "step": 14953 }, { "epoch": 4.571690614490981, "grad_norm": 2.5601966381073, "learning_rate": 2.698696446010785e-05, "loss": 0.1769, "step": 14954 }, { "epoch": 4.571996331397126, "grad_norm": 0.831434965133667, "learning_rate": 2.698653984968791e-05, "loss": 0.1813, "step": 14955 }, { "epoch": 4.572302048303271, "grad_norm": 1.0109599828720093, "learning_rate": 2.698611523926797e-05, "loss": 0.1873, "step": 14956 }, { "epoch": 4.572607765209416, "grad_norm": 0.9640165567398071, "learning_rate": 2.6985690628848032e-05, "loss": 0.2264, "step": 14957 }, { "epoch": 4.5729134821155615, "grad_norm": 0.7435038685798645, "learning_rate": 2.698526601842809e-05, "loss": 0.2108, "step": 14958 }, { "epoch": 4.573219199021706, "grad_norm": 2.236658811569214, "learning_rate": 2.6984841408008153e-05, "loss": 0.2791, "step": 14959 }, { "epoch": 4.573524915927851, "grad_norm": 0.49283018708229065, "learning_rate": 2.698441679758821e-05, "loss": 0.1284, "step": 14960 }, { "epoch": 4.573830632833996, "grad_norm": 0.20558547973632812, "learning_rate": 2.6983992187168273e-05, "loss": 0.0851, "step": 14961 }, { "epoch": 4.57413634974014, "grad_norm": 0.21827438473701477, "learning_rate": 2.6983567576748332e-05, "loss": 0.0658, "step": 14962 }, { "epoch": 4.574442066646285, "grad_norm": 0.2595449686050415, "learning_rate": 2.6983142966328394e-05, "loss": 0.0662, "step": 14963 }, { "epoch": 4.57474778355243, "grad_norm": 0.18963636457920074, "learning_rate": 2.6982718355908453e-05, "loss": 0.0507, "step": 14964 }, { "epoch": 4.5750535004585755, "grad_norm": 0.3720925748348236, "learning_rate": 2.6982293745488515e-05, "loss": 0.0947, "step": 14965 }, { "epoch": 4.575359217364721, "grad_norm": 0.7021269798278809, "learning_rate": 2.6981869135068577e-05, "loss": 0.0419, "step": 14966 }, { "epoch": 4.575664934270865, "grad_norm": 0.23062840104103088, "learning_rate": 2.6981444524648636e-05, "loss": 0.079, "step": 14967 }, { "epoch": 4.57597065117701, "grad_norm": 0.3909221589565277, "learning_rate": 2.6981019914228698e-05, "loss": 0.0619, "step": 14968 }, { "epoch": 4.576276368083155, "grad_norm": 0.35167932510375977, "learning_rate": 2.6980595303808756e-05, "loss": 0.0767, "step": 14969 }, { "epoch": 4.5765820849893, "grad_norm": 0.37443622946739197, "learning_rate": 2.698017069338882e-05, "loss": 0.093, "step": 14970 }, { "epoch": 4.576887801895445, "grad_norm": 0.2960302233695984, "learning_rate": 2.6979746082968877e-05, "loss": 0.092, "step": 14971 }, { "epoch": 4.5771935188015895, "grad_norm": 0.2280861884355545, "learning_rate": 2.697932147254894e-05, "loss": 0.0891, "step": 14972 }, { "epoch": 4.577499235707735, "grad_norm": 0.45881927013397217, "learning_rate": 2.6978896862128998e-05, "loss": 0.1222, "step": 14973 }, { "epoch": 4.57780495261388, "grad_norm": 0.8559582829475403, "learning_rate": 2.697847225170906e-05, "loss": 0.1726, "step": 14974 }, { "epoch": 4.578110669520024, "grad_norm": 0.6454943418502808, "learning_rate": 2.697804764128912e-05, "loss": 0.1345, "step": 14975 }, { "epoch": 4.578416386426169, "grad_norm": 0.40975528955459595, "learning_rate": 2.697762303086918e-05, "loss": 0.1661, "step": 14976 }, { "epoch": 4.578722103332314, "grad_norm": 1.1681833267211914, "learning_rate": 2.697719842044924e-05, "loss": 0.1721, "step": 14977 }, { "epoch": 4.579027820238459, "grad_norm": 1.1271781921386719, "learning_rate": 2.6976773810029302e-05, "loss": 0.18, "step": 14978 }, { "epoch": 4.5793335371446044, "grad_norm": 0.9664425253868103, "learning_rate": 2.697634919960936e-05, "loss": 0.177, "step": 14979 }, { "epoch": 4.579639254050749, "grad_norm": 0.6236177086830139, "learning_rate": 2.697592458918942e-05, "loss": 0.2033, "step": 14980 }, { "epoch": 4.579944970956894, "grad_norm": 1.2009315490722656, "learning_rate": 2.697549997876948e-05, "loss": 0.1812, "step": 14981 }, { "epoch": 4.580250687863039, "grad_norm": 1.392148494720459, "learning_rate": 2.697507536834954e-05, "loss": 0.1974, "step": 14982 }, { "epoch": 4.580556404769184, "grad_norm": 0.8505409955978394, "learning_rate": 2.6974650757929602e-05, "loss": 0.1906, "step": 14983 }, { "epoch": 4.580862121675329, "grad_norm": 1.0854783058166504, "learning_rate": 2.697422614750966e-05, "loss": 0.2498, "step": 14984 }, { "epoch": 4.581167838581473, "grad_norm": 0.5763256549835205, "learning_rate": 2.6973801537089723e-05, "loss": 0.136, "step": 14985 }, { "epoch": 4.5814735554876185, "grad_norm": 0.2716992199420929, "learning_rate": 2.697337692666978e-05, "loss": 0.0671, "step": 14986 }, { "epoch": 4.581779272393764, "grad_norm": 0.30716681480407715, "learning_rate": 2.6972952316249844e-05, "loss": 0.0573, "step": 14987 }, { "epoch": 4.582084989299908, "grad_norm": 1.1491304636001587, "learning_rate": 2.6972527705829902e-05, "loss": 0.065, "step": 14988 }, { "epoch": 4.582390706206053, "grad_norm": 0.2500634789466858, "learning_rate": 2.6972103095409964e-05, "loss": 0.0628, "step": 14989 }, { "epoch": 4.582696423112198, "grad_norm": 0.639246940612793, "learning_rate": 2.6971678484990023e-05, "loss": 0.0958, "step": 14990 }, { "epoch": 4.583002140018343, "grad_norm": 0.34211477637290955, "learning_rate": 2.6971253874570085e-05, "loss": 0.0911, "step": 14991 }, { "epoch": 4.583307856924488, "grad_norm": 0.868474006652832, "learning_rate": 2.6970829264150144e-05, "loss": 0.0668, "step": 14992 }, { "epoch": 4.5836135738306325, "grad_norm": 0.23038412630558014, "learning_rate": 2.6970404653730203e-05, "loss": 0.0647, "step": 14993 }, { "epoch": 4.583919290736778, "grad_norm": 0.31389015913009644, "learning_rate": 2.6969980043310265e-05, "loss": 0.0534, "step": 14994 }, { "epoch": 4.584225007642923, "grad_norm": 0.4099172055721283, "learning_rate": 2.6969555432890323e-05, "loss": 0.1133, "step": 14995 }, { "epoch": 4.584530724549068, "grad_norm": 0.34988707304000854, "learning_rate": 2.6969130822470385e-05, "loss": 0.09, "step": 14996 }, { "epoch": 4.584836441455213, "grad_norm": 0.3698484003543854, "learning_rate": 2.6968706212050444e-05, "loss": 0.1021, "step": 14997 }, { "epoch": 4.585142158361357, "grad_norm": 0.4871501922607422, "learning_rate": 2.6968281601630506e-05, "loss": 0.1323, "step": 14998 }, { "epoch": 4.585447875267502, "grad_norm": 0.8304914832115173, "learning_rate": 2.6967856991210565e-05, "loss": 0.1211, "step": 14999 }, { "epoch": 4.585753592173647, "grad_norm": 0.3969796299934387, "learning_rate": 2.6967432380790627e-05, "loss": 0.1586, "step": 15000 }, { "epoch": 4.585753592173647, "eval_cer": 0.18941551285828068, "eval_loss": 0.240526020526886, "eval_runtime": 19.1934, "eval_samples_per_second": 236.435, "eval_steps_per_second": 0.782, "eval_wer": 0.33351023714421385, "step": 15000 }, { "epoch": 4.586059309079792, "grad_norm": 0.6260948181152344, "learning_rate": 2.6967007770370686e-05, "loss": 0.1371, "step": 15001 }, { "epoch": 4.586365025985937, "grad_norm": 0.5781628489494324, "learning_rate": 2.6966583159950748e-05, "loss": 0.1742, "step": 15002 }, { "epoch": 4.586670742892082, "grad_norm": 0.5682535171508789, "learning_rate": 2.6966158549530806e-05, "loss": 0.2109, "step": 15003 }, { "epoch": 4.586976459798227, "grad_norm": 1.506842017173767, "learning_rate": 2.6965733939110865e-05, "loss": 0.1972, "step": 15004 }, { "epoch": 4.587282176704372, "grad_norm": 1.0217894315719604, "learning_rate": 2.6965309328690927e-05, "loss": 0.1575, "step": 15005 }, { "epoch": 4.587587893610516, "grad_norm": 4.185604095458984, "learning_rate": 2.6964884718270986e-05, "loss": 0.2213, "step": 15006 }, { "epoch": 4.5878936105166614, "grad_norm": 1.4952973127365112, "learning_rate": 2.6964460107851048e-05, "loss": 0.2096, "step": 15007 }, { "epoch": 4.588199327422807, "grad_norm": 1.0874594449996948, "learning_rate": 2.6964035497431107e-05, "loss": 0.1957, "step": 15008 }, { "epoch": 4.588505044328952, "grad_norm": 0.8711608648300171, "learning_rate": 2.696361088701117e-05, "loss": 0.2458, "step": 15009 }, { "epoch": 4.588810761235097, "grad_norm": 0.25036147236824036, "learning_rate": 2.6963186276591228e-05, "loss": 0.1513, "step": 15010 }, { "epoch": 4.589116478141241, "grad_norm": 0.29544931650161743, "learning_rate": 2.696276166617129e-05, "loss": 0.1203, "step": 15011 }, { "epoch": 4.589422195047386, "grad_norm": 0.4793064594268799, "learning_rate": 2.696233705575135e-05, "loss": 0.0663, "step": 15012 }, { "epoch": 4.589727911953531, "grad_norm": 0.20759721100330353, "learning_rate": 2.696191244533141e-05, "loss": 0.0583, "step": 15013 }, { "epoch": 4.5900336288596755, "grad_norm": 0.17350339889526367, "learning_rate": 2.696148783491147e-05, "loss": 0.0554, "step": 15014 }, { "epoch": 4.590339345765821, "grad_norm": 0.38694337010383606, "learning_rate": 2.696106322449153e-05, "loss": 0.0382, "step": 15015 }, { "epoch": 4.590645062671966, "grad_norm": 0.2642088532447815, "learning_rate": 2.696063861407159e-05, "loss": 0.0596, "step": 15016 }, { "epoch": 4.590950779578111, "grad_norm": 0.32291319966316223, "learning_rate": 2.696021400365165e-05, "loss": 0.062, "step": 15017 }, { "epoch": 4.591256496484256, "grad_norm": 0.3639121651649475, "learning_rate": 2.695978939323171e-05, "loss": 0.0685, "step": 15018 }, { "epoch": 4.5915622133904, "grad_norm": 0.8569014072418213, "learning_rate": 2.695936478281177e-05, "loss": 0.0648, "step": 15019 }, { "epoch": 4.591867930296545, "grad_norm": 0.5733409523963928, "learning_rate": 2.695894017239183e-05, "loss": 0.0963, "step": 15020 }, { "epoch": 4.59217364720269, "grad_norm": 0.33063700795173645, "learning_rate": 2.695851556197189e-05, "loss": 0.0774, "step": 15021 }, { "epoch": 4.5924793641088355, "grad_norm": 0.7219920754432678, "learning_rate": 2.6958090951551952e-05, "loss": 0.0899, "step": 15022 }, { "epoch": 4.592785081014981, "grad_norm": 0.7562730312347412, "learning_rate": 2.695766634113201e-05, "loss": 0.1249, "step": 15023 }, { "epoch": 4.593090797921125, "grad_norm": 0.7267432808876038, "learning_rate": 2.6957241730712073e-05, "loss": 0.1317, "step": 15024 }, { "epoch": 4.59339651482727, "grad_norm": 0.9073829054832458, "learning_rate": 2.6956817120292132e-05, "loss": 0.1204, "step": 15025 }, { "epoch": 4.593702231733415, "grad_norm": 1.612046718597412, "learning_rate": 2.6956392509872194e-05, "loss": 0.1567, "step": 15026 }, { "epoch": 4.594007948639559, "grad_norm": 1.1423768997192383, "learning_rate": 2.6955967899452253e-05, "loss": 0.1685, "step": 15027 }, { "epoch": 4.594313665545704, "grad_norm": 0.4015604555606842, "learning_rate": 2.6955543289032315e-05, "loss": 0.1546, "step": 15028 }, { "epoch": 4.5946193824518495, "grad_norm": 0.8418240547180176, "learning_rate": 2.6955118678612373e-05, "loss": 0.199, "step": 15029 }, { "epoch": 4.594925099357995, "grad_norm": 0.7686911821365356, "learning_rate": 2.6954694068192432e-05, "loss": 0.2037, "step": 15030 }, { "epoch": 4.59523081626414, "grad_norm": 0.7611731886863708, "learning_rate": 2.6954269457772494e-05, "loss": 0.2123, "step": 15031 }, { "epoch": 4.595536533170284, "grad_norm": 0.9330403208732605, "learning_rate": 2.6953844847352553e-05, "loss": 0.1826, "step": 15032 }, { "epoch": 4.595842250076429, "grad_norm": 0.6911360025405884, "learning_rate": 2.6953420236932615e-05, "loss": 0.2167, "step": 15033 }, { "epoch": 4.596147966982574, "grad_norm": 1.378420352935791, "learning_rate": 2.6952995626512674e-05, "loss": 0.2436, "step": 15034 }, { "epoch": 4.596453683888719, "grad_norm": 0.5242282748222351, "learning_rate": 2.6952571016092736e-05, "loss": 0.1335, "step": 15035 }, { "epoch": 4.5967594007948644, "grad_norm": 0.3025785982608795, "learning_rate": 2.6952146405672794e-05, "loss": 0.0931, "step": 15036 }, { "epoch": 4.597065117701009, "grad_norm": 0.6385506391525269, "learning_rate": 2.6951721795252856e-05, "loss": 0.0542, "step": 15037 }, { "epoch": 4.597370834607154, "grad_norm": 0.2902139723300934, "learning_rate": 2.6951297184832915e-05, "loss": 0.0637, "step": 15038 }, { "epoch": 4.597676551513299, "grad_norm": 0.17104268074035645, "learning_rate": 2.6950872574412977e-05, "loss": 0.0542, "step": 15039 }, { "epoch": 4.597982268419443, "grad_norm": 0.43846625089645386, "learning_rate": 2.6950447963993036e-05, "loss": 0.0627, "step": 15040 }, { "epoch": 4.598287985325588, "grad_norm": 0.22803696990013123, "learning_rate": 2.6950023353573098e-05, "loss": 0.0642, "step": 15041 }, { "epoch": 4.598593702231733, "grad_norm": 0.5540205240249634, "learning_rate": 2.6949598743153157e-05, "loss": 0.059, "step": 15042 }, { "epoch": 4.5988994191378785, "grad_norm": 0.3119412064552307, "learning_rate": 2.6949174132733215e-05, "loss": 0.0596, "step": 15043 }, { "epoch": 4.599205136044024, "grad_norm": 0.3839174807071686, "learning_rate": 2.6948749522313278e-05, "loss": 0.0576, "step": 15044 }, { "epoch": 4.599510852950168, "grad_norm": 0.7480129599571228, "learning_rate": 2.6948324911893336e-05, "loss": 0.1116, "step": 15045 }, { "epoch": 4.599816569856313, "grad_norm": 0.43624863028526306, "learning_rate": 2.69479003014734e-05, "loss": 0.0811, "step": 15046 }, { "epoch": 4.600122286762458, "grad_norm": 0.33065417408943176, "learning_rate": 2.6947475691053457e-05, "loss": 0.0783, "step": 15047 }, { "epoch": 4.600428003668603, "grad_norm": 0.5015982985496521, "learning_rate": 2.694705108063352e-05, "loss": 0.1411, "step": 15048 }, { "epoch": 4.600733720574748, "grad_norm": 0.29242685437202454, "learning_rate": 2.6946626470213578e-05, "loss": 0.1112, "step": 15049 }, { "epoch": 4.6010394374808925, "grad_norm": 0.524789035320282, "learning_rate": 2.694620185979364e-05, "loss": 0.1361, "step": 15050 }, { "epoch": 4.601345154387038, "grad_norm": 0.746990978717804, "learning_rate": 2.69457772493737e-05, "loss": 0.1347, "step": 15051 }, { "epoch": 4.601650871293183, "grad_norm": 0.3913774788379669, "learning_rate": 2.694535263895376e-05, "loss": 0.18, "step": 15052 }, { "epoch": 4.601956588199327, "grad_norm": 1.067986011505127, "learning_rate": 2.694492802853382e-05, "loss": 0.1999, "step": 15053 }, { "epoch": 4.602262305105472, "grad_norm": 0.5268691182136536, "learning_rate": 2.694450341811388e-05, "loss": 0.1694, "step": 15054 }, { "epoch": 4.602568022011617, "grad_norm": 0.6951779723167419, "learning_rate": 2.694407880769394e-05, "loss": 0.141, "step": 15055 }, { "epoch": 4.602873738917762, "grad_norm": 1.2696363925933838, "learning_rate": 2.6943654197274e-05, "loss": 0.2061, "step": 15056 }, { "epoch": 4.603179455823907, "grad_norm": 0.9224441647529602, "learning_rate": 2.694322958685406e-05, "loss": 0.2158, "step": 15057 }, { "epoch": 4.603485172730052, "grad_norm": 0.7144806385040283, "learning_rate": 2.694280497643412e-05, "loss": 0.2178, "step": 15058 }, { "epoch": 4.603790889636197, "grad_norm": 7.927372455596924, "learning_rate": 2.6942380366014182e-05, "loss": 0.2686, "step": 15059 }, { "epoch": 4.604096606542342, "grad_norm": 0.37572234869003296, "learning_rate": 2.694195575559424e-05, "loss": 0.1379, "step": 15060 }, { "epoch": 4.604402323448487, "grad_norm": 0.32846951484680176, "learning_rate": 2.6941531145174303e-05, "loss": 0.1095, "step": 15061 }, { "epoch": 4.604708040354632, "grad_norm": 0.5028209686279297, "learning_rate": 2.694110653475436e-05, "loss": 0.088, "step": 15062 }, { "epoch": 4.605013757260776, "grad_norm": 0.24946264922618866, "learning_rate": 2.6940681924334423e-05, "loss": 0.0717, "step": 15063 }, { "epoch": 4.6053194741669214, "grad_norm": 0.3689137399196625, "learning_rate": 2.6940257313914482e-05, "loss": 0.0537, "step": 15064 }, { "epoch": 4.605625191073067, "grad_norm": 0.3194081485271454, "learning_rate": 2.6939832703494544e-05, "loss": 0.0717, "step": 15065 }, { "epoch": 4.605930907979211, "grad_norm": 0.26006263494491577, "learning_rate": 2.6939408093074603e-05, "loss": 0.0636, "step": 15066 }, { "epoch": 4.606236624885356, "grad_norm": 0.4427160322666168, "learning_rate": 2.6938983482654665e-05, "loss": 0.0677, "step": 15067 }, { "epoch": 4.606542341791501, "grad_norm": 0.4724999666213989, "learning_rate": 2.6938558872234727e-05, "loss": 0.091, "step": 15068 }, { "epoch": 4.606848058697646, "grad_norm": 0.29389262199401855, "learning_rate": 2.6938134261814786e-05, "loss": 0.0605, "step": 15069 }, { "epoch": 4.607153775603791, "grad_norm": 0.2102939933538437, "learning_rate": 2.6937709651394848e-05, "loss": 0.0599, "step": 15070 }, { "epoch": 4.6074594925099355, "grad_norm": 0.3449554145336151, "learning_rate": 2.6937285040974906e-05, "loss": 0.0842, "step": 15071 }, { "epoch": 4.607765209416081, "grad_norm": 0.4155147671699524, "learning_rate": 2.693686043055497e-05, "loss": 0.1282, "step": 15072 }, { "epoch": 4.608070926322226, "grad_norm": 0.6399480700492859, "learning_rate": 2.6936435820135027e-05, "loss": 0.1477, "step": 15073 }, { "epoch": 4.608376643228371, "grad_norm": 0.6013297438621521, "learning_rate": 2.693601120971509e-05, "loss": 0.1329, "step": 15074 }, { "epoch": 4.608682360134516, "grad_norm": 1.1854428052902222, "learning_rate": 2.6935586599295148e-05, "loss": 0.1526, "step": 15075 }, { "epoch": 4.60898807704066, "grad_norm": 0.4602837860584259, "learning_rate": 2.693516198887521e-05, "loss": 0.1426, "step": 15076 }, { "epoch": 4.609293793946805, "grad_norm": 0.41110891103744507, "learning_rate": 2.693473737845527e-05, "loss": 0.151, "step": 15077 }, { "epoch": 4.60959951085295, "grad_norm": 1.4958088397979736, "learning_rate": 2.693431276803533e-05, "loss": 0.1821, "step": 15078 }, { "epoch": 4.609905227759095, "grad_norm": 0.5483261942863464, "learning_rate": 2.693388815761539e-05, "loss": 0.1847, "step": 15079 }, { "epoch": 4.61021094466524, "grad_norm": 1.4252681732177734, "learning_rate": 2.6933463547195452e-05, "loss": 0.1558, "step": 15080 }, { "epoch": 4.610516661571385, "grad_norm": 4.131947040557861, "learning_rate": 2.693303893677551e-05, "loss": 0.1914, "step": 15081 }, { "epoch": 4.61082237847753, "grad_norm": 1.0741747617721558, "learning_rate": 2.693261432635557e-05, "loss": 0.1989, "step": 15082 }, { "epoch": 4.611128095383675, "grad_norm": 1.1442248821258545, "learning_rate": 2.693218971593563e-05, "loss": 0.1979, "step": 15083 }, { "epoch": 4.611433812289819, "grad_norm": 1.1109610795974731, "learning_rate": 2.693176510551569e-05, "loss": 0.2114, "step": 15084 }, { "epoch": 4.611739529195964, "grad_norm": 0.2886922061443329, "learning_rate": 2.6931340495095752e-05, "loss": 0.1518, "step": 15085 }, { "epoch": 4.6120452461021095, "grad_norm": 0.36721014976501465, "learning_rate": 2.693091588467581e-05, "loss": 0.0963, "step": 15086 }, { "epoch": 4.612350963008255, "grad_norm": 0.16611425578594208, "learning_rate": 2.6930491274255873e-05, "loss": 0.0692, "step": 15087 }, { "epoch": 4.6126566799144, "grad_norm": 0.2219206988811493, "learning_rate": 2.693006666383593e-05, "loss": 0.059, "step": 15088 }, { "epoch": 4.612962396820544, "grad_norm": 0.7106373906135559, "learning_rate": 2.6929642053415994e-05, "loss": 0.0681, "step": 15089 }, { "epoch": 4.613268113726689, "grad_norm": 0.22425417602062225, "learning_rate": 2.6929217442996052e-05, "loss": 0.039, "step": 15090 }, { "epoch": 4.613573830632834, "grad_norm": 0.2820930480957031, "learning_rate": 2.6928792832576114e-05, "loss": 0.0789, "step": 15091 }, { "epoch": 4.613879547538978, "grad_norm": 0.23235447704792023, "learning_rate": 2.6928368222156173e-05, "loss": 0.0629, "step": 15092 }, { "epoch": 4.614185264445124, "grad_norm": 0.244040846824646, "learning_rate": 2.6927943611736235e-05, "loss": 0.0563, "step": 15093 }, { "epoch": 4.614490981351269, "grad_norm": 0.5149393677711487, "learning_rate": 2.6927519001316294e-05, "loss": 0.0783, "step": 15094 }, { "epoch": 4.614796698257414, "grad_norm": 0.30953487753868103, "learning_rate": 2.6927094390896353e-05, "loss": 0.0996, "step": 15095 }, { "epoch": 4.615102415163559, "grad_norm": 0.36087051033973694, "learning_rate": 2.6926669780476415e-05, "loss": 0.0868, "step": 15096 }, { "epoch": 4.615408132069703, "grad_norm": 0.6163648366928101, "learning_rate": 2.6926245170056473e-05, "loss": 0.0865, "step": 15097 }, { "epoch": 4.615713848975848, "grad_norm": 0.47493064403533936, "learning_rate": 2.6925820559636535e-05, "loss": 0.1096, "step": 15098 }, { "epoch": 4.616019565881993, "grad_norm": 0.49550512433052063, "learning_rate": 2.6925395949216594e-05, "loss": 0.1385, "step": 15099 }, { "epoch": 4.6163252827881385, "grad_norm": 0.4924461245536804, "learning_rate": 2.6924971338796656e-05, "loss": 0.1291, "step": 15100 }, { "epoch": 4.616630999694284, "grad_norm": 0.774863600730896, "learning_rate": 2.6924546728376715e-05, "loss": 0.156, "step": 15101 }, { "epoch": 4.616936716600428, "grad_norm": 1.686882734298706, "learning_rate": 2.6924122117956777e-05, "loss": 0.191, "step": 15102 }, { "epoch": 4.617242433506573, "grad_norm": 0.5715798735618591, "learning_rate": 2.6923697507536836e-05, "loss": 0.1947, "step": 15103 }, { "epoch": 4.617548150412718, "grad_norm": 0.5831999182701111, "learning_rate": 2.6923272897116898e-05, "loss": 0.1588, "step": 15104 }, { "epoch": 4.617853867318862, "grad_norm": 0.6951306462287903, "learning_rate": 2.6922848286696956e-05, "loss": 0.1837, "step": 15105 }, { "epoch": 4.618159584225007, "grad_norm": 0.6028845310211182, "learning_rate": 2.692242367627702e-05, "loss": 0.2089, "step": 15106 }, { "epoch": 4.6184653011311525, "grad_norm": 0.6380050778388977, "learning_rate": 2.6921999065857077e-05, "loss": 0.1621, "step": 15107 }, { "epoch": 4.618771018037298, "grad_norm": 0.907759428024292, "learning_rate": 2.6921574455437136e-05, "loss": 0.1954, "step": 15108 }, { "epoch": 4.619076734943443, "grad_norm": 0.9245545268058777, "learning_rate": 2.6921149845017198e-05, "loss": 0.2422, "step": 15109 }, { "epoch": 4.619382451849587, "grad_norm": 0.3340117931365967, "learning_rate": 2.6920725234597257e-05, "loss": 0.1159, "step": 15110 }, { "epoch": 4.619688168755732, "grad_norm": 0.8031355142593384, "learning_rate": 2.692030062417732e-05, "loss": 0.0676, "step": 15111 }, { "epoch": 4.619993885661877, "grad_norm": 0.21702249348163605, "learning_rate": 2.6919876013757378e-05, "loss": 0.0631, "step": 15112 }, { "epoch": 4.620299602568022, "grad_norm": 0.8551281690597534, "learning_rate": 2.691945140333744e-05, "loss": 0.0624, "step": 15113 }, { "epoch": 4.620605319474167, "grad_norm": 0.2745366394519806, "learning_rate": 2.69190267929175e-05, "loss": 0.0616, "step": 15114 }, { "epoch": 4.620911036380312, "grad_norm": 0.432184100151062, "learning_rate": 2.691860218249756e-05, "loss": 0.0349, "step": 15115 }, { "epoch": 4.621216753286457, "grad_norm": 0.14929233491420746, "learning_rate": 2.691817757207762e-05, "loss": 0.0424, "step": 15116 }, { "epoch": 4.621522470192602, "grad_norm": 0.6965368986129761, "learning_rate": 2.691775296165768e-05, "loss": 0.1142, "step": 15117 }, { "epoch": 4.621828187098746, "grad_norm": 0.4559759497642517, "learning_rate": 2.691732835123774e-05, "loss": 0.0826, "step": 15118 }, { "epoch": 4.622133904004891, "grad_norm": 0.19531458616256714, "learning_rate": 2.69169037408178e-05, "loss": 0.0735, "step": 15119 }, { "epoch": 4.622439620911036, "grad_norm": 0.3044597804546356, "learning_rate": 2.691647913039786e-05, "loss": 0.1034, "step": 15120 }, { "epoch": 4.622745337817181, "grad_norm": 0.43134868144989014, "learning_rate": 2.691605451997792e-05, "loss": 0.074, "step": 15121 }, { "epoch": 4.623051054723327, "grad_norm": 1.0026837587356567, "learning_rate": 2.691562990955798e-05, "loss": 0.1002, "step": 15122 }, { "epoch": 4.623356771629471, "grad_norm": 0.45424070954322815, "learning_rate": 2.691520529913804e-05, "loss": 0.1228, "step": 15123 }, { "epoch": 4.623662488535616, "grad_norm": 0.40658068656921387, "learning_rate": 2.6914780688718102e-05, "loss": 0.1132, "step": 15124 }, { "epoch": 4.623968205441761, "grad_norm": 0.8966238498687744, "learning_rate": 2.691435607829816e-05, "loss": 0.1425, "step": 15125 }, { "epoch": 4.624273922347906, "grad_norm": 0.4539843201637268, "learning_rate": 2.6913931467878223e-05, "loss": 0.1508, "step": 15126 }, { "epoch": 4.624579639254051, "grad_norm": 0.7656182050704956, "learning_rate": 2.6913506857458282e-05, "loss": 0.1714, "step": 15127 }, { "epoch": 4.6248853561601955, "grad_norm": 0.5403881072998047, "learning_rate": 2.6913082247038344e-05, "loss": 0.1852, "step": 15128 }, { "epoch": 4.625191073066341, "grad_norm": 0.4342227876186371, "learning_rate": 2.6912657636618403e-05, "loss": 0.1965, "step": 15129 }, { "epoch": 4.625496789972486, "grad_norm": 1.9946343898773193, "learning_rate": 2.6912233026198465e-05, "loss": 0.166, "step": 15130 }, { "epoch": 4.62580250687863, "grad_norm": 0.7587636709213257, "learning_rate": 2.6911808415778523e-05, "loss": 0.1653, "step": 15131 }, { "epoch": 4.626108223784775, "grad_norm": 0.8989913463592529, "learning_rate": 2.6911383805358582e-05, "loss": 0.2241, "step": 15132 }, { "epoch": 4.62641394069092, "grad_norm": 0.6607565879821777, "learning_rate": 2.6910959194938644e-05, "loss": 0.1647, "step": 15133 }, { "epoch": 4.626719657597065, "grad_norm": 0.9601176977157593, "learning_rate": 2.6910534584518703e-05, "loss": 0.2055, "step": 15134 }, { "epoch": 4.62702537450321, "grad_norm": 0.2652820944786072, "learning_rate": 2.6910109974098765e-05, "loss": 0.1207, "step": 15135 }, { "epoch": 4.627331091409355, "grad_norm": 0.23987774550914764, "learning_rate": 2.6909685363678824e-05, "loss": 0.09, "step": 15136 }, { "epoch": 4.6276368083155, "grad_norm": 0.6705834865570068, "learning_rate": 2.6909260753258886e-05, "loss": 0.0869, "step": 15137 }, { "epoch": 4.627942525221645, "grad_norm": 0.7058405876159668, "learning_rate": 2.6908836142838944e-05, "loss": 0.0732, "step": 15138 }, { "epoch": 4.62824824212779, "grad_norm": 0.3737628757953644, "learning_rate": 2.6908411532419007e-05, "loss": 0.047, "step": 15139 }, { "epoch": 4.628553959033935, "grad_norm": 0.3702773153781891, "learning_rate": 2.6907986921999065e-05, "loss": 0.0401, "step": 15140 }, { "epoch": 4.628859675940079, "grad_norm": 0.3652614653110504, "learning_rate": 2.6907562311579127e-05, "loss": 0.0495, "step": 15141 }, { "epoch": 4.629165392846224, "grad_norm": 0.8258194327354431, "learning_rate": 2.6907137701159186e-05, "loss": 0.0828, "step": 15142 }, { "epoch": 4.6294711097523695, "grad_norm": 0.37983807921409607, "learning_rate": 2.6906713090739248e-05, "loss": 0.0724, "step": 15143 }, { "epoch": 4.629776826658514, "grad_norm": 0.334555447101593, "learning_rate": 2.6906288480319307e-05, "loss": 0.0732, "step": 15144 }, { "epoch": 4.630082543564659, "grad_norm": 0.5230880379676819, "learning_rate": 2.6905863869899365e-05, "loss": 0.1079, "step": 15145 }, { "epoch": 4.630388260470804, "grad_norm": 0.8327156901359558, "learning_rate": 2.6905439259479428e-05, "loss": 0.077, "step": 15146 }, { "epoch": 4.630693977376949, "grad_norm": 0.42064911127090454, "learning_rate": 2.6905014649059486e-05, "loss": 0.0729, "step": 15147 }, { "epoch": 4.630999694283094, "grad_norm": 0.44110915064811707, "learning_rate": 2.690459003863955e-05, "loss": 0.1362, "step": 15148 }, { "epoch": 4.631305411189238, "grad_norm": 0.7501769661903381, "learning_rate": 2.6904165428219607e-05, "loss": 0.1149, "step": 15149 }, { "epoch": 4.631611128095384, "grad_norm": 0.49550706148147583, "learning_rate": 2.690374081779967e-05, "loss": 0.1447, "step": 15150 }, { "epoch": 4.631916845001529, "grad_norm": 0.501969039440155, "learning_rate": 2.6903316207379728e-05, "loss": 0.1401, "step": 15151 }, { "epoch": 4.632222561907674, "grad_norm": 0.5502749681472778, "learning_rate": 2.690289159695979e-05, "loss": 0.1619, "step": 15152 }, { "epoch": 4.632528278813819, "grad_norm": 0.8900346755981445, "learning_rate": 2.690246698653985e-05, "loss": 0.1636, "step": 15153 }, { "epoch": 4.632833995719963, "grad_norm": 0.5703431963920593, "learning_rate": 2.690204237611991e-05, "loss": 0.1637, "step": 15154 }, { "epoch": 4.633139712626108, "grad_norm": 0.6254262328147888, "learning_rate": 2.690161776569997e-05, "loss": 0.2086, "step": 15155 }, { "epoch": 4.633445429532253, "grad_norm": 0.7208427786827087, "learning_rate": 2.690119315528003e-05, "loss": 0.1714, "step": 15156 }, { "epoch": 4.633751146438398, "grad_norm": 0.695073664188385, "learning_rate": 2.690076854486009e-05, "loss": 0.2045, "step": 15157 }, { "epoch": 4.634056863344543, "grad_norm": 0.8404328227043152, "learning_rate": 2.690034393444015e-05, "loss": 0.1954, "step": 15158 }, { "epoch": 4.634362580250688, "grad_norm": 1.0616143941879272, "learning_rate": 2.689991932402021e-05, "loss": 0.2236, "step": 15159 }, { "epoch": 4.634668297156833, "grad_norm": 0.36238035559654236, "learning_rate": 2.689949471360027e-05, "loss": 0.1685, "step": 15160 }, { "epoch": 4.634974014062978, "grad_norm": 0.2720766067504883, "learning_rate": 2.6899070103180332e-05, "loss": 0.0791, "step": 15161 }, { "epoch": 4.635279730969122, "grad_norm": 0.3535056710243225, "learning_rate": 2.689864549276039e-05, "loss": 0.0533, "step": 15162 }, { "epoch": 4.635585447875267, "grad_norm": 0.9645991921424866, "learning_rate": 2.6898220882340453e-05, "loss": 0.0417, "step": 15163 }, { "epoch": 4.6358911647814125, "grad_norm": 0.18052572011947632, "learning_rate": 2.689779627192051e-05, "loss": 0.0405, "step": 15164 }, { "epoch": 4.636196881687558, "grad_norm": 0.2536015808582306, "learning_rate": 2.6897371661500573e-05, "loss": 0.0636, "step": 15165 }, { "epoch": 4.636502598593703, "grad_norm": 0.36583447456359863, "learning_rate": 2.6896947051080632e-05, "loss": 0.0536, "step": 15166 }, { "epoch": 4.636808315499847, "grad_norm": 0.7943262457847595, "learning_rate": 2.6896522440660694e-05, "loss": 0.056, "step": 15167 }, { "epoch": 4.637114032405992, "grad_norm": 0.8921501040458679, "learning_rate": 2.6896097830240753e-05, "loss": 0.0786, "step": 15168 }, { "epoch": 4.637419749312137, "grad_norm": 0.31463542580604553, "learning_rate": 2.6895673219820815e-05, "loss": 0.0687, "step": 15169 }, { "epoch": 4.637725466218281, "grad_norm": 0.31115880608558655, "learning_rate": 2.6895248609400877e-05, "loss": 0.0694, "step": 15170 }, { "epoch": 4.6380311831244265, "grad_norm": 0.5317988991737366, "learning_rate": 2.6894823998980936e-05, "loss": 0.0746, "step": 15171 }, { "epoch": 4.638336900030572, "grad_norm": 0.3978286385536194, "learning_rate": 2.6894399388560998e-05, "loss": 0.0809, "step": 15172 }, { "epoch": 4.638642616936717, "grad_norm": 0.3294518291950226, "learning_rate": 2.6893974778141057e-05, "loss": 0.1198, "step": 15173 }, { "epoch": 4.638948333842862, "grad_norm": 0.4218984842300415, "learning_rate": 2.689355016772112e-05, "loss": 0.1372, "step": 15174 }, { "epoch": 4.639254050749006, "grad_norm": 0.3054928779602051, "learning_rate": 2.6893125557301177e-05, "loss": 0.1624, "step": 15175 }, { "epoch": 4.639559767655151, "grad_norm": 0.7906259298324585, "learning_rate": 2.689270094688124e-05, "loss": 0.122, "step": 15176 }, { "epoch": 4.639865484561296, "grad_norm": 0.5020260214805603, "learning_rate": 2.6892276336461298e-05, "loss": 0.18, "step": 15177 }, { "epoch": 4.640171201467441, "grad_norm": 1.0161212682724, "learning_rate": 2.689185172604136e-05, "loss": 0.1789, "step": 15178 }, { "epoch": 4.640476918373587, "grad_norm": 0.5647533535957336, "learning_rate": 2.689142711562142e-05, "loss": 0.1771, "step": 15179 }, { "epoch": 4.640782635279731, "grad_norm": 0.9244574904441833, "learning_rate": 2.689100250520148e-05, "loss": 0.188, "step": 15180 }, { "epoch": 4.641088352185876, "grad_norm": 0.7397893667221069, "learning_rate": 2.689057789478154e-05, "loss": 0.191, "step": 15181 }, { "epoch": 4.641394069092021, "grad_norm": 0.775223970413208, "learning_rate": 2.6890153284361602e-05, "loss": 0.1534, "step": 15182 }, { "epoch": 4.641699785998165, "grad_norm": 0.7356001734733582, "learning_rate": 2.688972867394166e-05, "loss": 0.2314, "step": 15183 }, { "epoch": 4.64200550290431, "grad_norm": 1.47553551197052, "learning_rate": 2.688930406352172e-05, "loss": 0.2716, "step": 15184 }, { "epoch": 4.6423112198104555, "grad_norm": 0.5123687982559204, "learning_rate": 2.688887945310178e-05, "loss": 0.1734, "step": 15185 }, { "epoch": 4.642616936716601, "grad_norm": 0.2246991991996765, "learning_rate": 2.688845484268184e-05, "loss": 0.0802, "step": 15186 }, { "epoch": 4.642922653622746, "grad_norm": 0.40346959233283997, "learning_rate": 2.6888030232261902e-05, "loss": 0.0559, "step": 15187 }, { "epoch": 4.64322837052889, "grad_norm": 0.21397967636585236, "learning_rate": 2.688760562184196e-05, "loss": 0.0646, "step": 15188 }, { "epoch": 4.643534087435035, "grad_norm": 0.2726128399372101, "learning_rate": 2.6887181011422023e-05, "loss": 0.0586, "step": 15189 }, { "epoch": 4.64383980434118, "grad_norm": 0.3098944127559662, "learning_rate": 2.688675640100208e-05, "loss": 0.0543, "step": 15190 }, { "epoch": 4.644145521247325, "grad_norm": 0.661821722984314, "learning_rate": 2.6886331790582144e-05, "loss": 0.0471, "step": 15191 }, { "epoch": 4.64445123815347, "grad_norm": 0.29412132501602173, "learning_rate": 2.6885907180162202e-05, "loss": 0.0534, "step": 15192 }, { "epoch": 4.644756955059615, "grad_norm": 0.34156185388565063, "learning_rate": 2.6885482569742264e-05, "loss": 0.0724, "step": 15193 }, { "epoch": 4.64506267196576, "grad_norm": 0.4843927323818207, "learning_rate": 2.6885057959322323e-05, "loss": 0.0734, "step": 15194 }, { "epoch": 4.645368388871905, "grad_norm": 0.5436331629753113, "learning_rate": 2.6884633348902385e-05, "loss": 0.1072, "step": 15195 }, { "epoch": 4.645674105778049, "grad_norm": 0.24658960103988647, "learning_rate": 2.6884208738482444e-05, "loss": 0.077, "step": 15196 }, { "epoch": 4.645979822684194, "grad_norm": 0.4144369065761566, "learning_rate": 2.6883784128062503e-05, "loss": 0.0913, "step": 15197 }, { "epoch": 4.646285539590339, "grad_norm": 0.6634828448295593, "learning_rate": 2.6883359517642565e-05, "loss": 0.1032, "step": 15198 }, { "epoch": 4.646591256496484, "grad_norm": 0.6143760085105896, "learning_rate": 2.6882934907222623e-05, "loss": 0.1645, "step": 15199 }, { "epoch": 4.6468969734026295, "grad_norm": 0.9502337574958801, "learning_rate": 2.6882510296802685e-05, "loss": 0.1725, "step": 15200 }, { "epoch": 4.647202690308774, "grad_norm": 0.8705599904060364, "learning_rate": 2.6882085686382744e-05, "loss": 0.1545, "step": 15201 }, { "epoch": 4.647508407214919, "grad_norm": 1.1893494129180908, "learning_rate": 2.6881661075962806e-05, "loss": 0.1716, "step": 15202 }, { "epoch": 4.647814124121064, "grad_norm": 0.4830440580844879, "learning_rate": 2.6881236465542865e-05, "loss": 0.1555, "step": 15203 }, { "epoch": 4.648119841027209, "grad_norm": 1.0627738237380981, "learning_rate": 2.6880811855122927e-05, "loss": 0.1987, "step": 15204 }, { "epoch": 4.648425557933354, "grad_norm": 1.1093292236328125, "learning_rate": 2.6880387244702986e-05, "loss": 0.194, "step": 15205 }, { "epoch": 4.648731274839498, "grad_norm": 0.83600914478302, "learning_rate": 2.6879962634283048e-05, "loss": 0.2047, "step": 15206 }, { "epoch": 4.6490369917456436, "grad_norm": 1.2058535814285278, "learning_rate": 2.6879538023863107e-05, "loss": 0.1987, "step": 15207 }, { "epoch": 4.649342708651789, "grad_norm": 0.7006507515907288, "learning_rate": 2.687911341344317e-05, "loss": 0.1735, "step": 15208 }, { "epoch": 4.649648425557933, "grad_norm": 1.2555279731750488, "learning_rate": 2.6878688803023227e-05, "loss": 0.2231, "step": 15209 }, { "epoch": 4.649954142464078, "grad_norm": 0.7170481085777283, "learning_rate": 2.6878264192603286e-05, "loss": 0.1453, "step": 15210 }, { "epoch": 4.650259859370223, "grad_norm": 1.0965518951416016, "learning_rate": 2.6877839582183348e-05, "loss": 0.0976, "step": 15211 }, { "epoch": 4.650565576276368, "grad_norm": 0.3107706308364868, "learning_rate": 2.6877414971763407e-05, "loss": 0.0867, "step": 15212 }, { "epoch": 4.650871293182513, "grad_norm": 0.5631162524223328, "learning_rate": 2.687699036134347e-05, "loss": 0.0625, "step": 15213 }, { "epoch": 4.651177010088658, "grad_norm": 0.3368028402328491, "learning_rate": 2.6876565750923528e-05, "loss": 0.0506, "step": 15214 }, { "epoch": 4.651482726994803, "grad_norm": 0.3614712059497833, "learning_rate": 2.687614114050359e-05, "loss": 0.0679, "step": 15215 }, { "epoch": 4.651788443900948, "grad_norm": 0.5514152646064758, "learning_rate": 2.687571653008365e-05, "loss": 0.0584, "step": 15216 }, { "epoch": 4.652094160807093, "grad_norm": 0.6279008388519287, "learning_rate": 2.687529191966371e-05, "loss": 0.0765, "step": 15217 }, { "epoch": 4.652399877713238, "grad_norm": 0.34611135721206665, "learning_rate": 2.687486730924377e-05, "loss": 0.0748, "step": 15218 }, { "epoch": 4.652705594619382, "grad_norm": 0.5160683989524841, "learning_rate": 2.687444269882383e-05, "loss": 0.0654, "step": 15219 }, { "epoch": 4.653011311525527, "grad_norm": 0.4075981676578522, "learning_rate": 2.687401808840389e-05, "loss": 0.0931, "step": 15220 }, { "epoch": 4.6533170284316725, "grad_norm": 0.34837695956230164, "learning_rate": 2.6873593477983952e-05, "loss": 0.066, "step": 15221 }, { "epoch": 4.653622745337817, "grad_norm": 0.45649248361587524, "learning_rate": 2.687316886756401e-05, "loss": 0.0693, "step": 15222 }, { "epoch": 4.653928462243962, "grad_norm": 0.843285322189331, "learning_rate": 2.687274425714407e-05, "loss": 0.1222, "step": 15223 }, { "epoch": 4.654234179150107, "grad_norm": 0.4295459985733032, "learning_rate": 2.687231964672413e-05, "loss": 0.0863, "step": 15224 }, { "epoch": 4.654539896056252, "grad_norm": 1.2018663883209229, "learning_rate": 2.687189503630419e-05, "loss": 0.1718, "step": 15225 }, { "epoch": 4.654845612962397, "grad_norm": 0.6621891260147095, "learning_rate": 2.6871470425884252e-05, "loss": 0.136, "step": 15226 }, { "epoch": 4.655151329868541, "grad_norm": 1.70203697681427, "learning_rate": 2.687104581546431e-05, "loss": 0.1557, "step": 15227 }, { "epoch": 4.6554570467746865, "grad_norm": 0.6863077282905579, "learning_rate": 2.6870621205044373e-05, "loss": 0.1662, "step": 15228 }, { "epoch": 4.655762763680832, "grad_norm": 1.2471818923950195, "learning_rate": 2.6870196594624432e-05, "loss": 0.2048, "step": 15229 }, { "epoch": 4.656068480586977, "grad_norm": 0.8999255299568176, "learning_rate": 2.6869771984204494e-05, "loss": 0.1761, "step": 15230 }, { "epoch": 4.656374197493121, "grad_norm": 1.106839656829834, "learning_rate": 2.6869347373784553e-05, "loss": 0.1499, "step": 15231 }, { "epoch": 4.656679914399266, "grad_norm": 0.9184419512748718, "learning_rate": 2.6868922763364615e-05, "loss": 0.2189, "step": 15232 }, { "epoch": 4.656985631305411, "grad_norm": 1.544843077659607, "learning_rate": 2.6868498152944673e-05, "loss": 0.1845, "step": 15233 }, { "epoch": 4.657291348211556, "grad_norm": 2.688816785812378, "learning_rate": 2.6868073542524732e-05, "loss": 0.2339, "step": 15234 }, { "epoch": 4.6575970651177006, "grad_norm": 1.6780455112457275, "learning_rate": 2.6867648932104794e-05, "loss": 0.1654, "step": 15235 }, { "epoch": 4.657902782023846, "grad_norm": 0.26196378469467163, "learning_rate": 2.6867224321684853e-05, "loss": 0.0901, "step": 15236 }, { "epoch": 4.658208498929991, "grad_norm": 0.3065689206123352, "learning_rate": 2.6866799711264915e-05, "loss": 0.0585, "step": 15237 }, { "epoch": 4.658514215836136, "grad_norm": 0.19421719014644623, "learning_rate": 2.6866375100844974e-05, "loss": 0.0568, "step": 15238 }, { "epoch": 4.658819932742281, "grad_norm": 0.46164360642433167, "learning_rate": 2.6865950490425036e-05, "loss": 0.0663, "step": 15239 }, { "epoch": 4.659125649648425, "grad_norm": 0.42243725061416626, "learning_rate": 2.6865525880005094e-05, "loss": 0.0583, "step": 15240 }, { "epoch": 4.65943136655457, "grad_norm": 0.26366862654685974, "learning_rate": 2.6865101269585157e-05, "loss": 0.0597, "step": 15241 }, { "epoch": 4.6597370834607155, "grad_norm": 1.1541013717651367, "learning_rate": 2.6864676659165215e-05, "loss": 0.0755, "step": 15242 }, { "epoch": 4.660042800366861, "grad_norm": 0.30905359983444214, "learning_rate": 2.6864252048745277e-05, "loss": 0.08, "step": 15243 }, { "epoch": 4.660348517273005, "grad_norm": 0.20018704235553741, "learning_rate": 2.6863827438325336e-05, "loss": 0.0408, "step": 15244 }, { "epoch": 4.66065423417915, "grad_norm": 0.7446248531341553, "learning_rate": 2.6863402827905398e-05, "loss": 0.0849, "step": 15245 }, { "epoch": 4.660959951085295, "grad_norm": 1.2874201536178589, "learning_rate": 2.6862978217485457e-05, "loss": 0.1101, "step": 15246 }, { "epoch": 4.66126566799144, "grad_norm": 1.163586974143982, "learning_rate": 2.6862553607065516e-05, "loss": 0.1414, "step": 15247 }, { "epoch": 4.661571384897584, "grad_norm": 1.1391769647598267, "learning_rate": 2.6862128996645578e-05, "loss": 0.1096, "step": 15248 }, { "epoch": 4.6618771018037295, "grad_norm": 0.7310516834259033, "learning_rate": 2.6861704386225636e-05, "loss": 0.1061, "step": 15249 }, { "epoch": 4.662182818709875, "grad_norm": 0.6041595339775085, "learning_rate": 2.68612797758057e-05, "loss": 0.1446, "step": 15250 }, { "epoch": 4.66248853561602, "grad_norm": 0.5414738059043884, "learning_rate": 2.6860855165385757e-05, "loss": 0.1721, "step": 15251 }, { "epoch": 4.662794252522165, "grad_norm": 0.6315814852714539, "learning_rate": 2.686043055496582e-05, "loss": 0.1662, "step": 15252 }, { "epoch": 4.663099969428309, "grad_norm": 1.2664172649383545, "learning_rate": 2.6860005944545878e-05, "loss": 0.1459, "step": 15253 }, { "epoch": 4.663405686334454, "grad_norm": 0.8090726733207703, "learning_rate": 2.685958133412594e-05, "loss": 0.2082, "step": 15254 }, { "epoch": 4.663711403240599, "grad_norm": 0.6546986103057861, "learning_rate": 2.6859156723706e-05, "loss": 0.2062, "step": 15255 }, { "epoch": 4.664017120146744, "grad_norm": 0.7409433722496033, "learning_rate": 2.685873211328606e-05, "loss": 0.1923, "step": 15256 }, { "epoch": 4.664322837052889, "grad_norm": 0.8616195917129517, "learning_rate": 2.685830750286612e-05, "loss": 0.1756, "step": 15257 }, { "epoch": 4.664628553959034, "grad_norm": 1.5144380331039429, "learning_rate": 2.685788289244618e-05, "loss": 0.2079, "step": 15258 }, { "epoch": 4.664934270865179, "grad_norm": 4.65247106552124, "learning_rate": 2.685745828202624e-05, "loss": 0.2176, "step": 15259 }, { "epoch": 4.665239987771324, "grad_norm": 0.37513479590415955, "learning_rate": 2.68570336716063e-05, "loss": 0.1337, "step": 15260 }, { "epoch": 4.665545704677468, "grad_norm": 0.2784704267978668, "learning_rate": 2.685660906118636e-05, "loss": 0.0801, "step": 15261 }, { "epoch": 4.665851421583613, "grad_norm": 1.253129005432129, "learning_rate": 2.685618445076642e-05, "loss": 0.0655, "step": 15262 }, { "epoch": 4.666157138489758, "grad_norm": 0.27287474274635315, "learning_rate": 2.6855759840346482e-05, "loss": 0.0811, "step": 15263 }, { "epoch": 4.6664628553959036, "grad_norm": 0.3435080349445343, "learning_rate": 2.685533522992654e-05, "loss": 0.0475, "step": 15264 }, { "epoch": 4.666768572302049, "grad_norm": 0.23666918277740479, "learning_rate": 2.6854910619506603e-05, "loss": 0.054, "step": 15265 }, { "epoch": 4.667074289208193, "grad_norm": 0.49712640047073364, "learning_rate": 2.685448600908666e-05, "loss": 0.0947, "step": 15266 }, { "epoch": 4.667380006114338, "grad_norm": 0.2990431785583496, "learning_rate": 2.6854061398666723e-05, "loss": 0.066, "step": 15267 }, { "epoch": 4.667685723020483, "grad_norm": 0.2399953156709671, "learning_rate": 2.6853636788246782e-05, "loss": 0.0802, "step": 15268 }, { "epoch": 4.667991439926628, "grad_norm": 0.549022912979126, "learning_rate": 2.6853212177826844e-05, "loss": 0.0569, "step": 15269 }, { "epoch": 4.6682971568327725, "grad_norm": 0.31125667691230774, "learning_rate": 2.6852787567406903e-05, "loss": 0.1204, "step": 15270 }, { "epoch": 4.668602873738918, "grad_norm": 0.3690160810947418, "learning_rate": 2.6852362956986965e-05, "loss": 0.1038, "step": 15271 }, { "epoch": 4.668908590645063, "grad_norm": 0.41195112466812134, "learning_rate": 2.6851938346567027e-05, "loss": 0.1094, "step": 15272 }, { "epoch": 4.669214307551208, "grad_norm": 0.6160513162612915, "learning_rate": 2.6851513736147086e-05, "loss": 0.0983, "step": 15273 }, { "epoch": 4.669520024457352, "grad_norm": 0.32999229431152344, "learning_rate": 2.6851089125727148e-05, "loss": 0.119, "step": 15274 }, { "epoch": 4.669825741363497, "grad_norm": 0.4671297073364258, "learning_rate": 2.6850664515307207e-05, "loss": 0.1332, "step": 15275 }, { "epoch": 4.670131458269642, "grad_norm": 0.42172563076019287, "learning_rate": 2.685023990488727e-05, "loss": 0.1622, "step": 15276 }, { "epoch": 4.670437175175787, "grad_norm": 0.7595468163490295, "learning_rate": 2.6849815294467327e-05, "loss": 0.153, "step": 15277 }, { "epoch": 4.6707428920819325, "grad_norm": 0.572964608669281, "learning_rate": 2.684939068404739e-05, "loss": 0.1841, "step": 15278 }, { "epoch": 4.671048608988077, "grad_norm": 1.2081363201141357, "learning_rate": 2.6848966073627448e-05, "loss": 0.1791, "step": 15279 }, { "epoch": 4.671354325894222, "grad_norm": 0.5043083429336548, "learning_rate": 2.684854146320751e-05, "loss": 0.1643, "step": 15280 }, { "epoch": 4.671660042800367, "grad_norm": 1.2053991556167603, "learning_rate": 2.684811685278757e-05, "loss": 0.2139, "step": 15281 }, { "epoch": 4.671965759706512, "grad_norm": 1.2170993089675903, "learning_rate": 2.684769224236763e-05, "loss": 0.1954, "step": 15282 }, { "epoch": 4.672271476612656, "grad_norm": 4.4421257972717285, "learning_rate": 2.684726763194769e-05, "loss": 0.2367, "step": 15283 }, { "epoch": 4.672577193518801, "grad_norm": 2.0551795959472656, "learning_rate": 2.6846843021527752e-05, "loss": 0.233, "step": 15284 }, { "epoch": 4.6728829104249465, "grad_norm": 0.4052104651927948, "learning_rate": 2.684641841110781e-05, "loss": 0.1375, "step": 15285 }, { "epoch": 4.673188627331092, "grad_norm": 0.5589884519577026, "learning_rate": 2.684599380068787e-05, "loss": 0.0816, "step": 15286 }, { "epoch": 4.673494344237236, "grad_norm": 0.693835973739624, "learning_rate": 2.684556919026793e-05, "loss": 0.0707, "step": 15287 }, { "epoch": 4.673800061143381, "grad_norm": 0.2044287919998169, "learning_rate": 2.684514457984799e-05, "loss": 0.0633, "step": 15288 }, { "epoch": 4.674105778049526, "grad_norm": 0.2552923858165741, "learning_rate": 2.6844719969428052e-05, "loss": 0.0553, "step": 15289 }, { "epoch": 4.674411494955671, "grad_norm": 0.1554703265428543, "learning_rate": 2.684429535900811e-05, "loss": 0.0596, "step": 15290 }, { "epoch": 4.674717211861816, "grad_norm": 0.21028399467468262, "learning_rate": 2.6843870748588173e-05, "loss": 0.0485, "step": 15291 }, { "epoch": 4.6750229287679606, "grad_norm": 0.24622809886932373, "learning_rate": 2.684344613816823e-05, "loss": 0.0809, "step": 15292 }, { "epoch": 4.675328645674106, "grad_norm": 1.4033833742141724, "learning_rate": 2.6843021527748294e-05, "loss": 0.0673, "step": 15293 }, { "epoch": 4.675634362580251, "grad_norm": 0.24114589393138885, "learning_rate": 2.6842596917328352e-05, "loss": 0.066, "step": 15294 }, { "epoch": 4.675940079486396, "grad_norm": 0.32737278938293457, "learning_rate": 2.6842172306908414e-05, "loss": 0.0857, "step": 15295 }, { "epoch": 4.67624579639254, "grad_norm": 1.0100278854370117, "learning_rate": 2.6841747696488473e-05, "loss": 0.094, "step": 15296 }, { "epoch": 4.676551513298685, "grad_norm": 0.446646511554718, "learning_rate": 2.6841323086068535e-05, "loss": 0.1252, "step": 15297 }, { "epoch": 4.67685723020483, "grad_norm": 0.46299585700035095, "learning_rate": 2.6840898475648594e-05, "loss": 0.1268, "step": 15298 }, { "epoch": 4.6771629471109755, "grad_norm": 0.504539966583252, "learning_rate": 2.6840473865228653e-05, "loss": 0.1134, "step": 15299 }, { "epoch": 4.67746866401712, "grad_norm": 0.4523053467273712, "learning_rate": 2.6840049254808715e-05, "loss": 0.1315, "step": 15300 }, { "epoch": 4.677774380923265, "grad_norm": 0.5754449367523193, "learning_rate": 2.6839624644388773e-05, "loss": 0.1661, "step": 15301 }, { "epoch": 4.67808009782941, "grad_norm": 0.43748170137405396, "learning_rate": 2.6839200033968835e-05, "loss": 0.1523, "step": 15302 }, { "epoch": 4.678385814735555, "grad_norm": 1.4039044380187988, "learning_rate": 2.6838775423548894e-05, "loss": 0.1806, "step": 15303 }, { "epoch": 4.6786915316417, "grad_norm": 0.586067259311676, "learning_rate": 2.6838350813128956e-05, "loss": 0.1829, "step": 15304 }, { "epoch": 4.678997248547844, "grad_norm": 3.4631545543670654, "learning_rate": 2.6837926202709015e-05, "loss": 0.166, "step": 15305 }, { "epoch": 4.6793029654539895, "grad_norm": 1.282674789428711, "learning_rate": 2.6837501592289077e-05, "loss": 0.1991, "step": 15306 }, { "epoch": 4.679608682360135, "grad_norm": 0.7190931439399719, "learning_rate": 2.6837076981869136e-05, "loss": 0.2244, "step": 15307 }, { "epoch": 4.67991439926628, "grad_norm": 0.6387807130813599, "learning_rate": 2.6836652371449198e-05, "loss": 0.173, "step": 15308 }, { "epoch": 4.680220116172424, "grad_norm": 1.8184605836868286, "learning_rate": 2.6836227761029257e-05, "loss": 0.2602, "step": 15309 }, { "epoch": 4.680525833078569, "grad_norm": 0.4351781904697418, "learning_rate": 2.683580315060932e-05, "loss": 0.1604, "step": 15310 }, { "epoch": 4.680831549984714, "grad_norm": 0.24008022248744965, "learning_rate": 2.6835378540189377e-05, "loss": 0.065, "step": 15311 }, { "epoch": 4.681137266890859, "grad_norm": 0.19755956530570984, "learning_rate": 2.6834953929769436e-05, "loss": 0.063, "step": 15312 }, { "epoch": 4.6814429837970035, "grad_norm": 0.6949129700660706, "learning_rate": 2.6834529319349498e-05, "loss": 0.0889, "step": 15313 }, { "epoch": 4.681748700703149, "grad_norm": 0.27433809638023376, "learning_rate": 2.6834104708929557e-05, "loss": 0.0603, "step": 15314 }, { "epoch": 4.682054417609294, "grad_norm": 0.4499441981315613, "learning_rate": 2.683368009850962e-05, "loss": 0.0565, "step": 15315 }, { "epoch": 4.682360134515439, "grad_norm": 0.4316006600856781, "learning_rate": 2.6833255488089678e-05, "loss": 0.048, "step": 15316 }, { "epoch": 4.682665851421584, "grad_norm": 0.18989861011505127, "learning_rate": 2.683283087766974e-05, "loss": 0.0441, "step": 15317 }, { "epoch": 4.682971568327728, "grad_norm": 0.3578616976737976, "learning_rate": 2.68324062672498e-05, "loss": 0.0722, "step": 15318 }, { "epoch": 4.683277285233873, "grad_norm": 0.19206783175468445, "learning_rate": 2.683198165682986e-05, "loss": 0.0763, "step": 15319 }, { "epoch": 4.683583002140018, "grad_norm": 0.2714749872684479, "learning_rate": 2.683155704640992e-05, "loss": 0.0937, "step": 15320 }, { "epoch": 4.6838887190461636, "grad_norm": 0.5455918908119202, "learning_rate": 2.683113243598998e-05, "loss": 0.079, "step": 15321 }, { "epoch": 4.684194435952308, "grad_norm": 0.5277292728424072, "learning_rate": 2.683070782557004e-05, "loss": 0.1162, "step": 15322 }, { "epoch": 4.684500152858453, "grad_norm": 0.36300474405288696, "learning_rate": 2.6830283215150102e-05, "loss": 0.1128, "step": 15323 }, { "epoch": 4.684805869764598, "grad_norm": 0.8622922301292419, "learning_rate": 2.682985860473016e-05, "loss": 0.1225, "step": 15324 }, { "epoch": 4.685111586670743, "grad_norm": 0.29505693912506104, "learning_rate": 2.682943399431022e-05, "loss": 0.1307, "step": 15325 }, { "epoch": 4.685417303576887, "grad_norm": 0.3669566512107849, "learning_rate": 2.682900938389028e-05, "loss": 0.1488, "step": 15326 }, { "epoch": 4.6857230204830325, "grad_norm": 0.750697672367096, "learning_rate": 2.682858477347034e-05, "loss": 0.1512, "step": 15327 }, { "epoch": 4.686028737389178, "grad_norm": 0.8791185617446899, "learning_rate": 2.6828160163050402e-05, "loss": 0.1638, "step": 15328 }, { "epoch": 4.686334454295323, "grad_norm": 1.0825074911117554, "learning_rate": 2.682773555263046e-05, "loss": 0.2182, "step": 15329 }, { "epoch": 4.686640171201468, "grad_norm": 0.9205499887466431, "learning_rate": 2.6827310942210523e-05, "loss": 0.2007, "step": 15330 }, { "epoch": 4.686945888107612, "grad_norm": 0.8986067175865173, "learning_rate": 2.6826886331790582e-05, "loss": 0.225, "step": 15331 }, { "epoch": 4.687251605013757, "grad_norm": 1.7092711925506592, "learning_rate": 2.6826461721370644e-05, "loss": 0.176, "step": 15332 }, { "epoch": 4.687557321919902, "grad_norm": 0.7372469902038574, "learning_rate": 2.6826037110950703e-05, "loss": 0.1696, "step": 15333 }, { "epoch": 4.687863038826047, "grad_norm": 1.512389063835144, "learning_rate": 2.6825612500530765e-05, "loss": 0.2533, "step": 15334 }, { "epoch": 4.688168755732192, "grad_norm": 0.36468306183815, "learning_rate": 2.6825187890110823e-05, "loss": 0.1277, "step": 15335 }, { "epoch": 4.688474472638337, "grad_norm": 0.41689255833625793, "learning_rate": 2.6824763279690885e-05, "loss": 0.0976, "step": 15336 }, { "epoch": 4.688780189544482, "grad_norm": 0.6217965483665466, "learning_rate": 2.6824338669270944e-05, "loss": 0.0693, "step": 15337 }, { "epoch": 4.689085906450627, "grad_norm": 0.3376365005970001, "learning_rate": 2.6823914058851003e-05, "loss": 0.0537, "step": 15338 }, { "epoch": 4.689391623356771, "grad_norm": 0.31207549571990967, "learning_rate": 2.6823489448431065e-05, "loss": 0.0971, "step": 15339 }, { "epoch": 4.689697340262916, "grad_norm": 0.2932257652282715, "learning_rate": 2.6823064838011124e-05, "loss": 0.0601, "step": 15340 }, { "epoch": 4.690003057169061, "grad_norm": 0.2606724500656128, "learning_rate": 2.6822640227591186e-05, "loss": 0.0666, "step": 15341 }, { "epoch": 4.6903087740752065, "grad_norm": 0.2213360071182251, "learning_rate": 2.6822215617171244e-05, "loss": 0.0789, "step": 15342 }, { "epoch": 4.690614490981352, "grad_norm": 0.2538449764251709, "learning_rate": 2.6821791006751307e-05, "loss": 0.0695, "step": 15343 }, { "epoch": 4.690920207887496, "grad_norm": 0.46416765451431274, "learning_rate": 2.6821366396331365e-05, "loss": 0.0815, "step": 15344 }, { "epoch": 4.691225924793641, "grad_norm": 0.9298022985458374, "learning_rate": 2.6820941785911427e-05, "loss": 0.073, "step": 15345 }, { "epoch": 4.691531641699786, "grad_norm": 0.27172115445137024, "learning_rate": 2.6820517175491486e-05, "loss": 0.0828, "step": 15346 }, { "epoch": 4.691837358605931, "grad_norm": 0.4552333950996399, "learning_rate": 2.6820092565071548e-05, "loss": 0.0797, "step": 15347 }, { "epoch": 4.692143075512075, "grad_norm": 0.3680846095085144, "learning_rate": 2.6819667954651607e-05, "loss": 0.1053, "step": 15348 }, { "epoch": 4.6924487924182205, "grad_norm": 0.43959516286849976, "learning_rate": 2.6819243344231666e-05, "loss": 0.0938, "step": 15349 }, { "epoch": 4.692754509324366, "grad_norm": 0.4501431882381439, "learning_rate": 2.6818818733811728e-05, "loss": 0.1348, "step": 15350 }, { "epoch": 4.693060226230511, "grad_norm": 1.3817964792251587, "learning_rate": 2.6818394123391786e-05, "loss": 0.121, "step": 15351 }, { "epoch": 4.693365943136655, "grad_norm": 0.422977477312088, "learning_rate": 2.681796951297185e-05, "loss": 0.1438, "step": 15352 }, { "epoch": 4.6936716600428, "grad_norm": 0.5413080453872681, "learning_rate": 2.6817544902551907e-05, "loss": 0.2159, "step": 15353 }, { "epoch": 4.693977376948945, "grad_norm": 1.152778148651123, "learning_rate": 2.681712029213197e-05, "loss": 0.1867, "step": 15354 }, { "epoch": 4.69428309385509, "grad_norm": 0.671959400177002, "learning_rate": 2.6816695681712028e-05, "loss": 0.1658, "step": 15355 }, { "epoch": 4.6945888107612355, "grad_norm": 2.610276699066162, "learning_rate": 2.681627107129209e-05, "loss": 0.1563, "step": 15356 }, { "epoch": 4.69489452766738, "grad_norm": 0.9528270363807678, "learning_rate": 2.681584646087215e-05, "loss": 0.1517, "step": 15357 }, { "epoch": 4.695200244573525, "grad_norm": 0.7946162223815918, "learning_rate": 2.681542185045221e-05, "loss": 0.2041, "step": 15358 }, { "epoch": 4.69550596147967, "grad_norm": 1.4101364612579346, "learning_rate": 2.681499724003227e-05, "loss": 0.214, "step": 15359 }, { "epoch": 4.695811678385815, "grad_norm": 0.37842705845832825, "learning_rate": 2.681457262961233e-05, "loss": 0.1609, "step": 15360 }, { "epoch": 4.696117395291959, "grad_norm": 0.3203057050704956, "learning_rate": 2.681414801919239e-05, "loss": 0.0887, "step": 15361 }, { "epoch": 4.696423112198104, "grad_norm": 0.3610048294067383, "learning_rate": 2.681372340877245e-05, "loss": 0.0903, "step": 15362 }, { "epoch": 4.6967288291042495, "grad_norm": 0.17888808250427246, "learning_rate": 2.681329879835251e-05, "loss": 0.0625, "step": 15363 }, { "epoch": 4.697034546010395, "grad_norm": 0.36954331398010254, "learning_rate": 2.681287418793257e-05, "loss": 0.0447, "step": 15364 }, { "epoch": 4.697340262916539, "grad_norm": 0.1940033882856369, "learning_rate": 2.6812449577512632e-05, "loss": 0.0353, "step": 15365 }, { "epoch": 4.697645979822684, "grad_norm": 0.1958923637866974, "learning_rate": 2.681202496709269e-05, "loss": 0.0572, "step": 15366 }, { "epoch": 4.697951696728829, "grad_norm": 0.6035119295120239, "learning_rate": 2.6811600356672753e-05, "loss": 0.0837, "step": 15367 }, { "epoch": 4.698257413634974, "grad_norm": 0.43439266085624695, "learning_rate": 2.681117574625281e-05, "loss": 0.0641, "step": 15368 }, { "epoch": 4.698563130541119, "grad_norm": 0.3765634000301361, "learning_rate": 2.6810751135832873e-05, "loss": 0.0967, "step": 15369 }, { "epoch": 4.6988688474472635, "grad_norm": 0.28659576177597046, "learning_rate": 2.6810326525412932e-05, "loss": 0.0898, "step": 15370 }, { "epoch": 4.699174564353409, "grad_norm": 0.3753683269023895, "learning_rate": 2.6809901914992994e-05, "loss": 0.0908, "step": 15371 }, { "epoch": 4.699480281259554, "grad_norm": 2.839625358581543, "learning_rate": 2.6809477304573053e-05, "loss": 0.081, "step": 15372 }, { "epoch": 4.699785998165699, "grad_norm": 0.32549476623535156, "learning_rate": 2.6809052694153115e-05, "loss": 0.1058, "step": 15373 }, { "epoch": 4.700091715071843, "grad_norm": 0.6541196703910828, "learning_rate": 2.6808628083733177e-05, "loss": 0.1488, "step": 15374 }, { "epoch": 4.700397431977988, "grad_norm": 0.6634578704833984, "learning_rate": 2.6808203473313236e-05, "loss": 0.1594, "step": 15375 }, { "epoch": 4.700703148884133, "grad_norm": 0.5468353033065796, "learning_rate": 2.6807778862893298e-05, "loss": 0.135, "step": 15376 }, { "epoch": 4.701008865790278, "grad_norm": 0.4541108310222626, "learning_rate": 2.6807354252473357e-05, "loss": 0.1472, "step": 15377 }, { "epoch": 4.701314582696423, "grad_norm": 0.4606721103191376, "learning_rate": 2.680692964205342e-05, "loss": 0.1396, "step": 15378 }, { "epoch": 4.701620299602568, "grad_norm": 1.0668458938598633, "learning_rate": 2.6806505031633477e-05, "loss": 0.1569, "step": 15379 }, { "epoch": 4.701926016508713, "grad_norm": 0.619940459728241, "learning_rate": 2.680608042121354e-05, "loss": 0.1633, "step": 15380 }, { "epoch": 4.702231733414858, "grad_norm": 1.0865042209625244, "learning_rate": 2.6805655810793598e-05, "loss": 0.1702, "step": 15381 }, { "epoch": 4.702537450321003, "grad_norm": 0.9083575010299683, "learning_rate": 2.680523120037366e-05, "loss": 0.2182, "step": 15382 }, { "epoch": 4.702843167227147, "grad_norm": 1.1988587379455566, "learning_rate": 2.680480658995372e-05, "loss": 0.2118, "step": 15383 }, { "epoch": 4.7031488841332925, "grad_norm": 1.034746527671814, "learning_rate": 2.680438197953378e-05, "loss": 0.2419, "step": 15384 }, { "epoch": 4.703454601039438, "grad_norm": 0.2843377888202667, "learning_rate": 2.680395736911384e-05, "loss": 0.1476, "step": 15385 }, { "epoch": 4.703760317945583, "grad_norm": 0.3467104732990265, "learning_rate": 2.6803532758693902e-05, "loss": 0.0891, "step": 15386 }, { "epoch": 4.704066034851727, "grad_norm": 0.22001604735851288, "learning_rate": 2.680310814827396e-05, "loss": 0.0736, "step": 15387 }, { "epoch": 4.704371751757872, "grad_norm": 0.2797246277332306, "learning_rate": 2.680268353785402e-05, "loss": 0.0725, "step": 15388 }, { "epoch": 4.704677468664017, "grad_norm": 0.15810856223106384, "learning_rate": 2.680225892743408e-05, "loss": 0.0668, "step": 15389 }, { "epoch": 4.704983185570162, "grad_norm": 0.298189640045166, "learning_rate": 2.680183431701414e-05, "loss": 0.0549, "step": 15390 }, { "epoch": 4.7052889024763065, "grad_norm": 0.48106440901756287, "learning_rate": 2.6801409706594202e-05, "loss": 0.0549, "step": 15391 }, { "epoch": 4.705594619382452, "grad_norm": 0.1736774742603302, "learning_rate": 2.680098509617426e-05, "loss": 0.0642, "step": 15392 }, { "epoch": 4.705900336288597, "grad_norm": 0.2914932668209076, "learning_rate": 2.6800560485754323e-05, "loss": 0.1128, "step": 15393 }, { "epoch": 4.706206053194742, "grad_norm": 0.31785130500793457, "learning_rate": 2.680013587533438e-05, "loss": 0.0664, "step": 15394 }, { "epoch": 4.706511770100887, "grad_norm": 0.28000208735466003, "learning_rate": 2.6799711264914444e-05, "loss": 0.0758, "step": 15395 }, { "epoch": 4.706817487007031, "grad_norm": 1.1339943408966064, "learning_rate": 2.6799286654494502e-05, "loss": 0.0753, "step": 15396 }, { "epoch": 4.707123203913176, "grad_norm": 0.2626739740371704, "learning_rate": 2.6798862044074564e-05, "loss": 0.0752, "step": 15397 }, { "epoch": 4.707428920819321, "grad_norm": 0.5425975918769836, "learning_rate": 2.6798437433654623e-05, "loss": 0.1406, "step": 15398 }, { "epoch": 4.7077346377254665, "grad_norm": 0.5332227349281311, "learning_rate": 2.6798012823234685e-05, "loss": 0.1289, "step": 15399 }, { "epoch": 4.708040354631611, "grad_norm": 0.8117501139640808, "learning_rate": 2.6797588212814744e-05, "loss": 0.1589, "step": 15400 }, { "epoch": 4.708346071537756, "grad_norm": 0.6738077402114868, "learning_rate": 2.6797163602394803e-05, "loss": 0.1394, "step": 15401 }, { "epoch": 4.708651788443901, "grad_norm": 1.5666565895080566, "learning_rate": 2.6796738991974865e-05, "loss": 0.1475, "step": 15402 }, { "epoch": 4.708957505350046, "grad_norm": 0.6043332815170288, "learning_rate": 2.6796314381554923e-05, "loss": 0.1624, "step": 15403 }, { "epoch": 4.70926322225619, "grad_norm": 1.3583390712738037, "learning_rate": 2.6795889771134986e-05, "loss": 0.1322, "step": 15404 }, { "epoch": 4.709568939162335, "grad_norm": 0.6382148861885071, "learning_rate": 2.6795465160715044e-05, "loss": 0.1723, "step": 15405 }, { "epoch": 4.7098746560684805, "grad_norm": 0.7648168206214905, "learning_rate": 2.6795040550295106e-05, "loss": 0.1708, "step": 15406 }, { "epoch": 4.710180372974626, "grad_norm": 1.538331389427185, "learning_rate": 2.6794615939875165e-05, "loss": 0.1669, "step": 15407 }, { "epoch": 4.710486089880771, "grad_norm": 1.2944849729537964, "learning_rate": 2.6794191329455227e-05, "loss": 0.2103, "step": 15408 }, { "epoch": 4.710791806786915, "grad_norm": 1.098026990890503, "learning_rate": 2.6793766719035286e-05, "loss": 0.2491, "step": 15409 }, { "epoch": 4.71109752369306, "grad_norm": 0.5569389462471008, "learning_rate": 2.6793342108615348e-05, "loss": 0.1315, "step": 15410 }, { "epoch": 4.711403240599205, "grad_norm": 0.4316944479942322, "learning_rate": 2.6792917498195407e-05, "loss": 0.0722, "step": 15411 }, { "epoch": 4.71170895750535, "grad_norm": 0.507596492767334, "learning_rate": 2.679249288777547e-05, "loss": 0.1074, "step": 15412 }, { "epoch": 4.712014674411495, "grad_norm": 0.1910618394613266, "learning_rate": 2.6792068277355527e-05, "loss": 0.0629, "step": 15413 }, { "epoch": 4.71232039131764, "grad_norm": 0.15995095670223236, "learning_rate": 2.6791643666935586e-05, "loss": 0.0452, "step": 15414 }, { "epoch": 4.712626108223785, "grad_norm": 0.25042614340782166, "learning_rate": 2.6791219056515648e-05, "loss": 0.0534, "step": 15415 }, { "epoch": 4.71293182512993, "grad_norm": 0.25537484884262085, "learning_rate": 2.6790794446095707e-05, "loss": 0.0462, "step": 15416 }, { "epoch": 4.713237542036074, "grad_norm": 0.44222912192344666, "learning_rate": 2.679036983567577e-05, "loss": 0.0601, "step": 15417 }, { "epoch": 4.713543258942219, "grad_norm": 0.28733983635902405, "learning_rate": 2.6789945225255828e-05, "loss": 0.0676, "step": 15418 }, { "epoch": 4.713848975848364, "grad_norm": 1.6393259763717651, "learning_rate": 2.678952061483589e-05, "loss": 0.083, "step": 15419 }, { "epoch": 4.7141546927545095, "grad_norm": 0.42061322927474976, "learning_rate": 2.678909600441595e-05, "loss": 0.0826, "step": 15420 }, { "epoch": 4.714460409660655, "grad_norm": 0.44739097356796265, "learning_rate": 2.678867139399601e-05, "loss": 0.0882, "step": 15421 }, { "epoch": 4.714766126566799, "grad_norm": 0.4270138144493103, "learning_rate": 2.678824678357607e-05, "loss": 0.0834, "step": 15422 }, { "epoch": 4.715071843472944, "grad_norm": 0.38622352480888367, "learning_rate": 2.678782217315613e-05, "loss": 0.1171, "step": 15423 }, { "epoch": 4.715377560379089, "grad_norm": 0.6039283871650696, "learning_rate": 2.678739756273619e-05, "loss": 0.1219, "step": 15424 }, { "epoch": 4.715683277285234, "grad_norm": 1.0906482934951782, "learning_rate": 2.6786972952316252e-05, "loss": 0.1291, "step": 15425 }, { "epoch": 4.715988994191378, "grad_norm": 0.8775330781936646, "learning_rate": 2.678654834189631e-05, "loss": 0.1549, "step": 15426 }, { "epoch": 4.7162947110975235, "grad_norm": 0.5587326884269714, "learning_rate": 2.678612373147637e-05, "loss": 0.1489, "step": 15427 }, { "epoch": 4.716600428003669, "grad_norm": 0.5096992254257202, "learning_rate": 2.678569912105643e-05, "loss": 0.1558, "step": 15428 }, { "epoch": 4.716906144909814, "grad_norm": 1.0123767852783203, "learning_rate": 2.678527451063649e-05, "loss": 0.2014, "step": 15429 }, { "epoch": 4.717211861815958, "grad_norm": 0.989216148853302, "learning_rate": 2.6784849900216552e-05, "loss": 0.1596, "step": 15430 }, { "epoch": 4.717517578722103, "grad_norm": 1.4644906520843506, "learning_rate": 2.678442528979661e-05, "loss": 0.2406, "step": 15431 }, { "epoch": 4.717823295628248, "grad_norm": 2.0006353855133057, "learning_rate": 2.6784000679376673e-05, "loss": 0.1588, "step": 15432 }, { "epoch": 4.718129012534393, "grad_norm": 0.9193089604377747, "learning_rate": 2.6783576068956732e-05, "loss": 0.207, "step": 15433 }, { "epoch": 4.718434729440538, "grad_norm": 1.4329336881637573, "learning_rate": 2.6783151458536794e-05, "loss": 0.2527, "step": 15434 }, { "epoch": 4.718740446346683, "grad_norm": 0.5682390928268433, "learning_rate": 2.6782726848116853e-05, "loss": 0.1384, "step": 15435 }, { "epoch": 4.719046163252828, "grad_norm": 0.345245361328125, "learning_rate": 2.6782302237696915e-05, "loss": 0.075, "step": 15436 }, { "epoch": 4.719351880158973, "grad_norm": 0.287173867225647, "learning_rate": 2.6781877627276973e-05, "loss": 0.0845, "step": 15437 }, { "epoch": 4.719657597065118, "grad_norm": 2.1938114166259766, "learning_rate": 2.6781453016857036e-05, "loss": 0.0807, "step": 15438 }, { "epoch": 4.719963313971262, "grad_norm": 0.11572805792093277, "learning_rate": 2.6781028406437094e-05, "loss": 0.0332, "step": 15439 }, { "epoch": 4.720269030877407, "grad_norm": 0.17091436684131622, "learning_rate": 2.6780603796017153e-05, "loss": 0.0462, "step": 15440 }, { "epoch": 4.7205747477835525, "grad_norm": 0.4457049071788788, "learning_rate": 2.6780179185597215e-05, "loss": 0.0862, "step": 15441 }, { "epoch": 4.720880464689698, "grad_norm": 0.3122018575668335, "learning_rate": 2.6779754575177274e-05, "loss": 0.0811, "step": 15442 }, { "epoch": 4.721186181595842, "grad_norm": 0.18318095803260803, "learning_rate": 2.6779329964757336e-05, "loss": 0.0517, "step": 15443 }, { "epoch": 4.721491898501987, "grad_norm": 0.24040205776691437, "learning_rate": 2.6778905354337395e-05, "loss": 0.0687, "step": 15444 }, { "epoch": 4.721797615408132, "grad_norm": 0.27532362937927246, "learning_rate": 2.6778480743917457e-05, "loss": 0.0899, "step": 15445 }, { "epoch": 4.722103332314277, "grad_norm": 1.5184118747711182, "learning_rate": 2.6778056133497515e-05, "loss": 0.0694, "step": 15446 }, { "epoch": 4.722409049220422, "grad_norm": 0.2607455849647522, "learning_rate": 2.6777631523077577e-05, "loss": 0.0988, "step": 15447 }, { "epoch": 4.7227147661265665, "grad_norm": 0.8992821574211121, "learning_rate": 2.6777206912657636e-05, "loss": 0.1001, "step": 15448 }, { "epoch": 4.723020483032712, "grad_norm": 0.6894249320030212, "learning_rate": 2.6776782302237698e-05, "loss": 0.13, "step": 15449 }, { "epoch": 4.723326199938857, "grad_norm": 0.4117873013019562, "learning_rate": 2.6776357691817757e-05, "loss": 0.1253, "step": 15450 }, { "epoch": 4.723631916845002, "grad_norm": 0.534745454788208, "learning_rate": 2.677593308139782e-05, "loss": 0.1612, "step": 15451 }, { "epoch": 4.723937633751146, "grad_norm": 0.656191349029541, "learning_rate": 2.6775508470977878e-05, "loss": 0.204, "step": 15452 }, { "epoch": 4.724243350657291, "grad_norm": 0.6205139756202698, "learning_rate": 2.6775083860557936e-05, "loss": 0.1445, "step": 15453 }, { "epoch": 4.724549067563436, "grad_norm": 0.8376290798187256, "learning_rate": 2.6774659250138e-05, "loss": 0.2001, "step": 15454 }, { "epoch": 4.724854784469581, "grad_norm": 0.9507554769515991, "learning_rate": 2.6774234639718057e-05, "loss": 0.1663, "step": 15455 }, { "epoch": 4.725160501375726, "grad_norm": 1.0310710668563843, "learning_rate": 2.677381002929812e-05, "loss": 0.1562, "step": 15456 }, { "epoch": 4.725466218281871, "grad_norm": 1.2029623985290527, "learning_rate": 2.6773385418878178e-05, "loss": 0.1691, "step": 15457 }, { "epoch": 4.725771935188016, "grad_norm": 3.2564752101898193, "learning_rate": 2.677296080845824e-05, "loss": 0.2172, "step": 15458 }, { "epoch": 4.726077652094161, "grad_norm": 1.4058338403701782, "learning_rate": 2.67725361980383e-05, "loss": 0.2539, "step": 15459 }, { "epoch": 4.726383369000306, "grad_norm": 0.35954463481903076, "learning_rate": 2.677211158761836e-05, "loss": 0.1132, "step": 15460 }, { "epoch": 4.72668908590645, "grad_norm": 0.3148441016674042, "learning_rate": 2.677168697719842e-05, "loss": 0.0897, "step": 15461 }, { "epoch": 4.726994802812595, "grad_norm": 0.22469745576381683, "learning_rate": 2.677126236677848e-05, "loss": 0.0492, "step": 15462 }, { "epoch": 4.7273005197187405, "grad_norm": 1.1033726930618286, "learning_rate": 2.677083775635854e-05, "loss": 0.0661, "step": 15463 }, { "epoch": 4.727606236624886, "grad_norm": 0.25563180446624756, "learning_rate": 2.6770413145938602e-05, "loss": 0.0507, "step": 15464 }, { "epoch": 4.72791195353103, "grad_norm": 0.5432220101356506, "learning_rate": 2.676998853551866e-05, "loss": 0.0637, "step": 15465 }, { "epoch": 4.728217670437175, "grad_norm": 0.35140952467918396, "learning_rate": 2.676956392509872e-05, "loss": 0.0396, "step": 15466 }, { "epoch": 4.72852338734332, "grad_norm": 0.9196953773498535, "learning_rate": 2.6769139314678782e-05, "loss": 0.0733, "step": 15467 }, { "epoch": 4.728829104249465, "grad_norm": 0.6146523952484131, "learning_rate": 2.676871470425884e-05, "loss": 0.0618, "step": 15468 }, { "epoch": 4.7291348211556095, "grad_norm": 0.1619008630514145, "learning_rate": 2.6768290093838903e-05, "loss": 0.0538, "step": 15469 }, { "epoch": 4.729440538061755, "grad_norm": 0.3170586824417114, "learning_rate": 2.676786548341896e-05, "loss": 0.1253, "step": 15470 }, { "epoch": 4.7297462549679, "grad_norm": 0.2742787301540375, "learning_rate": 2.6767440872999023e-05, "loss": 0.0688, "step": 15471 }, { "epoch": 4.730051971874045, "grad_norm": 0.5337661504745483, "learning_rate": 2.6767016262579082e-05, "loss": 0.0816, "step": 15472 }, { "epoch": 4.73035768878019, "grad_norm": 0.40201741456985474, "learning_rate": 2.6766591652159144e-05, "loss": 0.1337, "step": 15473 }, { "epoch": 4.730663405686334, "grad_norm": 0.30427277088165283, "learning_rate": 2.6766167041739203e-05, "loss": 0.1066, "step": 15474 }, { "epoch": 4.730969122592479, "grad_norm": 0.5065839886665344, "learning_rate": 2.6765742431319265e-05, "loss": 0.1288, "step": 15475 }, { "epoch": 4.731274839498624, "grad_norm": 0.4490653872489929, "learning_rate": 2.6765317820899327e-05, "loss": 0.2168, "step": 15476 }, { "epoch": 4.7315805564047695, "grad_norm": 1.2884559631347656, "learning_rate": 2.6764893210479386e-05, "loss": 0.1545, "step": 15477 }, { "epoch": 4.731886273310914, "grad_norm": 0.6753866672515869, "learning_rate": 2.6764468600059448e-05, "loss": 0.1918, "step": 15478 }, { "epoch": 4.732191990217059, "grad_norm": 0.48207879066467285, "learning_rate": 2.6764043989639507e-05, "loss": 0.1508, "step": 15479 }, { "epoch": 4.732497707123204, "grad_norm": 1.3730121850967407, "learning_rate": 2.676361937921957e-05, "loss": 0.1786, "step": 15480 }, { "epoch": 4.732803424029349, "grad_norm": 1.461383581161499, "learning_rate": 2.6763194768799627e-05, "loss": 0.1591, "step": 15481 }, { "epoch": 4.733109140935493, "grad_norm": 1.2919293642044067, "learning_rate": 2.676277015837969e-05, "loss": 0.2012, "step": 15482 }, { "epoch": 4.733414857841638, "grad_norm": 0.932185173034668, "learning_rate": 2.6762345547959748e-05, "loss": 0.1677, "step": 15483 }, { "epoch": 4.7337205747477835, "grad_norm": 2.158461093902588, "learning_rate": 2.676192093753981e-05, "loss": 0.2763, "step": 15484 }, { "epoch": 4.734026291653929, "grad_norm": 0.3461936116218567, "learning_rate": 2.676149632711987e-05, "loss": 0.1509, "step": 15485 }, { "epoch": 4.734332008560074, "grad_norm": 0.2451624870300293, "learning_rate": 2.676107171669993e-05, "loss": 0.1059, "step": 15486 }, { "epoch": 4.734637725466218, "grad_norm": 1.624423861503601, "learning_rate": 2.676064710627999e-05, "loss": 0.0652, "step": 15487 }, { "epoch": 4.734943442372363, "grad_norm": 0.2954976260662079, "learning_rate": 2.6760222495860052e-05, "loss": 0.0671, "step": 15488 }, { "epoch": 4.735249159278508, "grad_norm": 0.20741838216781616, "learning_rate": 2.675979788544011e-05, "loss": 0.0613, "step": 15489 }, { "epoch": 4.735554876184653, "grad_norm": 0.4407372772693634, "learning_rate": 2.675937327502017e-05, "loss": 0.0529, "step": 15490 }, { "epoch": 4.7358605930907975, "grad_norm": 0.22910399734973907, "learning_rate": 2.675894866460023e-05, "loss": 0.0482, "step": 15491 }, { "epoch": 4.736166309996943, "grad_norm": 0.47492101788520813, "learning_rate": 2.675852405418029e-05, "loss": 0.0573, "step": 15492 }, { "epoch": 4.736472026903088, "grad_norm": 0.3531329035758972, "learning_rate": 2.6758099443760352e-05, "loss": 0.1076, "step": 15493 }, { "epoch": 4.736777743809233, "grad_norm": 0.6856342554092407, "learning_rate": 2.675767483334041e-05, "loss": 0.0519, "step": 15494 }, { "epoch": 4.737083460715377, "grad_norm": 0.7447728514671326, "learning_rate": 2.6757250222920473e-05, "loss": 0.0601, "step": 15495 }, { "epoch": 4.737389177621522, "grad_norm": 0.4982115924358368, "learning_rate": 2.675682561250053e-05, "loss": 0.0648, "step": 15496 }, { "epoch": 4.737694894527667, "grad_norm": 0.6089572310447693, "learning_rate": 2.6756401002080594e-05, "loss": 0.0997, "step": 15497 }, { "epoch": 4.7380006114338125, "grad_norm": 0.5358493328094482, "learning_rate": 2.6755976391660652e-05, "loss": 0.1358, "step": 15498 }, { "epoch": 4.738306328339958, "grad_norm": 0.48963049054145813, "learning_rate": 2.6755551781240714e-05, "loss": 0.1355, "step": 15499 }, { "epoch": 4.738612045246102, "grad_norm": 0.6579869985580444, "learning_rate": 2.6755127170820773e-05, "loss": 0.1515, "step": 15500 }, { "epoch": 4.738917762152247, "grad_norm": 0.5658666491508484, "learning_rate": 2.6754702560400835e-05, "loss": 0.2008, "step": 15501 }, { "epoch": 4.739223479058392, "grad_norm": 0.558836817741394, "learning_rate": 2.6754277949980894e-05, "loss": 0.1847, "step": 15502 }, { "epoch": 4.739529195964537, "grad_norm": 0.3692103624343872, "learning_rate": 2.6753853339560953e-05, "loss": 0.1498, "step": 15503 }, { "epoch": 4.739834912870681, "grad_norm": 0.6196784973144531, "learning_rate": 2.6753428729141015e-05, "loss": 0.179, "step": 15504 }, { "epoch": 4.7401406297768265, "grad_norm": 0.8737592697143555, "learning_rate": 2.6753004118721073e-05, "loss": 0.1832, "step": 15505 }, { "epoch": 4.740446346682972, "grad_norm": 0.6430594325065613, "learning_rate": 2.6752579508301136e-05, "loss": 0.1807, "step": 15506 }, { "epoch": 4.740752063589117, "grad_norm": 1.2821934223175049, "learning_rate": 2.6752154897881194e-05, "loss": 0.1868, "step": 15507 }, { "epoch": 4.741057780495261, "grad_norm": 0.9619268774986267, "learning_rate": 2.6751730287461256e-05, "loss": 0.1911, "step": 15508 }, { "epoch": 4.741363497401406, "grad_norm": 1.3166967630386353, "learning_rate": 2.6751305677041315e-05, "loss": 0.2541, "step": 15509 }, { "epoch": 4.741669214307551, "grad_norm": 0.3904224634170532, "learning_rate": 2.6750881066621377e-05, "loss": 0.1274, "step": 15510 }, { "epoch": 4.741974931213696, "grad_norm": 0.20141516625881195, "learning_rate": 2.6750456456201436e-05, "loss": 0.0676, "step": 15511 }, { "epoch": 4.742280648119841, "grad_norm": 1.0985835790634155, "learning_rate": 2.6750031845781498e-05, "loss": 0.0406, "step": 15512 }, { "epoch": 4.742586365025986, "grad_norm": 0.1772218644618988, "learning_rate": 2.6749607235361557e-05, "loss": 0.0364, "step": 15513 }, { "epoch": 4.742892081932131, "grad_norm": 0.4501854181289673, "learning_rate": 2.674918262494162e-05, "loss": 0.0551, "step": 15514 }, { "epoch": 4.743197798838276, "grad_norm": 0.2976124584674835, "learning_rate": 2.6748758014521677e-05, "loss": 0.0687, "step": 15515 }, { "epoch": 4.743503515744421, "grad_norm": 0.39957892894744873, "learning_rate": 2.6748333404101736e-05, "loss": 0.0581, "step": 15516 }, { "epoch": 4.743809232650565, "grad_norm": 0.24489343166351318, "learning_rate": 2.6747908793681798e-05, "loss": 0.0661, "step": 15517 }, { "epoch": 4.74411494955671, "grad_norm": 0.26595693826675415, "learning_rate": 2.6747484183261857e-05, "loss": 0.0603, "step": 15518 }, { "epoch": 4.744420666462855, "grad_norm": 0.24549883604049683, "learning_rate": 2.674705957284192e-05, "loss": 0.0784, "step": 15519 }, { "epoch": 4.7447263833690005, "grad_norm": 0.4347473680973053, "learning_rate": 2.6746634962421978e-05, "loss": 0.1004, "step": 15520 }, { "epoch": 4.745032100275145, "grad_norm": 0.3296695947647095, "learning_rate": 2.674621035200204e-05, "loss": 0.1184, "step": 15521 }, { "epoch": 4.74533781718129, "grad_norm": 0.26916539669036865, "learning_rate": 2.67457857415821e-05, "loss": 0.075, "step": 15522 }, { "epoch": 4.745643534087435, "grad_norm": 0.36684224009513855, "learning_rate": 2.674536113116216e-05, "loss": 0.116, "step": 15523 }, { "epoch": 4.74594925099358, "grad_norm": 0.5757076740264893, "learning_rate": 2.674493652074222e-05, "loss": 0.1482, "step": 15524 }, { "epoch": 4.746254967899725, "grad_norm": 0.8671665787696838, "learning_rate": 2.674451191032228e-05, "loss": 0.1172, "step": 15525 }, { "epoch": 4.7465606848058695, "grad_norm": 0.6456646919250488, "learning_rate": 2.674408729990234e-05, "loss": 0.1747, "step": 15526 }, { "epoch": 4.746866401712015, "grad_norm": 0.47880035638809204, "learning_rate": 2.6743662689482402e-05, "loss": 0.1441, "step": 15527 }, { "epoch": 4.74717211861816, "grad_norm": 0.7509108185768127, "learning_rate": 2.674323807906246e-05, "loss": 0.1844, "step": 15528 }, { "epoch": 4.747477835524305, "grad_norm": 1.6328896284103394, "learning_rate": 2.674281346864252e-05, "loss": 0.161, "step": 15529 }, { "epoch": 4.747783552430449, "grad_norm": 1.1460351943969727, "learning_rate": 2.674238885822258e-05, "loss": 0.1621, "step": 15530 }, { "epoch": 4.748089269336594, "grad_norm": 1.4284043312072754, "learning_rate": 2.674196424780264e-05, "loss": 0.2171, "step": 15531 }, { "epoch": 4.748394986242739, "grad_norm": 0.9615164399147034, "learning_rate": 2.6741539637382702e-05, "loss": 0.1944, "step": 15532 }, { "epoch": 4.748700703148884, "grad_norm": 0.9136574864387512, "learning_rate": 2.674111502696276e-05, "loss": 0.2337, "step": 15533 }, { "epoch": 4.749006420055029, "grad_norm": 2.727055549621582, "learning_rate": 2.6740690416542823e-05, "loss": 0.2814, "step": 15534 }, { "epoch": 4.749312136961174, "grad_norm": 0.6243181228637695, "learning_rate": 2.6740265806122882e-05, "loss": 0.1398, "step": 15535 }, { "epoch": 4.749617853867319, "grad_norm": 0.38999006152153015, "learning_rate": 2.6739841195702944e-05, "loss": 0.0763, "step": 15536 }, { "epoch": 4.749923570773464, "grad_norm": 0.36471548676490784, "learning_rate": 2.6739416585283003e-05, "loss": 0.0867, "step": 15537 }, { "epoch": 4.750229287679609, "grad_norm": 0.21289163827896118, "learning_rate": 2.6738991974863065e-05, "loss": 0.0489, "step": 15538 }, { "epoch": 4.750535004585753, "grad_norm": 0.15584729611873627, "learning_rate": 2.6738567364443123e-05, "loss": 0.0492, "step": 15539 }, { "epoch": 4.750840721491898, "grad_norm": 0.21512000262737274, "learning_rate": 2.6738142754023186e-05, "loss": 0.0543, "step": 15540 }, { "epoch": 4.7511464383980435, "grad_norm": 0.7727904319763184, "learning_rate": 2.6737718143603244e-05, "loss": 0.0622, "step": 15541 }, { "epoch": 4.751452155304189, "grad_norm": 1.095444679260254, "learning_rate": 2.6737293533183303e-05, "loss": 0.087, "step": 15542 }, { "epoch": 4.751757872210333, "grad_norm": 0.15629497170448303, "learning_rate": 2.6736868922763365e-05, "loss": 0.0469, "step": 15543 }, { "epoch": 4.752063589116478, "grad_norm": 0.3776550889015198, "learning_rate": 2.6736444312343424e-05, "loss": 0.0538, "step": 15544 }, { "epoch": 4.752369306022623, "grad_norm": 0.5671632885932922, "learning_rate": 2.6736019701923486e-05, "loss": 0.1129, "step": 15545 }, { "epoch": 4.752675022928768, "grad_norm": 0.19016113877296448, "learning_rate": 2.6735595091503545e-05, "loss": 0.0625, "step": 15546 }, { "epoch": 4.752980739834912, "grad_norm": 1.0341042280197144, "learning_rate": 2.6735170481083607e-05, "loss": 0.0783, "step": 15547 }, { "epoch": 4.7532864567410575, "grad_norm": 0.43117955327033997, "learning_rate": 2.6734745870663665e-05, "loss": 0.1146, "step": 15548 }, { "epoch": 4.753592173647203, "grad_norm": 0.4532405734062195, "learning_rate": 2.6734321260243727e-05, "loss": 0.1271, "step": 15549 }, { "epoch": 4.753897890553348, "grad_norm": 0.6279839277267456, "learning_rate": 2.6733896649823786e-05, "loss": 0.1128, "step": 15550 }, { "epoch": 4.754203607459493, "grad_norm": 0.29787498712539673, "learning_rate": 2.6733472039403848e-05, "loss": 0.1362, "step": 15551 }, { "epoch": 4.754509324365637, "grad_norm": 0.5428005456924438, "learning_rate": 2.6733047428983907e-05, "loss": 0.1668, "step": 15552 }, { "epoch": 4.754815041271782, "grad_norm": 1.1838384866714478, "learning_rate": 2.673262281856397e-05, "loss": 0.195, "step": 15553 }, { "epoch": 4.755120758177927, "grad_norm": 0.5044512152671814, "learning_rate": 2.6732198208144028e-05, "loss": 0.1696, "step": 15554 }, { "epoch": 4.7554264750840725, "grad_norm": 0.5703250765800476, "learning_rate": 2.6731773597724086e-05, "loss": 0.1625, "step": 15555 }, { "epoch": 4.755732191990217, "grad_norm": 1.1772677898406982, "learning_rate": 2.673134898730415e-05, "loss": 0.2038, "step": 15556 }, { "epoch": 4.756037908896362, "grad_norm": 0.7101632952690125, "learning_rate": 2.6730924376884207e-05, "loss": 0.1859, "step": 15557 }, { "epoch": 4.756343625802507, "grad_norm": 2.2967405319213867, "learning_rate": 2.673049976646427e-05, "loss": 0.1961, "step": 15558 }, { "epoch": 4.756649342708652, "grad_norm": 0.9420740008354187, "learning_rate": 2.6730075156044328e-05, "loss": 0.2302, "step": 15559 }, { "epoch": 4.756955059614796, "grad_norm": 0.37894871830940247, "learning_rate": 2.672965054562439e-05, "loss": 0.1494, "step": 15560 }, { "epoch": 4.757260776520941, "grad_norm": 0.2704842984676361, "learning_rate": 2.672922593520445e-05, "loss": 0.079, "step": 15561 }, { "epoch": 4.7575664934270865, "grad_norm": 0.3644258677959442, "learning_rate": 2.672880132478451e-05, "loss": 0.0724, "step": 15562 }, { "epoch": 4.757872210333232, "grad_norm": 0.17737939953804016, "learning_rate": 2.672837671436457e-05, "loss": 0.0486, "step": 15563 }, { "epoch": 4.758177927239377, "grad_norm": 0.297868549823761, "learning_rate": 2.672795210394463e-05, "loss": 0.063, "step": 15564 }, { "epoch": 4.758483644145521, "grad_norm": 0.3873581290245056, "learning_rate": 2.672752749352469e-05, "loss": 0.0507, "step": 15565 }, { "epoch": 4.758789361051666, "grad_norm": 0.27237963676452637, "learning_rate": 2.6727102883104752e-05, "loss": 0.0737, "step": 15566 }, { "epoch": 4.759095077957811, "grad_norm": 0.26742658019065857, "learning_rate": 2.672667827268481e-05, "loss": 0.0472, "step": 15567 }, { "epoch": 4.759400794863956, "grad_norm": 0.38208094239234924, "learning_rate": 2.672625366226487e-05, "loss": 0.0847, "step": 15568 }, { "epoch": 4.7597065117701005, "grad_norm": 0.7327196002006531, "learning_rate": 2.6725829051844932e-05, "loss": 0.0725, "step": 15569 }, { "epoch": 4.760012228676246, "grad_norm": 0.9418108463287354, "learning_rate": 2.672540444142499e-05, "loss": 0.0621, "step": 15570 }, { "epoch": 4.760317945582391, "grad_norm": 0.45676612854003906, "learning_rate": 2.6724979831005053e-05, "loss": 0.0816, "step": 15571 }, { "epoch": 4.760623662488536, "grad_norm": 0.4362925589084625, "learning_rate": 2.672455522058511e-05, "loss": 0.0878, "step": 15572 }, { "epoch": 4.76092937939468, "grad_norm": 0.6080014109611511, "learning_rate": 2.6724130610165173e-05, "loss": 0.1084, "step": 15573 }, { "epoch": 4.761235096300825, "grad_norm": 0.7287077903747559, "learning_rate": 2.6723705999745232e-05, "loss": 0.1294, "step": 15574 }, { "epoch": 4.76154081320697, "grad_norm": 0.4062003195285797, "learning_rate": 2.6723281389325294e-05, "loss": 0.1239, "step": 15575 }, { "epoch": 4.761846530113115, "grad_norm": 0.8360328078269958, "learning_rate": 2.6722856778905353e-05, "loss": 0.1714, "step": 15576 }, { "epoch": 4.7621522470192605, "grad_norm": 0.8267832398414612, "learning_rate": 2.6722432168485415e-05, "loss": 0.1912, "step": 15577 }, { "epoch": 4.762457963925405, "grad_norm": 0.8864356279373169, "learning_rate": 2.6722007558065477e-05, "loss": 0.1839, "step": 15578 }, { "epoch": 4.76276368083155, "grad_norm": 0.4981704652309418, "learning_rate": 2.672158294764554e-05, "loss": 0.1597, "step": 15579 }, { "epoch": 4.763069397737695, "grad_norm": 0.9783864617347717, "learning_rate": 2.6721158337225598e-05, "loss": 0.2098, "step": 15580 }, { "epoch": 4.76337511464384, "grad_norm": 4.111319541931152, "learning_rate": 2.6720733726805657e-05, "loss": 0.2021, "step": 15581 }, { "epoch": 4.763680831549984, "grad_norm": 1.2487386465072632, "learning_rate": 2.672030911638572e-05, "loss": 0.168, "step": 15582 }, { "epoch": 4.7639865484561295, "grad_norm": 1.5650361776351929, "learning_rate": 2.6719884505965777e-05, "loss": 0.1795, "step": 15583 }, { "epoch": 4.764292265362275, "grad_norm": 1.8927040100097656, "learning_rate": 2.671945989554584e-05, "loss": 0.2614, "step": 15584 }, { "epoch": 4.76459798226842, "grad_norm": 0.33663904666900635, "learning_rate": 2.6719035285125898e-05, "loss": 0.1479, "step": 15585 }, { "epoch": 4.764903699174564, "grad_norm": 1.1730259656906128, "learning_rate": 2.671861067470596e-05, "loss": 0.0885, "step": 15586 }, { "epoch": 4.765209416080709, "grad_norm": 0.2831679880619049, "learning_rate": 2.671818606428602e-05, "loss": 0.0834, "step": 15587 }, { "epoch": 4.765515132986854, "grad_norm": 0.2535910904407501, "learning_rate": 2.671776145386608e-05, "loss": 0.0673, "step": 15588 }, { "epoch": 4.765820849892999, "grad_norm": 0.2642778754234314, "learning_rate": 2.671733684344614e-05, "loss": 0.0595, "step": 15589 }, { "epoch": 4.766126566799144, "grad_norm": 0.2686872184276581, "learning_rate": 2.6716912233026202e-05, "loss": 0.0492, "step": 15590 }, { "epoch": 4.766432283705289, "grad_norm": 1.3936817646026611, "learning_rate": 2.671648762260626e-05, "loss": 0.0952, "step": 15591 }, { "epoch": 4.766738000611434, "grad_norm": 0.2997973561286926, "learning_rate": 2.671606301218632e-05, "loss": 0.0871, "step": 15592 }, { "epoch": 4.767043717517579, "grad_norm": 0.3567797839641571, "learning_rate": 2.671563840176638e-05, "loss": 0.056, "step": 15593 }, { "epoch": 4.767349434423724, "grad_norm": 0.3195769190788269, "learning_rate": 2.671521379134644e-05, "loss": 0.0814, "step": 15594 }, { "epoch": 4.767655151329868, "grad_norm": 0.3628385066986084, "learning_rate": 2.6714789180926502e-05, "loss": 0.0782, "step": 15595 }, { "epoch": 4.767960868236013, "grad_norm": 1.8942173719406128, "learning_rate": 2.671436457050656e-05, "loss": 0.1037, "step": 15596 }, { "epoch": 4.768266585142158, "grad_norm": 0.6837552189826965, "learning_rate": 2.6713939960086623e-05, "loss": 0.1019, "step": 15597 }, { "epoch": 4.7685723020483035, "grad_norm": 0.8912591338157654, "learning_rate": 2.671351534966668e-05, "loss": 0.1452, "step": 15598 }, { "epoch": 4.768878018954448, "grad_norm": 0.4051685333251953, "learning_rate": 2.6713090739246744e-05, "loss": 0.1196, "step": 15599 }, { "epoch": 4.769183735860593, "grad_norm": 0.43861308693885803, "learning_rate": 2.6712666128826802e-05, "loss": 0.1539, "step": 15600 }, { "epoch": 4.769489452766738, "grad_norm": 0.5856853127479553, "learning_rate": 2.6712241518406865e-05, "loss": 0.1539, "step": 15601 }, { "epoch": 4.769795169672883, "grad_norm": 0.5344411134719849, "learning_rate": 2.6711816907986923e-05, "loss": 0.1465, "step": 15602 }, { "epoch": 4.770100886579028, "grad_norm": 1.1220037937164307, "learning_rate": 2.6711392297566985e-05, "loss": 0.2094, "step": 15603 }, { "epoch": 4.770406603485172, "grad_norm": 0.48453405499458313, "learning_rate": 2.6710967687147044e-05, "loss": 0.1608, "step": 15604 }, { "epoch": 4.7707123203913175, "grad_norm": 0.9990222454071045, "learning_rate": 2.6710543076727103e-05, "loss": 0.1951, "step": 15605 }, { "epoch": 4.771018037297463, "grad_norm": 1.3574508428573608, "learning_rate": 2.6710118466307165e-05, "loss": 0.1426, "step": 15606 }, { "epoch": 4.771323754203608, "grad_norm": 0.7748913168907166, "learning_rate": 2.6709693855887223e-05, "loss": 0.189, "step": 15607 }, { "epoch": 4.771629471109752, "grad_norm": 0.9638360142707825, "learning_rate": 2.6709269245467286e-05, "loss": 0.1819, "step": 15608 }, { "epoch": 4.771935188015897, "grad_norm": 2.0785417556762695, "learning_rate": 2.6708844635047344e-05, "loss": 0.268, "step": 15609 }, { "epoch": 4.772240904922042, "grad_norm": 0.33042609691619873, "learning_rate": 2.6708420024627406e-05, "loss": 0.142, "step": 15610 }, { "epoch": 4.772546621828187, "grad_norm": 0.3634949028491974, "learning_rate": 2.6707995414207465e-05, "loss": 0.091, "step": 15611 }, { "epoch": 4.772852338734332, "grad_norm": 0.25966474413871765, "learning_rate": 2.6707570803787527e-05, "loss": 0.0538, "step": 15612 }, { "epoch": 4.773158055640477, "grad_norm": 0.4378645718097687, "learning_rate": 2.6707146193367586e-05, "loss": 0.0635, "step": 15613 }, { "epoch": 4.773463772546622, "grad_norm": 0.34623128175735474, "learning_rate": 2.6706721582947648e-05, "loss": 0.0476, "step": 15614 }, { "epoch": 4.773769489452767, "grad_norm": 0.3951106071472168, "learning_rate": 2.6706296972527707e-05, "loss": 0.095, "step": 15615 }, { "epoch": 4.774075206358912, "grad_norm": 0.2664748430252075, "learning_rate": 2.670587236210777e-05, "loss": 0.0567, "step": 15616 }, { "epoch": 4.774380923265056, "grad_norm": 1.5128334760665894, "learning_rate": 2.6705447751687827e-05, "loss": 0.1132, "step": 15617 }, { "epoch": 4.774686640171201, "grad_norm": 0.4229755699634552, "learning_rate": 2.6705023141267886e-05, "loss": 0.0869, "step": 15618 }, { "epoch": 4.7749923570773465, "grad_norm": 3.255587339401245, "learning_rate": 2.6704598530847948e-05, "loss": 0.06, "step": 15619 }, { "epoch": 4.775298073983492, "grad_norm": 0.1927347034215927, "learning_rate": 2.6704173920428007e-05, "loss": 0.0544, "step": 15620 }, { "epoch": 4.775603790889636, "grad_norm": 0.30803191661834717, "learning_rate": 2.670374931000807e-05, "loss": 0.0882, "step": 15621 }, { "epoch": 4.775909507795781, "grad_norm": 0.8772420883178711, "learning_rate": 2.6703324699588128e-05, "loss": 0.0919, "step": 15622 }, { "epoch": 4.776215224701926, "grad_norm": 0.9117602109909058, "learning_rate": 2.670290008916819e-05, "loss": 0.1228, "step": 15623 }, { "epoch": 4.776520941608071, "grad_norm": 0.4405968189239502, "learning_rate": 2.670247547874825e-05, "loss": 0.0992, "step": 15624 }, { "epoch": 4.776826658514215, "grad_norm": 0.4758959412574768, "learning_rate": 2.670205086832831e-05, "loss": 0.1439, "step": 15625 }, { "epoch": 4.7771323754203605, "grad_norm": 0.7873270511627197, "learning_rate": 2.670162625790837e-05, "loss": 0.1901, "step": 15626 }, { "epoch": 4.777438092326506, "grad_norm": 1.1970916986465454, "learning_rate": 2.670120164748843e-05, "loss": 0.1404, "step": 15627 }, { "epoch": 4.777743809232651, "grad_norm": 0.8034089207649231, "learning_rate": 2.670077703706849e-05, "loss": 0.1548, "step": 15628 }, { "epoch": 4.778049526138796, "grad_norm": 0.808716356754303, "learning_rate": 2.6700352426648552e-05, "loss": 0.1697, "step": 15629 }, { "epoch": 4.77835524304494, "grad_norm": 0.7088771462440491, "learning_rate": 2.669992781622861e-05, "loss": 0.1875, "step": 15630 }, { "epoch": 4.778660959951085, "grad_norm": 0.914836049079895, "learning_rate": 2.669950320580867e-05, "loss": 0.2082, "step": 15631 }, { "epoch": 4.77896667685723, "grad_norm": 1.7761976718902588, "learning_rate": 2.669907859538873e-05, "loss": 0.2202, "step": 15632 }, { "epoch": 4.779272393763375, "grad_norm": 0.7730107307434082, "learning_rate": 2.669865398496879e-05, "loss": 0.1907, "step": 15633 }, { "epoch": 4.77957811066952, "grad_norm": 2.255401372909546, "learning_rate": 2.6698229374548852e-05, "loss": 0.2879, "step": 15634 }, { "epoch": 4.779883827575665, "grad_norm": 0.9744365215301514, "learning_rate": 2.669780476412891e-05, "loss": 0.1292, "step": 15635 }, { "epoch": 4.78018954448181, "grad_norm": 0.2563032805919647, "learning_rate": 2.6697380153708973e-05, "loss": 0.1088, "step": 15636 }, { "epoch": 4.780495261387955, "grad_norm": 0.24860912561416626, "learning_rate": 2.6696955543289032e-05, "loss": 0.0696, "step": 15637 }, { "epoch": 4.780800978294099, "grad_norm": 0.7129545211791992, "learning_rate": 2.6696530932869094e-05, "loss": 0.0751, "step": 15638 }, { "epoch": 4.781106695200244, "grad_norm": 0.27117493748664856, "learning_rate": 2.6696106322449153e-05, "loss": 0.0583, "step": 15639 }, { "epoch": 4.7814124121063895, "grad_norm": 0.5525006651878357, "learning_rate": 2.6695681712029215e-05, "loss": 0.0509, "step": 15640 }, { "epoch": 4.781718129012535, "grad_norm": 0.24007654190063477, "learning_rate": 2.6695257101609273e-05, "loss": 0.065, "step": 15641 }, { "epoch": 4.78202384591868, "grad_norm": 0.2752619981765747, "learning_rate": 2.6694832491189336e-05, "loss": 0.0785, "step": 15642 }, { "epoch": 4.782329562824824, "grad_norm": 0.7001988887786865, "learning_rate": 2.6694407880769394e-05, "loss": 0.072, "step": 15643 }, { "epoch": 4.782635279730969, "grad_norm": 0.3578706979751587, "learning_rate": 2.6693983270349453e-05, "loss": 0.0815, "step": 15644 }, { "epoch": 4.782940996637114, "grad_norm": 0.40610039234161377, "learning_rate": 2.6693558659929515e-05, "loss": 0.0856, "step": 15645 }, { "epoch": 4.783246713543259, "grad_norm": 1.173287034034729, "learning_rate": 2.6693134049509574e-05, "loss": 0.0924, "step": 15646 }, { "epoch": 4.7835524304494035, "grad_norm": 0.31334197521209717, "learning_rate": 2.6692709439089636e-05, "loss": 0.0992, "step": 15647 }, { "epoch": 4.783858147355549, "grad_norm": 0.396474152803421, "learning_rate": 2.6692284828669695e-05, "loss": 0.1344, "step": 15648 }, { "epoch": 4.784163864261694, "grad_norm": 0.49313855171203613, "learning_rate": 2.6691860218249757e-05, "loss": 0.1096, "step": 15649 }, { "epoch": 4.784469581167839, "grad_norm": 0.570652186870575, "learning_rate": 2.6691435607829815e-05, "loss": 0.1431, "step": 15650 }, { "epoch": 4.784775298073983, "grad_norm": 0.6634118556976318, "learning_rate": 2.6691010997409877e-05, "loss": 0.1865, "step": 15651 }, { "epoch": 4.785081014980128, "grad_norm": 1.0125056505203247, "learning_rate": 2.6690586386989936e-05, "loss": 0.207, "step": 15652 }, { "epoch": 4.785386731886273, "grad_norm": 2.116285562515259, "learning_rate": 2.6690161776569998e-05, "loss": 0.1432, "step": 15653 }, { "epoch": 4.785692448792418, "grad_norm": 0.6631580591201782, "learning_rate": 2.6689737166150057e-05, "loss": 0.1918, "step": 15654 }, { "epoch": 4.7859981656985635, "grad_norm": 0.657293438911438, "learning_rate": 2.668931255573012e-05, "loss": 0.1901, "step": 15655 }, { "epoch": 4.786303882604708, "grad_norm": 1.3305116891860962, "learning_rate": 2.6688887945310178e-05, "loss": 0.1904, "step": 15656 }, { "epoch": 4.786609599510853, "grad_norm": 0.807292640209198, "learning_rate": 2.6688463334890236e-05, "loss": 0.1999, "step": 15657 }, { "epoch": 4.786915316416998, "grad_norm": 1.637930989265442, "learning_rate": 2.66880387244703e-05, "loss": 0.204, "step": 15658 }, { "epoch": 4.787221033323143, "grad_norm": 2.344982624053955, "learning_rate": 2.6687614114050357e-05, "loss": 0.2809, "step": 15659 }, { "epoch": 4.787526750229287, "grad_norm": 0.6961372494697571, "learning_rate": 2.668718950363042e-05, "loss": 0.109, "step": 15660 }, { "epoch": 4.787832467135432, "grad_norm": 0.38653308153152466, "learning_rate": 2.6686764893210478e-05, "loss": 0.1048, "step": 15661 }, { "epoch": 4.7881381840415775, "grad_norm": 0.2327260971069336, "learning_rate": 2.668634028279054e-05, "loss": 0.0975, "step": 15662 }, { "epoch": 4.788443900947723, "grad_norm": 0.2015862911939621, "learning_rate": 2.66859156723706e-05, "loss": 0.0673, "step": 15663 }, { "epoch": 4.788749617853867, "grad_norm": 0.36131036281585693, "learning_rate": 2.668549106195066e-05, "loss": 0.0672, "step": 15664 }, { "epoch": 4.789055334760012, "grad_norm": 0.33856508135795593, "learning_rate": 2.668506645153072e-05, "loss": 0.0378, "step": 15665 }, { "epoch": 4.789361051666157, "grad_norm": 0.30548039078712463, "learning_rate": 2.668464184111078e-05, "loss": 0.1177, "step": 15666 }, { "epoch": 4.789666768572302, "grad_norm": 0.31199556589126587, "learning_rate": 2.668421723069084e-05, "loss": 0.0479, "step": 15667 }, { "epoch": 4.789972485478447, "grad_norm": 0.31618350744247437, "learning_rate": 2.6683792620270902e-05, "loss": 0.0693, "step": 15668 }, { "epoch": 4.790278202384592, "grad_norm": 0.5224365592002869, "learning_rate": 2.668336800985096e-05, "loss": 0.0976, "step": 15669 }, { "epoch": 4.790583919290737, "grad_norm": 0.4872288107872009, "learning_rate": 2.668294339943102e-05, "loss": 0.0691, "step": 15670 }, { "epoch": 4.790889636196882, "grad_norm": 0.400310218334198, "learning_rate": 2.6682518789011082e-05, "loss": 0.0803, "step": 15671 }, { "epoch": 4.791195353103027, "grad_norm": 0.8948304057121277, "learning_rate": 2.668209417859114e-05, "loss": 0.1155, "step": 15672 }, { "epoch": 4.791501070009171, "grad_norm": 0.3543936610221863, "learning_rate": 2.6681669568171203e-05, "loss": 0.1242, "step": 15673 }, { "epoch": 4.791806786915316, "grad_norm": 0.34561073780059814, "learning_rate": 2.668124495775126e-05, "loss": 0.1359, "step": 15674 }, { "epoch": 4.792112503821461, "grad_norm": 0.64077228307724, "learning_rate": 2.6680820347331324e-05, "loss": 0.1213, "step": 15675 }, { "epoch": 4.7924182207276065, "grad_norm": 0.42780613899230957, "learning_rate": 2.6680395736911382e-05, "loss": 0.1442, "step": 15676 }, { "epoch": 4.792723937633751, "grad_norm": 0.6903275847434998, "learning_rate": 2.6679971126491444e-05, "loss": 0.1523, "step": 15677 }, { "epoch": 4.793029654539896, "grad_norm": 1.1424777507781982, "learning_rate": 2.6679546516071503e-05, "loss": 0.1732, "step": 15678 }, { "epoch": 4.793335371446041, "grad_norm": 0.5057557225227356, "learning_rate": 2.6679121905651565e-05, "loss": 0.1645, "step": 15679 }, { "epoch": 4.793641088352186, "grad_norm": 0.5947940945625305, "learning_rate": 2.6678697295231627e-05, "loss": 0.1757, "step": 15680 }, { "epoch": 4.793946805258331, "grad_norm": 0.46336764097213745, "learning_rate": 2.667827268481169e-05, "loss": 0.1602, "step": 15681 }, { "epoch": 4.794252522164475, "grad_norm": 1.8710860013961792, "learning_rate": 2.6677848074391748e-05, "loss": 0.2929, "step": 15682 }, { "epoch": 4.7945582390706205, "grad_norm": 0.9564358592033386, "learning_rate": 2.6677423463971807e-05, "loss": 0.21, "step": 15683 }, { "epoch": 4.794863955976766, "grad_norm": 1.0703682899475098, "learning_rate": 2.667699885355187e-05, "loss": 0.2569, "step": 15684 }, { "epoch": 4.795169672882911, "grad_norm": 0.529213011264801, "learning_rate": 2.6676574243131927e-05, "loss": 0.1379, "step": 15685 }, { "epoch": 4.795475389789055, "grad_norm": 0.6338291168212891, "learning_rate": 2.667614963271199e-05, "loss": 0.1035, "step": 15686 }, { "epoch": 4.7957811066952, "grad_norm": 0.17024636268615723, "learning_rate": 2.6675725022292048e-05, "loss": 0.0489, "step": 15687 }, { "epoch": 4.796086823601345, "grad_norm": 0.17667967081069946, "learning_rate": 2.667530041187211e-05, "loss": 0.0526, "step": 15688 }, { "epoch": 4.79639254050749, "grad_norm": 0.5144132971763611, "learning_rate": 2.667487580145217e-05, "loss": 0.0595, "step": 15689 }, { "epoch": 4.7966982574136345, "grad_norm": 0.23151050508022308, "learning_rate": 2.667445119103223e-05, "loss": 0.0488, "step": 15690 }, { "epoch": 4.79700397431978, "grad_norm": 0.36293861269950867, "learning_rate": 2.667402658061229e-05, "loss": 0.0795, "step": 15691 }, { "epoch": 4.797309691225925, "grad_norm": 0.4968498647212982, "learning_rate": 2.6673601970192352e-05, "loss": 0.0825, "step": 15692 }, { "epoch": 4.79761540813207, "grad_norm": 0.277657151222229, "learning_rate": 2.667317735977241e-05, "loss": 0.0778, "step": 15693 }, { "epoch": 4.797921125038215, "grad_norm": 0.5195251703262329, "learning_rate": 2.6672752749352473e-05, "loss": 0.0739, "step": 15694 }, { "epoch": 4.798226841944359, "grad_norm": 0.2520146667957306, "learning_rate": 2.667232813893253e-05, "loss": 0.0828, "step": 15695 }, { "epoch": 4.798532558850504, "grad_norm": 0.45456743240356445, "learning_rate": 2.667190352851259e-05, "loss": 0.063, "step": 15696 }, { "epoch": 4.7988382757566495, "grad_norm": 0.33080974221229553, "learning_rate": 2.6671478918092652e-05, "loss": 0.1221, "step": 15697 }, { "epoch": 4.799143992662795, "grad_norm": 0.27165597677230835, "learning_rate": 2.667105430767271e-05, "loss": 0.1092, "step": 15698 }, { "epoch": 4.799449709568939, "grad_norm": 0.9649966359138489, "learning_rate": 2.6670629697252773e-05, "loss": 0.1087, "step": 15699 }, { "epoch": 4.799755426475084, "grad_norm": 0.6683133840560913, "learning_rate": 2.667020508683283e-05, "loss": 0.1345, "step": 15700 }, { "epoch": 4.800061143381229, "grad_norm": 0.7035684585571289, "learning_rate": 2.6669780476412894e-05, "loss": 0.1612, "step": 15701 }, { "epoch": 4.800366860287374, "grad_norm": 0.7257013320922852, "learning_rate": 2.6669355865992952e-05, "loss": 0.1875, "step": 15702 }, { "epoch": 4.800672577193518, "grad_norm": 0.5434814095497131, "learning_rate": 2.6668931255573015e-05, "loss": 0.1469, "step": 15703 }, { "epoch": 4.8009782940996635, "grad_norm": 0.6231019496917725, "learning_rate": 2.6668506645153073e-05, "loss": 0.1574, "step": 15704 }, { "epoch": 4.801284011005809, "grad_norm": 0.6849789023399353, "learning_rate": 2.6668082034733135e-05, "loss": 0.209, "step": 15705 }, { "epoch": 4.801589727911954, "grad_norm": 1.2462950944900513, "learning_rate": 2.6667657424313194e-05, "loss": 0.1946, "step": 15706 }, { "epoch": 4.801895444818099, "grad_norm": 1.4298832416534424, "learning_rate": 2.6667232813893253e-05, "loss": 0.1689, "step": 15707 }, { "epoch": 4.802201161724243, "grad_norm": 0.8686357140541077, "learning_rate": 2.6666808203473315e-05, "loss": 0.2294, "step": 15708 }, { "epoch": 4.802506878630388, "grad_norm": 1.4436752796173096, "learning_rate": 2.6666383593053374e-05, "loss": 0.2673, "step": 15709 }, { "epoch": 4.802812595536533, "grad_norm": 0.40931448340415955, "learning_rate": 2.6665958982633436e-05, "loss": 0.1319, "step": 15710 }, { "epoch": 4.803118312442678, "grad_norm": 0.47706466913223267, "learning_rate": 2.6665534372213494e-05, "loss": 0.0877, "step": 15711 }, { "epoch": 4.803424029348823, "grad_norm": 0.5428346991539001, "learning_rate": 2.6665109761793556e-05, "loss": 0.0854, "step": 15712 }, { "epoch": 4.803729746254968, "grad_norm": 0.3708249032497406, "learning_rate": 2.6664685151373615e-05, "loss": 0.0546, "step": 15713 }, { "epoch": 4.804035463161113, "grad_norm": 0.21375881135463715, "learning_rate": 2.6664260540953677e-05, "loss": 0.0446, "step": 15714 }, { "epoch": 4.804341180067258, "grad_norm": 0.3092779517173767, "learning_rate": 2.6663835930533736e-05, "loss": 0.0707, "step": 15715 }, { "epoch": 4.804646896973402, "grad_norm": 0.21842053532600403, "learning_rate": 2.6663411320113798e-05, "loss": 0.0583, "step": 15716 }, { "epoch": 4.804952613879547, "grad_norm": 0.24462075531482697, "learning_rate": 2.6662986709693857e-05, "loss": 0.0579, "step": 15717 }, { "epoch": 4.805258330785692, "grad_norm": 0.27174434065818787, "learning_rate": 2.666256209927392e-05, "loss": 0.0711, "step": 15718 }, { "epoch": 4.8055640476918375, "grad_norm": 0.2850550413131714, "learning_rate": 2.6662137488853977e-05, "loss": 0.07, "step": 15719 }, { "epoch": 4.805869764597983, "grad_norm": 0.28589001297950745, "learning_rate": 2.6661712878434036e-05, "loss": 0.1017, "step": 15720 }, { "epoch": 4.806175481504127, "grad_norm": 0.25585728883743286, "learning_rate": 2.6661288268014098e-05, "loss": 0.0871, "step": 15721 }, { "epoch": 4.806481198410272, "grad_norm": 0.6289474964141846, "learning_rate": 2.6660863657594157e-05, "loss": 0.0873, "step": 15722 }, { "epoch": 4.806786915316417, "grad_norm": 0.3271428346633911, "learning_rate": 2.666043904717422e-05, "loss": 0.1242, "step": 15723 }, { "epoch": 4.807092632222562, "grad_norm": 0.5494830012321472, "learning_rate": 2.6660014436754278e-05, "loss": 0.1368, "step": 15724 }, { "epoch": 4.8073983491287064, "grad_norm": 0.491993248462677, "learning_rate": 2.665958982633434e-05, "loss": 0.1769, "step": 15725 }, { "epoch": 4.807704066034852, "grad_norm": 0.5476564764976501, "learning_rate": 2.66591652159144e-05, "loss": 0.1328, "step": 15726 }, { "epoch": 4.808009782940997, "grad_norm": 0.7452865839004517, "learning_rate": 2.665874060549446e-05, "loss": 0.1718, "step": 15727 }, { "epoch": 4.808315499847142, "grad_norm": 0.5329870581626892, "learning_rate": 2.665831599507452e-05, "loss": 0.2256, "step": 15728 }, { "epoch": 4.808621216753286, "grad_norm": 0.4099826514720917, "learning_rate": 2.665789138465458e-05, "loss": 0.1427, "step": 15729 }, { "epoch": 4.808926933659431, "grad_norm": 0.686372697353363, "learning_rate": 2.665746677423464e-05, "loss": 0.1666, "step": 15730 }, { "epoch": 4.809232650565576, "grad_norm": 0.9409127235412598, "learning_rate": 2.6657042163814702e-05, "loss": 0.1569, "step": 15731 }, { "epoch": 4.809538367471721, "grad_norm": 0.874701738357544, "learning_rate": 2.665661755339476e-05, "loss": 0.2321, "step": 15732 }, { "epoch": 4.8098440843778665, "grad_norm": 2.204712390899658, "learning_rate": 2.665619294297482e-05, "loss": 0.1991, "step": 15733 }, { "epoch": 4.810149801284011, "grad_norm": 5.6752824783325195, "learning_rate": 2.665576833255488e-05, "loss": 0.249, "step": 15734 }, { "epoch": 4.810455518190156, "grad_norm": 0.5237113833427429, "learning_rate": 2.665534372213494e-05, "loss": 0.1501, "step": 15735 }, { "epoch": 4.810761235096301, "grad_norm": 0.2572159469127655, "learning_rate": 2.6654919111715002e-05, "loss": 0.0833, "step": 15736 }, { "epoch": 4.811066952002446, "grad_norm": 0.39703062176704407, "learning_rate": 2.665449450129506e-05, "loss": 0.0696, "step": 15737 }, { "epoch": 4.81137266890859, "grad_norm": 0.18090301752090454, "learning_rate": 2.6654069890875123e-05, "loss": 0.0521, "step": 15738 }, { "epoch": 4.811678385814735, "grad_norm": 0.13991428911685944, "learning_rate": 2.6653645280455182e-05, "loss": 0.048, "step": 15739 }, { "epoch": 4.8119841027208805, "grad_norm": 0.17378424108028412, "learning_rate": 2.6653220670035244e-05, "loss": 0.0644, "step": 15740 }, { "epoch": 4.812289819627026, "grad_norm": 0.2420710027217865, "learning_rate": 2.6652796059615303e-05, "loss": 0.0523, "step": 15741 }, { "epoch": 4.81259553653317, "grad_norm": 0.22850501537322998, "learning_rate": 2.6652371449195365e-05, "loss": 0.0902, "step": 15742 }, { "epoch": 4.812901253439315, "grad_norm": 0.3278219699859619, "learning_rate": 2.6651946838775424e-05, "loss": 0.0889, "step": 15743 }, { "epoch": 4.81320697034546, "grad_norm": 0.2799775004386902, "learning_rate": 2.6651522228355486e-05, "loss": 0.0653, "step": 15744 }, { "epoch": 4.813512687251605, "grad_norm": 0.26626715064048767, "learning_rate": 2.6651097617935544e-05, "loss": 0.0935, "step": 15745 }, { "epoch": 4.81381840415775, "grad_norm": 0.29610225558280945, "learning_rate": 2.6650673007515603e-05, "loss": 0.0723, "step": 15746 }, { "epoch": 4.8141241210638945, "grad_norm": 0.32684940099716187, "learning_rate": 2.6650248397095665e-05, "loss": 0.1015, "step": 15747 }, { "epoch": 4.81442983797004, "grad_norm": 0.30269700288772583, "learning_rate": 2.6649823786675724e-05, "loss": 0.1145, "step": 15748 }, { "epoch": 4.814735554876185, "grad_norm": 0.2822059392929077, "learning_rate": 2.6649399176255786e-05, "loss": 0.1122, "step": 15749 }, { "epoch": 4.81504127178233, "grad_norm": 0.5509469509124756, "learning_rate": 2.6648974565835845e-05, "loss": 0.1391, "step": 15750 }, { "epoch": 4.815346988688474, "grad_norm": 2.772015333175659, "learning_rate": 2.6648549955415907e-05, "loss": 0.1522, "step": 15751 }, { "epoch": 4.815652705594619, "grad_norm": 0.5280134677886963, "learning_rate": 2.6648125344995965e-05, "loss": 0.1509, "step": 15752 }, { "epoch": 4.815958422500764, "grad_norm": 0.5185939073562622, "learning_rate": 2.6647700734576027e-05, "loss": 0.1484, "step": 15753 }, { "epoch": 4.8162641394069095, "grad_norm": 0.48052778840065, "learning_rate": 2.6647276124156086e-05, "loss": 0.1897, "step": 15754 }, { "epoch": 4.816569856313054, "grad_norm": 1.0945007801055908, "learning_rate": 2.6646851513736148e-05, "loss": 0.1521, "step": 15755 }, { "epoch": 4.816875573219199, "grad_norm": 1.0221805572509766, "learning_rate": 2.6646426903316207e-05, "loss": 0.2171, "step": 15756 }, { "epoch": 4.817181290125344, "grad_norm": 2.6333484649658203, "learning_rate": 2.664600229289627e-05, "loss": 0.2123, "step": 15757 }, { "epoch": 4.817487007031489, "grad_norm": 1.0313090085983276, "learning_rate": 2.6645577682476328e-05, "loss": 0.1937, "step": 15758 }, { "epoch": 4.817792723937634, "grad_norm": 1.2255678176879883, "learning_rate": 2.6645153072056386e-05, "loss": 0.2605, "step": 15759 }, { "epoch": 4.818098440843778, "grad_norm": 0.5466446280479431, "learning_rate": 2.664472846163645e-05, "loss": 0.1314, "step": 15760 }, { "epoch": 4.8184041577499235, "grad_norm": 0.19584687054157257, "learning_rate": 2.6644303851216507e-05, "loss": 0.0863, "step": 15761 }, { "epoch": 4.818709874656069, "grad_norm": 0.24220392107963562, "learning_rate": 2.664387924079657e-05, "loss": 0.0611, "step": 15762 }, { "epoch": 4.819015591562214, "grad_norm": 0.23055948317050934, "learning_rate": 2.6643454630376628e-05, "loss": 0.0731, "step": 15763 }, { "epoch": 4.819321308468358, "grad_norm": 0.17270275950431824, "learning_rate": 2.664303001995669e-05, "loss": 0.049, "step": 15764 }, { "epoch": 4.819627025374503, "grad_norm": 0.41637346148490906, "learning_rate": 2.664260540953675e-05, "loss": 0.0639, "step": 15765 }, { "epoch": 4.819932742280648, "grad_norm": 0.1561410129070282, "learning_rate": 2.664218079911681e-05, "loss": 0.0527, "step": 15766 }, { "epoch": 4.820238459186793, "grad_norm": 0.18265902996063232, "learning_rate": 2.664175618869687e-05, "loss": 0.0429, "step": 15767 }, { "epoch": 4.8205441760929375, "grad_norm": 0.3275531828403473, "learning_rate": 2.664133157827693e-05, "loss": 0.0658, "step": 15768 }, { "epoch": 4.820849892999083, "grad_norm": 0.27334657311439514, "learning_rate": 2.664090696785699e-05, "loss": 0.0637, "step": 15769 }, { "epoch": 4.821155609905228, "grad_norm": 0.40117427706718445, "learning_rate": 2.6640482357437052e-05, "loss": 0.105, "step": 15770 }, { "epoch": 4.821461326811373, "grad_norm": 0.2736455798149109, "learning_rate": 2.664005774701711e-05, "loss": 0.0975, "step": 15771 }, { "epoch": 4.821767043717518, "grad_norm": 0.5808291435241699, "learning_rate": 2.663963313659717e-05, "loss": 0.0938, "step": 15772 }, { "epoch": 4.822072760623662, "grad_norm": 0.5836955904960632, "learning_rate": 2.6639208526177232e-05, "loss": 0.0908, "step": 15773 }, { "epoch": 4.822378477529807, "grad_norm": 0.433731347322464, "learning_rate": 2.663878391575729e-05, "loss": 0.1685, "step": 15774 }, { "epoch": 4.822684194435952, "grad_norm": 1.5629578828811646, "learning_rate": 2.6638359305337353e-05, "loss": 0.1187, "step": 15775 }, { "epoch": 4.8229899113420975, "grad_norm": 0.7119711637496948, "learning_rate": 2.663793469491741e-05, "loss": 0.1461, "step": 15776 }, { "epoch": 4.823295628248242, "grad_norm": 0.5904694199562073, "learning_rate": 2.6637510084497474e-05, "loss": 0.1417, "step": 15777 }, { "epoch": 4.823601345154387, "grad_norm": 1.0829190015792847, "learning_rate": 2.6637085474077532e-05, "loss": 0.1808, "step": 15778 }, { "epoch": 4.823907062060532, "grad_norm": 0.867216169834137, "learning_rate": 2.6636660863657594e-05, "loss": 0.181, "step": 15779 }, { "epoch": 4.824212778966677, "grad_norm": 1.9116909503936768, "learning_rate": 2.6636236253237653e-05, "loss": 0.2046, "step": 15780 }, { "epoch": 4.824518495872821, "grad_norm": 0.7722105979919434, "learning_rate": 2.6635811642817715e-05, "loss": 0.1793, "step": 15781 }, { "epoch": 4.8248242127789664, "grad_norm": 1.5438627004623413, "learning_rate": 2.6635387032397774e-05, "loss": 0.1861, "step": 15782 }, { "epoch": 4.825129929685112, "grad_norm": 0.9481801986694336, "learning_rate": 2.6634962421977836e-05, "loss": 0.2012, "step": 15783 }, { "epoch": 4.825435646591257, "grad_norm": 2.7041356563568115, "learning_rate": 2.6634537811557898e-05, "loss": 0.2514, "step": 15784 }, { "epoch": 4.825741363497402, "grad_norm": 0.29591163992881775, "learning_rate": 2.6634113201137957e-05, "loss": 0.1506, "step": 15785 }, { "epoch": 4.826047080403546, "grad_norm": 0.4726213216781616, "learning_rate": 2.663368859071802e-05, "loss": 0.0751, "step": 15786 }, { "epoch": 4.826352797309691, "grad_norm": 0.204535573720932, "learning_rate": 2.6633263980298077e-05, "loss": 0.0618, "step": 15787 }, { "epoch": 4.826658514215836, "grad_norm": 0.6023572683334351, "learning_rate": 2.663283936987814e-05, "loss": 0.0813, "step": 15788 }, { "epoch": 4.826964231121981, "grad_norm": 0.3038691580295563, "learning_rate": 2.6632414759458198e-05, "loss": 0.0541, "step": 15789 }, { "epoch": 4.827269948028126, "grad_norm": 0.4809274971485138, "learning_rate": 2.663199014903826e-05, "loss": 0.064, "step": 15790 }, { "epoch": 4.827575664934271, "grad_norm": 0.4091047942638397, "learning_rate": 2.663156553861832e-05, "loss": 0.0574, "step": 15791 }, { "epoch": 4.827881381840416, "grad_norm": 0.2030009627342224, "learning_rate": 2.663114092819838e-05, "loss": 0.0514, "step": 15792 }, { "epoch": 4.828187098746561, "grad_norm": 0.28209567070007324, "learning_rate": 2.663071631777844e-05, "loss": 0.0783, "step": 15793 }, { "epoch": 4.828492815652705, "grad_norm": 0.3193736672401428, "learning_rate": 2.6630291707358502e-05, "loss": 0.0575, "step": 15794 }, { "epoch": 4.82879853255885, "grad_norm": 0.542600691318512, "learning_rate": 2.662986709693856e-05, "loss": 0.0746, "step": 15795 }, { "epoch": 4.829104249464995, "grad_norm": 0.3376907706260681, "learning_rate": 2.6629442486518623e-05, "loss": 0.0774, "step": 15796 }, { "epoch": 4.8294099663711405, "grad_norm": 0.5592567324638367, "learning_rate": 2.662901787609868e-05, "loss": 0.1195, "step": 15797 }, { "epoch": 4.829715683277286, "grad_norm": 0.565908670425415, "learning_rate": 2.662859326567874e-05, "loss": 0.1057, "step": 15798 }, { "epoch": 4.83002140018343, "grad_norm": 0.7756862044334412, "learning_rate": 2.6628168655258802e-05, "loss": 0.1448, "step": 15799 }, { "epoch": 4.830327117089575, "grad_norm": 0.4286160469055176, "learning_rate": 2.662774404483886e-05, "loss": 0.1331, "step": 15800 }, { "epoch": 4.83063283399572, "grad_norm": 0.8199710845947266, "learning_rate": 2.6627319434418923e-05, "loss": 0.1456, "step": 15801 }, { "epoch": 4.830938550901865, "grad_norm": 0.5051546096801758, "learning_rate": 2.6626894823998982e-05, "loss": 0.1374, "step": 15802 }, { "epoch": 4.831244267808009, "grad_norm": 0.7567780017852783, "learning_rate": 2.6626470213579044e-05, "loss": 0.1449, "step": 15803 }, { "epoch": 4.8315499847141545, "grad_norm": 0.9036616086959839, "learning_rate": 2.6626045603159102e-05, "loss": 0.2169, "step": 15804 }, { "epoch": 4.8318557016203, "grad_norm": 1.12540602684021, "learning_rate": 2.6625620992739165e-05, "loss": 0.1771, "step": 15805 }, { "epoch": 4.832161418526445, "grad_norm": 1.753917932510376, "learning_rate": 2.6625196382319223e-05, "loss": 0.1888, "step": 15806 }, { "epoch": 4.832467135432589, "grad_norm": 2.5195324420928955, "learning_rate": 2.6624771771899285e-05, "loss": 0.1653, "step": 15807 }, { "epoch": 4.832772852338734, "grad_norm": 1.1539461612701416, "learning_rate": 2.6624347161479344e-05, "loss": 0.2228, "step": 15808 }, { "epoch": 4.833078569244879, "grad_norm": 2.046140432357788, "learning_rate": 2.6623922551059406e-05, "loss": 0.2299, "step": 15809 }, { "epoch": 4.833384286151024, "grad_norm": 0.3307948708534241, "learning_rate": 2.6623497940639465e-05, "loss": 0.1471, "step": 15810 }, { "epoch": 4.8336900030571694, "grad_norm": 0.1889675110578537, "learning_rate": 2.6623073330219524e-05, "loss": 0.0687, "step": 15811 }, { "epoch": 4.833995719963314, "grad_norm": 0.23347435891628265, "learning_rate": 2.6622648719799586e-05, "loss": 0.0598, "step": 15812 }, { "epoch": 4.834301436869459, "grad_norm": 0.2024346888065338, "learning_rate": 2.6622224109379644e-05, "loss": 0.0684, "step": 15813 }, { "epoch": 4.834607153775604, "grad_norm": 0.23489680886268616, "learning_rate": 2.6621799498959706e-05, "loss": 0.05, "step": 15814 }, { "epoch": 4.834912870681749, "grad_norm": 0.24270108342170715, "learning_rate": 2.6621374888539765e-05, "loss": 0.048, "step": 15815 }, { "epoch": 4.835218587587893, "grad_norm": 0.22955083847045898, "learning_rate": 2.6620950278119827e-05, "loss": 0.0541, "step": 15816 }, { "epoch": 4.835524304494038, "grad_norm": 0.3753277063369751, "learning_rate": 2.6620525667699886e-05, "loss": 0.0744, "step": 15817 }, { "epoch": 4.8358300214001835, "grad_norm": 0.24463260173797607, "learning_rate": 2.6620101057279948e-05, "loss": 0.0533, "step": 15818 }, { "epoch": 4.836135738306329, "grad_norm": 0.2125495821237564, "learning_rate": 2.6619676446860007e-05, "loss": 0.0737, "step": 15819 }, { "epoch": 4.836441455212473, "grad_norm": 0.43351665139198303, "learning_rate": 2.661925183644007e-05, "loss": 0.1297, "step": 15820 }, { "epoch": 4.836747172118618, "grad_norm": 1.634965181350708, "learning_rate": 2.6618827226020127e-05, "loss": 0.1117, "step": 15821 }, { "epoch": 4.837052889024763, "grad_norm": 0.35520100593566895, "learning_rate": 2.6618402615600186e-05, "loss": 0.0901, "step": 15822 }, { "epoch": 4.837358605930908, "grad_norm": 0.4502006769180298, "learning_rate": 2.6617978005180248e-05, "loss": 0.1385, "step": 15823 }, { "epoch": 4.837664322837053, "grad_norm": 0.3962273895740509, "learning_rate": 2.6617553394760307e-05, "loss": 0.1274, "step": 15824 }, { "epoch": 4.8379700397431975, "grad_norm": 0.44470837712287903, "learning_rate": 2.661712878434037e-05, "loss": 0.1645, "step": 15825 }, { "epoch": 4.838275756649343, "grad_norm": 0.9815933704376221, "learning_rate": 2.6616704173920428e-05, "loss": 0.1458, "step": 15826 }, { "epoch": 4.838581473555488, "grad_norm": 0.4462904930114746, "learning_rate": 2.661627956350049e-05, "loss": 0.1358, "step": 15827 }, { "epoch": 4.838887190461633, "grad_norm": 0.4917019009590149, "learning_rate": 2.661585495308055e-05, "loss": 0.1566, "step": 15828 }, { "epoch": 4.839192907367777, "grad_norm": 0.55172199010849, "learning_rate": 2.661543034266061e-05, "loss": 0.1711, "step": 15829 }, { "epoch": 4.839498624273922, "grad_norm": 1.1347404718399048, "learning_rate": 2.661500573224067e-05, "loss": 0.1974, "step": 15830 }, { "epoch": 4.839804341180067, "grad_norm": 0.958901584148407, "learning_rate": 2.661458112182073e-05, "loss": 0.196, "step": 15831 }, { "epoch": 4.840110058086212, "grad_norm": 1.623146653175354, "learning_rate": 2.661415651140079e-05, "loss": 0.1738, "step": 15832 }, { "epoch": 4.840415774992357, "grad_norm": 4.829529285430908, "learning_rate": 2.6613731900980852e-05, "loss": 0.1902, "step": 15833 }, { "epoch": 4.840721491898502, "grad_norm": 0.7619117498397827, "learning_rate": 2.661330729056091e-05, "loss": 0.2204, "step": 15834 }, { "epoch": 4.841027208804647, "grad_norm": 0.5584035515785217, "learning_rate": 2.661288268014097e-05, "loss": 0.1746, "step": 15835 }, { "epoch": 4.841332925710792, "grad_norm": 0.26595115661621094, "learning_rate": 2.6612458069721032e-05, "loss": 0.0916, "step": 15836 }, { "epoch": 4.841638642616937, "grad_norm": 0.5035415887832642, "learning_rate": 2.661203345930109e-05, "loss": 0.0953, "step": 15837 }, { "epoch": 4.841944359523081, "grad_norm": 0.2502012252807617, "learning_rate": 2.6611608848881152e-05, "loss": 0.0525, "step": 15838 }, { "epoch": 4.8422500764292264, "grad_norm": 0.16931454837322235, "learning_rate": 2.661118423846121e-05, "loss": 0.0591, "step": 15839 }, { "epoch": 4.842555793335372, "grad_norm": 0.5793378949165344, "learning_rate": 2.6610759628041273e-05, "loss": 0.0859, "step": 15840 }, { "epoch": 4.842861510241517, "grad_norm": 0.5243752598762512, "learning_rate": 2.6610335017621332e-05, "loss": 0.0596, "step": 15841 }, { "epoch": 4.843167227147661, "grad_norm": 0.21455161273479462, "learning_rate": 2.6609910407201394e-05, "loss": 0.0783, "step": 15842 }, { "epoch": 4.843472944053806, "grad_norm": 0.4389013350009918, "learning_rate": 2.6609485796781453e-05, "loss": 0.0888, "step": 15843 }, { "epoch": 4.843778660959951, "grad_norm": 0.3322286307811737, "learning_rate": 2.6609061186361515e-05, "loss": 0.0539, "step": 15844 }, { "epoch": 4.844084377866096, "grad_norm": 0.23339693248271942, "learning_rate": 2.6608636575941574e-05, "loss": 0.1233, "step": 15845 }, { "epoch": 4.8443900947722405, "grad_norm": 0.3977592885494232, "learning_rate": 2.6608211965521636e-05, "loss": 0.0896, "step": 15846 }, { "epoch": 4.844695811678386, "grad_norm": 0.9959298372268677, "learning_rate": 2.6607787355101694e-05, "loss": 0.1012, "step": 15847 }, { "epoch": 4.845001528584531, "grad_norm": 0.6548387408256531, "learning_rate": 2.6607362744681753e-05, "loss": 0.1368, "step": 15848 }, { "epoch": 4.845307245490676, "grad_norm": 2.4059996604919434, "learning_rate": 2.6606938134261815e-05, "loss": 0.1437, "step": 15849 }, { "epoch": 4.845612962396821, "grad_norm": 0.6465174555778503, "learning_rate": 2.6606513523841874e-05, "loss": 0.1663, "step": 15850 }, { "epoch": 4.845918679302965, "grad_norm": 0.4249199330806732, "learning_rate": 2.6606088913421936e-05, "loss": 0.1919, "step": 15851 }, { "epoch": 4.84622439620911, "grad_norm": 3.1344316005706787, "learning_rate": 2.6605664303001995e-05, "loss": 0.1601, "step": 15852 }, { "epoch": 4.846530113115255, "grad_norm": 0.8127695322036743, "learning_rate": 2.6605239692582057e-05, "loss": 0.1524, "step": 15853 }, { "epoch": 4.8468358300214005, "grad_norm": 0.5423147678375244, "learning_rate": 2.6604815082162115e-05, "loss": 0.1694, "step": 15854 }, { "epoch": 4.847141546927545, "grad_norm": 0.6240178346633911, "learning_rate": 2.6604390471742177e-05, "loss": 0.1752, "step": 15855 }, { "epoch": 4.84744726383369, "grad_norm": 0.5441421866416931, "learning_rate": 2.6603965861322236e-05, "loss": 0.1414, "step": 15856 }, { "epoch": 4.847752980739835, "grad_norm": 0.8501322865486145, "learning_rate": 2.6603541250902298e-05, "loss": 0.1842, "step": 15857 }, { "epoch": 4.84805869764598, "grad_norm": 1.6940313577651978, "learning_rate": 2.6603116640482357e-05, "loss": 0.1999, "step": 15858 }, { "epoch": 4.848364414552124, "grad_norm": 1.437025547027588, "learning_rate": 2.660269203006242e-05, "loss": 0.243, "step": 15859 }, { "epoch": 4.848670131458269, "grad_norm": 0.3204152286052704, "learning_rate": 2.6602267419642478e-05, "loss": 0.1282, "step": 15860 }, { "epoch": 4.8489758483644145, "grad_norm": 0.3235352635383606, "learning_rate": 2.6601842809222536e-05, "loss": 0.0982, "step": 15861 }, { "epoch": 4.84928156527056, "grad_norm": 0.19379356503486633, "learning_rate": 2.66014181988026e-05, "loss": 0.0742, "step": 15862 }, { "epoch": 4.849587282176705, "grad_norm": 0.6489826440811157, "learning_rate": 2.6600993588382657e-05, "loss": 0.0644, "step": 15863 }, { "epoch": 4.849892999082849, "grad_norm": 0.18576966226100922, "learning_rate": 2.660056897796272e-05, "loss": 0.0486, "step": 15864 }, { "epoch": 4.850198715988994, "grad_norm": 0.32578256726264954, "learning_rate": 2.6600144367542778e-05, "loss": 0.0527, "step": 15865 }, { "epoch": 4.850504432895139, "grad_norm": 0.25108247995376587, "learning_rate": 2.659971975712284e-05, "loss": 0.0586, "step": 15866 }, { "epoch": 4.850810149801284, "grad_norm": 0.27227839827537537, "learning_rate": 2.65992951467029e-05, "loss": 0.0696, "step": 15867 }, { "epoch": 4.851115866707429, "grad_norm": 0.3875209391117096, "learning_rate": 2.659887053628296e-05, "loss": 0.064, "step": 15868 }, { "epoch": 4.851421583613574, "grad_norm": 0.3232678472995758, "learning_rate": 2.659844592586302e-05, "loss": 0.0845, "step": 15869 }, { "epoch": 4.851727300519719, "grad_norm": 0.39022740721702576, "learning_rate": 2.6598021315443082e-05, "loss": 0.07, "step": 15870 }, { "epoch": 4.852033017425864, "grad_norm": 0.7930917739868164, "learning_rate": 2.659759670502314e-05, "loss": 0.0691, "step": 15871 }, { "epoch": 4.852338734332008, "grad_norm": 0.5103524923324585, "learning_rate": 2.6597172094603203e-05, "loss": 0.0819, "step": 15872 }, { "epoch": 4.852644451238153, "grad_norm": 0.37459972500801086, "learning_rate": 2.659674748418326e-05, "loss": 0.1058, "step": 15873 }, { "epoch": 4.852950168144298, "grad_norm": 0.4057163596153259, "learning_rate": 2.659632287376332e-05, "loss": 0.1071, "step": 15874 }, { "epoch": 4.8532558850504435, "grad_norm": 0.36398231983184814, "learning_rate": 2.6595898263343382e-05, "loss": 0.1168, "step": 15875 }, { "epoch": 4.853561601956589, "grad_norm": 0.581800639629364, "learning_rate": 2.659547365292344e-05, "loss": 0.1482, "step": 15876 }, { "epoch": 4.853867318862733, "grad_norm": 0.488392174243927, "learning_rate": 2.6595049042503503e-05, "loss": 0.1776, "step": 15877 }, { "epoch": 4.854173035768878, "grad_norm": 1.228584885597229, "learning_rate": 2.659462443208356e-05, "loss": 0.1647, "step": 15878 }, { "epoch": 4.854478752675023, "grad_norm": 1.143633246421814, "learning_rate": 2.6594199821663624e-05, "loss": 0.1922, "step": 15879 }, { "epoch": 4.854784469581168, "grad_norm": 1.1147656440734863, "learning_rate": 2.6593775211243682e-05, "loss": 0.201, "step": 15880 }, { "epoch": 4.855090186487312, "grad_norm": 0.9125921726226807, "learning_rate": 2.6593350600823744e-05, "loss": 0.2225, "step": 15881 }, { "epoch": 4.8553959033934575, "grad_norm": 0.9918628334999084, "learning_rate": 2.6592925990403803e-05, "loss": 0.1857, "step": 15882 }, { "epoch": 4.855701620299603, "grad_norm": 2.495972156524658, "learning_rate": 2.6592501379983865e-05, "loss": 0.1818, "step": 15883 }, { "epoch": 4.856007337205748, "grad_norm": 2.0985138416290283, "learning_rate": 2.6592076769563924e-05, "loss": 0.2075, "step": 15884 }, { "epoch": 4.856313054111892, "grad_norm": 0.279201865196228, "learning_rate": 2.6591652159143986e-05, "loss": 0.1619, "step": 15885 }, { "epoch": 4.856618771018037, "grad_norm": 0.23677530884742737, "learning_rate": 2.6591227548724048e-05, "loss": 0.0843, "step": 15886 }, { "epoch": 4.856924487924182, "grad_norm": 0.3359810411930084, "learning_rate": 2.6590802938304107e-05, "loss": 0.0772, "step": 15887 }, { "epoch": 4.857230204830327, "grad_norm": 0.20665787160396576, "learning_rate": 2.659037832788417e-05, "loss": 0.0463, "step": 15888 }, { "epoch": 4.857535921736472, "grad_norm": 0.3864682912826538, "learning_rate": 2.6589953717464228e-05, "loss": 0.0595, "step": 15889 }, { "epoch": 4.857841638642617, "grad_norm": 0.23731385171413422, "learning_rate": 2.658952910704429e-05, "loss": 0.0532, "step": 15890 }, { "epoch": 4.858147355548762, "grad_norm": 0.21371111273765564, "learning_rate": 2.6589104496624348e-05, "loss": 0.0369, "step": 15891 }, { "epoch": 4.858453072454907, "grad_norm": 0.44977501034736633, "learning_rate": 2.658867988620441e-05, "loss": 0.082, "step": 15892 }, { "epoch": 4.858758789361052, "grad_norm": 0.7213362455368042, "learning_rate": 2.658825527578447e-05, "loss": 0.0576, "step": 15893 }, { "epoch": 4.859064506267196, "grad_norm": 0.24743175506591797, "learning_rate": 2.658783066536453e-05, "loss": 0.0625, "step": 15894 }, { "epoch": 4.859370223173341, "grad_norm": 0.4178747832775116, "learning_rate": 2.658740605494459e-05, "loss": 0.1042, "step": 15895 }, { "epoch": 4.8596759400794864, "grad_norm": 0.5316694378852844, "learning_rate": 2.6586981444524652e-05, "loss": 0.0698, "step": 15896 }, { "epoch": 4.859981656985632, "grad_norm": 0.26702749729156494, "learning_rate": 2.658655683410471e-05, "loss": 0.0767, "step": 15897 }, { "epoch": 4.860287373891776, "grad_norm": 1.1349657773971558, "learning_rate": 2.6586132223684773e-05, "loss": 0.1691, "step": 15898 }, { "epoch": 4.860593090797921, "grad_norm": 0.46722257137298584, "learning_rate": 2.658570761326483e-05, "loss": 0.0954, "step": 15899 }, { "epoch": 4.860898807704066, "grad_norm": 0.3748873174190521, "learning_rate": 2.658528300284489e-05, "loss": 0.1442, "step": 15900 }, { "epoch": 4.861204524610211, "grad_norm": 0.6062159538269043, "learning_rate": 2.6584858392424952e-05, "loss": 0.1824, "step": 15901 }, { "epoch": 4.861510241516356, "grad_norm": 0.7128743529319763, "learning_rate": 2.658443378200501e-05, "loss": 0.1457, "step": 15902 }, { "epoch": 4.8618159584225005, "grad_norm": 0.49684494733810425, "learning_rate": 2.6584009171585073e-05, "loss": 0.169, "step": 15903 }, { "epoch": 4.862121675328646, "grad_norm": 0.6522027254104614, "learning_rate": 2.6583584561165132e-05, "loss": 0.1638, "step": 15904 }, { "epoch": 4.862427392234791, "grad_norm": 0.9215797781944275, "learning_rate": 2.6583159950745194e-05, "loss": 0.1613, "step": 15905 }, { "epoch": 4.862733109140936, "grad_norm": 0.7951763868331909, "learning_rate": 2.6582735340325253e-05, "loss": 0.2236, "step": 15906 }, { "epoch": 4.86303882604708, "grad_norm": 0.8084176182746887, "learning_rate": 2.6582310729905315e-05, "loss": 0.2183, "step": 15907 }, { "epoch": 4.863344542953225, "grad_norm": 1.7800360918045044, "learning_rate": 2.6581886119485373e-05, "loss": 0.1569, "step": 15908 }, { "epoch": 4.86365025985937, "grad_norm": 2.4765937328338623, "learning_rate": 2.6581461509065435e-05, "loss": 0.2207, "step": 15909 }, { "epoch": 4.863955976765515, "grad_norm": 0.6977350115776062, "learning_rate": 2.6581036898645494e-05, "loss": 0.1661, "step": 15910 }, { "epoch": 4.86426169367166, "grad_norm": 0.19283360242843628, "learning_rate": 2.6580612288225556e-05, "loss": 0.073, "step": 15911 }, { "epoch": 4.864567410577805, "grad_norm": 0.2130376100540161, "learning_rate": 2.6580187677805615e-05, "loss": 0.0651, "step": 15912 }, { "epoch": 4.86487312748395, "grad_norm": 0.2311042696237564, "learning_rate": 2.6579763067385674e-05, "loss": 0.0549, "step": 15913 }, { "epoch": 4.865178844390095, "grad_norm": 0.24147599935531616, "learning_rate": 2.6579338456965736e-05, "loss": 0.0668, "step": 15914 }, { "epoch": 4.86548456129624, "grad_norm": 0.33184462785720825, "learning_rate": 2.6578913846545794e-05, "loss": 0.0831, "step": 15915 }, { "epoch": 4.865790278202384, "grad_norm": 0.21544794738292694, "learning_rate": 2.6578489236125856e-05, "loss": 0.0467, "step": 15916 }, { "epoch": 4.866095995108529, "grad_norm": 0.2834005355834961, "learning_rate": 2.6578064625705915e-05, "loss": 0.047, "step": 15917 }, { "epoch": 4.8664017120146745, "grad_norm": 0.23931053280830383, "learning_rate": 2.6577640015285977e-05, "loss": 0.0565, "step": 15918 }, { "epoch": 4.86670742892082, "grad_norm": 0.17997236549854279, "learning_rate": 2.6577215404866036e-05, "loss": 0.0604, "step": 15919 }, { "epoch": 4.867013145826964, "grad_norm": 0.36292946338653564, "learning_rate": 2.6576790794446098e-05, "loss": 0.1081, "step": 15920 }, { "epoch": 4.867318862733109, "grad_norm": 0.38767609000205994, "learning_rate": 2.6576366184026157e-05, "loss": 0.0782, "step": 15921 }, { "epoch": 4.867624579639254, "grad_norm": 0.3593500852584839, "learning_rate": 2.657594157360622e-05, "loss": 0.0784, "step": 15922 }, { "epoch": 4.867930296545399, "grad_norm": 0.5006045699119568, "learning_rate": 2.6575516963186278e-05, "loss": 0.1606, "step": 15923 }, { "epoch": 4.868236013451543, "grad_norm": 1.0938889980316162, "learning_rate": 2.657509235276634e-05, "loss": 0.1169, "step": 15924 }, { "epoch": 4.868541730357689, "grad_norm": 0.6373082995414734, "learning_rate": 2.6574667742346398e-05, "loss": 0.1453, "step": 15925 }, { "epoch": 4.868847447263834, "grad_norm": 0.4715053141117096, "learning_rate": 2.6574243131926457e-05, "loss": 0.1512, "step": 15926 }, { "epoch": 4.869153164169979, "grad_norm": 0.7867876291275024, "learning_rate": 2.657381852150652e-05, "loss": 0.1738, "step": 15927 }, { "epoch": 4.869458881076124, "grad_norm": 1.0895073413848877, "learning_rate": 2.6573393911086578e-05, "loss": 0.1733, "step": 15928 }, { "epoch": 4.869764597982268, "grad_norm": 1.280938744544983, "learning_rate": 2.657296930066664e-05, "loss": 0.173, "step": 15929 }, { "epoch": 4.870070314888413, "grad_norm": 0.6439650058746338, "learning_rate": 2.65725446902467e-05, "loss": 0.1941, "step": 15930 }, { "epoch": 4.870376031794558, "grad_norm": 0.5923189520835876, "learning_rate": 2.657212007982676e-05, "loss": 0.1994, "step": 15931 }, { "epoch": 4.8706817487007035, "grad_norm": 1.0379735231399536, "learning_rate": 2.657169546940682e-05, "loss": 0.1977, "step": 15932 }, { "epoch": 4.870987465606848, "grad_norm": 0.7177295088768005, "learning_rate": 2.657127085898688e-05, "loss": 0.1765, "step": 15933 }, { "epoch": 4.871293182512993, "grad_norm": 2.5255544185638428, "learning_rate": 2.657084624856694e-05, "loss": 0.2761, "step": 15934 }, { "epoch": 4.871598899419138, "grad_norm": 0.3669639229774475, "learning_rate": 2.6570421638147002e-05, "loss": 0.132, "step": 15935 }, { "epoch": 4.871904616325283, "grad_norm": 0.29439032077789307, "learning_rate": 2.656999702772706e-05, "loss": 0.0982, "step": 15936 }, { "epoch": 4.872210333231427, "grad_norm": 0.2885611951351166, "learning_rate": 2.6569572417307123e-05, "loss": 0.0572, "step": 15937 }, { "epoch": 4.872516050137572, "grad_norm": 0.38596290349960327, "learning_rate": 2.6569147806887182e-05, "loss": 0.0959, "step": 15938 }, { "epoch": 4.8728217670437175, "grad_norm": 0.19018425047397614, "learning_rate": 2.656872319646724e-05, "loss": 0.0428, "step": 15939 }, { "epoch": 4.873127483949863, "grad_norm": 0.2627294063568115, "learning_rate": 2.6568298586047303e-05, "loss": 0.0427, "step": 15940 }, { "epoch": 4.873433200856008, "grad_norm": 0.19431759417057037, "learning_rate": 2.656787397562736e-05, "loss": 0.0559, "step": 15941 }, { "epoch": 4.873738917762152, "grad_norm": 0.3714583218097687, "learning_rate": 2.6567449365207423e-05, "loss": 0.0539, "step": 15942 }, { "epoch": 4.874044634668297, "grad_norm": 0.3366328179836273, "learning_rate": 2.6567024754787482e-05, "loss": 0.0717, "step": 15943 }, { "epoch": 4.874350351574442, "grad_norm": 0.36678048968315125, "learning_rate": 2.6566600144367544e-05, "loss": 0.0693, "step": 15944 }, { "epoch": 4.874656068480587, "grad_norm": 0.33923766016960144, "learning_rate": 2.6566175533947603e-05, "loss": 0.0957, "step": 15945 }, { "epoch": 4.8749617853867315, "grad_norm": 0.28224796056747437, "learning_rate": 2.6565750923527665e-05, "loss": 0.0811, "step": 15946 }, { "epoch": 4.875267502292877, "grad_norm": 0.3905019760131836, "learning_rate": 2.6565326313107724e-05, "loss": 0.0921, "step": 15947 }, { "epoch": 4.875573219199022, "grad_norm": 0.5047869086265564, "learning_rate": 2.6564901702687786e-05, "loss": 0.0961, "step": 15948 }, { "epoch": 4.875878936105167, "grad_norm": 0.3660680055618286, "learning_rate": 2.6564477092267844e-05, "loss": 0.1011, "step": 15949 }, { "epoch": 4.876184653011311, "grad_norm": 0.8059324026107788, "learning_rate": 2.6564052481847903e-05, "loss": 0.1386, "step": 15950 }, { "epoch": 4.876490369917456, "grad_norm": 0.6230055689811707, "learning_rate": 2.6563627871427965e-05, "loss": 0.1665, "step": 15951 }, { "epoch": 4.876796086823601, "grad_norm": 0.4991663098335266, "learning_rate": 2.6563203261008024e-05, "loss": 0.1654, "step": 15952 }, { "epoch": 4.877101803729746, "grad_norm": 0.3988023102283478, "learning_rate": 2.6562778650588086e-05, "loss": 0.156, "step": 15953 }, { "epoch": 4.877407520635892, "grad_norm": 0.9014497995376587, "learning_rate": 2.6562354040168145e-05, "loss": 0.1969, "step": 15954 }, { "epoch": 4.877713237542036, "grad_norm": 0.7094293832778931, "learning_rate": 2.6561929429748207e-05, "loss": 0.1446, "step": 15955 }, { "epoch": 4.878018954448181, "grad_norm": 0.6405656337738037, "learning_rate": 2.6561504819328265e-05, "loss": 0.1538, "step": 15956 }, { "epoch": 4.878324671354326, "grad_norm": 1.5922199487686157, "learning_rate": 2.6561080208908328e-05, "loss": 0.2022, "step": 15957 }, { "epoch": 4.878630388260471, "grad_norm": 1.903275728225708, "learning_rate": 2.6560655598488386e-05, "loss": 0.1906, "step": 15958 }, { "epoch": 4.878936105166615, "grad_norm": 2.236989974975586, "learning_rate": 2.656023098806845e-05, "loss": 0.2222, "step": 15959 }, { "epoch": 4.8792418220727605, "grad_norm": 0.3971293568611145, "learning_rate": 2.6559806377648507e-05, "loss": 0.1357, "step": 15960 }, { "epoch": 4.879547538978906, "grad_norm": 0.5977848768234253, "learning_rate": 2.655938176722857e-05, "loss": 0.0622, "step": 15961 }, { "epoch": 4.879853255885051, "grad_norm": 0.4003983438014984, "learning_rate": 2.6558957156808628e-05, "loss": 0.0714, "step": 15962 }, { "epoch": 4.880158972791195, "grad_norm": 0.8494381308555603, "learning_rate": 2.6558532546388687e-05, "loss": 0.0644, "step": 15963 }, { "epoch": 4.88046468969734, "grad_norm": 0.20500536262989044, "learning_rate": 2.655810793596875e-05, "loss": 0.0499, "step": 15964 }, { "epoch": 4.880770406603485, "grad_norm": 0.6279383897781372, "learning_rate": 2.6557683325548807e-05, "loss": 0.061, "step": 15965 }, { "epoch": 4.88107612350963, "grad_norm": 0.26431792974472046, "learning_rate": 2.655725871512887e-05, "loss": 0.0448, "step": 15966 }, { "epoch": 4.881381840415775, "grad_norm": 0.25802937150001526, "learning_rate": 2.6556834104708928e-05, "loss": 0.0842, "step": 15967 }, { "epoch": 4.88168755732192, "grad_norm": 0.2347501516342163, "learning_rate": 2.655640949428899e-05, "loss": 0.0495, "step": 15968 }, { "epoch": 4.881993274228065, "grad_norm": 0.20796233415603638, "learning_rate": 2.655598488386905e-05, "loss": 0.0498, "step": 15969 }, { "epoch": 4.88229899113421, "grad_norm": 0.3157995641231537, "learning_rate": 2.655556027344911e-05, "loss": 0.0886, "step": 15970 }, { "epoch": 4.882604708040355, "grad_norm": 0.2890511751174927, "learning_rate": 2.655513566302917e-05, "loss": 0.0959, "step": 15971 }, { "epoch": 4.882910424946499, "grad_norm": 0.7896401286125183, "learning_rate": 2.6554711052609232e-05, "loss": 0.0966, "step": 15972 }, { "epoch": 4.883216141852644, "grad_norm": 0.41245314478874207, "learning_rate": 2.655428644218929e-05, "loss": 0.0955, "step": 15973 }, { "epoch": 4.883521858758789, "grad_norm": 0.34747743606567383, "learning_rate": 2.6553861831769353e-05, "loss": 0.1086, "step": 15974 }, { "epoch": 4.8838275756649345, "grad_norm": 0.5498822331428528, "learning_rate": 2.655343722134941e-05, "loss": 0.126, "step": 15975 }, { "epoch": 4.884133292571079, "grad_norm": 0.7886852025985718, "learning_rate": 2.655301261092947e-05, "loss": 0.1588, "step": 15976 }, { "epoch": 4.884439009477224, "grad_norm": 1.03469717502594, "learning_rate": 2.6552588000509532e-05, "loss": 0.1444, "step": 15977 }, { "epoch": 4.884744726383369, "grad_norm": 0.5886448621749878, "learning_rate": 2.655216339008959e-05, "loss": 0.1665, "step": 15978 }, { "epoch": 4.885050443289514, "grad_norm": 0.7311325073242188, "learning_rate": 2.6551738779669653e-05, "loss": 0.176, "step": 15979 }, { "epoch": 4.885356160195659, "grad_norm": 1.2947033643722534, "learning_rate": 2.655131416924971e-05, "loss": 0.1425, "step": 15980 }, { "epoch": 4.885661877101803, "grad_norm": 0.9980518817901611, "learning_rate": 2.6550889558829774e-05, "loss": 0.1889, "step": 15981 }, { "epoch": 4.8859675940079486, "grad_norm": 0.6144388318061829, "learning_rate": 2.6550464948409832e-05, "loss": 0.1842, "step": 15982 }, { "epoch": 4.886273310914094, "grad_norm": 0.8020000457763672, "learning_rate": 2.6550040337989894e-05, "loss": 0.2041, "step": 15983 }, { "epoch": 4.886579027820239, "grad_norm": 2.2071962356567383, "learning_rate": 2.6549615727569953e-05, "loss": 0.2553, "step": 15984 }, { "epoch": 4.886884744726383, "grad_norm": 0.60760498046875, "learning_rate": 2.6549191117150015e-05, "loss": 0.1314, "step": 15985 }, { "epoch": 4.887190461632528, "grad_norm": 0.21043743193149567, "learning_rate": 2.6548766506730074e-05, "loss": 0.0662, "step": 15986 }, { "epoch": 4.887496178538673, "grad_norm": 0.35362640023231506, "learning_rate": 2.6548341896310136e-05, "loss": 0.1123, "step": 15987 }, { "epoch": 4.887801895444818, "grad_norm": 0.28309738636016846, "learning_rate": 2.6547917285890198e-05, "loss": 0.0533, "step": 15988 }, { "epoch": 4.888107612350963, "grad_norm": 0.1543063223361969, "learning_rate": 2.6547492675470257e-05, "loss": 0.0566, "step": 15989 }, { "epoch": 4.888413329257108, "grad_norm": 0.24633049964904785, "learning_rate": 2.654706806505032e-05, "loss": 0.0567, "step": 15990 }, { "epoch": 4.888719046163253, "grad_norm": 0.6743683218955994, "learning_rate": 2.6546643454630378e-05, "loss": 0.0644, "step": 15991 }, { "epoch": 4.889024763069398, "grad_norm": 0.2825789749622345, "learning_rate": 2.654621884421044e-05, "loss": 0.054, "step": 15992 }, { "epoch": 4.889330479975543, "grad_norm": 0.263272762298584, "learning_rate": 2.65457942337905e-05, "loss": 0.0788, "step": 15993 }, { "epoch": 4.889636196881687, "grad_norm": 1.3071494102478027, "learning_rate": 2.654536962337056e-05, "loss": 0.0821, "step": 15994 }, { "epoch": 4.889941913787832, "grad_norm": 0.7215616703033447, "learning_rate": 2.654494501295062e-05, "loss": 0.0795, "step": 15995 }, { "epoch": 4.8902476306939775, "grad_norm": 0.40893790125846863, "learning_rate": 2.654452040253068e-05, "loss": 0.0792, "step": 15996 }, { "epoch": 4.890553347600123, "grad_norm": 0.30646660923957825, "learning_rate": 2.654409579211074e-05, "loss": 0.1271, "step": 15997 }, { "epoch": 4.890859064506267, "grad_norm": 0.8708686828613281, "learning_rate": 2.6543671181690802e-05, "loss": 0.1183, "step": 15998 }, { "epoch": 4.891164781412412, "grad_norm": 0.7050628066062927, "learning_rate": 2.654324657127086e-05, "loss": 0.1158, "step": 15999 }, { "epoch": 4.891470498318557, "grad_norm": 0.4121387004852295, "learning_rate": 2.6542821960850923e-05, "loss": 0.1545, "step": 16000 }, { "epoch": 4.891470498318557, "eval_cer": 0.18924094137671435, "eval_loss": 0.23109658062458038, "eval_runtime": 18.9408, "eval_samples_per_second": 239.589, "eval_steps_per_second": 0.792, "eval_wer": 0.3306611547163487, "step": 16000 }, { "epoch": 4.891776215224702, "grad_norm": 1.119240164756775, "learning_rate": 2.654239735043098e-05, "loss": 0.1372, "step": 16001 }, { "epoch": 4.892081932130846, "grad_norm": 1.4711869955062866, "learning_rate": 2.654197274001104e-05, "loss": 0.1696, "step": 16002 }, { "epoch": 4.8923876490369915, "grad_norm": 0.48931828141212463, "learning_rate": 2.6541548129591102e-05, "loss": 0.1383, "step": 16003 }, { "epoch": 4.892693365943137, "grad_norm": 1.748809814453125, "learning_rate": 2.654112351917116e-05, "loss": 0.1686, "step": 16004 }, { "epoch": 4.892999082849282, "grad_norm": 1.9891473054885864, "learning_rate": 2.6540698908751223e-05, "loss": 0.1954, "step": 16005 }, { "epoch": 4.893304799755427, "grad_norm": 1.5399080514907837, "learning_rate": 2.6540274298331282e-05, "loss": 0.1656, "step": 16006 }, { "epoch": 4.893610516661571, "grad_norm": 1.1881670951843262, "learning_rate": 2.6539849687911344e-05, "loss": 0.2038, "step": 16007 }, { "epoch": 4.893916233567716, "grad_norm": 1.1668936014175415, "learning_rate": 2.6539425077491403e-05, "loss": 0.2165, "step": 16008 }, { "epoch": 4.894221950473861, "grad_norm": 1.160502552986145, "learning_rate": 2.6539000467071465e-05, "loss": 0.2182, "step": 16009 }, { "epoch": 4.894527667380006, "grad_norm": 0.5566250085830688, "learning_rate": 2.6538575856651523e-05, "loss": 0.1505, "step": 16010 }, { "epoch": 4.894833384286151, "grad_norm": 0.4078027606010437, "learning_rate": 2.6538151246231585e-05, "loss": 0.0792, "step": 16011 }, { "epoch": 4.895139101192296, "grad_norm": 0.4372258484363556, "learning_rate": 2.6537726635811644e-05, "loss": 0.0664, "step": 16012 }, { "epoch": 4.895444818098441, "grad_norm": 0.21844764053821564, "learning_rate": 2.6537302025391706e-05, "loss": 0.0572, "step": 16013 }, { "epoch": 4.895750535004586, "grad_norm": 0.17754444479942322, "learning_rate": 2.6536877414971765e-05, "loss": 0.0534, "step": 16014 }, { "epoch": 4.89605625191073, "grad_norm": 0.18550032377243042, "learning_rate": 2.6536452804551824e-05, "loss": 0.0599, "step": 16015 }, { "epoch": 4.896361968816875, "grad_norm": 0.26180440187454224, "learning_rate": 2.6536028194131886e-05, "loss": 0.0762, "step": 16016 }, { "epoch": 4.8966676857230205, "grad_norm": 0.32112932205200195, "learning_rate": 2.6535603583711944e-05, "loss": 0.0904, "step": 16017 }, { "epoch": 4.896973402629166, "grad_norm": 0.37409600615501404, "learning_rate": 2.6535178973292006e-05, "loss": 0.0617, "step": 16018 }, { "epoch": 4.897279119535311, "grad_norm": 0.19478781521320343, "learning_rate": 2.6534754362872065e-05, "loss": 0.064, "step": 16019 }, { "epoch": 4.897584836441455, "grad_norm": 1.2265040874481201, "learning_rate": 2.6534329752452127e-05, "loss": 0.1093, "step": 16020 }, { "epoch": 4.8978905533476, "grad_norm": 0.3794666826725006, "learning_rate": 2.6533905142032186e-05, "loss": 0.0809, "step": 16021 }, { "epoch": 4.898196270253745, "grad_norm": 1.01667320728302, "learning_rate": 2.6533480531612248e-05, "loss": 0.0895, "step": 16022 }, { "epoch": 4.89850198715989, "grad_norm": 0.7847824692726135, "learning_rate": 2.6533055921192307e-05, "loss": 0.124, "step": 16023 }, { "epoch": 4.8988077040660345, "grad_norm": 0.3585977852344513, "learning_rate": 2.653263131077237e-05, "loss": 0.123, "step": 16024 }, { "epoch": 4.89911342097218, "grad_norm": 0.9051998257637024, "learning_rate": 2.6532206700352428e-05, "loss": 0.131, "step": 16025 }, { "epoch": 4.899419137878325, "grad_norm": 1.0993696451187134, "learning_rate": 2.653178208993249e-05, "loss": 0.1618, "step": 16026 }, { "epoch": 4.89972485478447, "grad_norm": 0.4076637923717499, "learning_rate": 2.653135747951255e-05, "loss": 0.1508, "step": 16027 }, { "epoch": 4.900030571690614, "grad_norm": 0.6077864170074463, "learning_rate": 2.6530932869092607e-05, "loss": 0.1568, "step": 16028 }, { "epoch": 4.900336288596759, "grad_norm": 0.7433117032051086, "learning_rate": 2.653050825867267e-05, "loss": 0.1944, "step": 16029 }, { "epoch": 4.900642005502904, "grad_norm": 0.5806362628936768, "learning_rate": 2.6530083648252728e-05, "loss": 0.1747, "step": 16030 }, { "epoch": 4.900947722409049, "grad_norm": 0.7732837796211243, "learning_rate": 2.652965903783279e-05, "loss": 0.1933, "step": 16031 }, { "epoch": 4.9012534393151945, "grad_norm": 0.7941733002662659, "learning_rate": 2.652923442741285e-05, "loss": 0.1969, "step": 16032 }, { "epoch": 4.901559156221339, "grad_norm": 0.6914300918579102, "learning_rate": 2.652880981699291e-05, "loss": 0.1975, "step": 16033 }, { "epoch": 4.901864873127484, "grad_norm": 1.3372652530670166, "learning_rate": 2.652838520657297e-05, "loss": 0.2215, "step": 16034 }, { "epoch": 4.902170590033629, "grad_norm": 0.36221009492874146, "learning_rate": 2.652796059615303e-05, "loss": 0.1424, "step": 16035 }, { "epoch": 4.902476306939774, "grad_norm": 0.3304927945137024, "learning_rate": 2.652753598573309e-05, "loss": 0.0885, "step": 16036 }, { "epoch": 4.902782023845918, "grad_norm": 0.24286635220050812, "learning_rate": 2.6527111375313152e-05, "loss": 0.0806, "step": 16037 }, { "epoch": 4.903087740752063, "grad_norm": 0.3232383728027344, "learning_rate": 2.652668676489321e-05, "loss": 0.0561, "step": 16038 }, { "epoch": 4.9033934576582086, "grad_norm": 0.40518003702163696, "learning_rate": 2.6526262154473273e-05, "loss": 0.0394, "step": 16039 }, { "epoch": 4.903699174564354, "grad_norm": 0.17011404037475586, "learning_rate": 2.6525837544053332e-05, "loss": 0.0489, "step": 16040 }, { "epoch": 4.904004891470498, "grad_norm": 0.28210869431495667, "learning_rate": 2.652541293363339e-05, "loss": 0.0751, "step": 16041 }, { "epoch": 4.904310608376643, "grad_norm": 0.17980434000492096, "learning_rate": 2.6524988323213453e-05, "loss": 0.0537, "step": 16042 }, { "epoch": 4.904616325282788, "grad_norm": 0.3138673007488251, "learning_rate": 2.652456371279351e-05, "loss": 0.0848, "step": 16043 }, { "epoch": 4.904922042188933, "grad_norm": 0.29795464873313904, "learning_rate": 2.6524139102373573e-05, "loss": 0.0862, "step": 16044 }, { "epoch": 4.905227759095078, "grad_norm": 0.48201683163642883, "learning_rate": 2.6523714491953632e-05, "loss": 0.0869, "step": 16045 }, { "epoch": 4.905533476001223, "grad_norm": 0.39987263083457947, "learning_rate": 2.6523289881533694e-05, "loss": 0.0968, "step": 16046 }, { "epoch": 4.905839192907368, "grad_norm": 0.28045737743377686, "learning_rate": 2.6522865271113753e-05, "loss": 0.1017, "step": 16047 }, { "epoch": 4.906144909813513, "grad_norm": 0.21248967945575714, "learning_rate": 2.6522440660693815e-05, "loss": 0.1006, "step": 16048 }, { "epoch": 4.906450626719658, "grad_norm": 0.6393201351165771, "learning_rate": 2.6522016050273874e-05, "loss": 0.141, "step": 16049 }, { "epoch": 4.906756343625802, "grad_norm": 0.8005924224853516, "learning_rate": 2.6521591439853936e-05, "loss": 0.1267, "step": 16050 }, { "epoch": 4.907062060531947, "grad_norm": 0.8773077726364136, "learning_rate": 2.6521166829433994e-05, "loss": 0.1728, "step": 16051 }, { "epoch": 4.907367777438092, "grad_norm": 0.6285409331321716, "learning_rate": 2.6520742219014056e-05, "loss": 0.1519, "step": 16052 }, { "epoch": 4.9076734943442375, "grad_norm": 0.5856149196624756, "learning_rate": 2.6520317608594115e-05, "loss": 0.1707, "step": 16053 }, { "epoch": 4.907979211250382, "grad_norm": 0.5249850153923035, "learning_rate": 2.6519892998174174e-05, "loss": 0.1773, "step": 16054 }, { "epoch": 4.908284928156527, "grad_norm": 0.575958251953125, "learning_rate": 2.6519468387754236e-05, "loss": 0.1867, "step": 16055 }, { "epoch": 4.908590645062672, "grad_norm": 0.5239731073379517, "learning_rate": 2.6519043777334295e-05, "loss": 0.1801, "step": 16056 }, { "epoch": 4.908896361968817, "grad_norm": 0.6911988258361816, "learning_rate": 2.6518619166914357e-05, "loss": 0.1789, "step": 16057 }, { "epoch": 4.909202078874962, "grad_norm": 3.1644673347473145, "learning_rate": 2.6518194556494415e-05, "loss": 0.1433, "step": 16058 }, { "epoch": 4.909507795781106, "grad_norm": 1.4791492223739624, "learning_rate": 2.6517769946074478e-05, "loss": 0.2828, "step": 16059 }, { "epoch": 4.9098135126872515, "grad_norm": 0.6344278454780579, "learning_rate": 2.6517345335654536e-05, "loss": 0.1891, "step": 16060 }, { "epoch": 4.910119229593397, "grad_norm": 0.33652743697166443, "learning_rate": 2.65169207252346e-05, "loss": 0.0804, "step": 16061 }, { "epoch": 4.910424946499542, "grad_norm": 0.4961891770362854, "learning_rate": 2.6516496114814657e-05, "loss": 0.0821, "step": 16062 }, { "epoch": 4.910730663405686, "grad_norm": 0.15934418141841888, "learning_rate": 2.651607150439472e-05, "loss": 0.0539, "step": 16063 }, { "epoch": 4.911036380311831, "grad_norm": 0.38524749875068665, "learning_rate": 2.6515646893974778e-05, "loss": 0.0528, "step": 16064 }, { "epoch": 4.911342097217976, "grad_norm": 1.1722164154052734, "learning_rate": 2.6515222283554837e-05, "loss": 0.0828, "step": 16065 }, { "epoch": 4.911647814124121, "grad_norm": 0.17058725655078888, "learning_rate": 2.65147976731349e-05, "loss": 0.0634, "step": 16066 }, { "epoch": 4.9119535310302656, "grad_norm": 0.23723281919956207, "learning_rate": 2.6514373062714957e-05, "loss": 0.0573, "step": 16067 }, { "epoch": 4.912259247936411, "grad_norm": 0.37426167726516724, "learning_rate": 2.651394845229502e-05, "loss": 0.111, "step": 16068 }, { "epoch": 4.912564964842556, "grad_norm": 0.8686860799789429, "learning_rate": 2.6513523841875078e-05, "loss": 0.0949, "step": 16069 }, { "epoch": 4.912870681748701, "grad_norm": 0.47424593567848206, "learning_rate": 2.651309923145514e-05, "loss": 0.0815, "step": 16070 }, { "epoch": 4.913176398654846, "grad_norm": 0.5699555277824402, "learning_rate": 2.65126746210352e-05, "loss": 0.1008, "step": 16071 }, { "epoch": 4.91348211556099, "grad_norm": 1.0221952199935913, "learning_rate": 2.651225001061526e-05, "loss": 0.1016, "step": 16072 }, { "epoch": 4.913787832467135, "grad_norm": 0.5204733610153198, "learning_rate": 2.651182540019532e-05, "loss": 0.1682, "step": 16073 }, { "epoch": 4.9140935493732805, "grad_norm": 0.30191996693611145, "learning_rate": 2.6511400789775382e-05, "loss": 0.1246, "step": 16074 }, { "epoch": 4.914399266279426, "grad_norm": 0.42567765712738037, "learning_rate": 2.651097617935544e-05, "loss": 0.1344, "step": 16075 }, { "epoch": 4.91470498318557, "grad_norm": 1.2410801649093628, "learning_rate": 2.6510551568935503e-05, "loss": 0.1758, "step": 16076 }, { "epoch": 4.915010700091715, "grad_norm": 0.543576717376709, "learning_rate": 2.651012695851556e-05, "loss": 0.1604, "step": 16077 }, { "epoch": 4.91531641699786, "grad_norm": 2.1974666118621826, "learning_rate": 2.650970234809562e-05, "loss": 0.2055, "step": 16078 }, { "epoch": 4.915622133904005, "grad_norm": 0.47301000356674194, "learning_rate": 2.6509277737675682e-05, "loss": 0.159, "step": 16079 }, { "epoch": 4.915927850810149, "grad_norm": 0.6614408493041992, "learning_rate": 2.650885312725574e-05, "loss": 0.1663, "step": 16080 }, { "epoch": 4.9162335677162945, "grad_norm": 0.5320749878883362, "learning_rate": 2.6508428516835803e-05, "loss": 0.1764, "step": 16081 }, { "epoch": 4.91653928462244, "grad_norm": 4.388126850128174, "learning_rate": 2.650800390641586e-05, "loss": 0.1736, "step": 16082 }, { "epoch": 4.916845001528585, "grad_norm": 1.290460228919983, "learning_rate": 2.6507579295995924e-05, "loss": 0.2014, "step": 16083 }, { "epoch": 4.91715071843473, "grad_norm": 1.0338093042373657, "learning_rate": 2.6507154685575982e-05, "loss": 0.1782, "step": 16084 }, { "epoch": 4.917456435340874, "grad_norm": 0.4085944890975952, "learning_rate": 2.6506730075156044e-05, "loss": 0.1342, "step": 16085 }, { "epoch": 4.917762152247019, "grad_norm": 0.3063693046569824, "learning_rate": 2.6506305464736103e-05, "loss": 0.0864, "step": 16086 }, { "epoch": 4.918067869153164, "grad_norm": 0.27443715929985046, "learning_rate": 2.6505880854316165e-05, "loss": 0.0546, "step": 16087 }, { "epoch": 4.918373586059309, "grad_norm": 0.2307957410812378, "learning_rate": 2.6505456243896224e-05, "loss": 0.0675, "step": 16088 }, { "epoch": 4.918679302965454, "grad_norm": 0.25626569986343384, "learning_rate": 2.6505031633476286e-05, "loss": 0.0497, "step": 16089 }, { "epoch": 4.918985019871599, "grad_norm": 0.3822304606437683, "learning_rate": 2.6504607023056348e-05, "loss": 0.0428, "step": 16090 }, { "epoch": 4.919290736777744, "grad_norm": 0.4988192021846771, "learning_rate": 2.6504182412636407e-05, "loss": 0.0525, "step": 16091 }, { "epoch": 4.919596453683889, "grad_norm": 0.4744739234447479, "learning_rate": 2.650375780221647e-05, "loss": 0.0552, "step": 16092 }, { "epoch": 4.919902170590033, "grad_norm": 0.5877829194068909, "learning_rate": 2.6503333191796528e-05, "loss": 0.0702, "step": 16093 }, { "epoch": 4.920207887496178, "grad_norm": 0.4482937157154083, "learning_rate": 2.650290858137659e-05, "loss": 0.0559, "step": 16094 }, { "epoch": 4.920513604402323, "grad_norm": 0.5888118743896484, "learning_rate": 2.650248397095665e-05, "loss": 0.1144, "step": 16095 }, { "epoch": 4.9208193213084686, "grad_norm": 0.2752913236618042, "learning_rate": 2.650205936053671e-05, "loss": 0.0759, "step": 16096 }, { "epoch": 4.921125038214614, "grad_norm": 0.5999616980552673, "learning_rate": 2.650163475011677e-05, "loss": 0.0849, "step": 16097 }, { "epoch": 4.921430755120758, "grad_norm": 2.693358898162842, "learning_rate": 2.650121013969683e-05, "loss": 0.1522, "step": 16098 }, { "epoch": 4.921736472026903, "grad_norm": 0.42872217297554016, "learning_rate": 2.650078552927689e-05, "loss": 0.1152, "step": 16099 }, { "epoch": 4.922042188933048, "grad_norm": 0.4370652139186859, "learning_rate": 2.6500360918856952e-05, "loss": 0.1373, "step": 16100 }, { "epoch": 4.922347905839193, "grad_norm": 0.35769039392471313, "learning_rate": 2.649993630843701e-05, "loss": 0.1239, "step": 16101 }, { "epoch": 4.9226536227453375, "grad_norm": 0.8935092091560364, "learning_rate": 2.6499511698017073e-05, "loss": 0.1512, "step": 16102 }, { "epoch": 4.922959339651483, "grad_norm": 0.9574534296989441, "learning_rate": 2.649908708759713e-05, "loss": 0.1693, "step": 16103 }, { "epoch": 4.923265056557628, "grad_norm": 0.9061641693115234, "learning_rate": 2.649866247717719e-05, "loss": 0.1658, "step": 16104 }, { "epoch": 4.923570773463773, "grad_norm": 1.1269967555999756, "learning_rate": 2.6498237866757252e-05, "loss": 0.1982, "step": 16105 }, { "epoch": 4.923876490369917, "grad_norm": 1.0049570798873901, "learning_rate": 2.649781325633731e-05, "loss": 0.1581, "step": 16106 }, { "epoch": 4.924182207276062, "grad_norm": 1.18439519405365, "learning_rate": 2.6497388645917373e-05, "loss": 0.1894, "step": 16107 }, { "epoch": 4.924487924182207, "grad_norm": 1.20333993434906, "learning_rate": 2.6496964035497432e-05, "loss": 0.175, "step": 16108 }, { "epoch": 4.924793641088352, "grad_norm": 0.9547755122184753, "learning_rate": 2.6496539425077494e-05, "loss": 0.2045, "step": 16109 }, { "epoch": 4.9250993579944975, "grad_norm": 0.4532049000263214, "learning_rate": 2.6496114814657553e-05, "loss": 0.169, "step": 16110 }, { "epoch": 4.925405074900642, "grad_norm": 0.2280377298593521, "learning_rate": 2.6495690204237615e-05, "loss": 0.0868, "step": 16111 }, { "epoch": 4.925710791806787, "grad_norm": 0.4232746958732605, "learning_rate": 2.6495265593817673e-05, "loss": 0.0635, "step": 16112 }, { "epoch": 4.926016508712932, "grad_norm": 0.24428167939186096, "learning_rate": 2.6494840983397735e-05, "loss": 0.0738, "step": 16113 }, { "epoch": 4.926322225619077, "grad_norm": 0.3160003125667572, "learning_rate": 2.6494416372977794e-05, "loss": 0.0642, "step": 16114 }, { "epoch": 4.926627942525221, "grad_norm": 0.3349120616912842, "learning_rate": 2.6493991762557856e-05, "loss": 0.0461, "step": 16115 }, { "epoch": 4.926933659431366, "grad_norm": 0.34701502323150635, "learning_rate": 2.6493567152137915e-05, "loss": 0.055, "step": 16116 }, { "epoch": 4.9272393763375115, "grad_norm": 0.30333542823791504, "learning_rate": 2.6493142541717974e-05, "loss": 0.0568, "step": 16117 }, { "epoch": 4.927545093243657, "grad_norm": 0.3435628414154053, "learning_rate": 2.6492717931298036e-05, "loss": 0.0931, "step": 16118 }, { "epoch": 4.927850810149801, "grad_norm": 0.2966861426830292, "learning_rate": 2.6492293320878094e-05, "loss": 0.0519, "step": 16119 }, { "epoch": 4.928156527055946, "grad_norm": 0.618125319480896, "learning_rate": 2.6491868710458157e-05, "loss": 0.1019, "step": 16120 }, { "epoch": 4.928462243962091, "grad_norm": 0.8951911926269531, "learning_rate": 2.6491444100038215e-05, "loss": 0.0923, "step": 16121 }, { "epoch": 4.928767960868236, "grad_norm": 0.7747204899787903, "learning_rate": 2.6491019489618277e-05, "loss": 0.0671, "step": 16122 }, { "epoch": 4.929073677774381, "grad_norm": 0.6017471551895142, "learning_rate": 2.6490594879198336e-05, "loss": 0.1392, "step": 16123 }, { "epoch": 4.9293793946805256, "grad_norm": 0.5396701693534851, "learning_rate": 2.6490170268778398e-05, "loss": 0.1302, "step": 16124 }, { "epoch": 4.929685111586671, "grad_norm": 0.6103761196136475, "learning_rate": 2.6489745658358457e-05, "loss": 0.1317, "step": 16125 }, { "epoch": 4.929990828492816, "grad_norm": 1.0165095329284668, "learning_rate": 2.648932104793852e-05, "loss": 0.1335, "step": 16126 }, { "epoch": 4.930296545398961, "grad_norm": 0.6253355741500854, "learning_rate": 2.6488896437518578e-05, "loss": 0.1538, "step": 16127 }, { "epoch": 4.930602262305105, "grad_norm": 1.9943050146102905, "learning_rate": 2.648847182709864e-05, "loss": 0.1662, "step": 16128 }, { "epoch": 4.93090797921125, "grad_norm": 0.5925998091697693, "learning_rate": 2.64880472166787e-05, "loss": 0.1617, "step": 16129 }, { "epoch": 4.931213696117395, "grad_norm": 0.6143059134483337, "learning_rate": 2.6487622606258757e-05, "loss": 0.1657, "step": 16130 }, { "epoch": 4.9315194130235405, "grad_norm": 0.7104598879814148, "learning_rate": 2.648719799583882e-05, "loss": 0.1406, "step": 16131 }, { "epoch": 4.931825129929685, "grad_norm": 0.8259052038192749, "learning_rate": 2.6486773385418878e-05, "loss": 0.209, "step": 16132 }, { "epoch": 4.93213084683583, "grad_norm": 1.7791532278060913, "learning_rate": 2.648634877499894e-05, "loss": 0.1853, "step": 16133 }, { "epoch": 4.932436563741975, "grad_norm": 3.3699681758880615, "learning_rate": 2.6485924164579e-05, "loss": 0.2431, "step": 16134 }, { "epoch": 4.93274228064812, "grad_norm": 0.5384926795959473, "learning_rate": 2.648549955415906e-05, "loss": 0.1588, "step": 16135 }, { "epoch": 4.933047997554265, "grad_norm": 0.3068762421607971, "learning_rate": 2.648507494373912e-05, "loss": 0.0589, "step": 16136 }, { "epoch": 4.933353714460409, "grad_norm": 0.5451555848121643, "learning_rate": 2.648465033331918e-05, "loss": 0.0767, "step": 16137 }, { "epoch": 4.9336594313665545, "grad_norm": 0.36615949869155884, "learning_rate": 2.648422572289924e-05, "loss": 0.0664, "step": 16138 }, { "epoch": 4.9339651482727, "grad_norm": 0.291433721780777, "learning_rate": 2.6483801112479302e-05, "loss": 0.054, "step": 16139 }, { "epoch": 4.934270865178845, "grad_norm": 0.20853009819984436, "learning_rate": 2.648337650205936e-05, "loss": 0.0492, "step": 16140 }, { "epoch": 4.934576582084989, "grad_norm": 0.23909419775009155, "learning_rate": 2.6482951891639423e-05, "loss": 0.0733, "step": 16141 }, { "epoch": 4.934882298991134, "grad_norm": 0.22978001832962036, "learning_rate": 2.6482527281219482e-05, "loss": 0.055, "step": 16142 }, { "epoch": 4.935188015897279, "grad_norm": 0.1824663132429123, "learning_rate": 2.648210267079954e-05, "loss": 0.0636, "step": 16143 }, { "epoch": 4.935493732803424, "grad_norm": 0.24066044390201569, "learning_rate": 2.6481678060379603e-05, "loss": 0.0535, "step": 16144 }, { "epoch": 4.9357994497095685, "grad_norm": 0.3642326295375824, "learning_rate": 2.648125344995966e-05, "loss": 0.0926, "step": 16145 }, { "epoch": 4.936105166615714, "grad_norm": 0.38487887382507324, "learning_rate": 2.6480828839539723e-05, "loss": 0.0932, "step": 16146 }, { "epoch": 4.936410883521859, "grad_norm": 0.6197339296340942, "learning_rate": 2.6480404229119782e-05, "loss": 0.094, "step": 16147 }, { "epoch": 4.936716600428004, "grad_norm": 0.8459832072257996, "learning_rate": 2.6479979618699844e-05, "loss": 0.1199, "step": 16148 }, { "epoch": 4.937022317334149, "grad_norm": 0.6683096885681152, "learning_rate": 2.6479555008279903e-05, "loss": 0.1019, "step": 16149 }, { "epoch": 4.937328034240293, "grad_norm": 0.7628757953643799, "learning_rate": 2.6479130397859965e-05, "loss": 0.1345, "step": 16150 }, { "epoch": 4.937633751146438, "grad_norm": 1.280805230140686, "learning_rate": 2.6478705787440024e-05, "loss": 0.1394, "step": 16151 }, { "epoch": 4.937939468052583, "grad_norm": 0.5588558316230774, "learning_rate": 2.6478281177020086e-05, "loss": 0.1682, "step": 16152 }, { "epoch": 4.9382451849587286, "grad_norm": 0.6438719034194946, "learning_rate": 2.6477856566600144e-05, "loss": 0.1671, "step": 16153 }, { "epoch": 4.938550901864873, "grad_norm": 0.6705468893051147, "learning_rate": 2.6477431956180207e-05, "loss": 0.1547, "step": 16154 }, { "epoch": 4.938856618771018, "grad_norm": 0.6853405237197876, "learning_rate": 2.6477007345760265e-05, "loss": 0.188, "step": 16155 }, { "epoch": 4.939162335677163, "grad_norm": 1.2957031726837158, "learning_rate": 2.6476582735340324e-05, "loss": 0.1434, "step": 16156 }, { "epoch": 4.939468052583308, "grad_norm": 0.7410624623298645, "learning_rate": 2.6476158124920386e-05, "loss": 0.1669, "step": 16157 }, { "epoch": 4.939773769489452, "grad_norm": 2.236218214035034, "learning_rate": 2.6475733514500445e-05, "loss": 0.2322, "step": 16158 }, { "epoch": 4.9400794863955975, "grad_norm": 1.5604541301727295, "learning_rate": 2.6475308904080507e-05, "loss": 0.2342, "step": 16159 }, { "epoch": 4.940385203301743, "grad_norm": 0.5462390184402466, "learning_rate": 2.6474884293660565e-05, "loss": 0.1408, "step": 16160 }, { "epoch": 4.940690920207888, "grad_norm": 0.8692618012428284, "learning_rate": 2.6474459683240628e-05, "loss": 0.0937, "step": 16161 }, { "epoch": 4.940996637114033, "grad_norm": 0.2712526023387909, "learning_rate": 2.6474035072820686e-05, "loss": 0.0513, "step": 16162 }, { "epoch": 4.941302354020177, "grad_norm": 0.13674603402614594, "learning_rate": 2.647361046240075e-05, "loss": 0.0429, "step": 16163 }, { "epoch": 4.941608070926322, "grad_norm": 0.5190302133560181, "learning_rate": 2.6473185851980807e-05, "loss": 0.0603, "step": 16164 }, { "epoch": 4.941913787832467, "grad_norm": 0.2478194385766983, "learning_rate": 2.647276124156087e-05, "loss": 0.0547, "step": 16165 }, { "epoch": 4.942219504738612, "grad_norm": 0.2354096621274948, "learning_rate": 2.6472336631140928e-05, "loss": 0.0422, "step": 16166 }, { "epoch": 4.942525221644757, "grad_norm": 0.26456043124198914, "learning_rate": 2.647191202072099e-05, "loss": 0.0503, "step": 16167 }, { "epoch": 4.942830938550902, "grad_norm": 0.3050549328327179, "learning_rate": 2.647148741030105e-05, "loss": 0.0881, "step": 16168 }, { "epoch": 4.943136655457047, "grad_norm": 0.355782687664032, "learning_rate": 2.6471062799881107e-05, "loss": 0.0961, "step": 16169 }, { "epoch": 4.943442372363192, "grad_norm": 0.29249337315559387, "learning_rate": 2.647063818946117e-05, "loss": 0.0597, "step": 16170 }, { "epoch": 4.943748089269336, "grad_norm": 0.5387439131736755, "learning_rate": 2.6470213579041228e-05, "loss": 0.1143, "step": 16171 }, { "epoch": 4.944053806175481, "grad_norm": 0.8383841514587402, "learning_rate": 2.646978896862129e-05, "loss": 0.0641, "step": 16172 }, { "epoch": 4.944359523081626, "grad_norm": 0.5547565221786499, "learning_rate": 2.646936435820135e-05, "loss": 0.1181, "step": 16173 }, { "epoch": 4.9446652399877715, "grad_norm": 0.4237385094165802, "learning_rate": 2.646893974778141e-05, "loss": 0.1408, "step": 16174 }, { "epoch": 4.944970956893917, "grad_norm": 0.450828492641449, "learning_rate": 2.646851513736147e-05, "loss": 0.1255, "step": 16175 }, { "epoch": 4.945276673800061, "grad_norm": 0.9282153844833374, "learning_rate": 2.6468090526941532e-05, "loss": 0.15, "step": 16176 }, { "epoch": 4.945582390706206, "grad_norm": 0.6778057813644409, "learning_rate": 2.646766591652159e-05, "loss": 0.1794, "step": 16177 }, { "epoch": 4.945888107612351, "grad_norm": 0.6619948744773865, "learning_rate": 2.6467241306101653e-05, "loss": 0.1588, "step": 16178 }, { "epoch": 4.946193824518496, "grad_norm": 0.3997032344341278, "learning_rate": 2.646681669568171e-05, "loss": 0.1617, "step": 16179 }, { "epoch": 4.94649954142464, "grad_norm": 0.8461418747901917, "learning_rate": 2.646639208526177e-05, "loss": 0.1944, "step": 16180 }, { "epoch": 4.9468052583307855, "grad_norm": 0.5402050018310547, "learning_rate": 2.6465967474841832e-05, "loss": 0.1647, "step": 16181 }, { "epoch": 4.947110975236931, "grad_norm": 4.685575485229492, "learning_rate": 2.646554286442189e-05, "loss": 0.2017, "step": 16182 }, { "epoch": 4.947416692143076, "grad_norm": 1.0009102821350098, "learning_rate": 2.6465118254001953e-05, "loss": 0.2181, "step": 16183 }, { "epoch": 4.94772240904922, "grad_norm": 3.5499463081359863, "learning_rate": 2.646469364358201e-05, "loss": 0.25, "step": 16184 }, { "epoch": 4.948028125955365, "grad_norm": 0.555263876914978, "learning_rate": 2.6464269033162074e-05, "loss": 0.156, "step": 16185 }, { "epoch": 4.94833384286151, "grad_norm": 0.6567282676696777, "learning_rate": 2.6463844422742132e-05, "loss": 0.0764, "step": 16186 }, { "epoch": 4.948639559767655, "grad_norm": 0.43958452343940735, "learning_rate": 2.6463419812322194e-05, "loss": 0.0981, "step": 16187 }, { "epoch": 4.9489452766738005, "grad_norm": 0.19926422834396362, "learning_rate": 2.6462995201902253e-05, "loss": 0.0696, "step": 16188 }, { "epoch": 4.949250993579945, "grad_norm": 0.38151440024375916, "learning_rate": 2.6462570591482315e-05, "loss": 0.0797, "step": 16189 }, { "epoch": 4.94955671048609, "grad_norm": 0.2506442368030548, "learning_rate": 2.6462145981062374e-05, "loss": 0.0689, "step": 16190 }, { "epoch": 4.949862427392235, "grad_norm": 0.26800596714019775, "learning_rate": 2.6461721370642436e-05, "loss": 0.0363, "step": 16191 }, { "epoch": 4.95016814429838, "grad_norm": 0.28583404421806335, "learning_rate": 2.6461296760222498e-05, "loss": 0.0647, "step": 16192 }, { "epoch": 4.950473861204524, "grad_norm": 0.24741584062576294, "learning_rate": 2.6460872149802557e-05, "loss": 0.0609, "step": 16193 }, { "epoch": 4.950779578110669, "grad_norm": 0.20727026462554932, "learning_rate": 2.646044753938262e-05, "loss": 0.0497, "step": 16194 }, { "epoch": 4.9510852950168145, "grad_norm": 0.7953619956970215, "learning_rate": 2.6460022928962678e-05, "loss": 0.1257, "step": 16195 }, { "epoch": 4.95139101192296, "grad_norm": 0.6956741213798523, "learning_rate": 2.645959831854274e-05, "loss": 0.0943, "step": 16196 }, { "epoch": 4.951696728829104, "grad_norm": 0.6693007349967957, "learning_rate": 2.64591737081228e-05, "loss": 0.0742, "step": 16197 }, { "epoch": 4.952002445735249, "grad_norm": 0.5977396368980408, "learning_rate": 2.645874909770286e-05, "loss": 0.1409, "step": 16198 }, { "epoch": 4.952308162641394, "grad_norm": 0.6485908031463623, "learning_rate": 2.645832448728292e-05, "loss": 0.1028, "step": 16199 }, { "epoch": 4.952613879547539, "grad_norm": 0.6261325478553772, "learning_rate": 2.645789987686298e-05, "loss": 0.1668, "step": 16200 }, { "epoch": 4.952919596453684, "grad_norm": 0.5063225626945496, "learning_rate": 2.645747526644304e-05, "loss": 0.1246, "step": 16201 }, { "epoch": 4.9532253133598285, "grad_norm": 1.3815869092941284, "learning_rate": 2.6457050656023102e-05, "loss": 0.1566, "step": 16202 }, { "epoch": 4.953531030265974, "grad_norm": 0.6278057098388672, "learning_rate": 2.645662604560316e-05, "loss": 0.1438, "step": 16203 }, { "epoch": 4.953836747172119, "grad_norm": 1.6880429983139038, "learning_rate": 2.6456201435183223e-05, "loss": 0.1896, "step": 16204 }, { "epoch": 4.954142464078264, "grad_norm": 1.0147854089736938, "learning_rate": 2.645577682476328e-05, "loss": 0.1953, "step": 16205 }, { "epoch": 4.954448180984408, "grad_norm": 1.045670509338379, "learning_rate": 2.645535221434334e-05, "loss": 0.177, "step": 16206 }, { "epoch": 4.954753897890553, "grad_norm": 0.6756311655044556, "learning_rate": 2.6454927603923402e-05, "loss": 0.1801, "step": 16207 }, { "epoch": 4.955059614796698, "grad_norm": 1.0499931573867798, "learning_rate": 2.645450299350346e-05, "loss": 0.2029, "step": 16208 }, { "epoch": 4.955365331702843, "grad_norm": 1.1285697221755981, "learning_rate": 2.6454078383083523e-05, "loss": 0.2117, "step": 16209 }, { "epoch": 4.955671048608988, "grad_norm": 0.934039831161499, "learning_rate": 2.6453653772663582e-05, "loss": 0.1435, "step": 16210 }, { "epoch": 4.955976765515133, "grad_norm": 0.46611881256103516, "learning_rate": 2.6453229162243644e-05, "loss": 0.0824, "step": 16211 }, { "epoch": 4.956282482421278, "grad_norm": 0.19744712114334106, "learning_rate": 2.6452804551823703e-05, "loss": 0.0632, "step": 16212 }, { "epoch": 4.956588199327423, "grad_norm": 0.5067269206047058, "learning_rate": 2.6452379941403765e-05, "loss": 0.0955, "step": 16213 }, { "epoch": 4.956893916233568, "grad_norm": 0.5780366659164429, "learning_rate": 2.6451955330983823e-05, "loss": 0.0538, "step": 16214 }, { "epoch": 4.957199633139712, "grad_norm": 0.20543056726455688, "learning_rate": 2.6451530720563885e-05, "loss": 0.0392, "step": 16215 }, { "epoch": 4.9575053500458575, "grad_norm": 0.27941665053367615, "learning_rate": 2.6451106110143944e-05, "loss": 0.0489, "step": 16216 }, { "epoch": 4.957811066952003, "grad_norm": 0.3026092052459717, "learning_rate": 2.6450681499724006e-05, "loss": 0.0625, "step": 16217 }, { "epoch": 4.958116783858148, "grad_norm": 0.3497205376625061, "learning_rate": 2.6450256889304065e-05, "loss": 0.0783, "step": 16218 }, { "epoch": 4.958422500764292, "grad_norm": 0.3224717974662781, "learning_rate": 2.6449832278884124e-05, "loss": 0.069, "step": 16219 }, { "epoch": 4.958728217670437, "grad_norm": 0.5241775512695312, "learning_rate": 2.6449407668464186e-05, "loss": 0.092, "step": 16220 }, { "epoch": 4.959033934576582, "grad_norm": 0.4540558457374573, "learning_rate": 2.6448983058044244e-05, "loss": 0.0715, "step": 16221 }, { "epoch": 4.959339651482727, "grad_norm": 0.6412497758865356, "learning_rate": 2.6448558447624307e-05, "loss": 0.0908, "step": 16222 }, { "epoch": 4.9596453683888715, "grad_norm": 1.2702451944351196, "learning_rate": 2.6448133837204365e-05, "loss": 0.1204, "step": 16223 }, { "epoch": 4.959951085295017, "grad_norm": 1.3516309261322021, "learning_rate": 2.6447709226784427e-05, "loss": 0.129, "step": 16224 }, { "epoch": 4.960256802201162, "grad_norm": 1.1595357656478882, "learning_rate": 2.6447284616364486e-05, "loss": 0.1465, "step": 16225 }, { "epoch": 4.960562519107307, "grad_norm": 0.9924737215042114, "learning_rate": 2.6446860005944548e-05, "loss": 0.1829, "step": 16226 }, { "epoch": 4.960868236013452, "grad_norm": 0.9391486644744873, "learning_rate": 2.6446435395524607e-05, "loss": 0.1732, "step": 16227 }, { "epoch": 4.961173952919596, "grad_norm": 0.6501911282539368, "learning_rate": 2.644601078510467e-05, "loss": 0.158, "step": 16228 }, { "epoch": 4.961479669825741, "grad_norm": 0.8515514135360718, "learning_rate": 2.6445586174684728e-05, "loss": 0.1598, "step": 16229 }, { "epoch": 4.961785386731886, "grad_norm": 1.6769587993621826, "learning_rate": 2.644516156426479e-05, "loss": 0.1447, "step": 16230 }, { "epoch": 4.9620911036380315, "grad_norm": 1.776716709136963, "learning_rate": 2.644473695384485e-05, "loss": 0.1539, "step": 16231 }, { "epoch": 4.962396820544176, "grad_norm": 0.863213837146759, "learning_rate": 2.6444312343424907e-05, "loss": 0.1692, "step": 16232 }, { "epoch": 4.962702537450321, "grad_norm": 0.8694145679473877, "learning_rate": 2.644388773300497e-05, "loss": 0.1776, "step": 16233 }, { "epoch": 4.963008254356466, "grad_norm": 4.588730812072754, "learning_rate": 2.6443463122585028e-05, "loss": 0.2194, "step": 16234 }, { "epoch": 4.963313971262611, "grad_norm": 0.5255776047706604, "learning_rate": 2.644303851216509e-05, "loss": 0.1615, "step": 16235 }, { "epoch": 4.963619688168755, "grad_norm": 1.1640057563781738, "learning_rate": 2.644261390174515e-05, "loss": 0.0777, "step": 16236 }, { "epoch": 4.9639254050749, "grad_norm": 0.19605523347854614, "learning_rate": 2.644218929132521e-05, "loss": 0.0552, "step": 16237 }, { "epoch": 4.9642311219810455, "grad_norm": 0.3822788894176483, "learning_rate": 2.644176468090527e-05, "loss": 0.0706, "step": 16238 }, { "epoch": 4.964536838887191, "grad_norm": 0.3126916289329529, "learning_rate": 2.644134007048533e-05, "loss": 0.0593, "step": 16239 }, { "epoch": 4.964842555793336, "grad_norm": 0.2752786874771118, "learning_rate": 2.644091546006539e-05, "loss": 0.0692, "step": 16240 }, { "epoch": 4.96514827269948, "grad_norm": 0.32149213552474976, "learning_rate": 2.6440490849645452e-05, "loss": 0.0754, "step": 16241 }, { "epoch": 4.965453989605625, "grad_norm": 0.37535929679870605, "learning_rate": 2.644006623922551e-05, "loss": 0.0512, "step": 16242 }, { "epoch": 4.96575970651177, "grad_norm": 0.2844174802303314, "learning_rate": 2.6439641628805573e-05, "loss": 0.0735, "step": 16243 }, { "epoch": 4.966065423417915, "grad_norm": 1.3420312404632568, "learning_rate": 2.6439217018385632e-05, "loss": 0.062, "step": 16244 }, { "epoch": 4.96637114032406, "grad_norm": 1.0133360624313354, "learning_rate": 2.643879240796569e-05, "loss": 0.094, "step": 16245 }, { "epoch": 4.966676857230205, "grad_norm": 0.4830609858036041, "learning_rate": 2.6438367797545753e-05, "loss": 0.0856, "step": 16246 }, { "epoch": 4.96698257413635, "grad_norm": 0.29936954379081726, "learning_rate": 2.643794318712581e-05, "loss": 0.0791, "step": 16247 }, { "epoch": 4.967288291042495, "grad_norm": 0.551843523979187, "learning_rate": 2.6437518576705873e-05, "loss": 0.1428, "step": 16248 }, { "epoch": 4.967594007948639, "grad_norm": 0.46459001302719116, "learning_rate": 2.6437093966285932e-05, "loss": 0.1398, "step": 16249 }, { "epoch": 4.967899724854784, "grad_norm": 2.145453929901123, "learning_rate": 2.6436669355865994e-05, "loss": 0.1266, "step": 16250 }, { "epoch": 4.968205441760929, "grad_norm": 0.3577316999435425, "learning_rate": 2.6436244745446053e-05, "loss": 0.1487, "step": 16251 }, { "epoch": 4.9685111586670745, "grad_norm": 0.7347962856292725, "learning_rate": 2.6435820135026115e-05, "loss": 0.1765, "step": 16252 }, { "epoch": 4.96881687557322, "grad_norm": 0.7608230113983154, "learning_rate": 2.6435395524606174e-05, "loss": 0.1836, "step": 16253 }, { "epoch": 4.969122592479364, "grad_norm": 1.2254383563995361, "learning_rate": 2.6434970914186236e-05, "loss": 0.1906, "step": 16254 }, { "epoch": 4.969428309385509, "grad_norm": 0.9722108244895935, "learning_rate": 2.6434546303766294e-05, "loss": 0.229, "step": 16255 }, { "epoch": 4.969734026291654, "grad_norm": 1.5004971027374268, "learning_rate": 2.6434121693346357e-05, "loss": 0.2169, "step": 16256 }, { "epoch": 4.970039743197799, "grad_norm": 4.245785236358643, "learning_rate": 2.6433697082926415e-05, "loss": 0.1918, "step": 16257 }, { "epoch": 4.970345460103943, "grad_norm": 1.1180176734924316, "learning_rate": 2.6433272472506474e-05, "loss": 0.2259, "step": 16258 }, { "epoch": 4.9706511770100885, "grad_norm": 3.008047342300415, "learning_rate": 2.6432847862086536e-05, "loss": 0.2341, "step": 16259 }, { "epoch": 4.970956893916234, "grad_norm": 0.42520081996917725, "learning_rate": 2.6432423251666595e-05, "loss": 0.1498, "step": 16260 }, { "epoch": 4.971262610822379, "grad_norm": 0.29699474573135376, "learning_rate": 2.6431998641246657e-05, "loss": 0.0854, "step": 16261 }, { "epoch": 4.971568327728523, "grad_norm": 0.5320903062820435, "learning_rate": 2.6431574030826716e-05, "loss": 0.0483, "step": 16262 }, { "epoch": 4.971874044634668, "grad_norm": 0.18682923913002014, "learning_rate": 2.6431149420406778e-05, "loss": 0.0805, "step": 16263 }, { "epoch": 4.972179761540813, "grad_norm": 0.6450976133346558, "learning_rate": 2.6430724809986836e-05, "loss": 0.0582, "step": 16264 }, { "epoch": 4.972485478446958, "grad_norm": 0.299600213766098, "learning_rate": 2.64303001995669e-05, "loss": 0.0529, "step": 16265 }, { "epoch": 4.972791195353103, "grad_norm": 0.4454646110534668, "learning_rate": 2.6429875589146957e-05, "loss": 0.058, "step": 16266 }, { "epoch": 4.973096912259248, "grad_norm": 0.7826207280158997, "learning_rate": 2.642945097872702e-05, "loss": 0.065, "step": 16267 }, { "epoch": 4.973402629165393, "grad_norm": 0.44051361083984375, "learning_rate": 2.6429026368307078e-05, "loss": 0.044, "step": 16268 }, { "epoch": 4.973708346071538, "grad_norm": 0.49163734912872314, "learning_rate": 2.642860175788714e-05, "loss": 0.0585, "step": 16269 }, { "epoch": 4.974014062977683, "grad_norm": 0.4259580969810486, "learning_rate": 2.64281771474672e-05, "loss": 0.0861, "step": 16270 }, { "epoch": 4.974319779883827, "grad_norm": 0.2825963795185089, "learning_rate": 2.6427752537047257e-05, "loss": 0.0736, "step": 16271 }, { "epoch": 4.974625496789972, "grad_norm": 0.36909857392311096, "learning_rate": 2.642732792662732e-05, "loss": 0.0849, "step": 16272 }, { "epoch": 4.9749312136961175, "grad_norm": 0.5704494118690491, "learning_rate": 2.6426903316207378e-05, "loss": 0.0912, "step": 16273 }, { "epoch": 4.975236930602263, "grad_norm": 0.6329939365386963, "learning_rate": 2.642647870578744e-05, "loss": 0.1207, "step": 16274 }, { "epoch": 4.975542647508407, "grad_norm": 0.3876838684082031, "learning_rate": 2.64260540953675e-05, "loss": 0.1204, "step": 16275 }, { "epoch": 4.975848364414552, "grad_norm": 0.8251622319221497, "learning_rate": 2.642562948494756e-05, "loss": 0.1613, "step": 16276 }, { "epoch": 4.976154081320697, "grad_norm": 0.4649137258529663, "learning_rate": 2.642520487452762e-05, "loss": 0.1328, "step": 16277 }, { "epoch": 4.976459798226842, "grad_norm": 0.8467286229133606, "learning_rate": 2.6424780264107682e-05, "loss": 0.197, "step": 16278 }, { "epoch": 4.976765515132987, "grad_norm": 1.4875431060791016, "learning_rate": 2.642435565368774e-05, "loss": 0.1733, "step": 16279 }, { "epoch": 4.9770712320391315, "grad_norm": 1.609748125076294, "learning_rate": 2.6423931043267803e-05, "loss": 0.1636, "step": 16280 }, { "epoch": 4.977376948945277, "grad_norm": 1.1398124694824219, "learning_rate": 2.642350643284786e-05, "loss": 0.1889, "step": 16281 }, { "epoch": 4.977682665851422, "grad_norm": 0.8253848552703857, "learning_rate": 2.6423081822427923e-05, "loss": 0.1809, "step": 16282 }, { "epoch": 4.977988382757567, "grad_norm": 1.9678714275360107, "learning_rate": 2.6422657212007982e-05, "loss": 0.1832, "step": 16283 }, { "epoch": 4.978294099663711, "grad_norm": 1.833385944366455, "learning_rate": 2.642223260158804e-05, "loss": 0.2859, "step": 16284 }, { "epoch": 4.978599816569856, "grad_norm": 0.3439232409000397, "learning_rate": 2.6421807991168103e-05, "loss": 0.1555, "step": 16285 }, { "epoch": 4.978905533476001, "grad_norm": 0.411931574344635, "learning_rate": 2.642138338074816e-05, "loss": 0.0994, "step": 16286 }, { "epoch": 4.979211250382146, "grad_norm": 0.20540018379688263, "learning_rate": 2.6420958770328224e-05, "loss": 0.0649, "step": 16287 }, { "epoch": 4.979516967288291, "grad_norm": 0.20930306613445282, "learning_rate": 2.6420534159908282e-05, "loss": 0.0521, "step": 16288 }, { "epoch": 4.979822684194436, "grad_norm": 0.5270512104034424, "learning_rate": 2.6420109549488344e-05, "loss": 0.0609, "step": 16289 }, { "epoch": 4.980128401100581, "grad_norm": 0.1929650604724884, "learning_rate": 2.6419684939068403e-05, "loss": 0.0445, "step": 16290 }, { "epoch": 4.980434118006726, "grad_norm": 0.2580484449863434, "learning_rate": 2.6419260328648465e-05, "loss": 0.0457, "step": 16291 }, { "epoch": 4.980739834912871, "grad_norm": 0.29407572746276855, "learning_rate": 2.6418835718228524e-05, "loss": 0.0532, "step": 16292 }, { "epoch": 4.981045551819015, "grad_norm": 0.7456240057945251, "learning_rate": 2.6418411107808586e-05, "loss": 0.0686, "step": 16293 }, { "epoch": 4.98135126872516, "grad_norm": 0.392733633518219, "learning_rate": 2.6417986497388648e-05, "loss": 0.0604, "step": 16294 }, { "epoch": 4.9816569856313055, "grad_norm": 0.8671808242797852, "learning_rate": 2.6417561886968707e-05, "loss": 0.0893, "step": 16295 }, { "epoch": 4.981962702537451, "grad_norm": 0.2525700628757477, "learning_rate": 2.641713727654877e-05, "loss": 0.068, "step": 16296 }, { "epoch": 4.982268419443595, "grad_norm": 0.35767409205436707, "learning_rate": 2.6416712666128828e-05, "loss": 0.0954, "step": 16297 }, { "epoch": 4.98257413634974, "grad_norm": 0.7149615287780762, "learning_rate": 2.641628805570889e-05, "loss": 0.1332, "step": 16298 }, { "epoch": 4.982879853255885, "grad_norm": 0.5591803193092346, "learning_rate": 2.641586344528895e-05, "loss": 0.1157, "step": 16299 }, { "epoch": 4.98318557016203, "grad_norm": 3.316666841506958, "learning_rate": 2.641543883486901e-05, "loss": 0.1287, "step": 16300 }, { "epoch": 4.9834912870681745, "grad_norm": 0.9091182351112366, "learning_rate": 2.641501422444907e-05, "loss": 0.1609, "step": 16301 }, { "epoch": 4.98379700397432, "grad_norm": 0.5489398837089539, "learning_rate": 2.641458961402913e-05, "loss": 0.1838, "step": 16302 }, { "epoch": 4.984102720880465, "grad_norm": 0.7959627509117126, "learning_rate": 2.641416500360919e-05, "loss": 0.1415, "step": 16303 }, { "epoch": 4.98440843778661, "grad_norm": 0.9976868629455566, "learning_rate": 2.6413740393189252e-05, "loss": 0.1626, "step": 16304 }, { "epoch": 4.984714154692755, "grad_norm": 0.7502087950706482, "learning_rate": 2.641331578276931e-05, "loss": 0.1976, "step": 16305 }, { "epoch": 4.985019871598899, "grad_norm": 3.9919450283050537, "learning_rate": 2.6412891172349373e-05, "loss": 0.2157, "step": 16306 }, { "epoch": 4.985325588505044, "grad_norm": 1.0622131824493408, "learning_rate": 2.641246656192943e-05, "loss": 0.163, "step": 16307 }, { "epoch": 4.985631305411189, "grad_norm": 2.502284049987793, "learning_rate": 2.641204195150949e-05, "loss": 0.1897, "step": 16308 }, { "epoch": 4.9859370223173345, "grad_norm": 2.4570224285125732, "learning_rate": 2.6411617341089552e-05, "loss": 0.2628, "step": 16309 }, { "epoch": 4.986242739223479, "grad_norm": 0.8203973174095154, "learning_rate": 2.641119273066961e-05, "loss": 0.1414, "step": 16310 }, { "epoch": 4.986548456129624, "grad_norm": 1.0239824056625366, "learning_rate": 2.6410768120249673e-05, "loss": 0.0824, "step": 16311 }, { "epoch": 4.986854173035769, "grad_norm": 0.21523885428905487, "learning_rate": 2.6410343509829732e-05, "loss": 0.0739, "step": 16312 }, { "epoch": 4.987159889941914, "grad_norm": 0.20692579448223114, "learning_rate": 2.6409918899409794e-05, "loss": 0.0474, "step": 16313 }, { "epoch": 4.987465606848058, "grad_norm": 0.3089268207550049, "learning_rate": 2.6409494288989853e-05, "loss": 0.0604, "step": 16314 }, { "epoch": 4.987771323754203, "grad_norm": 0.41296055912971497, "learning_rate": 2.6409069678569915e-05, "loss": 0.0616, "step": 16315 }, { "epoch": 4.9880770406603485, "grad_norm": 0.1936233639717102, "learning_rate": 2.6408645068149973e-05, "loss": 0.0468, "step": 16316 }, { "epoch": 4.988382757566494, "grad_norm": 0.4046232998371124, "learning_rate": 2.6408220457730036e-05, "loss": 0.0904, "step": 16317 }, { "epoch": 4.988688474472639, "grad_norm": 0.40093469619750977, "learning_rate": 2.6407795847310094e-05, "loss": 0.0574, "step": 16318 }, { "epoch": 4.988994191378783, "grad_norm": 0.5713542103767395, "learning_rate": 2.6407371236890156e-05, "loss": 0.0996, "step": 16319 }, { "epoch": 4.989299908284928, "grad_norm": 0.27804842591285706, "learning_rate": 2.6406946626470215e-05, "loss": 0.0734, "step": 16320 }, { "epoch": 4.989605625191073, "grad_norm": 0.3515048921108246, "learning_rate": 2.6406522016050274e-05, "loss": 0.0837, "step": 16321 }, { "epoch": 4.989911342097218, "grad_norm": 1.3136210441589355, "learning_rate": 2.6406097405630336e-05, "loss": 0.1266, "step": 16322 }, { "epoch": 4.9902170590033625, "grad_norm": 0.26073992252349854, "learning_rate": 2.6405672795210394e-05, "loss": 0.0902, "step": 16323 }, { "epoch": 4.990522775909508, "grad_norm": 0.9181076288223267, "learning_rate": 2.6405248184790457e-05, "loss": 0.1341, "step": 16324 }, { "epoch": 4.990828492815653, "grad_norm": 1.6166555881500244, "learning_rate": 2.6404823574370515e-05, "loss": 0.1608, "step": 16325 }, { "epoch": 4.991134209721798, "grad_norm": 2.8133652210235596, "learning_rate": 2.6404398963950577e-05, "loss": 0.1549, "step": 16326 }, { "epoch": 4.991439926627942, "grad_norm": 0.5451922416687012, "learning_rate": 2.6403974353530636e-05, "loss": 0.1243, "step": 16327 }, { "epoch": 4.991745643534087, "grad_norm": 0.45319995284080505, "learning_rate": 2.6403549743110698e-05, "loss": 0.1407, "step": 16328 }, { "epoch": 4.992051360440232, "grad_norm": 1.0955123901367188, "learning_rate": 2.6403125132690757e-05, "loss": 0.1652, "step": 16329 }, { "epoch": 4.9923570773463775, "grad_norm": 2.4507296085357666, "learning_rate": 2.640270052227082e-05, "loss": 0.1963, "step": 16330 }, { "epoch": 4.992662794252523, "grad_norm": 0.8484175801277161, "learning_rate": 2.6402275911850878e-05, "loss": 0.166, "step": 16331 }, { "epoch": 4.992968511158667, "grad_norm": 0.8937247395515442, "learning_rate": 2.640185130143094e-05, "loss": 0.2014, "step": 16332 }, { "epoch": 4.993274228064812, "grad_norm": 10.322090148925781, "learning_rate": 2.6401426691011e-05, "loss": 0.2102, "step": 16333 }, { "epoch": 4.993579944970957, "grad_norm": 1.345955491065979, "learning_rate": 2.6401002080591057e-05, "loss": 0.2686, "step": 16334 }, { "epoch": 4.993885661877102, "grad_norm": 0.591651201248169, "learning_rate": 2.640057747017112e-05, "loss": 0.142, "step": 16335 }, { "epoch": 4.994191378783246, "grad_norm": 0.48485633730888367, "learning_rate": 2.6400152859751178e-05, "loss": 0.0741, "step": 16336 }, { "epoch": 4.9944970956893915, "grad_norm": 0.2908630967140198, "learning_rate": 2.639972824933124e-05, "loss": 0.0754, "step": 16337 }, { "epoch": 4.994802812595537, "grad_norm": 0.3867572247982025, "learning_rate": 2.63993036389113e-05, "loss": 0.0624, "step": 16338 }, { "epoch": 4.995108529501682, "grad_norm": 0.18130439519882202, "learning_rate": 2.639887902849136e-05, "loss": 0.0728, "step": 16339 }, { "epoch": 4.995414246407826, "grad_norm": 0.2162308543920517, "learning_rate": 2.639845441807142e-05, "loss": 0.0659, "step": 16340 }, { "epoch": 4.995719963313971, "grad_norm": 0.217263862490654, "learning_rate": 2.639802980765148e-05, "loss": 0.0412, "step": 16341 }, { "epoch": 4.996025680220116, "grad_norm": 0.23642511665821075, "learning_rate": 2.639760519723154e-05, "loss": 0.0932, "step": 16342 }, { "epoch": 4.996331397126261, "grad_norm": 0.30101045966148376, "learning_rate": 2.6397180586811602e-05, "loss": 0.0747, "step": 16343 }, { "epoch": 4.996637114032406, "grad_norm": 0.6027377843856812, "learning_rate": 2.639675597639166e-05, "loss": 0.0784, "step": 16344 }, { "epoch": 4.996942830938551, "grad_norm": 0.39730706810951233, "learning_rate": 2.6396331365971723e-05, "loss": 0.1213, "step": 16345 }, { "epoch": 4.997248547844696, "grad_norm": 0.3758262097835541, "learning_rate": 2.6395906755551782e-05, "loss": 0.0958, "step": 16346 }, { "epoch": 4.997554264750841, "grad_norm": 0.5462109446525574, "learning_rate": 2.639548214513184e-05, "loss": 0.1643, "step": 16347 }, { "epoch": 4.997859981656986, "grad_norm": 0.8620638847351074, "learning_rate": 2.6395057534711903e-05, "loss": 0.1522, "step": 16348 }, { "epoch": 4.99816569856313, "grad_norm": 0.9141589403152466, "learning_rate": 2.639463292429196e-05, "loss": 0.1941, "step": 16349 }, { "epoch": 4.998471415469275, "grad_norm": 0.5894749760627747, "learning_rate": 2.6394208313872023e-05, "loss": 0.17, "step": 16350 }, { "epoch": 4.99877713237542, "grad_norm": 0.5764205455780029, "learning_rate": 2.6393783703452082e-05, "loss": 0.1786, "step": 16351 }, { "epoch": 4.9990828492815655, "grad_norm": 1.5803017616271973, "learning_rate": 2.6393359093032144e-05, "loss": 0.1734, "step": 16352 }, { "epoch": 4.99938856618771, "grad_norm": 0.8548043966293335, "learning_rate": 2.6392934482612203e-05, "loss": 0.2131, "step": 16353 }, { "epoch": 4.999694283093855, "grad_norm": 1.074985146522522, "learning_rate": 2.6392509872192265e-05, "loss": 0.182, "step": 16354 }, { "epoch": 5.0, "grad_norm": 0.9442436099052429, "learning_rate": 2.6392085261772324e-05, "loss": 0.1781, "step": 16355 }, { "epoch": 5.000305716906145, "grad_norm": 0.42482396960258484, "learning_rate": 2.6391660651352386e-05, "loss": 0.1438, "step": 16356 }, { "epoch": 5.00061143381229, "grad_norm": 0.24956119060516357, "learning_rate": 2.6391236040932444e-05, "loss": 0.0829, "step": 16357 }, { "epoch": 5.0009171507184345, "grad_norm": 0.3592005670070648, "learning_rate": 2.6390811430512507e-05, "loss": 0.0548, "step": 16358 }, { "epoch": 5.00122286762458, "grad_norm": 0.16821086406707764, "learning_rate": 2.6390386820092565e-05, "loss": 0.052, "step": 16359 }, { "epoch": 5.001528584530725, "grad_norm": 0.6343734860420227, "learning_rate": 2.6389962209672624e-05, "loss": 0.0527, "step": 16360 }, { "epoch": 5.00183430143687, "grad_norm": 0.26062875986099243, "learning_rate": 2.6389537599252686e-05, "loss": 0.0516, "step": 16361 }, { "epoch": 5.002140018343014, "grad_norm": 0.4094092547893524, "learning_rate": 2.6389112988832745e-05, "loss": 0.0638, "step": 16362 }, { "epoch": 5.002445735249159, "grad_norm": 0.3926723599433899, "learning_rate": 2.6388688378412807e-05, "loss": 0.077, "step": 16363 }, { "epoch": 5.002751452155304, "grad_norm": 0.21153947710990906, "learning_rate": 2.6388263767992866e-05, "loss": 0.058, "step": 16364 }, { "epoch": 5.003057169061449, "grad_norm": 0.21605686843395233, "learning_rate": 2.6387839157572928e-05, "loss": 0.063, "step": 16365 }, { "epoch": 5.003362885967594, "grad_norm": 0.4699592590332031, "learning_rate": 2.6387414547152986e-05, "loss": 0.0647, "step": 16366 }, { "epoch": 5.003668602873739, "grad_norm": 0.37986406683921814, "learning_rate": 2.638698993673305e-05, "loss": 0.0949, "step": 16367 }, { "epoch": 5.003974319779884, "grad_norm": 0.3874395191669464, "learning_rate": 2.6386565326313107e-05, "loss": 0.0898, "step": 16368 }, { "epoch": 5.004280036686029, "grad_norm": 0.9580787420272827, "learning_rate": 2.638614071589317e-05, "loss": 0.1122, "step": 16369 }, { "epoch": 5.004585753592174, "grad_norm": 2.766045570373535, "learning_rate": 2.6385716105473228e-05, "loss": 0.1061, "step": 16370 }, { "epoch": 5.004891470498318, "grad_norm": 1.0749144554138184, "learning_rate": 2.638529149505329e-05, "loss": 0.105, "step": 16371 }, { "epoch": 5.005197187404463, "grad_norm": 1.3288666009902954, "learning_rate": 2.638486688463335e-05, "loss": 0.1804, "step": 16372 }, { "epoch": 5.0055029043106085, "grad_norm": 0.7543281316757202, "learning_rate": 2.6384442274213407e-05, "loss": 0.1484, "step": 16373 }, { "epoch": 5.005808621216754, "grad_norm": 0.681703507900238, "learning_rate": 2.638401766379347e-05, "loss": 0.1519, "step": 16374 }, { "epoch": 5.006114338122898, "grad_norm": 0.9294992685317993, "learning_rate": 2.6383593053373528e-05, "loss": 0.1781, "step": 16375 }, { "epoch": 5.006420055029043, "grad_norm": 0.8403221964836121, "learning_rate": 2.638316844295359e-05, "loss": 0.1619, "step": 16376 }, { "epoch": 5.006725771935188, "grad_norm": 1.628466010093689, "learning_rate": 2.638274383253365e-05, "loss": 0.1985, "step": 16377 }, { "epoch": 5.007031488841333, "grad_norm": 2.1636393070220947, "learning_rate": 2.638231922211371e-05, "loss": 0.2107, "step": 16378 }, { "epoch": 5.007337205747477, "grad_norm": 9.486687660217285, "learning_rate": 2.638189461169377e-05, "loss": 0.2128, "step": 16379 }, { "epoch": 5.0076429226536225, "grad_norm": 3.6045351028442383, "learning_rate": 2.6381470001273832e-05, "loss": 0.29, "step": 16380 }, { "epoch": 5.007948639559768, "grad_norm": 0.5125231146812439, "learning_rate": 2.638104539085389e-05, "loss": 0.1125, "step": 16381 }, { "epoch": 5.008254356465913, "grad_norm": 0.3328703045845032, "learning_rate": 2.6380620780433953e-05, "loss": 0.0709, "step": 16382 }, { "epoch": 5.008560073372058, "grad_norm": 0.27682390809059143, "learning_rate": 2.638019617001401e-05, "loss": 0.091, "step": 16383 }, { "epoch": 5.008865790278202, "grad_norm": 0.18005388975143433, "learning_rate": 2.6379771559594073e-05, "loss": 0.0616, "step": 16384 }, { "epoch": 5.009171507184347, "grad_norm": 0.3831649124622345, "learning_rate": 2.6379346949174132e-05, "loss": 0.0544, "step": 16385 }, { "epoch": 5.009477224090492, "grad_norm": 0.44338300824165344, "learning_rate": 2.637892233875419e-05, "loss": 0.0643, "step": 16386 }, { "epoch": 5.0097829409966375, "grad_norm": 0.46192222833633423, "learning_rate": 2.6378497728334253e-05, "loss": 0.057, "step": 16387 }, { "epoch": 5.010088657902782, "grad_norm": 0.1747373342514038, "learning_rate": 2.637807311791431e-05, "loss": 0.0413, "step": 16388 }, { "epoch": 5.010394374808927, "grad_norm": 0.39670756459236145, "learning_rate": 2.6377648507494374e-05, "loss": 0.0996, "step": 16389 }, { "epoch": 5.010700091715072, "grad_norm": 0.6066391468048096, "learning_rate": 2.6377223897074432e-05, "loss": 0.0601, "step": 16390 }, { "epoch": 5.011005808621217, "grad_norm": 0.34010249376296997, "learning_rate": 2.6376799286654495e-05, "loss": 0.09, "step": 16391 }, { "epoch": 5.011311525527361, "grad_norm": 0.3338136374950409, "learning_rate": 2.6376374676234553e-05, "loss": 0.0831, "step": 16392 }, { "epoch": 5.011617242433506, "grad_norm": 0.48321253061294556, "learning_rate": 2.6375950065814615e-05, "loss": 0.1032, "step": 16393 }, { "epoch": 5.0119229593396515, "grad_norm": 0.6926617622375488, "learning_rate": 2.6375525455394674e-05, "loss": 0.1049, "step": 16394 }, { "epoch": 5.012228676245797, "grad_norm": 0.470794141292572, "learning_rate": 2.6375100844974736e-05, "loss": 0.1151, "step": 16395 }, { "epoch": 5.012534393151942, "grad_norm": 1.6989997625350952, "learning_rate": 2.6374676234554798e-05, "loss": 0.1347, "step": 16396 }, { "epoch": 5.012840110058086, "grad_norm": 0.45471885800361633, "learning_rate": 2.637425162413486e-05, "loss": 0.1389, "step": 16397 }, { "epoch": 5.013145826964231, "grad_norm": 5.163754463195801, "learning_rate": 2.637382701371492e-05, "loss": 0.1661, "step": 16398 }, { "epoch": 5.013451543870376, "grad_norm": 0.413546085357666, "learning_rate": 2.6373402403294978e-05, "loss": 0.1363, "step": 16399 }, { "epoch": 5.013757260776521, "grad_norm": 1.2378861904144287, "learning_rate": 2.637297779287504e-05, "loss": 0.1806, "step": 16400 }, { "epoch": 5.0140629776826655, "grad_norm": 0.8649237751960754, "learning_rate": 2.63725531824551e-05, "loss": 0.1767, "step": 16401 }, { "epoch": 5.014368694588811, "grad_norm": 0.569861650466919, "learning_rate": 2.637212857203516e-05, "loss": 0.1498, "step": 16402 }, { "epoch": 5.014674411494956, "grad_norm": 1.0313724279403687, "learning_rate": 2.637170396161522e-05, "loss": 0.1864, "step": 16403 }, { "epoch": 5.014980128401101, "grad_norm": 1.3856236934661865, "learning_rate": 2.637127935119528e-05, "loss": 0.2091, "step": 16404 }, { "epoch": 5.015285845307245, "grad_norm": 2.8013575077056885, "learning_rate": 2.637085474077534e-05, "loss": 0.2442, "step": 16405 }, { "epoch": 5.01559156221339, "grad_norm": 0.42359548807144165, "learning_rate": 2.6370430130355402e-05, "loss": 0.1567, "step": 16406 }, { "epoch": 5.015897279119535, "grad_norm": 0.38081085681915283, "learning_rate": 2.637000551993546e-05, "loss": 0.0914, "step": 16407 }, { "epoch": 5.01620299602568, "grad_norm": 0.15456263720989227, "learning_rate": 2.6369580909515523e-05, "loss": 0.0506, "step": 16408 }, { "epoch": 5.0165087129318255, "grad_norm": 0.21941013634204865, "learning_rate": 2.636915629909558e-05, "loss": 0.052, "step": 16409 }, { "epoch": 5.01681442983797, "grad_norm": 0.2602652311325073, "learning_rate": 2.636873168867564e-05, "loss": 0.0745, "step": 16410 }, { "epoch": 5.017120146744115, "grad_norm": 0.6747981309890747, "learning_rate": 2.6368307078255702e-05, "loss": 0.0794, "step": 16411 }, { "epoch": 5.01742586365026, "grad_norm": 0.5002877712249756, "learning_rate": 2.636788246783576e-05, "loss": 0.042, "step": 16412 }, { "epoch": 5.017731580556405, "grad_norm": 0.48290205001831055, "learning_rate": 2.6367457857415823e-05, "loss": 0.0587, "step": 16413 }, { "epoch": 5.018037297462549, "grad_norm": 0.2638358771800995, "learning_rate": 2.6367033246995882e-05, "loss": 0.0532, "step": 16414 }, { "epoch": 5.0183430143686945, "grad_norm": 0.3338315784931183, "learning_rate": 2.6366608636575944e-05, "loss": 0.0757, "step": 16415 }, { "epoch": 5.01864873127484, "grad_norm": 0.3567098379135132, "learning_rate": 2.6366184026156003e-05, "loss": 0.087, "step": 16416 }, { "epoch": 5.018954448180985, "grad_norm": 0.7122564315795898, "learning_rate": 2.6365759415736065e-05, "loss": 0.0978, "step": 16417 }, { "epoch": 5.019260165087129, "grad_norm": 0.5069395899772644, "learning_rate": 2.6365334805316123e-05, "loss": 0.0646, "step": 16418 }, { "epoch": 5.019565881993274, "grad_norm": 1.05270516872406, "learning_rate": 2.6364910194896186e-05, "loss": 0.1508, "step": 16419 }, { "epoch": 5.019871598899419, "grad_norm": 0.8133147358894348, "learning_rate": 2.6364485584476244e-05, "loss": 0.1278, "step": 16420 }, { "epoch": 5.020177315805564, "grad_norm": 0.42177754640579224, "learning_rate": 2.6364060974056306e-05, "loss": 0.1061, "step": 16421 }, { "epoch": 5.020483032711709, "grad_norm": 1.212404727935791, "learning_rate": 2.6363636363636365e-05, "loss": 0.1244, "step": 16422 }, { "epoch": 5.020788749617854, "grad_norm": 0.5405663847923279, "learning_rate": 2.6363211753216424e-05, "loss": 0.1439, "step": 16423 }, { "epoch": 5.021094466523999, "grad_norm": 0.6593935489654541, "learning_rate": 2.6362787142796486e-05, "loss": 0.1975, "step": 16424 }, { "epoch": 5.021400183430144, "grad_norm": 0.9098764657974243, "learning_rate": 2.6362362532376545e-05, "loss": 0.1886, "step": 16425 }, { "epoch": 5.021705900336289, "grad_norm": 0.8823050856590271, "learning_rate": 2.6361937921956607e-05, "loss": 0.1528, "step": 16426 }, { "epoch": 5.022011617242433, "grad_norm": 0.6045442223548889, "learning_rate": 2.6361513311536665e-05, "loss": 0.1746, "step": 16427 }, { "epoch": 5.022317334148578, "grad_norm": 1.0305438041687012, "learning_rate": 2.6361088701116727e-05, "loss": 0.2089, "step": 16428 }, { "epoch": 5.022623051054723, "grad_norm": 1.243304967880249, "learning_rate": 2.6360664090696786e-05, "loss": 0.2358, "step": 16429 }, { "epoch": 5.0229287679608685, "grad_norm": 1.1350843906402588, "learning_rate": 2.6360239480276848e-05, "loss": 0.2472, "step": 16430 }, { "epoch": 5.023234484867013, "grad_norm": 0.3201141059398651, "learning_rate": 2.6359814869856907e-05, "loss": 0.1368, "step": 16431 }, { "epoch": 5.023540201773158, "grad_norm": 0.25508689880371094, "learning_rate": 2.635939025943697e-05, "loss": 0.0957, "step": 16432 }, { "epoch": 5.023845918679303, "grad_norm": 0.45394209027290344, "learning_rate": 2.6358965649017028e-05, "loss": 0.0764, "step": 16433 }, { "epoch": 5.024151635585448, "grad_norm": 0.3409861922264099, "learning_rate": 2.635854103859709e-05, "loss": 0.0566, "step": 16434 }, { "epoch": 5.024457352491593, "grad_norm": 0.8333185315132141, "learning_rate": 2.635811642817715e-05, "loss": 0.0645, "step": 16435 }, { "epoch": 5.024763069397737, "grad_norm": 0.5052927136421204, "learning_rate": 2.6357691817757207e-05, "loss": 0.0636, "step": 16436 }, { "epoch": 5.0250687863038825, "grad_norm": 0.19980919361114502, "learning_rate": 2.635726720733727e-05, "loss": 0.0422, "step": 16437 }, { "epoch": 5.025374503210028, "grad_norm": 1.9356482028961182, "learning_rate": 2.6356842596917328e-05, "loss": 0.0663, "step": 16438 }, { "epoch": 5.025680220116173, "grad_norm": 0.2559940814971924, "learning_rate": 2.635641798649739e-05, "loss": 0.1207, "step": 16439 }, { "epoch": 5.025985937022317, "grad_norm": 0.3152120113372803, "learning_rate": 2.635599337607745e-05, "loss": 0.0714, "step": 16440 }, { "epoch": 5.026291653928462, "grad_norm": 0.29199615120887756, "learning_rate": 2.635556876565751e-05, "loss": 0.0885, "step": 16441 }, { "epoch": 5.026597370834607, "grad_norm": 0.40492990612983704, "learning_rate": 2.635514415523757e-05, "loss": 0.0841, "step": 16442 }, { "epoch": 5.026903087740752, "grad_norm": 1.2099952697753906, "learning_rate": 2.635471954481763e-05, "loss": 0.0696, "step": 16443 }, { "epoch": 5.027208804646897, "grad_norm": 0.7454831600189209, "learning_rate": 2.635429493439769e-05, "loss": 0.1106, "step": 16444 }, { "epoch": 5.027514521553042, "grad_norm": 0.42999815940856934, "learning_rate": 2.6353870323977752e-05, "loss": 0.1393, "step": 16445 }, { "epoch": 5.027820238459187, "grad_norm": 0.4738295078277588, "learning_rate": 2.635344571355781e-05, "loss": 0.1272, "step": 16446 }, { "epoch": 5.028125955365332, "grad_norm": 0.44073686003685, "learning_rate": 2.6353021103137873e-05, "loss": 0.1326, "step": 16447 }, { "epoch": 5.028431672271477, "grad_norm": 1.631782054901123, "learning_rate": 2.6352596492717932e-05, "loss": 0.2273, "step": 16448 }, { "epoch": 5.028737389177621, "grad_norm": 1.068873405456543, "learning_rate": 2.635217188229799e-05, "loss": 0.1704, "step": 16449 }, { "epoch": 5.029043106083766, "grad_norm": 0.7504650950431824, "learning_rate": 2.6351747271878053e-05, "loss": 0.1853, "step": 16450 }, { "epoch": 5.0293488229899115, "grad_norm": 1.8907215595245361, "learning_rate": 2.635132266145811e-05, "loss": 0.1594, "step": 16451 }, { "epoch": 5.029654539896057, "grad_norm": 1.1632335186004639, "learning_rate": 2.6350898051038173e-05, "loss": 0.1931, "step": 16452 }, { "epoch": 5.029960256802201, "grad_norm": 0.8527543544769287, "learning_rate": 2.6350473440618232e-05, "loss": 0.1944, "step": 16453 }, { "epoch": 5.030265973708346, "grad_norm": 0.9776577949523926, "learning_rate": 2.6350048830198294e-05, "loss": 0.2188, "step": 16454 }, { "epoch": 5.030571690614491, "grad_norm": 2.2156805992126465, "learning_rate": 2.6349624219778353e-05, "loss": 0.2111, "step": 16455 }, { "epoch": 5.030877407520636, "grad_norm": 0.2732706069946289, "learning_rate": 2.6349199609358415e-05, "loss": 0.137, "step": 16456 }, { "epoch": 5.03118312442678, "grad_norm": 0.2298620194196701, "learning_rate": 2.6348774998938474e-05, "loss": 0.0971, "step": 16457 }, { "epoch": 5.0314888413329255, "grad_norm": 0.21198643743991852, "learning_rate": 2.6348350388518536e-05, "loss": 0.0536, "step": 16458 }, { "epoch": 5.031794558239071, "grad_norm": 0.29682883620262146, "learning_rate": 2.6347925778098595e-05, "loss": 0.0508, "step": 16459 }, { "epoch": 5.032100275145216, "grad_norm": 0.21364730596542358, "learning_rate": 2.6347501167678657e-05, "loss": 0.0611, "step": 16460 }, { "epoch": 5.032405992051361, "grad_norm": 0.1879059225320816, "learning_rate": 2.6347076557258715e-05, "loss": 0.0524, "step": 16461 }, { "epoch": 5.032711708957505, "grad_norm": 0.4084801971912384, "learning_rate": 2.6346651946838774e-05, "loss": 0.068, "step": 16462 }, { "epoch": 5.03301742586365, "grad_norm": 0.2681211829185486, "learning_rate": 2.6346227336418836e-05, "loss": 0.0524, "step": 16463 }, { "epoch": 5.033323142769795, "grad_norm": 0.43063393235206604, "learning_rate": 2.6345802725998895e-05, "loss": 0.0832, "step": 16464 }, { "epoch": 5.03362885967594, "grad_norm": 0.28433653712272644, "learning_rate": 2.6345378115578957e-05, "loss": 0.0664, "step": 16465 }, { "epoch": 5.033934576582085, "grad_norm": 0.3584614098072052, "learning_rate": 2.6344953505159016e-05, "loss": 0.1049, "step": 16466 }, { "epoch": 5.03424029348823, "grad_norm": 0.2802443206310272, "learning_rate": 2.6344528894739078e-05, "loss": 0.0694, "step": 16467 }, { "epoch": 5.034546010394375, "grad_norm": 0.38705506920814514, "learning_rate": 2.6344104284319136e-05, "loss": 0.098, "step": 16468 }, { "epoch": 5.03485172730052, "grad_norm": 0.3334243893623352, "learning_rate": 2.63436796738992e-05, "loss": 0.1073, "step": 16469 }, { "epoch": 5.035157444206664, "grad_norm": 1.146982192993164, "learning_rate": 2.6343255063479257e-05, "loss": 0.1163, "step": 16470 }, { "epoch": 5.035463161112809, "grad_norm": 0.3531021475791931, "learning_rate": 2.634283045305932e-05, "loss": 0.188, "step": 16471 }, { "epoch": 5.0357688780189545, "grad_norm": 1.008013367652893, "learning_rate": 2.6342405842639378e-05, "loss": 0.1238, "step": 16472 }, { "epoch": 5.0360745949251, "grad_norm": 0.4491884112358093, "learning_rate": 2.634198123221944e-05, "loss": 0.1564, "step": 16473 }, { "epoch": 5.036380311831245, "grad_norm": 0.7455951571464539, "learning_rate": 2.63415566217995e-05, "loss": 0.1657, "step": 16474 }, { "epoch": 5.036686028737389, "grad_norm": 0.8117737770080566, "learning_rate": 2.6341132011379557e-05, "loss": 0.1642, "step": 16475 }, { "epoch": 5.036991745643534, "grad_norm": 1.2864360809326172, "learning_rate": 2.634070740095962e-05, "loss": 0.2181, "step": 16476 }, { "epoch": 5.037297462549679, "grad_norm": 1.1385220289230347, "learning_rate": 2.6340282790539678e-05, "loss": 0.1781, "step": 16477 }, { "epoch": 5.037603179455824, "grad_norm": 0.668286919593811, "learning_rate": 2.633985818011974e-05, "loss": 0.1795, "step": 16478 }, { "epoch": 5.0379088963619685, "grad_norm": 1.1969107389450073, "learning_rate": 2.63394335696998e-05, "loss": 0.1893, "step": 16479 }, { "epoch": 5.038214613268114, "grad_norm": 1.2723103761672974, "learning_rate": 2.633900895927986e-05, "loss": 0.237, "step": 16480 }, { "epoch": 5.038520330174259, "grad_norm": 0.43112409114837646, "learning_rate": 2.633858434885992e-05, "loss": 0.1275, "step": 16481 }, { "epoch": 5.038826047080404, "grad_norm": 0.572668731212616, "learning_rate": 2.6338159738439982e-05, "loss": 0.1183, "step": 16482 }, { "epoch": 5.039131763986548, "grad_norm": 0.18148022890090942, "learning_rate": 2.633773512802004e-05, "loss": 0.0658, "step": 16483 }, { "epoch": 5.039437480892693, "grad_norm": 0.3536089062690735, "learning_rate": 2.6337310517600103e-05, "loss": 0.0724, "step": 16484 }, { "epoch": 5.039743197798838, "grad_norm": 0.7198283076286316, "learning_rate": 2.633688590718016e-05, "loss": 0.067, "step": 16485 }, { "epoch": 5.040048914704983, "grad_norm": 0.548953115940094, "learning_rate": 2.6336461296760223e-05, "loss": 0.0442, "step": 16486 }, { "epoch": 5.0403546316111285, "grad_norm": 0.31466764211654663, "learning_rate": 2.6336036686340282e-05, "loss": 0.0544, "step": 16487 }, { "epoch": 5.040660348517273, "grad_norm": 0.18291887640953064, "learning_rate": 2.633561207592034e-05, "loss": 0.0584, "step": 16488 }, { "epoch": 5.040966065423418, "grad_norm": 0.22092896699905396, "learning_rate": 2.6335187465500403e-05, "loss": 0.072, "step": 16489 }, { "epoch": 5.041271782329563, "grad_norm": 0.22431516647338867, "learning_rate": 2.633476285508046e-05, "loss": 0.0729, "step": 16490 }, { "epoch": 5.041577499235708, "grad_norm": 0.3201832175254822, "learning_rate": 2.6334338244660524e-05, "loss": 0.1119, "step": 16491 }, { "epoch": 5.041883216141852, "grad_norm": 0.2613682746887207, "learning_rate": 2.6333913634240582e-05, "loss": 0.0808, "step": 16492 }, { "epoch": 5.042188933047997, "grad_norm": 0.9195640087127686, "learning_rate": 2.6333489023820645e-05, "loss": 0.1013, "step": 16493 }, { "epoch": 5.0424946499541425, "grad_norm": 0.9079650044441223, "learning_rate": 2.6333064413400703e-05, "loss": 0.1505, "step": 16494 }, { "epoch": 5.042800366860288, "grad_norm": 0.38544243574142456, "learning_rate": 2.6332639802980765e-05, "loss": 0.1182, "step": 16495 }, { "epoch": 5.043106083766432, "grad_norm": 0.5742585062980652, "learning_rate": 2.6332215192560824e-05, "loss": 0.1202, "step": 16496 }, { "epoch": 5.043411800672577, "grad_norm": 0.6690835356712341, "learning_rate": 2.6331790582140886e-05, "loss": 0.1281, "step": 16497 }, { "epoch": 5.043717517578722, "grad_norm": 1.6776182651519775, "learning_rate": 2.6331365971720948e-05, "loss": 0.1454, "step": 16498 }, { "epoch": 5.044023234484867, "grad_norm": 0.4953669309616089, "learning_rate": 2.633094136130101e-05, "loss": 0.1618, "step": 16499 }, { "epoch": 5.044328951391012, "grad_norm": 0.5306150317192078, "learning_rate": 2.633051675088107e-05, "loss": 0.1772, "step": 16500 }, { "epoch": 5.044634668297157, "grad_norm": 0.8569665551185608, "learning_rate": 2.6330092140461128e-05, "loss": 0.1455, "step": 16501 }, { "epoch": 5.044940385203302, "grad_norm": 0.6530666351318359, "learning_rate": 2.632966753004119e-05, "loss": 0.171, "step": 16502 }, { "epoch": 5.045246102109447, "grad_norm": 1.070251226425171, "learning_rate": 2.632924291962125e-05, "loss": 0.1721, "step": 16503 }, { "epoch": 5.045551819015592, "grad_norm": 0.9160354137420654, "learning_rate": 2.632881830920131e-05, "loss": 0.2044, "step": 16504 }, { "epoch": 5.045857535921736, "grad_norm": 0.7550650835037231, "learning_rate": 2.632839369878137e-05, "loss": 0.1813, "step": 16505 }, { "epoch": 5.046163252827881, "grad_norm": 0.6906359195709229, "learning_rate": 2.632796908836143e-05, "loss": 0.1662, "step": 16506 }, { "epoch": 5.046468969734026, "grad_norm": 0.23376408219337463, "learning_rate": 2.632754447794149e-05, "loss": 0.066, "step": 16507 }, { "epoch": 5.0467746866401715, "grad_norm": 0.29271459579467773, "learning_rate": 2.6327119867521552e-05, "loss": 0.0815, "step": 16508 }, { "epoch": 5.047080403546316, "grad_norm": 0.2838662564754486, "learning_rate": 2.632669525710161e-05, "loss": 0.049, "step": 16509 }, { "epoch": 5.047386120452461, "grad_norm": 0.16510547697544098, "learning_rate": 2.6326270646681673e-05, "loss": 0.0403, "step": 16510 }, { "epoch": 5.047691837358606, "grad_norm": 0.18805237114429474, "learning_rate": 2.632584603626173e-05, "loss": 0.0695, "step": 16511 }, { "epoch": 5.047997554264751, "grad_norm": 0.42142656445503235, "learning_rate": 2.6325421425841794e-05, "loss": 0.0791, "step": 16512 }, { "epoch": 5.048303271170896, "grad_norm": 0.38538819551467896, "learning_rate": 2.6324996815421852e-05, "loss": 0.0779, "step": 16513 }, { "epoch": 5.04860898807704, "grad_norm": 0.25570645928382874, "learning_rate": 2.632457220500191e-05, "loss": 0.086, "step": 16514 }, { "epoch": 5.0489147049831855, "grad_norm": 0.39775827527046204, "learning_rate": 2.6324147594581973e-05, "loss": 0.0642, "step": 16515 }, { "epoch": 5.049220421889331, "grad_norm": 0.20755720138549805, "learning_rate": 2.6323722984162032e-05, "loss": 0.0729, "step": 16516 }, { "epoch": 5.049526138795476, "grad_norm": 0.33923399448394775, "learning_rate": 2.6323298373742094e-05, "loss": 0.0889, "step": 16517 }, { "epoch": 5.04983185570162, "grad_norm": 0.34698203206062317, "learning_rate": 2.6322873763322153e-05, "loss": 0.1007, "step": 16518 }, { "epoch": 5.050137572607765, "grad_norm": 0.329685240983963, "learning_rate": 2.6322449152902215e-05, "loss": 0.0999, "step": 16519 }, { "epoch": 5.05044328951391, "grad_norm": 0.6554479598999023, "learning_rate": 2.6322024542482273e-05, "loss": 0.125, "step": 16520 }, { "epoch": 5.050749006420055, "grad_norm": 0.9967379570007324, "learning_rate": 2.6321599932062336e-05, "loss": 0.1263, "step": 16521 }, { "epoch": 5.0510547233261995, "grad_norm": 1.0538371801376343, "learning_rate": 2.6321175321642394e-05, "loss": 0.1978, "step": 16522 }, { "epoch": 5.051360440232345, "grad_norm": 0.6198041439056396, "learning_rate": 2.6320750711222456e-05, "loss": 0.1701, "step": 16523 }, { "epoch": 5.05166615713849, "grad_norm": 0.5469854474067688, "learning_rate": 2.6320326100802515e-05, "loss": 0.1685, "step": 16524 }, { "epoch": 5.051971874044635, "grad_norm": 0.43536946177482605, "learning_rate": 2.6319901490382577e-05, "loss": 0.1424, "step": 16525 }, { "epoch": 5.05227759095078, "grad_norm": 0.8657736778259277, "learning_rate": 2.6319476879962636e-05, "loss": 0.1893, "step": 16526 }, { "epoch": 5.052583307856924, "grad_norm": 0.8070408701896667, "learning_rate": 2.6319052269542695e-05, "loss": 0.1587, "step": 16527 }, { "epoch": 5.052889024763069, "grad_norm": 0.595635712146759, "learning_rate": 2.6318627659122757e-05, "loss": 0.1707, "step": 16528 }, { "epoch": 5.0531947416692145, "grad_norm": 1.4899053573608398, "learning_rate": 2.6318203048702815e-05, "loss": 0.1928, "step": 16529 }, { "epoch": 5.05350045857536, "grad_norm": 1.1263093948364258, "learning_rate": 2.6317778438282877e-05, "loss": 0.1921, "step": 16530 }, { "epoch": 5.053806175481504, "grad_norm": 0.5245074033737183, "learning_rate": 2.6317353827862936e-05, "loss": 0.1381, "step": 16531 }, { "epoch": 5.054111892387649, "grad_norm": 0.45213401317596436, "learning_rate": 2.6316929217442998e-05, "loss": 0.075, "step": 16532 }, { "epoch": 5.054417609293794, "grad_norm": 0.34262615442276, "learning_rate": 2.6316504607023057e-05, "loss": 0.0703, "step": 16533 }, { "epoch": 5.054723326199939, "grad_norm": 0.22703340649604797, "learning_rate": 2.631607999660312e-05, "loss": 0.0569, "step": 16534 }, { "epoch": 5.055029043106083, "grad_norm": 0.16437283158302307, "learning_rate": 2.6315655386183178e-05, "loss": 0.0501, "step": 16535 }, { "epoch": 5.0553347600122285, "grad_norm": 0.2884165942668915, "learning_rate": 2.631523077576324e-05, "loss": 0.0706, "step": 16536 }, { "epoch": 5.055640476918374, "grad_norm": 0.34727299213409424, "learning_rate": 2.63148061653433e-05, "loss": 0.0587, "step": 16537 }, { "epoch": 5.055946193824519, "grad_norm": 0.3823190927505493, "learning_rate": 2.6314381554923357e-05, "loss": 0.0723, "step": 16538 }, { "epoch": 5.056251910730664, "grad_norm": 0.2192797064781189, "learning_rate": 2.631395694450342e-05, "loss": 0.0639, "step": 16539 }, { "epoch": 5.056557627636808, "grad_norm": 0.6096482872962952, "learning_rate": 2.6313532334083478e-05, "loss": 0.0789, "step": 16540 }, { "epoch": 5.056863344542953, "grad_norm": 0.3101189434528351, "learning_rate": 2.631310772366354e-05, "loss": 0.107, "step": 16541 }, { "epoch": 5.057169061449098, "grad_norm": 0.19444406032562256, "learning_rate": 2.63126831132436e-05, "loss": 0.0846, "step": 16542 }, { "epoch": 5.057474778355243, "grad_norm": 0.6031914353370667, "learning_rate": 2.631225850282366e-05, "loss": 0.1088, "step": 16543 }, { "epoch": 5.057780495261388, "grad_norm": 1.3045779466629028, "learning_rate": 2.631183389240372e-05, "loss": 0.1502, "step": 16544 }, { "epoch": 5.058086212167533, "grad_norm": 0.2988913655281067, "learning_rate": 2.631140928198378e-05, "loss": 0.1244, "step": 16545 }, { "epoch": 5.058391929073678, "grad_norm": 0.46721789240837097, "learning_rate": 2.631098467156384e-05, "loss": 0.1645, "step": 16546 }, { "epoch": 5.058697645979823, "grad_norm": 0.3502979874610901, "learning_rate": 2.6310560061143902e-05, "loss": 0.1133, "step": 16547 }, { "epoch": 5.059003362885967, "grad_norm": 0.7475467324256897, "learning_rate": 2.631013545072396e-05, "loss": 0.1512, "step": 16548 }, { "epoch": 5.059309079792112, "grad_norm": 0.6988968849182129, "learning_rate": 2.6309710840304023e-05, "loss": 0.1567, "step": 16549 }, { "epoch": 5.059614796698257, "grad_norm": 0.7540093660354614, "learning_rate": 2.6309286229884082e-05, "loss": 0.2254, "step": 16550 }, { "epoch": 5.0599205136044025, "grad_norm": 0.8164035677909851, "learning_rate": 2.630886161946414e-05, "loss": 0.2161, "step": 16551 }, { "epoch": 5.060226230510548, "grad_norm": 0.7295927405357361, "learning_rate": 2.6308437009044203e-05, "loss": 0.1601, "step": 16552 }, { "epoch": 5.060531947416692, "grad_norm": 0.7783808708190918, "learning_rate": 2.630801239862426e-05, "loss": 0.18, "step": 16553 }, { "epoch": 5.060837664322837, "grad_norm": 0.6929106116294861, "learning_rate": 2.6307587788204323e-05, "loss": 0.1631, "step": 16554 }, { "epoch": 5.061143381228982, "grad_norm": 4.541207313537598, "learning_rate": 2.6307163177784382e-05, "loss": 0.2339, "step": 16555 }, { "epoch": 5.061449098135127, "grad_norm": 0.5112455487251282, "learning_rate": 2.6306738567364444e-05, "loss": 0.1399, "step": 16556 }, { "epoch": 5.0617548150412714, "grad_norm": 0.2692943811416626, "learning_rate": 2.6306313956944503e-05, "loss": 0.0746, "step": 16557 }, { "epoch": 5.062060531947417, "grad_norm": 1.1449533700942993, "learning_rate": 2.6305889346524565e-05, "loss": 0.0953, "step": 16558 }, { "epoch": 5.062366248853562, "grad_norm": 0.44046521186828613, "learning_rate": 2.6305464736104624e-05, "loss": 0.0864, "step": 16559 }, { "epoch": 5.062671965759707, "grad_norm": 0.2712450921535492, "learning_rate": 2.6305040125684686e-05, "loss": 0.0502, "step": 16560 }, { "epoch": 5.062977682665851, "grad_norm": 0.2917913794517517, "learning_rate": 2.6304615515264745e-05, "loss": 0.0606, "step": 16561 }, { "epoch": 5.063283399571996, "grad_norm": 0.5418318510055542, "learning_rate": 2.6304190904844807e-05, "loss": 0.083, "step": 16562 }, { "epoch": 5.063589116478141, "grad_norm": 0.6188943982124329, "learning_rate": 2.6303766294424865e-05, "loss": 0.0669, "step": 16563 }, { "epoch": 5.063894833384286, "grad_norm": 0.7415938377380371, "learning_rate": 2.6303341684004924e-05, "loss": 0.0519, "step": 16564 }, { "epoch": 5.0642005502904315, "grad_norm": 0.23035357892513275, "learning_rate": 2.6302917073584986e-05, "loss": 0.0524, "step": 16565 }, { "epoch": 5.064506267196576, "grad_norm": 0.3066081702709198, "learning_rate": 2.6302492463165045e-05, "loss": 0.0944, "step": 16566 }, { "epoch": 5.064811984102721, "grad_norm": 0.6577243804931641, "learning_rate": 2.6302067852745107e-05, "loss": 0.0788, "step": 16567 }, { "epoch": 5.065117701008866, "grad_norm": 0.6377013921737671, "learning_rate": 2.6301643242325166e-05, "loss": 0.0965, "step": 16568 }, { "epoch": 5.065423417915011, "grad_norm": 1.0677664279937744, "learning_rate": 2.6301218631905228e-05, "loss": 0.1043, "step": 16569 }, { "epoch": 5.065729134821155, "grad_norm": 0.34531816840171814, "learning_rate": 2.6300794021485286e-05, "loss": 0.1017, "step": 16570 }, { "epoch": 5.0660348517273, "grad_norm": 0.9647581577301025, "learning_rate": 2.630036941106535e-05, "loss": 0.1325, "step": 16571 }, { "epoch": 5.0663405686334455, "grad_norm": 0.5898714661598206, "learning_rate": 2.6299944800645407e-05, "loss": 0.1596, "step": 16572 }, { "epoch": 5.066646285539591, "grad_norm": 0.526202380657196, "learning_rate": 2.629952019022547e-05, "loss": 0.1915, "step": 16573 }, { "epoch": 5.066952002445735, "grad_norm": 0.7104379534721375, "learning_rate": 2.6299095579805528e-05, "loss": 0.1782, "step": 16574 }, { "epoch": 5.06725771935188, "grad_norm": 0.6192407608032227, "learning_rate": 2.629867096938559e-05, "loss": 0.1887, "step": 16575 }, { "epoch": 5.067563436258025, "grad_norm": 0.912322461605072, "learning_rate": 2.629824635896565e-05, "loss": 0.2046, "step": 16576 }, { "epoch": 5.06786915316417, "grad_norm": 0.9253473281860352, "learning_rate": 2.6297821748545707e-05, "loss": 0.2065, "step": 16577 }, { "epoch": 5.068174870070315, "grad_norm": 0.8166700601577759, "learning_rate": 2.629739713812577e-05, "loss": 0.2175, "step": 16578 }, { "epoch": 5.0684805869764595, "grad_norm": 0.9864962697029114, "learning_rate": 2.6296972527705828e-05, "loss": 0.1882, "step": 16579 }, { "epoch": 5.068786303882605, "grad_norm": 0.9973411560058594, "learning_rate": 2.629654791728589e-05, "loss": 0.2352, "step": 16580 }, { "epoch": 5.06909202078875, "grad_norm": 0.39580172300338745, "learning_rate": 2.629612330686595e-05, "loss": 0.1217, "step": 16581 }, { "epoch": 5.069397737694895, "grad_norm": 0.3488457500934601, "learning_rate": 2.629569869644601e-05, "loss": 0.076, "step": 16582 }, { "epoch": 5.069703454601039, "grad_norm": 0.29747334122657776, "learning_rate": 2.629527408602607e-05, "loss": 0.0453, "step": 16583 }, { "epoch": 5.070009171507184, "grad_norm": 0.4225180745124817, "learning_rate": 2.6294849475606132e-05, "loss": 0.0467, "step": 16584 }, { "epoch": 5.070314888413329, "grad_norm": 0.19557969272136688, "learning_rate": 2.629442486518619e-05, "loss": 0.057, "step": 16585 }, { "epoch": 5.0706206053194744, "grad_norm": 0.1987408548593521, "learning_rate": 2.6294000254766253e-05, "loss": 0.0747, "step": 16586 }, { "epoch": 5.070926322225619, "grad_norm": 0.26820895075798035, "learning_rate": 2.629357564434631e-05, "loss": 0.0681, "step": 16587 }, { "epoch": 5.071232039131764, "grad_norm": 0.3135540187358856, "learning_rate": 2.6293151033926373e-05, "loss": 0.0702, "step": 16588 }, { "epoch": 5.071537756037909, "grad_norm": 0.2638728618621826, "learning_rate": 2.6292726423506432e-05, "loss": 0.0937, "step": 16589 }, { "epoch": 5.071843472944054, "grad_norm": 0.21826043725013733, "learning_rate": 2.629230181308649e-05, "loss": 0.0799, "step": 16590 }, { "epoch": 5.072149189850199, "grad_norm": 0.45335233211517334, "learning_rate": 2.6291877202666553e-05, "loss": 0.1151, "step": 16591 }, { "epoch": 5.072454906756343, "grad_norm": 0.3917050361633301, "learning_rate": 2.629145259224661e-05, "loss": 0.0771, "step": 16592 }, { "epoch": 5.0727606236624885, "grad_norm": 0.30893126130104065, "learning_rate": 2.6291027981826674e-05, "loss": 0.0854, "step": 16593 }, { "epoch": 5.073066340568634, "grad_norm": 0.8600320219993591, "learning_rate": 2.6290603371406732e-05, "loss": 0.1023, "step": 16594 }, { "epoch": 5.073372057474779, "grad_norm": 0.5759314894676208, "learning_rate": 2.6290178760986795e-05, "loss": 0.1125, "step": 16595 }, { "epoch": 5.073677774380923, "grad_norm": 0.3828466236591339, "learning_rate": 2.6289754150566853e-05, "loss": 0.1442, "step": 16596 }, { "epoch": 5.073983491287068, "grad_norm": 0.6759547591209412, "learning_rate": 2.6289329540146915e-05, "loss": 0.1301, "step": 16597 }, { "epoch": 5.074289208193213, "grad_norm": 0.9448093771934509, "learning_rate": 2.6288904929726974e-05, "loss": 0.1385, "step": 16598 }, { "epoch": 5.074594925099358, "grad_norm": 0.6342674493789673, "learning_rate": 2.6288480319307036e-05, "loss": 0.1823, "step": 16599 }, { "epoch": 5.0749006420055025, "grad_norm": 3.867136240005493, "learning_rate": 2.6288055708887095e-05, "loss": 0.1785, "step": 16600 }, { "epoch": 5.075206358911648, "grad_norm": 1.1060224771499634, "learning_rate": 2.628763109846716e-05, "loss": 0.1618, "step": 16601 }, { "epoch": 5.075512075817793, "grad_norm": 3.198253870010376, "learning_rate": 2.628720648804722e-05, "loss": 0.1725, "step": 16602 }, { "epoch": 5.075817792723938, "grad_norm": 1.2477232217788696, "learning_rate": 2.6286781877627278e-05, "loss": 0.1737, "step": 16603 }, { "epoch": 5.076123509630083, "grad_norm": 1.4768940210342407, "learning_rate": 2.628635726720734e-05, "loss": 0.1913, "step": 16604 }, { "epoch": 5.076429226536227, "grad_norm": 1.2948343753814697, "learning_rate": 2.62859326567874e-05, "loss": 0.2707, "step": 16605 }, { "epoch": 5.076734943442372, "grad_norm": 0.4650837481021881, "learning_rate": 2.628550804636746e-05, "loss": 0.1528, "step": 16606 }, { "epoch": 5.077040660348517, "grad_norm": 0.1872880607843399, "learning_rate": 2.628508343594752e-05, "loss": 0.0627, "step": 16607 }, { "epoch": 5.0773463772546625, "grad_norm": 0.30382487177848816, "learning_rate": 2.628465882552758e-05, "loss": 0.091, "step": 16608 }, { "epoch": 5.077652094160807, "grad_norm": 0.34416458010673523, "learning_rate": 2.628423421510764e-05, "loss": 0.0496, "step": 16609 }, { "epoch": 5.077957811066952, "grad_norm": 0.22626031935214996, "learning_rate": 2.6283809604687702e-05, "loss": 0.0455, "step": 16610 }, { "epoch": 5.078263527973097, "grad_norm": 0.4742296636104584, "learning_rate": 2.628338499426776e-05, "loss": 0.0543, "step": 16611 }, { "epoch": 5.078569244879242, "grad_norm": 0.392506867647171, "learning_rate": 2.6282960383847823e-05, "loss": 0.0441, "step": 16612 }, { "epoch": 5.078874961785386, "grad_norm": 0.4063917100429535, "learning_rate": 2.628253577342788e-05, "loss": 0.0818, "step": 16613 }, { "epoch": 5.0791806786915314, "grad_norm": 0.3948891758918762, "learning_rate": 2.6282111163007944e-05, "loss": 0.0572, "step": 16614 }, { "epoch": 5.079486395597677, "grad_norm": 0.4633733630180359, "learning_rate": 2.6281686552588002e-05, "loss": 0.0863, "step": 16615 }, { "epoch": 5.079792112503822, "grad_norm": 0.3342185616493225, "learning_rate": 2.628126194216806e-05, "loss": 0.0806, "step": 16616 }, { "epoch": 5.080097829409967, "grad_norm": 0.3482495844364166, "learning_rate": 2.6280837331748123e-05, "loss": 0.0706, "step": 16617 }, { "epoch": 5.080403546316111, "grad_norm": 0.6425572037696838, "learning_rate": 2.6280412721328182e-05, "loss": 0.132, "step": 16618 }, { "epoch": 5.080709263222256, "grad_norm": 0.3007427752017975, "learning_rate": 2.6279988110908244e-05, "loss": 0.127, "step": 16619 }, { "epoch": 5.081014980128401, "grad_norm": 0.7881385684013367, "learning_rate": 2.6279563500488303e-05, "loss": 0.124, "step": 16620 }, { "epoch": 5.081320697034546, "grad_norm": 0.6950275897979736, "learning_rate": 2.6279138890068365e-05, "loss": 0.1429, "step": 16621 }, { "epoch": 5.081626413940691, "grad_norm": 1.6144269704818726, "learning_rate": 2.6278714279648424e-05, "loss": 0.1516, "step": 16622 }, { "epoch": 5.081932130846836, "grad_norm": 0.8543024063110352, "learning_rate": 2.6278289669228486e-05, "loss": 0.1455, "step": 16623 }, { "epoch": 5.082237847752981, "grad_norm": 0.7125809788703918, "learning_rate": 2.6277865058808544e-05, "loss": 0.1815, "step": 16624 }, { "epoch": 5.082543564659126, "grad_norm": 0.6730098128318787, "learning_rate": 2.6277440448388606e-05, "loss": 0.2029, "step": 16625 }, { "epoch": 5.08284928156527, "grad_norm": 1.6300499439239502, "learning_rate": 2.6277015837968665e-05, "loss": 0.155, "step": 16626 }, { "epoch": 5.083154998471415, "grad_norm": 0.8103767037391663, "learning_rate": 2.6276591227548727e-05, "loss": 0.164, "step": 16627 }, { "epoch": 5.08346071537756, "grad_norm": 1.690933346748352, "learning_rate": 2.6276166617128786e-05, "loss": 0.1628, "step": 16628 }, { "epoch": 5.0837664322837055, "grad_norm": 0.7641493678092957, "learning_rate": 2.6275742006708845e-05, "loss": 0.1739, "step": 16629 }, { "epoch": 5.084072149189851, "grad_norm": 1.9727596044540405, "learning_rate": 2.6275317396288907e-05, "loss": 0.2414, "step": 16630 }, { "epoch": 5.084377866095995, "grad_norm": 1.7875568866729736, "learning_rate": 2.6274892785868965e-05, "loss": 0.1258, "step": 16631 }, { "epoch": 5.08468358300214, "grad_norm": 0.32835662364959717, "learning_rate": 2.6274468175449027e-05, "loss": 0.0738, "step": 16632 }, { "epoch": 5.084989299908285, "grad_norm": 0.23378118872642517, "learning_rate": 2.6274043565029086e-05, "loss": 0.0508, "step": 16633 }, { "epoch": 5.08529501681443, "grad_norm": 0.4946035146713257, "learning_rate": 2.6273618954609148e-05, "loss": 0.0613, "step": 16634 }, { "epoch": 5.085600733720574, "grad_norm": 0.293958842754364, "learning_rate": 2.6273194344189207e-05, "loss": 0.0568, "step": 16635 }, { "epoch": 5.0859064506267195, "grad_norm": 0.6958744525909424, "learning_rate": 2.627276973376927e-05, "loss": 0.0526, "step": 16636 }, { "epoch": 5.086212167532865, "grad_norm": 0.20352430641651154, "learning_rate": 2.6272345123349328e-05, "loss": 0.0514, "step": 16637 }, { "epoch": 5.08651788443901, "grad_norm": 0.3462166488170624, "learning_rate": 2.627192051292939e-05, "loss": 0.0757, "step": 16638 }, { "epoch": 5.086823601345154, "grad_norm": 0.25846749544143677, "learning_rate": 2.627149590250945e-05, "loss": 0.099, "step": 16639 }, { "epoch": 5.087129318251299, "grad_norm": 0.3270409405231476, "learning_rate": 2.627107129208951e-05, "loss": 0.0681, "step": 16640 }, { "epoch": 5.087435035157444, "grad_norm": 0.8551512956619263, "learning_rate": 2.627064668166957e-05, "loss": 0.0946, "step": 16641 }, { "epoch": 5.087740752063589, "grad_norm": 0.4880159795284271, "learning_rate": 2.6270222071249628e-05, "loss": 0.0869, "step": 16642 }, { "epoch": 5.0880464689697344, "grad_norm": 0.18756075203418732, "learning_rate": 2.626979746082969e-05, "loss": 0.069, "step": 16643 }, { "epoch": 5.088352185875879, "grad_norm": 1.1068153381347656, "learning_rate": 2.626937285040975e-05, "loss": 0.136, "step": 16644 }, { "epoch": 5.088657902782024, "grad_norm": 1.3732315301895142, "learning_rate": 2.626894823998981e-05, "loss": 0.1058, "step": 16645 }, { "epoch": 5.088963619688169, "grad_norm": 0.5059722065925598, "learning_rate": 2.626852362956987e-05, "loss": 0.1429, "step": 16646 }, { "epoch": 5.089269336594314, "grad_norm": 0.30328190326690674, "learning_rate": 2.626809901914993e-05, "loss": 0.1156, "step": 16647 }, { "epoch": 5.089575053500458, "grad_norm": 0.6071717739105225, "learning_rate": 2.626767440872999e-05, "loss": 0.1451, "step": 16648 }, { "epoch": 5.089880770406603, "grad_norm": 1.082592248916626, "learning_rate": 2.6267249798310052e-05, "loss": 0.184, "step": 16649 }, { "epoch": 5.0901864873127485, "grad_norm": 1.2796571254730225, "learning_rate": 2.626682518789011e-05, "loss": 0.2025, "step": 16650 }, { "epoch": 5.090492204218894, "grad_norm": 0.9296587705612183, "learning_rate": 2.6266400577470173e-05, "loss": 0.1718, "step": 16651 }, { "epoch": 5.090797921125038, "grad_norm": 0.7453572154045105, "learning_rate": 2.6265975967050232e-05, "loss": 0.1988, "step": 16652 }, { "epoch": 5.091103638031183, "grad_norm": 0.753042459487915, "learning_rate": 2.626555135663029e-05, "loss": 0.162, "step": 16653 }, { "epoch": 5.091409354937328, "grad_norm": 1.3600707054138184, "learning_rate": 2.6265126746210353e-05, "loss": 0.1936, "step": 16654 }, { "epoch": 5.091715071843473, "grad_norm": 3.6489007472991943, "learning_rate": 2.626470213579041e-05, "loss": 0.2241, "step": 16655 }, { "epoch": 5.092020788749618, "grad_norm": 0.5172454714775085, "learning_rate": 2.6264277525370474e-05, "loss": 0.1424, "step": 16656 }, { "epoch": 5.0923265056557625, "grad_norm": 0.3295772075653076, "learning_rate": 2.6263852914950532e-05, "loss": 0.0703, "step": 16657 }, { "epoch": 5.092632222561908, "grad_norm": 0.4434257447719574, "learning_rate": 2.6263428304530594e-05, "loss": 0.0847, "step": 16658 }, { "epoch": 5.092937939468053, "grad_norm": 0.2237773835659027, "learning_rate": 2.6263003694110653e-05, "loss": 0.0553, "step": 16659 }, { "epoch": 5.093243656374198, "grad_norm": 0.7067477703094482, "learning_rate": 2.6262579083690715e-05, "loss": 0.0483, "step": 16660 }, { "epoch": 5.093549373280342, "grad_norm": 0.19831284880638123, "learning_rate": 2.6262154473270774e-05, "loss": 0.0467, "step": 16661 }, { "epoch": 5.093855090186487, "grad_norm": 0.26069536805152893, "learning_rate": 2.6261729862850836e-05, "loss": 0.0496, "step": 16662 }, { "epoch": 5.094160807092632, "grad_norm": 0.6455317735671997, "learning_rate": 2.6261305252430895e-05, "loss": 0.0582, "step": 16663 }, { "epoch": 5.094466523998777, "grad_norm": 0.4894525408744812, "learning_rate": 2.6260880642010957e-05, "loss": 0.0514, "step": 16664 }, { "epoch": 5.094772240904922, "grad_norm": 0.31482699513435364, "learning_rate": 2.6260456031591015e-05, "loss": 0.0695, "step": 16665 }, { "epoch": 5.095077957811067, "grad_norm": 0.9334471225738525, "learning_rate": 2.6260031421171074e-05, "loss": 0.1033, "step": 16666 }, { "epoch": 5.095383674717212, "grad_norm": 0.7040863037109375, "learning_rate": 2.6259606810751136e-05, "loss": 0.0675, "step": 16667 }, { "epoch": 5.095689391623357, "grad_norm": 0.4580550491809845, "learning_rate": 2.6259182200331195e-05, "loss": 0.0899, "step": 16668 }, { "epoch": 5.095995108529502, "grad_norm": 0.5854487419128418, "learning_rate": 2.6258757589911257e-05, "loss": 0.0973, "step": 16669 }, { "epoch": 5.096300825435646, "grad_norm": 0.6483193039894104, "learning_rate": 2.6258332979491316e-05, "loss": 0.1317, "step": 16670 }, { "epoch": 5.0966065423417914, "grad_norm": 0.5061913728713989, "learning_rate": 2.6257908369071378e-05, "loss": 0.1437, "step": 16671 }, { "epoch": 5.096912259247937, "grad_norm": 0.6436333656311035, "learning_rate": 2.6257483758651436e-05, "loss": 0.1361, "step": 16672 }, { "epoch": 5.097217976154082, "grad_norm": 0.4885595142841339, "learning_rate": 2.62570591482315e-05, "loss": 0.1586, "step": 16673 }, { "epoch": 5.097523693060226, "grad_norm": 1.295068621635437, "learning_rate": 2.6256634537811557e-05, "loss": 0.1825, "step": 16674 }, { "epoch": 5.097829409966371, "grad_norm": 2.693099021911621, "learning_rate": 2.625620992739162e-05, "loss": 0.215, "step": 16675 }, { "epoch": 5.098135126872516, "grad_norm": 3.0279603004455566, "learning_rate": 2.6255785316971678e-05, "loss": 0.1642, "step": 16676 }, { "epoch": 5.098440843778661, "grad_norm": 3.3980295658111572, "learning_rate": 2.625536070655174e-05, "loss": 0.1865, "step": 16677 }, { "epoch": 5.0987465606848055, "grad_norm": 2.182666301727295, "learning_rate": 2.62549360961318e-05, "loss": 0.162, "step": 16678 }, { "epoch": 5.099052277590951, "grad_norm": 2.1183526515960693, "learning_rate": 2.6254511485711857e-05, "loss": 0.179, "step": 16679 }, { "epoch": 5.099357994497096, "grad_norm": 1.617173433303833, "learning_rate": 2.625408687529192e-05, "loss": 0.2927, "step": 16680 }, { "epoch": 5.099663711403241, "grad_norm": 0.4415738880634308, "learning_rate": 2.6253662264871978e-05, "loss": 0.1721, "step": 16681 }, { "epoch": 5.099969428309386, "grad_norm": 0.32738709449768066, "learning_rate": 2.625323765445204e-05, "loss": 0.1086, "step": 16682 }, { "epoch": 5.10027514521553, "grad_norm": 0.21777160465717316, "learning_rate": 2.62528130440321e-05, "loss": 0.0686, "step": 16683 }, { "epoch": 5.100580862121675, "grad_norm": 0.30111634731292725, "learning_rate": 2.625238843361216e-05, "loss": 0.0616, "step": 16684 }, { "epoch": 5.10088657902782, "grad_norm": 0.8372818231582642, "learning_rate": 2.625196382319222e-05, "loss": 0.0741, "step": 16685 }, { "epoch": 5.1011922959339655, "grad_norm": 0.20191729068756104, "learning_rate": 2.6251539212772282e-05, "loss": 0.046, "step": 16686 }, { "epoch": 5.10149801284011, "grad_norm": 0.7989339828491211, "learning_rate": 2.625111460235234e-05, "loss": 0.0496, "step": 16687 }, { "epoch": 5.101803729746255, "grad_norm": 0.3893676698207855, "learning_rate": 2.6250689991932403e-05, "loss": 0.0587, "step": 16688 }, { "epoch": 5.1021094466524, "grad_norm": 0.4809211790561676, "learning_rate": 2.625026538151246e-05, "loss": 0.0604, "step": 16689 }, { "epoch": 5.102415163558545, "grad_norm": 0.5480928421020508, "learning_rate": 2.6249840771092524e-05, "loss": 0.0742, "step": 16690 }, { "epoch": 5.102720880464689, "grad_norm": 0.3305383026599884, "learning_rate": 2.6249416160672582e-05, "loss": 0.0753, "step": 16691 }, { "epoch": 5.103026597370834, "grad_norm": 0.8236255645751953, "learning_rate": 2.624899155025264e-05, "loss": 0.0924, "step": 16692 }, { "epoch": 5.1033323142769795, "grad_norm": 0.3849577307701111, "learning_rate": 2.6248566939832703e-05, "loss": 0.1026, "step": 16693 }, { "epoch": 5.103638031183125, "grad_norm": 0.7770772576332092, "learning_rate": 2.6248142329412762e-05, "loss": 0.0847, "step": 16694 }, { "epoch": 5.10394374808927, "grad_norm": 0.5513730645179749, "learning_rate": 2.6247717718992824e-05, "loss": 0.1263, "step": 16695 }, { "epoch": 5.104249464995414, "grad_norm": 0.6081442832946777, "learning_rate": 2.6247293108572883e-05, "loss": 0.1442, "step": 16696 }, { "epoch": 5.104555181901559, "grad_norm": 1.2986211776733398, "learning_rate": 2.6246868498152945e-05, "loss": 0.155, "step": 16697 }, { "epoch": 5.104860898807704, "grad_norm": 0.652581512928009, "learning_rate": 2.6246443887733003e-05, "loss": 0.1159, "step": 16698 }, { "epoch": 5.105166615713849, "grad_norm": 0.9870818853378296, "learning_rate": 2.6246019277313065e-05, "loss": 0.1588, "step": 16699 }, { "epoch": 5.105472332619994, "grad_norm": 0.7319527864456177, "learning_rate": 2.6245594666893124e-05, "loss": 0.2031, "step": 16700 }, { "epoch": 5.105778049526139, "grad_norm": 0.8973028063774109, "learning_rate": 2.6245170056473186e-05, "loss": 0.1628, "step": 16701 }, { "epoch": 5.106083766432284, "grad_norm": 1.0799553394317627, "learning_rate": 2.6244745446053245e-05, "loss": 0.2017, "step": 16702 }, { "epoch": 5.106389483338429, "grad_norm": 2.079648017883301, "learning_rate": 2.6244320835633307e-05, "loss": 0.1925, "step": 16703 }, { "epoch": 5.106695200244573, "grad_norm": 0.7949920892715454, "learning_rate": 2.624389622521337e-05, "loss": 0.1783, "step": 16704 }, { "epoch": 5.107000917150718, "grad_norm": 1.9063783884048462, "learning_rate": 2.6243471614793428e-05, "loss": 0.2284, "step": 16705 }, { "epoch": 5.107306634056863, "grad_norm": 0.33947423100471497, "learning_rate": 2.624304700437349e-05, "loss": 0.1326, "step": 16706 }, { "epoch": 5.1076123509630085, "grad_norm": 0.3529576361179352, "learning_rate": 2.624262239395355e-05, "loss": 0.097, "step": 16707 }, { "epoch": 5.107918067869154, "grad_norm": 0.22005821764469147, "learning_rate": 2.624219778353361e-05, "loss": 0.075, "step": 16708 }, { "epoch": 5.108223784775298, "grad_norm": 0.4871126413345337, "learning_rate": 2.624177317311367e-05, "loss": 0.0682, "step": 16709 }, { "epoch": 5.108529501681443, "grad_norm": 0.1821487843990326, "learning_rate": 2.624134856269373e-05, "loss": 0.0465, "step": 16710 }, { "epoch": 5.108835218587588, "grad_norm": 0.4399604797363281, "learning_rate": 2.624092395227379e-05, "loss": 0.0445, "step": 16711 }, { "epoch": 5.109140935493733, "grad_norm": 0.35768210887908936, "learning_rate": 2.6240499341853852e-05, "loss": 0.0773, "step": 16712 }, { "epoch": 5.109446652399877, "grad_norm": 0.47841939330101013, "learning_rate": 2.624007473143391e-05, "loss": 0.042, "step": 16713 }, { "epoch": 5.1097523693060225, "grad_norm": 0.4686196446418762, "learning_rate": 2.6239650121013973e-05, "loss": 0.0981, "step": 16714 }, { "epoch": 5.110058086212168, "grad_norm": 0.2282872349023819, "learning_rate": 2.623922551059403e-05, "loss": 0.065, "step": 16715 }, { "epoch": 5.110363803118313, "grad_norm": 0.6683689951896667, "learning_rate": 2.6238800900174094e-05, "loss": 0.0849, "step": 16716 }, { "epoch": 5.110669520024457, "grad_norm": 0.346249520778656, "learning_rate": 2.6238376289754152e-05, "loss": 0.0798, "step": 16717 }, { "epoch": 5.110975236930602, "grad_norm": 0.3119552731513977, "learning_rate": 2.623795167933421e-05, "loss": 0.1038, "step": 16718 }, { "epoch": 5.111280953836747, "grad_norm": 0.5282664895057678, "learning_rate": 2.6237527068914273e-05, "loss": 0.116, "step": 16719 }, { "epoch": 5.111586670742892, "grad_norm": 0.4153136610984802, "learning_rate": 2.6237102458494332e-05, "loss": 0.1219, "step": 16720 }, { "epoch": 5.111892387649037, "grad_norm": 0.7254997491836548, "learning_rate": 2.6236677848074394e-05, "loss": 0.1299, "step": 16721 }, { "epoch": 5.112198104555182, "grad_norm": 0.664996325969696, "learning_rate": 2.6236253237654453e-05, "loss": 0.156, "step": 16722 }, { "epoch": 5.112503821461327, "grad_norm": 0.6199843287467957, "learning_rate": 2.6235828627234515e-05, "loss": 0.1652, "step": 16723 }, { "epoch": 5.112809538367472, "grad_norm": 0.4211265742778778, "learning_rate": 2.6235404016814574e-05, "loss": 0.1379, "step": 16724 }, { "epoch": 5.113115255273617, "grad_norm": 0.6741346716880798, "learning_rate": 2.6234979406394636e-05, "loss": 0.1867, "step": 16725 }, { "epoch": 5.113420972179761, "grad_norm": 0.8080139756202698, "learning_rate": 2.6234554795974694e-05, "loss": 0.182, "step": 16726 }, { "epoch": 5.113726689085906, "grad_norm": 0.7262703776359558, "learning_rate": 2.6234130185554756e-05, "loss": 0.168, "step": 16727 }, { "epoch": 5.1140324059920514, "grad_norm": 2.0479736328125, "learning_rate": 2.6233705575134815e-05, "loss": 0.2384, "step": 16728 }, { "epoch": 5.114338122898197, "grad_norm": 1.275508999824524, "learning_rate": 2.6233280964714877e-05, "loss": 0.2598, "step": 16729 }, { "epoch": 5.114643839804341, "grad_norm": 1.31483793258667, "learning_rate": 2.6232856354294936e-05, "loss": 0.2449, "step": 16730 }, { "epoch": 5.114949556710486, "grad_norm": 0.4201337695121765, "learning_rate": 2.6232431743874995e-05, "loss": 0.1437, "step": 16731 }, { "epoch": 5.115255273616631, "grad_norm": 0.2574135661125183, "learning_rate": 2.6232007133455057e-05, "loss": 0.0984, "step": 16732 }, { "epoch": 5.115560990522776, "grad_norm": 0.2733038663864136, "learning_rate": 2.6231582523035115e-05, "loss": 0.0577, "step": 16733 }, { "epoch": 5.115866707428921, "grad_norm": 0.1982709765434265, "learning_rate": 2.6231157912615177e-05, "loss": 0.0835, "step": 16734 }, { "epoch": 5.1161724243350655, "grad_norm": 0.3433452844619751, "learning_rate": 2.6230733302195236e-05, "loss": 0.0761, "step": 16735 }, { "epoch": 5.116478141241211, "grad_norm": 0.15467260777950287, "learning_rate": 2.6230308691775298e-05, "loss": 0.0488, "step": 16736 }, { "epoch": 5.116783858147356, "grad_norm": 0.28930506110191345, "learning_rate": 2.6229884081355357e-05, "loss": 0.0535, "step": 16737 }, { "epoch": 5.117089575053501, "grad_norm": 0.3444804251194, "learning_rate": 2.622945947093542e-05, "loss": 0.0888, "step": 16738 }, { "epoch": 5.117395291959645, "grad_norm": 0.18698380887508392, "learning_rate": 2.6229034860515478e-05, "loss": 0.0484, "step": 16739 }, { "epoch": 5.11770100886579, "grad_norm": 0.21832041442394257, "learning_rate": 2.622861025009554e-05, "loss": 0.0415, "step": 16740 }, { "epoch": 5.118006725771935, "grad_norm": 0.3345864415168762, "learning_rate": 2.62281856396756e-05, "loss": 0.0887, "step": 16741 }, { "epoch": 5.11831244267808, "grad_norm": 0.44758880138397217, "learning_rate": 2.622776102925566e-05, "loss": 0.0831, "step": 16742 }, { "epoch": 5.118618159584225, "grad_norm": 0.7839491963386536, "learning_rate": 2.622733641883572e-05, "loss": 0.1164, "step": 16743 }, { "epoch": 5.11892387649037, "grad_norm": 0.5331063270568848, "learning_rate": 2.6226911808415778e-05, "loss": 0.1113, "step": 16744 }, { "epoch": 5.119229593396515, "grad_norm": 0.5169556736946106, "learning_rate": 2.622648719799584e-05, "loss": 0.1146, "step": 16745 }, { "epoch": 5.11953531030266, "grad_norm": 0.3681056499481201, "learning_rate": 2.62260625875759e-05, "loss": 0.1474, "step": 16746 }, { "epoch": 5.119841027208805, "grad_norm": 2.3679769039154053, "learning_rate": 2.622563797715596e-05, "loss": 0.1682, "step": 16747 }, { "epoch": 5.120146744114949, "grad_norm": 0.6755548715591431, "learning_rate": 2.622521336673602e-05, "loss": 0.179, "step": 16748 }, { "epoch": 5.120452461021094, "grad_norm": 0.6262083649635315, "learning_rate": 2.622478875631608e-05, "loss": 0.184, "step": 16749 }, { "epoch": 5.1207581779272395, "grad_norm": 0.6508714556694031, "learning_rate": 2.622436414589614e-05, "loss": 0.1574, "step": 16750 }, { "epoch": 5.121063894833385, "grad_norm": 0.6899085640907288, "learning_rate": 2.6223939535476202e-05, "loss": 0.1554, "step": 16751 }, { "epoch": 5.121369611739529, "grad_norm": 1.0508657693862915, "learning_rate": 2.622351492505626e-05, "loss": 0.1558, "step": 16752 }, { "epoch": 5.121675328645674, "grad_norm": 0.9647064208984375, "learning_rate": 2.6223090314636323e-05, "loss": 0.1947, "step": 16753 }, { "epoch": 5.121981045551819, "grad_norm": 1.3785629272460938, "learning_rate": 2.6222665704216382e-05, "loss": 0.2324, "step": 16754 }, { "epoch": 5.122286762457964, "grad_norm": 1.44075608253479, "learning_rate": 2.6222241093796444e-05, "loss": 0.2569, "step": 16755 }, { "epoch": 5.122592479364108, "grad_norm": 0.35795941948890686, "learning_rate": 2.6221816483376503e-05, "loss": 0.1552, "step": 16756 }, { "epoch": 5.122898196270254, "grad_norm": 0.6252706050872803, "learning_rate": 2.622139187295656e-05, "loss": 0.0842, "step": 16757 }, { "epoch": 5.123203913176399, "grad_norm": 0.16684432327747345, "learning_rate": 2.6220967262536624e-05, "loss": 0.0606, "step": 16758 }, { "epoch": 5.123509630082544, "grad_norm": 0.8272101283073425, "learning_rate": 2.6220542652116682e-05, "loss": 0.0955, "step": 16759 }, { "epoch": 5.123815346988689, "grad_norm": 0.301944762468338, "learning_rate": 2.6220118041696744e-05, "loss": 0.0663, "step": 16760 }, { "epoch": 5.124121063894833, "grad_norm": 0.17421430349349976, "learning_rate": 2.6219693431276803e-05, "loss": 0.049, "step": 16761 }, { "epoch": 5.124426780800978, "grad_norm": 0.14025399088859558, "learning_rate": 2.6219268820856865e-05, "loss": 0.04, "step": 16762 }, { "epoch": 5.124732497707123, "grad_norm": 0.22687269747257233, "learning_rate": 2.6218844210436924e-05, "loss": 0.0845, "step": 16763 }, { "epoch": 5.1250382146132685, "grad_norm": 0.3288164436817169, "learning_rate": 2.6218419600016986e-05, "loss": 0.046, "step": 16764 }, { "epoch": 5.125343931519413, "grad_norm": 0.42633605003356934, "learning_rate": 2.6217994989597045e-05, "loss": 0.0665, "step": 16765 }, { "epoch": 5.125649648425558, "grad_norm": 0.4169326424598694, "learning_rate": 2.6217570379177107e-05, "loss": 0.0777, "step": 16766 }, { "epoch": 5.125955365331703, "grad_norm": 0.4272092580795288, "learning_rate": 2.6217145768757165e-05, "loss": 0.0703, "step": 16767 }, { "epoch": 5.126261082237848, "grad_norm": 0.424666166305542, "learning_rate": 2.6216721158337224e-05, "loss": 0.0836, "step": 16768 }, { "epoch": 5.126566799143992, "grad_norm": 0.4195452034473419, "learning_rate": 2.6216296547917286e-05, "loss": 0.107, "step": 16769 }, { "epoch": 5.126872516050137, "grad_norm": 0.6069584488868713, "learning_rate": 2.6215871937497345e-05, "loss": 0.1257, "step": 16770 }, { "epoch": 5.1271782329562825, "grad_norm": 0.5580390095710754, "learning_rate": 2.6215447327077407e-05, "loss": 0.1718, "step": 16771 }, { "epoch": 5.127483949862428, "grad_norm": 0.811248779296875, "learning_rate": 2.6215022716657466e-05, "loss": 0.1321, "step": 16772 }, { "epoch": 5.127789666768573, "grad_norm": 0.43086567521095276, "learning_rate": 2.6214598106237528e-05, "loss": 0.1532, "step": 16773 }, { "epoch": 5.128095383674717, "grad_norm": 1.1207998991012573, "learning_rate": 2.6214173495817586e-05, "loss": 0.1761, "step": 16774 }, { "epoch": 5.128401100580862, "grad_norm": 1.1678358316421509, "learning_rate": 2.621374888539765e-05, "loss": 0.1696, "step": 16775 }, { "epoch": 5.128706817487007, "grad_norm": 0.9384748339653015, "learning_rate": 2.6213324274977707e-05, "loss": 0.1834, "step": 16776 }, { "epoch": 5.129012534393152, "grad_norm": 0.8024607300758362, "learning_rate": 2.621289966455777e-05, "loss": 0.1761, "step": 16777 }, { "epoch": 5.1293182512992965, "grad_norm": 1.0693086385726929, "learning_rate": 2.6212475054137828e-05, "loss": 0.1563, "step": 16778 }, { "epoch": 5.129623968205442, "grad_norm": 1.0531105995178223, "learning_rate": 2.621205044371789e-05, "loss": 0.1921, "step": 16779 }, { "epoch": 5.129929685111587, "grad_norm": 1.0274244546890259, "learning_rate": 2.621162583329795e-05, "loss": 0.1988, "step": 16780 }, { "epoch": 5.130235402017732, "grad_norm": 0.6928637623786926, "learning_rate": 2.6211201222878008e-05, "loss": 0.187, "step": 16781 }, { "epoch": 5.130541118923876, "grad_norm": 0.3711897134780884, "learning_rate": 2.621077661245807e-05, "loss": 0.0682, "step": 16782 }, { "epoch": 5.130846835830021, "grad_norm": 0.22645960748195648, "learning_rate": 2.621035200203813e-05, "loss": 0.0803, "step": 16783 }, { "epoch": 5.131152552736166, "grad_norm": 0.4681661128997803, "learning_rate": 2.620992739161819e-05, "loss": 0.0849, "step": 16784 }, { "epoch": 5.131458269642311, "grad_norm": 0.19198893010616302, "learning_rate": 2.620950278119825e-05, "loss": 0.0361, "step": 16785 }, { "epoch": 5.131763986548457, "grad_norm": 0.22184152901172638, "learning_rate": 2.620907817077831e-05, "loss": 0.0687, "step": 16786 }, { "epoch": 5.132069703454601, "grad_norm": 0.4538893699645996, "learning_rate": 2.620865356035837e-05, "loss": 0.0396, "step": 16787 }, { "epoch": 5.132375420360746, "grad_norm": 0.29064205288887024, "learning_rate": 2.6208228949938432e-05, "loss": 0.0643, "step": 16788 }, { "epoch": 5.132681137266891, "grad_norm": 0.3447053134441376, "learning_rate": 2.620780433951849e-05, "loss": 0.1065, "step": 16789 }, { "epoch": 5.132986854173036, "grad_norm": 0.38856279850006104, "learning_rate": 2.6207379729098553e-05, "loss": 0.0602, "step": 16790 }, { "epoch": 5.13329257107918, "grad_norm": 0.4907699227333069, "learning_rate": 2.620695511867861e-05, "loss": 0.0683, "step": 16791 }, { "epoch": 5.1335982879853255, "grad_norm": 0.2041686773300171, "learning_rate": 2.6206530508258674e-05, "loss": 0.0625, "step": 16792 }, { "epoch": 5.133904004891471, "grad_norm": 0.3589651584625244, "learning_rate": 2.6206105897838732e-05, "loss": 0.0978, "step": 16793 }, { "epoch": 5.134209721797616, "grad_norm": 0.4887858033180237, "learning_rate": 2.620568128741879e-05, "loss": 0.0972, "step": 16794 }, { "epoch": 5.13451543870376, "grad_norm": 0.6020526885986328, "learning_rate": 2.6205256676998853e-05, "loss": 0.1196, "step": 16795 }, { "epoch": 5.134821155609905, "grad_norm": 1.5288838148117065, "learning_rate": 2.6204832066578912e-05, "loss": 0.1399, "step": 16796 }, { "epoch": 5.13512687251605, "grad_norm": 2.5859031677246094, "learning_rate": 2.6204407456158974e-05, "loss": 0.1168, "step": 16797 }, { "epoch": 5.135432589422195, "grad_norm": 1.2796900272369385, "learning_rate": 2.6203982845739033e-05, "loss": 0.158, "step": 16798 }, { "epoch": 5.13573830632834, "grad_norm": 0.8416755795478821, "learning_rate": 2.6203558235319095e-05, "loss": 0.1728, "step": 16799 }, { "epoch": 5.136044023234485, "grad_norm": 1.2723414897918701, "learning_rate": 2.6203133624899153e-05, "loss": 0.1381, "step": 16800 }, { "epoch": 5.13634974014063, "grad_norm": 1.1134322881698608, "learning_rate": 2.6202709014479215e-05, "loss": 0.1624, "step": 16801 }, { "epoch": 5.136655457046775, "grad_norm": 1.0639007091522217, "learning_rate": 2.6202284404059274e-05, "loss": 0.168, "step": 16802 }, { "epoch": 5.13696117395292, "grad_norm": 0.7173783779144287, "learning_rate": 2.6201859793639336e-05, "loss": 0.2022, "step": 16803 }, { "epoch": 5.137266890859064, "grad_norm": 0.7509580850601196, "learning_rate": 2.6201435183219395e-05, "loss": 0.2037, "step": 16804 }, { "epoch": 5.137572607765209, "grad_norm": 1.4366101026535034, "learning_rate": 2.6201010572799457e-05, "loss": 0.2456, "step": 16805 }, { "epoch": 5.137878324671354, "grad_norm": 0.529866635799408, "learning_rate": 2.620058596237952e-05, "loss": 0.1562, "step": 16806 }, { "epoch": 5.1381840415774995, "grad_norm": 0.27667614817619324, "learning_rate": 2.6200161351959578e-05, "loss": 0.0755, "step": 16807 }, { "epoch": 5.138489758483644, "grad_norm": 0.2593812048435211, "learning_rate": 2.619973674153964e-05, "loss": 0.0719, "step": 16808 }, { "epoch": 5.138795475389789, "grad_norm": 0.5107375383377075, "learning_rate": 2.61993121311197e-05, "loss": 0.0649, "step": 16809 }, { "epoch": 5.139101192295934, "grad_norm": 0.19018816947937012, "learning_rate": 2.619888752069976e-05, "loss": 0.0597, "step": 16810 }, { "epoch": 5.139406909202079, "grad_norm": 0.1565730720758438, "learning_rate": 2.619846291027982e-05, "loss": 0.0507, "step": 16811 }, { "epoch": 5.139712626108224, "grad_norm": 0.30241507291793823, "learning_rate": 2.619803829985988e-05, "loss": 0.0541, "step": 16812 }, { "epoch": 5.140018343014368, "grad_norm": 0.2494114637374878, "learning_rate": 2.619761368943994e-05, "loss": 0.0552, "step": 16813 }, { "epoch": 5.1403240599205136, "grad_norm": 0.8318573236465454, "learning_rate": 2.6197189079020002e-05, "loss": 0.0871, "step": 16814 }, { "epoch": 5.140629776826659, "grad_norm": 0.2787289619445801, "learning_rate": 2.619676446860006e-05, "loss": 0.0603, "step": 16815 }, { "epoch": 5.140935493732804, "grad_norm": 0.45657479763031006, "learning_rate": 2.6196339858180123e-05, "loss": 0.0941, "step": 16816 }, { "epoch": 5.141241210638948, "grad_norm": 0.4018072485923767, "learning_rate": 2.6195915247760182e-05, "loss": 0.086, "step": 16817 }, { "epoch": 5.141546927545093, "grad_norm": 0.8288427591323853, "learning_rate": 2.6195490637340244e-05, "loss": 0.1011, "step": 16818 }, { "epoch": 5.141852644451238, "grad_norm": 0.49745768308639526, "learning_rate": 2.6195066026920302e-05, "loss": 0.1057, "step": 16819 }, { "epoch": 5.142158361357383, "grad_norm": 3.225329637527466, "learning_rate": 2.619464141650036e-05, "loss": 0.1169, "step": 16820 }, { "epoch": 5.142464078263528, "grad_norm": 0.6348628401756287, "learning_rate": 2.6194216806080423e-05, "loss": 0.1259, "step": 16821 }, { "epoch": 5.142769795169673, "grad_norm": 1.4580150842666626, "learning_rate": 2.6193792195660482e-05, "loss": 0.1707, "step": 16822 }, { "epoch": 5.143075512075818, "grad_norm": 0.7013120055198669, "learning_rate": 2.6193367585240544e-05, "loss": 0.1511, "step": 16823 }, { "epoch": 5.143381228981963, "grad_norm": 1.7555310726165771, "learning_rate": 2.6192942974820603e-05, "loss": 0.1673, "step": 16824 }, { "epoch": 5.143686945888108, "grad_norm": 1.0665032863616943, "learning_rate": 2.6192518364400665e-05, "loss": 0.1603, "step": 16825 }, { "epoch": 5.143992662794252, "grad_norm": 0.5705925226211548, "learning_rate": 2.6192093753980724e-05, "loss": 0.1499, "step": 16826 }, { "epoch": 5.144298379700397, "grad_norm": 12.788030624389648, "learning_rate": 2.6191669143560786e-05, "loss": 0.1764, "step": 16827 }, { "epoch": 5.1446040966065425, "grad_norm": 1.1387652158737183, "learning_rate": 2.6191244533140844e-05, "loss": 0.177, "step": 16828 }, { "epoch": 5.144909813512688, "grad_norm": 1.3677341938018799, "learning_rate": 2.6190819922720906e-05, "loss": 0.1647, "step": 16829 }, { "epoch": 5.145215530418832, "grad_norm": 1.7758127450942993, "learning_rate": 2.6190395312300965e-05, "loss": 0.1968, "step": 16830 }, { "epoch": 5.145521247324977, "grad_norm": 0.583168625831604, "learning_rate": 2.6189970701881027e-05, "loss": 0.1265, "step": 16831 }, { "epoch": 5.145826964231122, "grad_norm": 0.2884281277656555, "learning_rate": 2.6189546091461086e-05, "loss": 0.0768, "step": 16832 }, { "epoch": 5.146132681137267, "grad_norm": 0.2193293571472168, "learning_rate": 2.6189121481041145e-05, "loss": 0.0767, "step": 16833 }, { "epoch": 5.146438398043411, "grad_norm": 0.29244133830070496, "learning_rate": 2.6188696870621207e-05, "loss": 0.0551, "step": 16834 }, { "epoch": 5.1467441149495565, "grad_norm": 0.16441528499126434, "learning_rate": 2.6188272260201265e-05, "loss": 0.0423, "step": 16835 }, { "epoch": 5.147049831855702, "grad_norm": 0.3397209346294403, "learning_rate": 2.6187847649781328e-05, "loss": 0.0533, "step": 16836 }, { "epoch": 5.147355548761847, "grad_norm": 0.2568510174751282, "learning_rate": 2.6187423039361386e-05, "loss": 0.0698, "step": 16837 }, { "epoch": 5.147661265667992, "grad_norm": 0.31085628271102905, "learning_rate": 2.6186998428941448e-05, "loss": 0.0453, "step": 16838 }, { "epoch": 5.147966982574136, "grad_norm": 0.43656349182128906, "learning_rate": 2.6186573818521507e-05, "loss": 0.0745, "step": 16839 }, { "epoch": 5.148272699480281, "grad_norm": 0.23907840251922607, "learning_rate": 2.618614920810157e-05, "loss": 0.0578, "step": 16840 }, { "epoch": 5.148578416386426, "grad_norm": 0.3232944905757904, "learning_rate": 2.6185724597681628e-05, "loss": 0.1112, "step": 16841 }, { "epoch": 5.148884133292571, "grad_norm": 0.24007488787174225, "learning_rate": 2.618529998726169e-05, "loss": 0.0777, "step": 16842 }, { "epoch": 5.149189850198716, "grad_norm": 0.3981797993183136, "learning_rate": 2.618487537684175e-05, "loss": 0.0864, "step": 16843 }, { "epoch": 5.149495567104861, "grad_norm": 0.48110994696617126, "learning_rate": 2.618445076642181e-05, "loss": 0.0998, "step": 16844 }, { "epoch": 5.149801284011006, "grad_norm": 0.4156735837459564, "learning_rate": 2.618402615600187e-05, "loss": 0.1427, "step": 16845 }, { "epoch": 5.150107000917151, "grad_norm": 0.6003925800323486, "learning_rate": 2.6183601545581928e-05, "loss": 0.1164, "step": 16846 }, { "epoch": 5.150412717823295, "grad_norm": 0.46255287528038025, "learning_rate": 2.618317693516199e-05, "loss": 0.1241, "step": 16847 }, { "epoch": 5.15071843472944, "grad_norm": 0.4299468696117401, "learning_rate": 2.618275232474205e-05, "loss": 0.1243, "step": 16848 }, { "epoch": 5.1510241516355855, "grad_norm": 0.5798346400260925, "learning_rate": 2.618232771432211e-05, "loss": 0.1331, "step": 16849 }, { "epoch": 5.151329868541731, "grad_norm": 1.3458205461502075, "learning_rate": 2.618190310390217e-05, "loss": 0.1932, "step": 16850 }, { "epoch": 5.151635585447876, "grad_norm": 3.0872862339019775, "learning_rate": 2.6181478493482232e-05, "loss": 0.1731, "step": 16851 }, { "epoch": 5.15194130235402, "grad_norm": 1.6227812767028809, "learning_rate": 2.618105388306229e-05, "loss": 0.1608, "step": 16852 }, { "epoch": 5.152247019260165, "grad_norm": 3.385836362838745, "learning_rate": 2.6180629272642353e-05, "loss": 0.1769, "step": 16853 }, { "epoch": 5.15255273616631, "grad_norm": 2.9214072227478027, "learning_rate": 2.618020466222241e-05, "loss": 0.1858, "step": 16854 }, { "epoch": 5.152858453072455, "grad_norm": 1.155037760734558, "learning_rate": 2.6179780051802473e-05, "loss": 0.2264, "step": 16855 }, { "epoch": 5.1531641699785995, "grad_norm": 0.3373599052429199, "learning_rate": 2.6179355441382532e-05, "loss": 0.1109, "step": 16856 }, { "epoch": 5.153469886884745, "grad_norm": 0.24492037296295166, "learning_rate": 2.6178930830962594e-05, "loss": 0.0746, "step": 16857 }, { "epoch": 5.15377560379089, "grad_norm": 0.19454927742481232, "learning_rate": 2.6178506220542653e-05, "loss": 0.0608, "step": 16858 }, { "epoch": 5.154081320697035, "grad_norm": 0.3962819278240204, "learning_rate": 2.617808161012271e-05, "loss": 0.07, "step": 16859 }, { "epoch": 5.154387037603179, "grad_norm": 0.25729432702064514, "learning_rate": 2.6177656999702774e-05, "loss": 0.0525, "step": 16860 }, { "epoch": 5.154692754509324, "grad_norm": 0.22803710401058197, "learning_rate": 2.6177232389282832e-05, "loss": 0.072, "step": 16861 }, { "epoch": 5.154998471415469, "grad_norm": 0.20339612662792206, "learning_rate": 2.6176807778862894e-05, "loss": 0.0575, "step": 16862 }, { "epoch": 5.155304188321614, "grad_norm": 0.43626946210861206, "learning_rate": 2.6176383168442953e-05, "loss": 0.0555, "step": 16863 }, { "epoch": 5.1556099052277595, "grad_norm": 0.26710015535354614, "learning_rate": 2.6175958558023015e-05, "loss": 0.0538, "step": 16864 }, { "epoch": 5.155915622133904, "grad_norm": 0.4323484003543854, "learning_rate": 2.6175533947603074e-05, "loss": 0.0756, "step": 16865 }, { "epoch": 5.156221339040049, "grad_norm": 0.6659135222434998, "learning_rate": 2.6175109337183136e-05, "loss": 0.0794, "step": 16866 }, { "epoch": 5.156527055946194, "grad_norm": 0.20239800214767456, "learning_rate": 2.6174684726763195e-05, "loss": 0.0667, "step": 16867 }, { "epoch": 5.156832772852339, "grad_norm": 0.333680659532547, "learning_rate": 2.6174260116343257e-05, "loss": 0.0867, "step": 16868 }, { "epoch": 5.157138489758483, "grad_norm": 0.3469800055027008, "learning_rate": 2.6173835505923315e-05, "loss": 0.1529, "step": 16869 }, { "epoch": 5.157444206664628, "grad_norm": 0.5727220773696899, "learning_rate": 2.6173410895503378e-05, "loss": 0.1209, "step": 16870 }, { "epoch": 5.1577499235707736, "grad_norm": 0.44612035155296326, "learning_rate": 2.6172986285083436e-05, "loss": 0.1078, "step": 16871 }, { "epoch": 5.158055640476919, "grad_norm": 0.5471586585044861, "learning_rate": 2.6172561674663495e-05, "loss": 0.1371, "step": 16872 }, { "epoch": 5.158361357383063, "grad_norm": 0.4438742995262146, "learning_rate": 2.6172137064243557e-05, "loss": 0.1505, "step": 16873 }, { "epoch": 5.158667074289208, "grad_norm": 0.8351784348487854, "learning_rate": 2.6171712453823616e-05, "loss": 0.1779, "step": 16874 }, { "epoch": 5.158972791195353, "grad_norm": 1.6869258880615234, "learning_rate": 2.6171287843403678e-05, "loss": 0.1691, "step": 16875 }, { "epoch": 5.159278508101498, "grad_norm": 0.9334918856620789, "learning_rate": 2.6170863232983736e-05, "loss": 0.1665, "step": 16876 }, { "epoch": 5.159584225007643, "grad_norm": 0.5144487023353577, "learning_rate": 2.61704386225638e-05, "loss": 0.1676, "step": 16877 }, { "epoch": 5.159889941913788, "grad_norm": 1.0443432331085205, "learning_rate": 2.6170014012143857e-05, "loss": 0.1681, "step": 16878 }, { "epoch": 5.160195658819933, "grad_norm": 1.4257978200912476, "learning_rate": 2.616958940172392e-05, "loss": 0.1646, "step": 16879 }, { "epoch": 5.160501375726078, "grad_norm": 3.552338123321533, "learning_rate": 2.6169164791303978e-05, "loss": 0.2422, "step": 16880 }, { "epoch": 5.160807092632223, "grad_norm": 0.3799952566623688, "learning_rate": 2.616874018088404e-05, "loss": 0.168, "step": 16881 }, { "epoch": 5.161112809538367, "grad_norm": 0.5557749271392822, "learning_rate": 2.61683155704641e-05, "loss": 0.0947, "step": 16882 }, { "epoch": 5.161418526444512, "grad_norm": 0.22231914103031158, "learning_rate": 2.6167890960044158e-05, "loss": 0.0892, "step": 16883 }, { "epoch": 5.161724243350657, "grad_norm": 0.22168171405792236, "learning_rate": 2.616746634962422e-05, "loss": 0.047, "step": 16884 }, { "epoch": 5.1620299602568025, "grad_norm": 0.1694391816854477, "learning_rate": 2.616704173920428e-05, "loss": 0.0538, "step": 16885 }, { "epoch": 5.162335677162947, "grad_norm": 0.22036725282669067, "learning_rate": 2.616661712878434e-05, "loss": 0.071, "step": 16886 }, { "epoch": 5.162641394069092, "grad_norm": 0.4096008837223053, "learning_rate": 2.61661925183644e-05, "loss": 0.0461, "step": 16887 }, { "epoch": 5.162947110975237, "grad_norm": 0.2336195558309555, "learning_rate": 2.616576790794446e-05, "loss": 0.0736, "step": 16888 }, { "epoch": 5.163252827881382, "grad_norm": 1.5247132778167725, "learning_rate": 2.616534329752452e-05, "loss": 0.0558, "step": 16889 }, { "epoch": 5.163558544787527, "grad_norm": 0.2525153160095215, "learning_rate": 2.6164918687104582e-05, "loss": 0.0651, "step": 16890 }, { "epoch": 5.163864261693671, "grad_norm": 0.4316021203994751, "learning_rate": 2.616449407668464e-05, "loss": 0.0726, "step": 16891 }, { "epoch": 5.1641699785998165, "grad_norm": 6.545958518981934, "learning_rate": 2.6164069466264703e-05, "loss": 0.0719, "step": 16892 }, { "epoch": 5.164475695505962, "grad_norm": 0.27365946769714355, "learning_rate": 2.616364485584476e-05, "loss": 0.1263, "step": 16893 }, { "epoch": 5.164781412412107, "grad_norm": 0.4729461669921875, "learning_rate": 2.6163220245424824e-05, "loss": 0.1138, "step": 16894 }, { "epoch": 5.165087129318251, "grad_norm": 0.713455080986023, "learning_rate": 2.6162795635004882e-05, "loss": 0.1122, "step": 16895 }, { "epoch": 5.165392846224396, "grad_norm": 0.4011973440647125, "learning_rate": 2.616237102458494e-05, "loss": 0.1062, "step": 16896 }, { "epoch": 5.165698563130541, "grad_norm": 0.6319785714149475, "learning_rate": 2.6161946414165003e-05, "loss": 0.1328, "step": 16897 }, { "epoch": 5.166004280036686, "grad_norm": 0.5535654425621033, "learning_rate": 2.6161521803745062e-05, "loss": 0.1887, "step": 16898 }, { "epoch": 5.1663099969428306, "grad_norm": 0.7565202116966248, "learning_rate": 2.6161097193325124e-05, "loss": 0.1519, "step": 16899 }, { "epoch": 5.166615713848976, "grad_norm": 0.6322049498558044, "learning_rate": 2.6160672582905183e-05, "loss": 0.1793, "step": 16900 }, { "epoch": 5.166921430755121, "grad_norm": 0.8369337916374207, "learning_rate": 2.6160247972485245e-05, "loss": 0.1617, "step": 16901 }, { "epoch": 5.167227147661266, "grad_norm": 1.0530232191085815, "learning_rate": 2.6159823362065303e-05, "loss": 0.1886, "step": 16902 }, { "epoch": 5.167532864567411, "grad_norm": 1.3118261098861694, "learning_rate": 2.6159398751645365e-05, "loss": 0.2047, "step": 16903 }, { "epoch": 5.167838581473555, "grad_norm": 0.7179185152053833, "learning_rate": 2.6158974141225424e-05, "loss": 0.1949, "step": 16904 }, { "epoch": 5.1681442983797, "grad_norm": 2.3610525131225586, "learning_rate": 2.6158549530805486e-05, "loss": 0.1725, "step": 16905 }, { "epoch": 5.1684500152858455, "grad_norm": 0.26937025785446167, "learning_rate": 2.6158124920385545e-05, "loss": 0.1229, "step": 16906 }, { "epoch": 5.168755732191991, "grad_norm": 0.3028858006000519, "learning_rate": 2.6157700309965607e-05, "loss": 0.0782, "step": 16907 }, { "epoch": 5.169061449098135, "grad_norm": 0.3587085008621216, "learning_rate": 2.615727569954567e-05, "loss": 0.0709, "step": 16908 }, { "epoch": 5.16936716600428, "grad_norm": 0.18767020106315613, "learning_rate": 2.6156851089125728e-05, "loss": 0.0552, "step": 16909 }, { "epoch": 5.169672882910425, "grad_norm": 0.20928974449634552, "learning_rate": 2.615642647870579e-05, "loss": 0.0486, "step": 16910 }, { "epoch": 5.16997859981657, "grad_norm": 0.15703293681144714, "learning_rate": 2.615600186828585e-05, "loss": 0.0431, "step": 16911 }, { "epoch": 5.170284316722714, "grad_norm": 0.2539346516132355, "learning_rate": 2.615557725786591e-05, "loss": 0.07, "step": 16912 }, { "epoch": 5.1705900336288595, "grad_norm": 0.5431164503097534, "learning_rate": 2.615515264744597e-05, "loss": 0.0558, "step": 16913 }, { "epoch": 5.170895750535005, "grad_norm": 0.27502962946891785, "learning_rate": 2.615472803702603e-05, "loss": 0.0702, "step": 16914 }, { "epoch": 5.17120146744115, "grad_norm": 0.26462092995643616, "learning_rate": 2.615430342660609e-05, "loss": 0.0685, "step": 16915 }, { "epoch": 5.171507184347295, "grad_norm": 0.24564941227436066, "learning_rate": 2.6153878816186152e-05, "loss": 0.1001, "step": 16916 }, { "epoch": 5.171812901253439, "grad_norm": 0.45104992389678955, "learning_rate": 2.615345420576621e-05, "loss": 0.0845, "step": 16917 }, { "epoch": 5.172118618159584, "grad_norm": 3.0868422985076904, "learning_rate": 2.6153029595346273e-05, "loss": 0.0865, "step": 16918 }, { "epoch": 5.172424335065729, "grad_norm": 0.44217467308044434, "learning_rate": 2.6152604984926332e-05, "loss": 0.1326, "step": 16919 }, { "epoch": 5.172730051971874, "grad_norm": 0.7014266848564148, "learning_rate": 2.6152180374506394e-05, "loss": 0.0989, "step": 16920 }, { "epoch": 5.173035768878019, "grad_norm": 0.6723583340644836, "learning_rate": 2.6151755764086453e-05, "loss": 0.1479, "step": 16921 }, { "epoch": 5.173341485784164, "grad_norm": 7.2451171875, "learning_rate": 2.615133115366651e-05, "loss": 0.1784, "step": 16922 }, { "epoch": 5.173647202690309, "grad_norm": 0.3786543905735016, "learning_rate": 2.6150906543246573e-05, "loss": 0.1566, "step": 16923 }, { "epoch": 5.173952919596454, "grad_norm": 0.5562983751296997, "learning_rate": 2.6150481932826632e-05, "loss": 0.1665, "step": 16924 }, { "epoch": 5.174258636502598, "grad_norm": 0.8975144028663635, "learning_rate": 2.6150057322406694e-05, "loss": 0.1603, "step": 16925 }, { "epoch": 5.174564353408743, "grad_norm": 3.6429529190063477, "learning_rate": 2.6149632711986753e-05, "loss": 0.1814, "step": 16926 }, { "epoch": 5.174870070314888, "grad_norm": 0.9929733872413635, "learning_rate": 2.6149208101566815e-05, "loss": 0.1578, "step": 16927 }, { "epoch": 5.1751757872210336, "grad_norm": 1.0302854776382446, "learning_rate": 2.6148783491146874e-05, "loss": 0.1513, "step": 16928 }, { "epoch": 5.175481504127179, "grad_norm": 1.0736396312713623, "learning_rate": 2.6148358880726936e-05, "loss": 0.1926, "step": 16929 }, { "epoch": 5.175787221033323, "grad_norm": 1.6320469379425049, "learning_rate": 2.6147934270306994e-05, "loss": 0.1935, "step": 16930 }, { "epoch": 5.176092937939468, "grad_norm": 1.3052235841751099, "learning_rate": 2.6147509659887056e-05, "loss": 0.1349, "step": 16931 }, { "epoch": 5.176398654845613, "grad_norm": 0.6789337992668152, "learning_rate": 2.6147085049467115e-05, "loss": 0.0785, "step": 16932 }, { "epoch": 5.176704371751758, "grad_norm": 0.46218910813331604, "learning_rate": 2.6146660439047177e-05, "loss": 0.0734, "step": 16933 }, { "epoch": 5.1770100886579025, "grad_norm": 0.22012034058570862, "learning_rate": 2.6146235828627236e-05, "loss": 0.0478, "step": 16934 }, { "epoch": 5.177315805564048, "grad_norm": 0.23578347265720367, "learning_rate": 2.6145811218207295e-05, "loss": 0.0419, "step": 16935 }, { "epoch": 5.177621522470193, "grad_norm": 1.2356555461883545, "learning_rate": 2.6145386607787357e-05, "loss": 0.0649, "step": 16936 }, { "epoch": 5.177927239376338, "grad_norm": 0.3874727487564087, "learning_rate": 2.6144961997367415e-05, "loss": 0.0471, "step": 16937 }, { "epoch": 5.178232956282482, "grad_norm": 0.6896043419837952, "learning_rate": 2.6144537386947478e-05, "loss": 0.0653, "step": 16938 }, { "epoch": 5.178538673188627, "grad_norm": 0.8112092614173889, "learning_rate": 2.6144112776527536e-05, "loss": 0.0831, "step": 16939 }, { "epoch": 5.178844390094772, "grad_norm": 0.45350679755210876, "learning_rate": 2.61436881661076e-05, "loss": 0.0711, "step": 16940 }, { "epoch": 5.179150107000917, "grad_norm": 1.4165873527526855, "learning_rate": 2.6143263555687657e-05, "loss": 0.0973, "step": 16941 }, { "epoch": 5.1794558239070625, "grad_norm": 0.6647264361381531, "learning_rate": 2.614283894526772e-05, "loss": 0.0634, "step": 16942 }, { "epoch": 5.179761540813207, "grad_norm": 0.8899637460708618, "learning_rate": 2.6142414334847778e-05, "loss": 0.0993, "step": 16943 }, { "epoch": 5.180067257719352, "grad_norm": 1.4926029443740845, "learning_rate": 2.614198972442784e-05, "loss": 0.1301, "step": 16944 }, { "epoch": 5.180372974625497, "grad_norm": 0.6135749220848083, "learning_rate": 2.61415651140079e-05, "loss": 0.1587, "step": 16945 }, { "epoch": 5.180678691531642, "grad_norm": 0.8462659120559692, "learning_rate": 2.614114050358796e-05, "loss": 0.15, "step": 16946 }, { "epoch": 5.180984408437786, "grad_norm": 1.1835358142852783, "learning_rate": 2.614071589316802e-05, "loss": 0.1494, "step": 16947 }, { "epoch": 5.181290125343931, "grad_norm": 0.8606416583061218, "learning_rate": 2.6140291282748078e-05, "loss": 0.1721, "step": 16948 }, { "epoch": 5.1815958422500765, "grad_norm": 0.8310849666595459, "learning_rate": 2.613986667232814e-05, "loss": 0.159, "step": 16949 }, { "epoch": 5.181901559156222, "grad_norm": 1.214922547340393, "learning_rate": 2.61394420619082e-05, "loss": 0.1595, "step": 16950 }, { "epoch": 5.182207276062366, "grad_norm": 10.961921691894531, "learning_rate": 2.613901745148826e-05, "loss": 0.1736, "step": 16951 }, { "epoch": 5.182512992968511, "grad_norm": 0.6809720396995544, "learning_rate": 2.613859284106832e-05, "loss": 0.1597, "step": 16952 }, { "epoch": 5.182818709874656, "grad_norm": 1.3250495195388794, "learning_rate": 2.6138168230648382e-05, "loss": 0.1807, "step": 16953 }, { "epoch": 5.183124426780801, "grad_norm": 1.0524958372116089, "learning_rate": 2.613774362022844e-05, "loss": 0.1662, "step": 16954 }, { "epoch": 5.183430143686946, "grad_norm": 2.1401660442352295, "learning_rate": 2.6137319009808503e-05, "loss": 0.2429, "step": 16955 }, { "epoch": 5.1837358605930905, "grad_norm": 0.6269713640213013, "learning_rate": 2.613689439938856e-05, "loss": 0.1409, "step": 16956 }, { "epoch": 5.184041577499236, "grad_norm": 0.5498161911964417, "learning_rate": 2.6136469788968623e-05, "loss": 0.091, "step": 16957 }, { "epoch": 5.184347294405381, "grad_norm": 0.18192759156227112, "learning_rate": 2.6136045178548682e-05, "loss": 0.0756, "step": 16958 }, { "epoch": 5.184653011311526, "grad_norm": 0.16490939259529114, "learning_rate": 2.6135620568128744e-05, "loss": 0.0434, "step": 16959 }, { "epoch": 5.18495872821767, "grad_norm": 0.30743834376335144, "learning_rate": 2.6135195957708803e-05, "loss": 0.0406, "step": 16960 }, { "epoch": 5.185264445123815, "grad_norm": 0.21214307844638824, "learning_rate": 2.613477134728886e-05, "loss": 0.0427, "step": 16961 }, { "epoch": 5.18557016202996, "grad_norm": 0.3593412935733795, "learning_rate": 2.6134346736868924e-05, "loss": 0.0694, "step": 16962 }, { "epoch": 5.1858758789361055, "grad_norm": 0.35203590989112854, "learning_rate": 2.6133922126448982e-05, "loss": 0.0551, "step": 16963 }, { "epoch": 5.18618159584225, "grad_norm": 0.16775096952915192, "learning_rate": 2.6133497516029044e-05, "loss": 0.0379, "step": 16964 }, { "epoch": 5.186487312748395, "grad_norm": 0.8918443918228149, "learning_rate": 2.6133072905609103e-05, "loss": 0.0849, "step": 16965 }, { "epoch": 5.18679302965454, "grad_norm": 1.7471051216125488, "learning_rate": 2.6132648295189165e-05, "loss": 0.0773, "step": 16966 }, { "epoch": 5.187098746560685, "grad_norm": 0.3241240084171295, "learning_rate": 2.6132223684769224e-05, "loss": 0.0895, "step": 16967 }, { "epoch": 5.18740446346683, "grad_norm": 1.5235614776611328, "learning_rate": 2.6131799074349286e-05, "loss": 0.0984, "step": 16968 }, { "epoch": 5.187710180372974, "grad_norm": 0.4613686501979828, "learning_rate": 2.6131374463929345e-05, "loss": 0.0919, "step": 16969 }, { "epoch": 5.1880158972791195, "grad_norm": 0.39312100410461426, "learning_rate": 2.6130949853509407e-05, "loss": 0.1169, "step": 16970 }, { "epoch": 5.188321614185265, "grad_norm": 0.6429910063743591, "learning_rate": 2.6130525243089465e-05, "loss": 0.1358, "step": 16971 }, { "epoch": 5.18862733109141, "grad_norm": 0.4895278513431549, "learning_rate": 2.6130100632669528e-05, "loss": 0.1694, "step": 16972 }, { "epoch": 5.188933047997554, "grad_norm": 1.2082911729812622, "learning_rate": 2.6129676022249586e-05, "loss": 0.1475, "step": 16973 }, { "epoch": 5.189238764903699, "grad_norm": 0.927025318145752, "learning_rate": 2.6129251411829645e-05, "loss": 0.1653, "step": 16974 }, { "epoch": 5.189544481809844, "grad_norm": 0.9603756666183472, "learning_rate": 2.6128826801409707e-05, "loss": 0.1573, "step": 16975 }, { "epoch": 5.189850198715989, "grad_norm": 1.4875503778457642, "learning_rate": 2.6128402190989766e-05, "loss": 0.1792, "step": 16976 }, { "epoch": 5.1901559156221335, "grad_norm": 1.3839737176895142, "learning_rate": 2.6127977580569828e-05, "loss": 0.1729, "step": 16977 }, { "epoch": 5.190461632528279, "grad_norm": 0.6470495462417603, "learning_rate": 2.6127552970149887e-05, "loss": 0.1672, "step": 16978 }, { "epoch": 5.190767349434424, "grad_norm": 1.6120057106018066, "learning_rate": 2.612712835972995e-05, "loss": 0.2346, "step": 16979 }, { "epoch": 5.191073066340569, "grad_norm": 1.5041766166687012, "learning_rate": 2.6126703749310007e-05, "loss": 0.2764, "step": 16980 }, { "epoch": 5.191378783246714, "grad_norm": 0.32318153977394104, "learning_rate": 2.612627913889007e-05, "loss": 0.1385, "step": 16981 }, { "epoch": 5.191684500152858, "grad_norm": 0.2337397336959839, "learning_rate": 2.6125854528470128e-05, "loss": 0.0666, "step": 16982 }, { "epoch": 5.191990217059003, "grad_norm": 2.032656192779541, "learning_rate": 2.612542991805019e-05, "loss": 0.0763, "step": 16983 }, { "epoch": 5.192295933965148, "grad_norm": 0.22368377447128296, "learning_rate": 2.612500530763025e-05, "loss": 0.0646, "step": 16984 }, { "epoch": 5.1926016508712936, "grad_norm": 0.1337159126996994, "learning_rate": 2.612458069721031e-05, "loss": 0.0382, "step": 16985 }, { "epoch": 5.192907367777438, "grad_norm": 0.19565719366073608, "learning_rate": 2.612415608679037e-05, "loss": 0.0543, "step": 16986 }, { "epoch": 5.193213084683583, "grad_norm": 0.6047672629356384, "learning_rate": 2.612373147637043e-05, "loss": 0.0439, "step": 16987 }, { "epoch": 5.193518801589728, "grad_norm": 0.5108151435852051, "learning_rate": 2.612330686595049e-05, "loss": 0.0605, "step": 16988 }, { "epoch": 5.193824518495873, "grad_norm": 0.39825427532196045, "learning_rate": 2.612288225553055e-05, "loss": 0.0465, "step": 16989 }, { "epoch": 5.194130235402017, "grad_norm": 0.6162153482437134, "learning_rate": 2.612245764511061e-05, "loss": 0.0558, "step": 16990 }, { "epoch": 5.1944359523081625, "grad_norm": 0.5806321501731873, "learning_rate": 2.612203303469067e-05, "loss": 0.1271, "step": 16991 }, { "epoch": 5.194741669214308, "grad_norm": 0.3921952545642853, "learning_rate": 2.6121608424270732e-05, "loss": 0.076, "step": 16992 }, { "epoch": 5.195047386120453, "grad_norm": 0.6310101747512817, "learning_rate": 2.612118381385079e-05, "loss": 0.0828, "step": 16993 }, { "epoch": 5.195353103026598, "grad_norm": 0.35069093108177185, "learning_rate": 2.6120759203430853e-05, "loss": 0.101, "step": 16994 }, { "epoch": 5.195658819932742, "grad_norm": 0.43293634057044983, "learning_rate": 2.612033459301091e-05, "loss": 0.1231, "step": 16995 }, { "epoch": 5.195964536838887, "grad_norm": 0.9305558800697327, "learning_rate": 2.6119909982590974e-05, "loss": 0.13, "step": 16996 }, { "epoch": 5.196270253745032, "grad_norm": 0.678790807723999, "learning_rate": 2.6119485372171032e-05, "loss": 0.1782, "step": 16997 }, { "epoch": 5.196575970651177, "grad_norm": 0.9294718503952026, "learning_rate": 2.6119060761751094e-05, "loss": 0.1243, "step": 16998 }, { "epoch": 5.196881687557322, "grad_norm": 0.7797172665596008, "learning_rate": 2.6118636151331153e-05, "loss": 0.1641, "step": 16999 }, { "epoch": 5.197187404463467, "grad_norm": 0.9051293134689331, "learning_rate": 2.6118211540911212e-05, "loss": 0.1721, "step": 17000 }, { "epoch": 5.197187404463467, "eval_cer": 0.18930095032350278, "eval_loss": 0.23064684867858887, "eval_runtime": 19.0591, "eval_samples_per_second": 238.101, "eval_steps_per_second": 0.787, "eval_wer": 0.33043769727102595, "step": 17000 }, { "epoch": 5.197493121369612, "grad_norm": 0.8413543105125427, "learning_rate": 2.6117786930491274e-05, "loss": 0.1518, "step": 17001 }, { "epoch": 5.197798838275757, "grad_norm": 1.5813815593719482, "learning_rate": 2.6117362320071333e-05, "loss": 0.17, "step": 17002 }, { "epoch": 5.198104555181901, "grad_norm": 0.625190019607544, "learning_rate": 2.6116937709651395e-05, "loss": 0.1886, "step": 17003 }, { "epoch": 5.198410272088046, "grad_norm": 1.0677759647369385, "learning_rate": 2.6116513099231453e-05, "loss": 0.1656, "step": 17004 }, { "epoch": 5.198715988994191, "grad_norm": 1.355506181716919, "learning_rate": 2.6116088488811515e-05, "loss": 0.2944, "step": 17005 }, { "epoch": 5.1990217059003365, "grad_norm": 0.4086611866950989, "learning_rate": 2.6115663878391574e-05, "loss": 0.1526, "step": 17006 }, { "epoch": 5.199327422806482, "grad_norm": 0.30071645975112915, "learning_rate": 2.6115239267971636e-05, "loss": 0.0837, "step": 17007 }, { "epoch": 5.199633139712626, "grad_norm": 0.2001948356628418, "learning_rate": 2.6114814657551695e-05, "loss": 0.0564, "step": 17008 }, { "epoch": 5.199938856618771, "grad_norm": 0.5245619416236877, "learning_rate": 2.6114390047131757e-05, "loss": 0.0583, "step": 17009 }, { "epoch": 5.200244573524916, "grad_norm": 0.2671698033809662, "learning_rate": 2.611396543671182e-05, "loss": 0.0492, "step": 17010 }, { "epoch": 5.200550290431061, "grad_norm": 0.24810215830802917, "learning_rate": 2.6113540826291878e-05, "loss": 0.0636, "step": 17011 }, { "epoch": 5.200856007337205, "grad_norm": 0.4107024371623993, "learning_rate": 2.611311621587194e-05, "loss": 0.0692, "step": 17012 }, { "epoch": 5.2011617242433505, "grad_norm": 0.3328718841075897, "learning_rate": 2.6112691605452e-05, "loss": 0.0604, "step": 17013 }, { "epoch": 5.201467441149496, "grad_norm": 0.18924878537654877, "learning_rate": 2.611226699503206e-05, "loss": 0.0672, "step": 17014 }, { "epoch": 5.201773158055641, "grad_norm": 0.8975359797477722, "learning_rate": 2.611184238461212e-05, "loss": 0.0761, "step": 17015 }, { "epoch": 5.202078874961785, "grad_norm": 0.6646851897239685, "learning_rate": 2.611141777419218e-05, "loss": 0.1279, "step": 17016 }, { "epoch": 5.20238459186793, "grad_norm": 0.5715751647949219, "learning_rate": 2.611099316377224e-05, "loss": 0.0825, "step": 17017 }, { "epoch": 5.202690308774075, "grad_norm": 0.390480637550354, "learning_rate": 2.6110568553352302e-05, "loss": 0.0814, "step": 17018 }, { "epoch": 5.20299602568022, "grad_norm": 0.4645673930644989, "learning_rate": 2.611014394293236e-05, "loss": 0.143, "step": 17019 }, { "epoch": 5.2033017425863655, "grad_norm": 0.6357869505882263, "learning_rate": 2.6109719332512423e-05, "loss": 0.1018, "step": 17020 }, { "epoch": 5.20360745949251, "grad_norm": 0.7585084438323975, "learning_rate": 2.6109294722092482e-05, "loss": 0.1266, "step": 17021 }, { "epoch": 5.203913176398655, "grad_norm": 0.6580197215080261, "learning_rate": 2.6108870111672544e-05, "loss": 0.1385, "step": 17022 }, { "epoch": 5.2042188933048, "grad_norm": 0.4852772355079651, "learning_rate": 2.6108445501252603e-05, "loss": 0.1537, "step": 17023 }, { "epoch": 5.204524610210945, "grad_norm": 0.5349514484405518, "learning_rate": 2.610802089083266e-05, "loss": 0.1646, "step": 17024 }, { "epoch": 5.204830327117089, "grad_norm": 1.8412518501281738, "learning_rate": 2.6107596280412723e-05, "loss": 0.1584, "step": 17025 }, { "epoch": 5.205136044023234, "grad_norm": 0.6206215023994446, "learning_rate": 2.6107171669992782e-05, "loss": 0.1671, "step": 17026 }, { "epoch": 5.2054417609293795, "grad_norm": 1.6372798681259155, "learning_rate": 2.6106747059572844e-05, "loss": 0.1913, "step": 17027 }, { "epoch": 5.205747477835525, "grad_norm": 1.5756028890609741, "learning_rate": 2.6106322449152903e-05, "loss": 0.1948, "step": 17028 }, { "epoch": 5.206053194741669, "grad_norm": 1.7904974222183228, "learning_rate": 2.6105897838732965e-05, "loss": 0.1928, "step": 17029 }, { "epoch": 5.206358911647814, "grad_norm": 0.7874795198440552, "learning_rate": 2.6105473228313024e-05, "loss": 0.2255, "step": 17030 }, { "epoch": 5.206664628553959, "grad_norm": 0.44373807311058044, "learning_rate": 2.6105048617893086e-05, "loss": 0.1204, "step": 17031 }, { "epoch": 5.206970345460104, "grad_norm": 0.3730107545852661, "learning_rate": 2.6104624007473144e-05, "loss": 0.0681, "step": 17032 }, { "epoch": 5.207276062366249, "grad_norm": 0.35062214732170105, "learning_rate": 2.6104199397053206e-05, "loss": 0.0628, "step": 17033 }, { "epoch": 5.2075817792723935, "grad_norm": 0.29060402512550354, "learning_rate": 2.6103774786633265e-05, "loss": 0.0598, "step": 17034 }, { "epoch": 5.207887496178539, "grad_norm": 0.22962519526481628, "learning_rate": 2.6103350176213327e-05, "loss": 0.0521, "step": 17035 }, { "epoch": 5.208193213084684, "grad_norm": 1.4900420904159546, "learning_rate": 2.6102925565793386e-05, "loss": 0.0925, "step": 17036 }, { "epoch": 5.208498929990829, "grad_norm": 0.2910684049129486, "learning_rate": 2.6102500955373445e-05, "loss": 0.0542, "step": 17037 }, { "epoch": 5.208804646896973, "grad_norm": 0.7917945981025696, "learning_rate": 2.6102076344953507e-05, "loss": 0.0624, "step": 17038 }, { "epoch": 5.209110363803118, "grad_norm": 0.3801807761192322, "learning_rate": 2.6101651734533565e-05, "loss": 0.0844, "step": 17039 }, { "epoch": 5.209416080709263, "grad_norm": 0.4603117108345032, "learning_rate": 2.6101227124113628e-05, "loss": 0.0704, "step": 17040 }, { "epoch": 5.209721797615408, "grad_norm": 0.45561668276786804, "learning_rate": 2.6100802513693686e-05, "loss": 0.0897, "step": 17041 }, { "epoch": 5.210027514521553, "grad_norm": 0.7238538265228271, "learning_rate": 2.610037790327375e-05, "loss": 0.0782, "step": 17042 }, { "epoch": 5.210333231427698, "grad_norm": 1.4124280214309692, "learning_rate": 2.6099953292853807e-05, "loss": 0.0802, "step": 17043 }, { "epoch": 5.210638948333843, "grad_norm": 0.6368075013160706, "learning_rate": 2.609952868243387e-05, "loss": 0.1305, "step": 17044 }, { "epoch": 5.210944665239988, "grad_norm": 0.38712289929389954, "learning_rate": 2.6099104072013928e-05, "loss": 0.1038, "step": 17045 }, { "epoch": 5.211250382146133, "grad_norm": 0.6398399472236633, "learning_rate": 2.609867946159399e-05, "loss": 0.1138, "step": 17046 }, { "epoch": 5.211556099052277, "grad_norm": 0.4912562072277069, "learning_rate": 2.609825485117405e-05, "loss": 0.1553, "step": 17047 }, { "epoch": 5.2118618159584225, "grad_norm": 0.5253143310546875, "learning_rate": 2.609783024075411e-05, "loss": 0.1523, "step": 17048 }, { "epoch": 5.212167532864568, "grad_norm": 0.6082462668418884, "learning_rate": 2.609740563033417e-05, "loss": 0.1419, "step": 17049 }, { "epoch": 5.212473249770713, "grad_norm": 0.9789995551109314, "learning_rate": 2.6096981019914228e-05, "loss": 0.1602, "step": 17050 }, { "epoch": 5.212778966676857, "grad_norm": 0.7661843299865723, "learning_rate": 2.609655640949429e-05, "loss": 0.1717, "step": 17051 }, { "epoch": 5.213084683583002, "grad_norm": 1.3957805633544922, "learning_rate": 2.609613179907435e-05, "loss": 0.1602, "step": 17052 }, { "epoch": 5.213390400489147, "grad_norm": 0.8787436485290527, "learning_rate": 2.609570718865441e-05, "loss": 0.162, "step": 17053 }, { "epoch": 5.213696117395292, "grad_norm": 0.878352165222168, "learning_rate": 2.609528257823447e-05, "loss": 0.2136, "step": 17054 }, { "epoch": 5.2140018343014365, "grad_norm": 2.193553924560547, "learning_rate": 2.6094857967814532e-05, "loss": 0.1906, "step": 17055 }, { "epoch": 5.214307551207582, "grad_norm": 0.3772686719894409, "learning_rate": 2.609443335739459e-05, "loss": 0.1277, "step": 17056 }, { "epoch": 5.214613268113727, "grad_norm": 0.909527063369751, "learning_rate": 2.6094008746974653e-05, "loss": 0.0724, "step": 17057 }, { "epoch": 5.214918985019872, "grad_norm": 0.3390888273715973, "learning_rate": 2.609358413655471e-05, "loss": 0.0726, "step": 17058 }, { "epoch": 5.215224701926017, "grad_norm": 0.27633529901504517, "learning_rate": 2.6093159526134773e-05, "loss": 0.056, "step": 17059 }, { "epoch": 5.215530418832161, "grad_norm": 0.394270658493042, "learning_rate": 2.6092734915714832e-05, "loss": 0.0758, "step": 17060 }, { "epoch": 5.215836135738306, "grad_norm": 0.23831534385681152, "learning_rate": 2.6092310305294894e-05, "loss": 0.05, "step": 17061 }, { "epoch": 5.216141852644451, "grad_norm": 0.27010467648506165, "learning_rate": 2.6091885694874953e-05, "loss": 0.0501, "step": 17062 }, { "epoch": 5.2164475695505965, "grad_norm": 0.15806157886981964, "learning_rate": 2.609146108445501e-05, "loss": 0.0542, "step": 17063 }, { "epoch": 5.216753286456741, "grad_norm": 0.22794708609580994, "learning_rate": 2.6091036474035074e-05, "loss": 0.0694, "step": 17064 }, { "epoch": 5.217059003362886, "grad_norm": 0.44551435112953186, "learning_rate": 2.6090611863615132e-05, "loss": 0.0513, "step": 17065 }, { "epoch": 5.217364720269031, "grad_norm": 0.398704469203949, "learning_rate": 2.6090187253195194e-05, "loss": 0.1064, "step": 17066 }, { "epoch": 5.217670437175176, "grad_norm": 0.36894550919532776, "learning_rate": 2.6089762642775253e-05, "loss": 0.0632, "step": 17067 }, { "epoch": 5.21797615408132, "grad_norm": 0.3543678820133209, "learning_rate": 2.6089338032355315e-05, "loss": 0.0737, "step": 17068 }, { "epoch": 5.218281870987465, "grad_norm": 0.6698466539382935, "learning_rate": 2.6088913421935374e-05, "loss": 0.1059, "step": 17069 }, { "epoch": 5.2185875878936105, "grad_norm": 0.39309802651405334, "learning_rate": 2.6088488811515436e-05, "loss": 0.0889, "step": 17070 }, { "epoch": 5.218893304799756, "grad_norm": 0.8927958607673645, "learning_rate": 2.6088064201095495e-05, "loss": 0.1661, "step": 17071 }, { "epoch": 5.219199021705901, "grad_norm": 1.176974892616272, "learning_rate": 2.6087639590675557e-05, "loss": 0.157, "step": 17072 }, { "epoch": 5.219504738612045, "grad_norm": 0.4703321158885956, "learning_rate": 2.6087214980255615e-05, "loss": 0.1558, "step": 17073 }, { "epoch": 5.21981045551819, "grad_norm": 0.8148415684700012, "learning_rate": 2.6086790369835678e-05, "loss": 0.1448, "step": 17074 }, { "epoch": 5.220116172424335, "grad_norm": 0.44749340415000916, "learning_rate": 2.6086365759415736e-05, "loss": 0.142, "step": 17075 }, { "epoch": 5.22042188933048, "grad_norm": 0.6776039600372314, "learning_rate": 2.6085941148995795e-05, "loss": 0.1388, "step": 17076 }, { "epoch": 5.220727606236625, "grad_norm": 0.6460184454917908, "learning_rate": 2.6085516538575857e-05, "loss": 0.1914, "step": 17077 }, { "epoch": 5.22103332314277, "grad_norm": 0.714772641658783, "learning_rate": 2.6085091928155916e-05, "loss": 0.1474, "step": 17078 }, { "epoch": 5.221339040048915, "grad_norm": 1.0902379751205444, "learning_rate": 2.6084667317735978e-05, "loss": 0.2108, "step": 17079 }, { "epoch": 5.22164475695506, "grad_norm": 4.763638496398926, "learning_rate": 2.6084242707316037e-05, "loss": 0.2064, "step": 17080 }, { "epoch": 5.221950473861204, "grad_norm": 0.5963332056999207, "learning_rate": 2.60838180968961e-05, "loss": 0.1511, "step": 17081 }, { "epoch": 5.222256190767349, "grad_norm": 0.5272235870361328, "learning_rate": 2.6083393486476157e-05, "loss": 0.0775, "step": 17082 }, { "epoch": 5.222561907673494, "grad_norm": 0.4176841974258423, "learning_rate": 2.608296887605622e-05, "loss": 0.0483, "step": 17083 }, { "epoch": 5.2228676245796395, "grad_norm": 0.23837456107139587, "learning_rate": 2.6082544265636278e-05, "loss": 0.0483, "step": 17084 }, { "epoch": 5.223173341485785, "grad_norm": 0.2491115927696228, "learning_rate": 2.608211965521634e-05, "loss": 0.066, "step": 17085 }, { "epoch": 5.223479058391929, "grad_norm": 0.4224886894226074, "learning_rate": 2.60816950447964e-05, "loss": 0.072, "step": 17086 }, { "epoch": 5.223784775298074, "grad_norm": 0.37702277302742004, "learning_rate": 2.608127043437646e-05, "loss": 0.0591, "step": 17087 }, { "epoch": 5.224090492204219, "grad_norm": 0.2386436015367508, "learning_rate": 2.608084582395652e-05, "loss": 0.044, "step": 17088 }, { "epoch": 5.224396209110364, "grad_norm": 0.3675117790699005, "learning_rate": 2.608042121353658e-05, "loss": 0.0709, "step": 17089 }, { "epoch": 5.224701926016508, "grad_norm": 0.44574543833732605, "learning_rate": 2.607999660311664e-05, "loss": 0.0541, "step": 17090 }, { "epoch": 5.2250076429226535, "grad_norm": 0.3424935042858124, "learning_rate": 2.60795719926967e-05, "loss": 0.0805, "step": 17091 }, { "epoch": 5.225313359828799, "grad_norm": 0.3476708233356476, "learning_rate": 2.607914738227676e-05, "loss": 0.0893, "step": 17092 }, { "epoch": 5.225619076734944, "grad_norm": 0.49808838963508606, "learning_rate": 2.607872277185682e-05, "loss": 0.0908, "step": 17093 }, { "epoch": 5.225924793641088, "grad_norm": 1.62803053855896, "learning_rate": 2.6078298161436882e-05, "loss": 0.1213, "step": 17094 }, { "epoch": 5.226230510547233, "grad_norm": 0.46561214327812195, "learning_rate": 2.607787355101694e-05, "loss": 0.1018, "step": 17095 }, { "epoch": 5.226536227453378, "grad_norm": 0.8491675853729248, "learning_rate": 2.6077448940597003e-05, "loss": 0.1132, "step": 17096 }, { "epoch": 5.226841944359523, "grad_norm": 0.7997792363166809, "learning_rate": 2.607702433017706e-05, "loss": 0.1475, "step": 17097 }, { "epoch": 5.227147661265668, "grad_norm": 0.6897121667861938, "learning_rate": 2.6076599719757124e-05, "loss": 0.1757, "step": 17098 }, { "epoch": 5.227453378171813, "grad_norm": 0.6885817646980286, "learning_rate": 2.6076175109337182e-05, "loss": 0.1491, "step": 17099 }, { "epoch": 5.227759095077958, "grad_norm": 0.5073511004447937, "learning_rate": 2.6075750498917244e-05, "loss": 0.1801, "step": 17100 }, { "epoch": 5.228064811984103, "grad_norm": 0.5853029489517212, "learning_rate": 2.6075325888497303e-05, "loss": 0.1748, "step": 17101 }, { "epoch": 5.228370528890248, "grad_norm": 0.5412883162498474, "learning_rate": 2.6074901278077362e-05, "loss": 0.1575, "step": 17102 }, { "epoch": 5.228676245796392, "grad_norm": 0.9837409853935242, "learning_rate": 2.6074476667657424e-05, "loss": 0.2003, "step": 17103 }, { "epoch": 5.228981962702537, "grad_norm": 1.4915127754211426, "learning_rate": 2.6074052057237483e-05, "loss": 0.1848, "step": 17104 }, { "epoch": 5.2292876796086825, "grad_norm": 2.5727245807647705, "learning_rate": 2.6073627446817545e-05, "loss": 0.2521, "step": 17105 }, { "epoch": 5.229593396514828, "grad_norm": 0.44885244965553284, "learning_rate": 2.6073202836397603e-05, "loss": 0.1332, "step": 17106 }, { "epoch": 5.229899113420972, "grad_norm": 0.23900370299816132, "learning_rate": 2.6072778225977665e-05, "loss": 0.063, "step": 17107 }, { "epoch": 5.230204830327117, "grad_norm": 0.29764324426651, "learning_rate": 2.6072353615557724e-05, "loss": 0.0453, "step": 17108 }, { "epoch": 5.230510547233262, "grad_norm": 0.24207410216331482, "learning_rate": 2.6071929005137786e-05, "loss": 0.0701, "step": 17109 }, { "epoch": 5.230816264139407, "grad_norm": 0.4479823708534241, "learning_rate": 2.6071504394717845e-05, "loss": 0.0472, "step": 17110 }, { "epoch": 5.231121981045552, "grad_norm": 0.8939133286476135, "learning_rate": 2.6071079784297907e-05, "loss": 0.0801, "step": 17111 }, { "epoch": 5.2314276979516965, "grad_norm": 0.9156277775764465, "learning_rate": 2.607065517387797e-05, "loss": 0.062, "step": 17112 }, { "epoch": 5.231733414857842, "grad_norm": 0.8447320461273193, "learning_rate": 2.607023056345803e-05, "loss": 0.0634, "step": 17113 }, { "epoch": 5.232039131763987, "grad_norm": 0.2500327229499817, "learning_rate": 2.606980595303809e-05, "loss": 0.0558, "step": 17114 }, { "epoch": 5.232344848670132, "grad_norm": 0.664632260799408, "learning_rate": 2.606938134261815e-05, "loss": 0.0589, "step": 17115 }, { "epoch": 5.232650565576276, "grad_norm": 0.7040178179740906, "learning_rate": 2.606895673219821e-05, "loss": 0.0914, "step": 17116 }, { "epoch": 5.232956282482421, "grad_norm": 0.6449639797210693, "learning_rate": 2.606853212177827e-05, "loss": 0.0885, "step": 17117 }, { "epoch": 5.233261999388566, "grad_norm": 0.3917717933654785, "learning_rate": 2.606810751135833e-05, "loss": 0.0945, "step": 17118 }, { "epoch": 5.233567716294711, "grad_norm": 0.959564208984375, "learning_rate": 2.606768290093839e-05, "loss": 0.0813, "step": 17119 }, { "epoch": 5.233873433200856, "grad_norm": 0.31328868865966797, "learning_rate": 2.6067258290518452e-05, "loss": 0.1126, "step": 17120 }, { "epoch": 5.234179150107001, "grad_norm": 0.5630320310592651, "learning_rate": 2.606683368009851e-05, "loss": 0.1253, "step": 17121 }, { "epoch": 5.234484867013146, "grad_norm": 0.7183599472045898, "learning_rate": 2.6066409069678573e-05, "loss": 0.168, "step": 17122 }, { "epoch": 5.234790583919291, "grad_norm": 0.42600587010383606, "learning_rate": 2.6065984459258632e-05, "loss": 0.1504, "step": 17123 }, { "epoch": 5.235096300825436, "grad_norm": 1.5021591186523438, "learning_rate": 2.6065559848838694e-05, "loss": 0.2003, "step": 17124 }, { "epoch": 5.23540201773158, "grad_norm": 1.0501168966293335, "learning_rate": 2.6065135238418753e-05, "loss": 0.1555, "step": 17125 }, { "epoch": 5.235707734637725, "grad_norm": 0.9235765337944031, "learning_rate": 2.606471062799881e-05, "loss": 0.1746, "step": 17126 }, { "epoch": 5.2360134515438705, "grad_norm": 0.8801589012145996, "learning_rate": 2.6064286017578873e-05, "loss": 0.1985, "step": 17127 }, { "epoch": 5.236319168450016, "grad_norm": 0.6045496463775635, "learning_rate": 2.6063861407158932e-05, "loss": 0.1788, "step": 17128 }, { "epoch": 5.23662488535616, "grad_norm": 0.7732682228088379, "learning_rate": 2.6063436796738994e-05, "loss": 0.2177, "step": 17129 }, { "epoch": 5.236930602262305, "grad_norm": 0.9052548408508301, "learning_rate": 2.6063012186319053e-05, "loss": 0.2537, "step": 17130 }, { "epoch": 5.23723631916845, "grad_norm": 0.5484800934791565, "learning_rate": 2.6062587575899115e-05, "loss": 0.1446, "step": 17131 }, { "epoch": 5.237542036074595, "grad_norm": 0.3970649242401123, "learning_rate": 2.6062162965479174e-05, "loss": 0.0729, "step": 17132 }, { "epoch": 5.2378477529807395, "grad_norm": 0.46443840861320496, "learning_rate": 2.6061738355059236e-05, "loss": 0.0632, "step": 17133 }, { "epoch": 5.238153469886885, "grad_norm": 0.1972033679485321, "learning_rate": 2.6061313744639294e-05, "loss": 0.0574, "step": 17134 }, { "epoch": 5.23845918679303, "grad_norm": 0.2778644263744354, "learning_rate": 2.6060889134219357e-05, "loss": 0.0541, "step": 17135 }, { "epoch": 5.238764903699175, "grad_norm": 1.6976767778396606, "learning_rate": 2.6060464523799415e-05, "loss": 0.0506, "step": 17136 }, { "epoch": 5.23907062060532, "grad_norm": 0.28037723898887634, "learning_rate": 2.6060039913379477e-05, "loss": 0.0647, "step": 17137 }, { "epoch": 5.239376337511464, "grad_norm": 2.234689235687256, "learning_rate": 2.6059615302959536e-05, "loss": 0.1141, "step": 17138 }, { "epoch": 5.239682054417609, "grad_norm": 0.3497505187988281, "learning_rate": 2.6059190692539595e-05, "loss": 0.0679, "step": 17139 }, { "epoch": 5.239987771323754, "grad_norm": 0.3169878125190735, "learning_rate": 2.6058766082119657e-05, "loss": 0.0856, "step": 17140 }, { "epoch": 5.2402934882298995, "grad_norm": 0.2677208483219147, "learning_rate": 2.6058341471699716e-05, "loss": 0.0748, "step": 17141 }, { "epoch": 5.240599205136044, "grad_norm": 0.8994413614273071, "learning_rate": 2.6057916861279778e-05, "loss": 0.0608, "step": 17142 }, { "epoch": 5.240904922042189, "grad_norm": 0.6407151222229004, "learning_rate": 2.6057492250859836e-05, "loss": 0.0904, "step": 17143 }, { "epoch": 5.241210638948334, "grad_norm": 0.6572180390357971, "learning_rate": 2.60570676404399e-05, "loss": 0.1056, "step": 17144 }, { "epoch": 5.241516355854479, "grad_norm": 1.2466291189193726, "learning_rate": 2.6056643030019957e-05, "loss": 0.1265, "step": 17145 }, { "epoch": 5.241822072760623, "grad_norm": 1.779950737953186, "learning_rate": 2.605621841960002e-05, "loss": 0.1609, "step": 17146 }, { "epoch": 5.242127789666768, "grad_norm": 1.7527027130126953, "learning_rate": 2.6055793809180078e-05, "loss": 0.1375, "step": 17147 }, { "epoch": 5.2424335065729135, "grad_norm": 0.7809727191925049, "learning_rate": 2.605536919876014e-05, "loss": 0.161, "step": 17148 }, { "epoch": 5.242739223479059, "grad_norm": 0.4589511454105377, "learning_rate": 2.60549445883402e-05, "loss": 0.1486, "step": 17149 }, { "epoch": 5.243044940385204, "grad_norm": 1.1547434329986572, "learning_rate": 2.605451997792026e-05, "loss": 0.1694, "step": 17150 }, { "epoch": 5.243350657291348, "grad_norm": 1.1958062648773193, "learning_rate": 2.605409536750032e-05, "loss": 0.1665, "step": 17151 }, { "epoch": 5.243656374197493, "grad_norm": 0.6589462757110596, "learning_rate": 2.6053670757080378e-05, "loss": 0.1673, "step": 17152 }, { "epoch": 5.243962091103638, "grad_norm": 0.9556612968444824, "learning_rate": 2.605324614666044e-05, "loss": 0.2036, "step": 17153 }, { "epoch": 5.244267808009783, "grad_norm": 1.2362257242202759, "learning_rate": 2.60528215362405e-05, "loss": 0.2268, "step": 17154 }, { "epoch": 5.2445735249159275, "grad_norm": 2.016751766204834, "learning_rate": 2.605239692582056e-05, "loss": 0.2875, "step": 17155 }, { "epoch": 5.244879241822073, "grad_norm": 0.4670041799545288, "learning_rate": 2.605197231540062e-05, "loss": 0.1637, "step": 17156 }, { "epoch": 5.245184958728218, "grad_norm": 0.3563551902770996, "learning_rate": 2.6051547704980682e-05, "loss": 0.0969, "step": 17157 }, { "epoch": 5.245490675634363, "grad_norm": 0.35574355721473694, "learning_rate": 2.605112309456074e-05, "loss": 0.0626, "step": 17158 }, { "epoch": 5.245796392540507, "grad_norm": 0.27543964982032776, "learning_rate": 2.6050698484140803e-05, "loss": 0.065, "step": 17159 }, { "epoch": 5.246102109446652, "grad_norm": 0.3493461608886719, "learning_rate": 2.605027387372086e-05, "loss": 0.0593, "step": 17160 }, { "epoch": 5.246407826352797, "grad_norm": 0.32883843779563904, "learning_rate": 2.6049849263300923e-05, "loss": 0.0544, "step": 17161 }, { "epoch": 5.2467135432589425, "grad_norm": 0.32210201025009155, "learning_rate": 2.6049424652880982e-05, "loss": 0.0618, "step": 17162 }, { "epoch": 5.247019260165088, "grad_norm": 0.512829065322876, "learning_rate": 2.6049000042461044e-05, "loss": 0.0434, "step": 17163 }, { "epoch": 5.247324977071232, "grad_norm": 0.5337696075439453, "learning_rate": 2.6048575432041103e-05, "loss": 0.079, "step": 17164 }, { "epoch": 5.247630693977377, "grad_norm": 2.536505937576294, "learning_rate": 2.604815082162116e-05, "loss": 0.0729, "step": 17165 }, { "epoch": 5.247936410883522, "grad_norm": 0.9277153611183167, "learning_rate": 2.6047726211201224e-05, "loss": 0.0788, "step": 17166 }, { "epoch": 5.248242127789667, "grad_norm": 0.3962959945201874, "learning_rate": 2.6047301600781282e-05, "loss": 0.0815, "step": 17167 }, { "epoch": 5.248547844695811, "grad_norm": 0.45308056473731995, "learning_rate": 2.6046876990361344e-05, "loss": 0.0744, "step": 17168 }, { "epoch": 5.2488535616019565, "grad_norm": 1.4351154565811157, "learning_rate": 2.6046452379941403e-05, "loss": 0.1155, "step": 17169 }, { "epoch": 5.249159278508102, "grad_norm": 0.43905335664749146, "learning_rate": 2.6046027769521465e-05, "loss": 0.1078, "step": 17170 }, { "epoch": 5.249464995414247, "grad_norm": 0.6177271008491516, "learning_rate": 2.6045603159101524e-05, "loss": 0.1713, "step": 17171 }, { "epoch": 5.249770712320391, "grad_norm": 0.5411296486854553, "learning_rate": 2.6045178548681586e-05, "loss": 0.1747, "step": 17172 }, { "epoch": 5.250076429226536, "grad_norm": 0.4707507789134979, "learning_rate": 2.6044753938261645e-05, "loss": 0.1613, "step": 17173 }, { "epoch": 5.250382146132681, "grad_norm": 0.953357994556427, "learning_rate": 2.6044329327841707e-05, "loss": 0.1582, "step": 17174 }, { "epoch": 5.250687863038826, "grad_norm": 0.9560423493385315, "learning_rate": 2.6043904717421766e-05, "loss": 0.177, "step": 17175 }, { "epoch": 5.250993579944971, "grad_norm": 0.5654507279396057, "learning_rate": 2.6043480107001828e-05, "loss": 0.1678, "step": 17176 }, { "epoch": 5.251299296851116, "grad_norm": 1.4232200384140015, "learning_rate": 2.6043055496581886e-05, "loss": 0.1666, "step": 17177 }, { "epoch": 5.251605013757261, "grad_norm": 0.766558825969696, "learning_rate": 2.6042630886161945e-05, "loss": 0.198, "step": 17178 }, { "epoch": 5.251910730663406, "grad_norm": 1.0041773319244385, "learning_rate": 2.6042206275742007e-05, "loss": 0.2218, "step": 17179 }, { "epoch": 5.252216447569551, "grad_norm": 1.7791972160339355, "learning_rate": 2.6041781665322066e-05, "loss": 0.2015, "step": 17180 }, { "epoch": 5.252522164475695, "grad_norm": 0.4961565434932709, "learning_rate": 2.6041357054902128e-05, "loss": 0.1483, "step": 17181 }, { "epoch": 5.25282788138184, "grad_norm": 0.4302699863910675, "learning_rate": 2.6040932444482187e-05, "loss": 0.0784, "step": 17182 }, { "epoch": 5.253133598287985, "grad_norm": 0.4155394732952118, "learning_rate": 2.604050783406225e-05, "loss": 0.0837, "step": 17183 }, { "epoch": 5.2534393151941305, "grad_norm": 0.4033893942832947, "learning_rate": 2.6040083223642307e-05, "loss": 0.0586, "step": 17184 }, { "epoch": 5.253745032100275, "grad_norm": 0.38707855343818665, "learning_rate": 2.603965861322237e-05, "loss": 0.0457, "step": 17185 }, { "epoch": 5.25405074900642, "grad_norm": 0.6979387998580933, "learning_rate": 2.6039234002802428e-05, "loss": 0.0573, "step": 17186 }, { "epoch": 5.254356465912565, "grad_norm": 0.22843392193317413, "learning_rate": 2.603880939238249e-05, "loss": 0.0533, "step": 17187 }, { "epoch": 5.25466218281871, "grad_norm": 0.5077956914901733, "learning_rate": 2.603838478196255e-05, "loss": 0.0587, "step": 17188 }, { "epoch": 5.254967899724855, "grad_norm": 0.3902338147163391, "learning_rate": 2.603796017154261e-05, "loss": 0.0726, "step": 17189 }, { "epoch": 5.2552736166309995, "grad_norm": 0.4844670593738556, "learning_rate": 2.603753556112267e-05, "loss": 0.0445, "step": 17190 }, { "epoch": 5.255579333537145, "grad_norm": 1.4803105592727661, "learning_rate": 2.603711095070273e-05, "loss": 0.0959, "step": 17191 }, { "epoch": 5.25588505044329, "grad_norm": 0.6733037233352661, "learning_rate": 2.603668634028279e-05, "loss": 0.0925, "step": 17192 }, { "epoch": 5.256190767349435, "grad_norm": 0.5551050901412964, "learning_rate": 2.603626172986285e-05, "loss": 0.0838, "step": 17193 }, { "epoch": 5.256496484255579, "grad_norm": 0.5808466076850891, "learning_rate": 2.603583711944291e-05, "loss": 0.1542, "step": 17194 }, { "epoch": 5.256802201161724, "grad_norm": 0.6103107929229736, "learning_rate": 2.603541250902297e-05, "loss": 0.1373, "step": 17195 }, { "epoch": 5.257107918067869, "grad_norm": 0.9395822286605835, "learning_rate": 2.6034987898603032e-05, "loss": 0.1313, "step": 17196 }, { "epoch": 5.257413634974014, "grad_norm": 0.532863199710846, "learning_rate": 2.603456328818309e-05, "loss": 0.1251, "step": 17197 }, { "epoch": 5.257719351880159, "grad_norm": 0.7362218499183655, "learning_rate": 2.6034138677763153e-05, "loss": 0.1463, "step": 17198 }, { "epoch": 5.258025068786304, "grad_norm": 1.3059393167495728, "learning_rate": 2.603371406734321e-05, "loss": 0.2072, "step": 17199 }, { "epoch": 5.258330785692449, "grad_norm": 4.0272698402404785, "learning_rate": 2.6033289456923274e-05, "loss": 0.174, "step": 17200 }, { "epoch": 5.258636502598594, "grad_norm": 1.2369298934936523, "learning_rate": 2.6032864846503332e-05, "loss": 0.1824, "step": 17201 }, { "epoch": 5.258942219504739, "grad_norm": 0.7370820641517639, "learning_rate": 2.6032440236083394e-05, "loss": 0.1814, "step": 17202 }, { "epoch": 5.259247936410883, "grad_norm": 1.4216235876083374, "learning_rate": 2.6032015625663453e-05, "loss": 0.1671, "step": 17203 }, { "epoch": 5.259553653317028, "grad_norm": 1.4005155563354492, "learning_rate": 2.6031591015243512e-05, "loss": 0.2119, "step": 17204 }, { "epoch": 5.2598593702231735, "grad_norm": 1.2369505167007446, "learning_rate": 2.6031166404823574e-05, "loss": 0.2115, "step": 17205 }, { "epoch": 5.260165087129319, "grad_norm": 0.3303484618663788, "learning_rate": 2.6030741794403633e-05, "loss": 0.1144, "step": 17206 }, { "epoch": 5.260470804035463, "grad_norm": 0.3570588231086731, "learning_rate": 2.6030317183983695e-05, "loss": 0.0872, "step": 17207 }, { "epoch": 5.260776520941608, "grad_norm": 0.31281962990760803, "learning_rate": 2.6029892573563753e-05, "loss": 0.0774, "step": 17208 }, { "epoch": 5.261082237847753, "grad_norm": 0.15285535156726837, "learning_rate": 2.6029467963143816e-05, "loss": 0.0428, "step": 17209 }, { "epoch": 5.261387954753898, "grad_norm": 0.3094017803668976, "learning_rate": 2.6029043352723874e-05, "loss": 0.0678, "step": 17210 }, { "epoch": 5.261693671660042, "grad_norm": 0.3275777995586395, "learning_rate": 2.6028618742303936e-05, "loss": 0.0748, "step": 17211 }, { "epoch": 5.2619993885661875, "grad_norm": 0.39929166436195374, "learning_rate": 2.6028194131883995e-05, "loss": 0.0616, "step": 17212 }, { "epoch": 5.262305105472333, "grad_norm": 0.4670945107936859, "learning_rate": 2.6027769521464057e-05, "loss": 0.0609, "step": 17213 }, { "epoch": 5.262610822378478, "grad_norm": 1.7056195735931396, "learning_rate": 2.602734491104412e-05, "loss": 0.0683, "step": 17214 }, { "epoch": 5.262916539284623, "grad_norm": 0.31235161423683167, "learning_rate": 2.602692030062418e-05, "loss": 0.077, "step": 17215 }, { "epoch": 5.263222256190767, "grad_norm": 0.38519105315208435, "learning_rate": 2.602649569020424e-05, "loss": 0.0956, "step": 17216 }, { "epoch": 5.263527973096912, "grad_norm": 0.33440011739730835, "learning_rate": 2.60260710797843e-05, "loss": 0.0909, "step": 17217 }, { "epoch": 5.263833690003057, "grad_norm": 0.6007805466651917, "learning_rate": 2.602564646936436e-05, "loss": 0.0943, "step": 17218 }, { "epoch": 5.2641394069092025, "grad_norm": 0.544255256652832, "learning_rate": 2.602522185894442e-05, "loss": 0.1183, "step": 17219 }, { "epoch": 5.264445123815347, "grad_norm": 0.36643359065055847, "learning_rate": 2.602479724852448e-05, "loss": 0.105, "step": 17220 }, { "epoch": 5.264750840721492, "grad_norm": 0.4963669180870056, "learning_rate": 2.602437263810454e-05, "loss": 0.1237, "step": 17221 }, { "epoch": 5.265056557627637, "grad_norm": 0.8021164536476135, "learning_rate": 2.6023948027684602e-05, "loss": 0.1386, "step": 17222 }, { "epoch": 5.265362274533782, "grad_norm": 0.46744436025619507, "learning_rate": 2.602352341726466e-05, "loss": 0.1531, "step": 17223 }, { "epoch": 5.265667991439926, "grad_norm": 0.5686823725700378, "learning_rate": 2.6023098806844723e-05, "loss": 0.1727, "step": 17224 }, { "epoch": 5.265973708346071, "grad_norm": 0.5567758083343506, "learning_rate": 2.6022674196424782e-05, "loss": 0.1431, "step": 17225 }, { "epoch": 5.2662794252522165, "grad_norm": 1.5232049226760864, "learning_rate": 2.6022249586004844e-05, "loss": 0.1853, "step": 17226 }, { "epoch": 5.266585142158362, "grad_norm": 0.7006996273994446, "learning_rate": 2.6021824975584903e-05, "loss": 0.18, "step": 17227 }, { "epoch": 5.266890859064507, "grad_norm": 0.5871265530586243, "learning_rate": 2.6021400365164965e-05, "loss": 0.1915, "step": 17228 }, { "epoch": 5.267196575970651, "grad_norm": 0.9886505603790283, "learning_rate": 2.6020975754745023e-05, "loss": 0.1687, "step": 17229 }, { "epoch": 5.267502292876796, "grad_norm": 1.6441172361373901, "learning_rate": 2.6020551144325082e-05, "loss": 0.2247, "step": 17230 }, { "epoch": 5.267808009782941, "grad_norm": 0.3439050614833832, "learning_rate": 2.6020126533905144e-05, "loss": 0.143, "step": 17231 }, { "epoch": 5.268113726689086, "grad_norm": 0.28478437662124634, "learning_rate": 2.6019701923485203e-05, "loss": 0.067, "step": 17232 }, { "epoch": 5.2684194435952305, "grad_norm": 0.27702683210372925, "learning_rate": 2.6019277313065265e-05, "loss": 0.0452, "step": 17233 }, { "epoch": 5.268725160501376, "grad_norm": 0.16727256774902344, "learning_rate": 2.6018852702645324e-05, "loss": 0.0571, "step": 17234 }, { "epoch": 5.269030877407521, "grad_norm": 0.2677350342273712, "learning_rate": 2.6018428092225386e-05, "loss": 0.0545, "step": 17235 }, { "epoch": 5.269336594313666, "grad_norm": 0.21428196132183075, "learning_rate": 2.6018003481805444e-05, "loss": 0.0614, "step": 17236 }, { "epoch": 5.26964231121981, "grad_norm": 0.21090549230575562, "learning_rate": 2.6017578871385507e-05, "loss": 0.0574, "step": 17237 }, { "epoch": 5.269948028125955, "grad_norm": 0.30823075771331787, "learning_rate": 2.6017154260965565e-05, "loss": 0.0644, "step": 17238 }, { "epoch": 5.2702537450321, "grad_norm": 0.2613312005996704, "learning_rate": 2.6016729650545627e-05, "loss": 0.0581, "step": 17239 }, { "epoch": 5.270559461938245, "grad_norm": 0.2530074715614319, "learning_rate": 2.6016305040125686e-05, "loss": 0.0852, "step": 17240 }, { "epoch": 5.2708651788443905, "grad_norm": 0.33697253465652466, "learning_rate": 2.6015880429705745e-05, "loss": 0.0797, "step": 17241 }, { "epoch": 5.271170895750535, "grad_norm": 0.5063658952713013, "learning_rate": 2.6015455819285807e-05, "loss": 0.073, "step": 17242 }, { "epoch": 5.27147661265668, "grad_norm": 0.3126816153526306, "learning_rate": 2.6015031208865866e-05, "loss": 0.1249, "step": 17243 }, { "epoch": 5.271782329562825, "grad_norm": 0.9730883836746216, "learning_rate": 2.6014606598445928e-05, "loss": 0.0833, "step": 17244 }, { "epoch": 5.27208804646897, "grad_norm": 0.39581018686294556, "learning_rate": 2.6014181988025986e-05, "loss": 0.1053, "step": 17245 }, { "epoch": 5.272393763375114, "grad_norm": 0.5055803656578064, "learning_rate": 2.601375737760605e-05, "loss": 0.1671, "step": 17246 }, { "epoch": 5.2726994802812595, "grad_norm": 1.126314401626587, "learning_rate": 2.6013332767186107e-05, "loss": 0.1604, "step": 17247 }, { "epoch": 5.273005197187405, "grad_norm": 0.47635579109191895, "learning_rate": 2.601290815676617e-05, "loss": 0.1548, "step": 17248 }, { "epoch": 5.27331091409355, "grad_norm": 0.39993491768836975, "learning_rate": 2.6012483546346228e-05, "loss": 0.1467, "step": 17249 }, { "epoch": 5.273616630999694, "grad_norm": 0.6744846701622009, "learning_rate": 2.601205893592629e-05, "loss": 0.1614, "step": 17250 }, { "epoch": 5.273922347905839, "grad_norm": 0.5040090680122375, "learning_rate": 2.601163432550635e-05, "loss": 0.2055, "step": 17251 }, { "epoch": 5.274228064811984, "grad_norm": 0.8020264506340027, "learning_rate": 2.601120971508641e-05, "loss": 0.2096, "step": 17252 }, { "epoch": 5.274533781718129, "grad_norm": 0.6218999624252319, "learning_rate": 2.601078510466647e-05, "loss": 0.2166, "step": 17253 }, { "epoch": 5.274839498624274, "grad_norm": 1.106251835823059, "learning_rate": 2.6010360494246528e-05, "loss": 0.1798, "step": 17254 }, { "epoch": 5.275145215530419, "grad_norm": 1.346760630607605, "learning_rate": 2.600993588382659e-05, "loss": 0.2524, "step": 17255 }, { "epoch": 5.275450932436564, "grad_norm": 0.6719956398010254, "learning_rate": 2.600951127340665e-05, "loss": 0.1238, "step": 17256 }, { "epoch": 5.275756649342709, "grad_norm": 0.6009935140609741, "learning_rate": 2.600908666298671e-05, "loss": 0.1142, "step": 17257 }, { "epoch": 5.276062366248854, "grad_norm": 0.24464759230613708, "learning_rate": 2.600866205256677e-05, "loss": 0.0945, "step": 17258 }, { "epoch": 5.276368083154998, "grad_norm": 0.228623166680336, "learning_rate": 2.6008237442146832e-05, "loss": 0.0763, "step": 17259 }, { "epoch": 5.276673800061143, "grad_norm": 0.15820106863975525, "learning_rate": 2.600781283172689e-05, "loss": 0.0513, "step": 17260 }, { "epoch": 5.276979516967288, "grad_norm": 0.25959089398384094, "learning_rate": 2.6007388221306953e-05, "loss": 0.0621, "step": 17261 }, { "epoch": 5.2772852338734335, "grad_norm": 0.4203749895095825, "learning_rate": 2.600696361088701e-05, "loss": 0.0616, "step": 17262 }, { "epoch": 5.277590950779578, "grad_norm": 1.241800308227539, "learning_rate": 2.6006539000467073e-05, "loss": 0.0669, "step": 17263 }, { "epoch": 5.277896667685723, "grad_norm": 0.8202168941497803, "learning_rate": 2.6006114390047132e-05, "loss": 0.0946, "step": 17264 }, { "epoch": 5.278202384591868, "grad_norm": 0.7856286764144897, "learning_rate": 2.6005689779627194e-05, "loss": 0.0688, "step": 17265 }, { "epoch": 5.278508101498013, "grad_norm": 0.9621418714523315, "learning_rate": 2.6005265169207253e-05, "loss": 0.0798, "step": 17266 }, { "epoch": 5.278813818404158, "grad_norm": 0.43488556146621704, "learning_rate": 2.600484055878731e-05, "loss": 0.0868, "step": 17267 }, { "epoch": 5.279119535310302, "grad_norm": 0.2936912477016449, "learning_rate": 2.6004415948367374e-05, "loss": 0.0729, "step": 17268 }, { "epoch": 5.2794252522164475, "grad_norm": 0.43014803528785706, "learning_rate": 2.6003991337947432e-05, "loss": 0.1397, "step": 17269 }, { "epoch": 5.279730969122593, "grad_norm": 0.7926280498504639, "learning_rate": 2.6003566727527494e-05, "loss": 0.1335, "step": 17270 }, { "epoch": 5.280036686028738, "grad_norm": 0.8460874557495117, "learning_rate": 2.6003142117107553e-05, "loss": 0.1522, "step": 17271 }, { "epoch": 5.280342402934882, "grad_norm": 0.731002926826477, "learning_rate": 2.6002717506687615e-05, "loss": 0.136, "step": 17272 }, { "epoch": 5.280648119841027, "grad_norm": 0.47834861278533936, "learning_rate": 2.6002292896267674e-05, "loss": 0.1739, "step": 17273 }, { "epoch": 5.280953836747172, "grad_norm": 0.6096823215484619, "learning_rate": 2.6001868285847736e-05, "loss": 0.1896, "step": 17274 }, { "epoch": 5.281259553653317, "grad_norm": 0.8816190361976624, "learning_rate": 2.6001443675427795e-05, "loss": 0.1597, "step": 17275 }, { "epoch": 5.281565270559462, "grad_norm": 2.9597699642181396, "learning_rate": 2.6001019065007857e-05, "loss": 0.1636, "step": 17276 }, { "epoch": 5.281870987465607, "grad_norm": 2.5121657848358154, "learning_rate": 2.6000594454587916e-05, "loss": 0.2024, "step": 17277 }, { "epoch": 5.282176704371752, "grad_norm": 0.8050329089164734, "learning_rate": 2.6000169844167978e-05, "loss": 0.2123, "step": 17278 }, { "epoch": 5.282482421277897, "grad_norm": 1.2476513385772705, "learning_rate": 2.5999745233748036e-05, "loss": 0.2522, "step": 17279 }, { "epoch": 5.282788138184042, "grad_norm": 1.6332908868789673, "learning_rate": 2.5999320623328095e-05, "loss": 0.2215, "step": 17280 }, { "epoch": 5.283093855090186, "grad_norm": 0.44571974873542786, "learning_rate": 2.5998896012908157e-05, "loss": 0.1615, "step": 17281 }, { "epoch": 5.283399571996331, "grad_norm": 0.4092874825000763, "learning_rate": 2.5998471402488216e-05, "loss": 0.0739, "step": 17282 }, { "epoch": 5.2837052889024765, "grad_norm": 3.3886170387268066, "learning_rate": 2.5998046792068278e-05, "loss": 0.0771, "step": 17283 }, { "epoch": 5.284011005808622, "grad_norm": 0.69985032081604, "learning_rate": 2.5997622181648337e-05, "loss": 0.0515, "step": 17284 }, { "epoch": 5.284316722714766, "grad_norm": 0.45458751916885376, "learning_rate": 2.59971975712284e-05, "loss": 0.052, "step": 17285 }, { "epoch": 5.284622439620911, "grad_norm": 0.9507390856742859, "learning_rate": 2.5996772960808457e-05, "loss": 0.0555, "step": 17286 }, { "epoch": 5.284928156527056, "grad_norm": 0.40967321395874023, "learning_rate": 2.599634835038852e-05, "loss": 0.0516, "step": 17287 }, { "epoch": 5.285233873433201, "grad_norm": 0.5870233178138733, "learning_rate": 2.5995923739968578e-05, "loss": 0.0622, "step": 17288 }, { "epoch": 5.285539590339345, "grad_norm": 0.4609624445438385, "learning_rate": 2.599549912954864e-05, "loss": 0.0785, "step": 17289 }, { "epoch": 5.2858453072454905, "grad_norm": 0.33990681171417236, "learning_rate": 2.59950745191287e-05, "loss": 0.0831, "step": 17290 }, { "epoch": 5.286151024151636, "grad_norm": 0.5933921933174133, "learning_rate": 2.599464990870876e-05, "loss": 0.0981, "step": 17291 }, { "epoch": 5.286456741057781, "grad_norm": 0.3558615446090698, "learning_rate": 2.599422529828882e-05, "loss": 0.075, "step": 17292 }, { "epoch": 5.286762457963926, "grad_norm": 0.44001996517181396, "learning_rate": 2.599380068786888e-05, "loss": 0.074, "step": 17293 }, { "epoch": 5.28706817487007, "grad_norm": 1.8426579236984253, "learning_rate": 2.599337607744894e-05, "loss": 0.1142, "step": 17294 }, { "epoch": 5.287373891776215, "grad_norm": 1.2937966585159302, "learning_rate": 2.5992951467029e-05, "loss": 0.1414, "step": 17295 }, { "epoch": 5.28767960868236, "grad_norm": 0.5763744115829468, "learning_rate": 2.599252685660906e-05, "loss": 0.1559, "step": 17296 }, { "epoch": 5.287985325588505, "grad_norm": 1.401045799255371, "learning_rate": 2.599210224618912e-05, "loss": 0.1897, "step": 17297 }, { "epoch": 5.28829104249465, "grad_norm": 4.105606555938721, "learning_rate": 2.5991677635769182e-05, "loss": 0.1484, "step": 17298 }, { "epoch": 5.288596759400795, "grad_norm": 0.8096147775650024, "learning_rate": 2.599125302534924e-05, "loss": 0.1583, "step": 17299 }, { "epoch": 5.28890247630694, "grad_norm": 1.8954200744628906, "learning_rate": 2.5990828414929303e-05, "loss": 0.1604, "step": 17300 }, { "epoch": 5.289208193213085, "grad_norm": 1.5278277397155762, "learning_rate": 2.599040380450936e-05, "loss": 0.1809, "step": 17301 }, { "epoch": 5.289513910119229, "grad_norm": 1.457716464996338, "learning_rate": 2.5989979194089424e-05, "loss": 0.1721, "step": 17302 }, { "epoch": 5.289819627025374, "grad_norm": 0.9752690196037292, "learning_rate": 2.5989554583669482e-05, "loss": 0.1773, "step": 17303 }, { "epoch": 5.2901253439315195, "grad_norm": 1.5574485063552856, "learning_rate": 2.5989129973249544e-05, "loss": 0.1784, "step": 17304 }, { "epoch": 5.290431060837665, "grad_norm": 5.237147808074951, "learning_rate": 2.5988705362829603e-05, "loss": 0.2233, "step": 17305 }, { "epoch": 5.29073677774381, "grad_norm": 0.4066639840602875, "learning_rate": 2.5988280752409662e-05, "loss": 0.1523, "step": 17306 }, { "epoch": 5.291042494649954, "grad_norm": 0.5286878347396851, "learning_rate": 2.5987856141989724e-05, "loss": 0.0894, "step": 17307 }, { "epoch": 5.291348211556099, "grad_norm": 0.26321765780448914, "learning_rate": 2.5987431531569783e-05, "loss": 0.075, "step": 17308 }, { "epoch": 5.291653928462244, "grad_norm": 0.219460129737854, "learning_rate": 2.5987006921149845e-05, "loss": 0.0514, "step": 17309 }, { "epoch": 5.291959645368389, "grad_norm": 0.2584701180458069, "learning_rate": 2.5986582310729903e-05, "loss": 0.0564, "step": 17310 }, { "epoch": 5.2922653622745335, "grad_norm": 0.3969259262084961, "learning_rate": 2.5986157700309966e-05, "loss": 0.0684, "step": 17311 }, { "epoch": 5.292571079180679, "grad_norm": 0.27650028467178345, "learning_rate": 2.5985733089890024e-05, "loss": 0.0747, "step": 17312 }, { "epoch": 5.292876796086824, "grad_norm": 0.3834875226020813, "learning_rate": 2.5985308479470086e-05, "loss": 0.0868, "step": 17313 }, { "epoch": 5.293182512992969, "grad_norm": 0.23372012376785278, "learning_rate": 2.5984883869050145e-05, "loss": 0.0671, "step": 17314 }, { "epoch": 5.293488229899113, "grad_norm": 0.5836880207061768, "learning_rate": 2.5984459258630207e-05, "loss": 0.0499, "step": 17315 }, { "epoch": 5.293793946805258, "grad_norm": 0.3614482581615448, "learning_rate": 2.598403464821027e-05, "loss": 0.1061, "step": 17316 }, { "epoch": 5.294099663711403, "grad_norm": 0.245935320854187, "learning_rate": 2.598361003779033e-05, "loss": 0.073, "step": 17317 }, { "epoch": 5.294405380617548, "grad_norm": 0.5125656127929688, "learning_rate": 2.598318542737039e-05, "loss": 0.0947, "step": 17318 }, { "epoch": 5.2947110975236935, "grad_norm": 0.8183099031448364, "learning_rate": 2.598276081695045e-05, "loss": 0.1437, "step": 17319 }, { "epoch": 5.295016814429838, "grad_norm": 0.7026053667068481, "learning_rate": 2.598233620653051e-05, "loss": 0.1075, "step": 17320 }, { "epoch": 5.295322531335983, "grad_norm": 0.8232913017272949, "learning_rate": 2.598191159611057e-05, "loss": 0.1668, "step": 17321 }, { "epoch": 5.295628248242128, "grad_norm": 0.8139181733131409, "learning_rate": 2.598148698569063e-05, "loss": 0.1443, "step": 17322 }, { "epoch": 5.295933965148273, "grad_norm": 0.4871729016304016, "learning_rate": 2.598106237527069e-05, "loss": 0.1588, "step": 17323 }, { "epoch": 5.296239682054417, "grad_norm": 1.7474123239517212, "learning_rate": 2.5980637764850752e-05, "loss": 0.1571, "step": 17324 }, { "epoch": 5.296545398960562, "grad_norm": 2.8019235134124756, "learning_rate": 2.598021315443081e-05, "loss": 0.1843, "step": 17325 }, { "epoch": 5.2968511158667075, "grad_norm": 0.43262195587158203, "learning_rate": 2.5979788544010873e-05, "loss": 0.1526, "step": 17326 }, { "epoch": 5.297156832772853, "grad_norm": 1.2314540147781372, "learning_rate": 2.5979363933590932e-05, "loss": 0.1662, "step": 17327 }, { "epoch": 5.297462549678997, "grad_norm": 1.4232938289642334, "learning_rate": 2.5978939323170994e-05, "loss": 0.1957, "step": 17328 }, { "epoch": 5.297768266585142, "grad_norm": 4.744178771972656, "learning_rate": 2.5978514712751053e-05, "loss": 0.1773, "step": 17329 }, { "epoch": 5.298073983491287, "grad_norm": 1.9780455827713013, "learning_rate": 2.5978090102331115e-05, "loss": 0.243, "step": 17330 }, { "epoch": 5.298379700397432, "grad_norm": 0.32747676968574524, "learning_rate": 2.5977665491911173e-05, "loss": 0.1605, "step": 17331 }, { "epoch": 5.298685417303577, "grad_norm": 0.26165780425071716, "learning_rate": 2.5977240881491232e-05, "loss": 0.0946, "step": 17332 }, { "epoch": 5.298991134209722, "grad_norm": 0.829535186290741, "learning_rate": 2.5976816271071294e-05, "loss": 0.0671, "step": 17333 }, { "epoch": 5.299296851115867, "grad_norm": 0.40366220474243164, "learning_rate": 2.5976391660651353e-05, "loss": 0.0503, "step": 17334 }, { "epoch": 5.299602568022012, "grad_norm": 0.7793444395065308, "learning_rate": 2.5975967050231415e-05, "loss": 0.0681, "step": 17335 }, { "epoch": 5.299908284928157, "grad_norm": 0.3614521026611328, "learning_rate": 2.5975542439811474e-05, "loss": 0.0669, "step": 17336 }, { "epoch": 5.300214001834301, "grad_norm": 0.3222470283508301, "learning_rate": 2.5975117829391536e-05, "loss": 0.0565, "step": 17337 }, { "epoch": 5.300519718740446, "grad_norm": 0.5457300543785095, "learning_rate": 2.5974693218971594e-05, "loss": 0.0745, "step": 17338 }, { "epoch": 5.300825435646591, "grad_norm": 0.333141028881073, "learning_rate": 2.5974268608551657e-05, "loss": 0.066, "step": 17339 }, { "epoch": 5.3011311525527365, "grad_norm": 0.3645116686820984, "learning_rate": 2.5973843998131715e-05, "loss": 0.0755, "step": 17340 }, { "epoch": 5.301436869458881, "grad_norm": 0.5593229532241821, "learning_rate": 2.5973419387711777e-05, "loss": 0.0905, "step": 17341 }, { "epoch": 5.301742586365026, "grad_norm": 1.2781566381454468, "learning_rate": 2.5972994777291836e-05, "loss": 0.0755, "step": 17342 }, { "epoch": 5.302048303271171, "grad_norm": 0.46636897325515747, "learning_rate": 2.5972570166871898e-05, "loss": 0.0831, "step": 17343 }, { "epoch": 5.302354020177316, "grad_norm": 2.1506054401397705, "learning_rate": 2.5972145556451957e-05, "loss": 0.1363, "step": 17344 }, { "epoch": 5.302659737083461, "grad_norm": 0.44152069091796875, "learning_rate": 2.5971720946032016e-05, "loss": 0.1098, "step": 17345 }, { "epoch": 5.302965453989605, "grad_norm": 0.8622969388961792, "learning_rate": 2.5971296335612078e-05, "loss": 0.1213, "step": 17346 }, { "epoch": 5.3032711708957505, "grad_norm": 0.7351446151733398, "learning_rate": 2.5970871725192136e-05, "loss": 0.1669, "step": 17347 }, { "epoch": 5.303576887801896, "grad_norm": 1.6256924867630005, "learning_rate": 2.59704471147722e-05, "loss": 0.1353, "step": 17348 }, { "epoch": 5.303882604708041, "grad_norm": 0.4934519827365875, "learning_rate": 2.5970022504352257e-05, "loss": 0.1929, "step": 17349 }, { "epoch": 5.304188321614185, "grad_norm": 0.8993542790412903, "learning_rate": 2.596959789393232e-05, "loss": 0.1827, "step": 17350 }, { "epoch": 5.30449403852033, "grad_norm": 1.2997190952301025, "learning_rate": 2.5969173283512378e-05, "loss": 0.1701, "step": 17351 }, { "epoch": 5.304799755426475, "grad_norm": 2.047340154647827, "learning_rate": 2.596874867309244e-05, "loss": 0.1537, "step": 17352 }, { "epoch": 5.30510547233262, "grad_norm": 1.0968362092971802, "learning_rate": 2.59683240626725e-05, "loss": 0.1746, "step": 17353 }, { "epoch": 5.3054111892387645, "grad_norm": 0.8943732976913452, "learning_rate": 2.596789945225256e-05, "loss": 0.2111, "step": 17354 }, { "epoch": 5.30571690614491, "grad_norm": 1.2364776134490967, "learning_rate": 2.596747484183262e-05, "loss": 0.2071, "step": 17355 }, { "epoch": 5.306022623051055, "grad_norm": 0.8191848397254944, "learning_rate": 2.5967050231412678e-05, "loss": 0.1506, "step": 17356 }, { "epoch": 5.3063283399572, "grad_norm": 0.32629773020744324, "learning_rate": 2.596662562099274e-05, "loss": 0.0621, "step": 17357 }, { "epoch": 5.306634056863345, "grad_norm": 0.3109055757522583, "learning_rate": 2.59662010105728e-05, "loss": 0.0584, "step": 17358 }, { "epoch": 5.306939773769489, "grad_norm": 0.26789429783821106, "learning_rate": 2.596577640015286e-05, "loss": 0.0586, "step": 17359 }, { "epoch": 5.307245490675634, "grad_norm": 0.8307021856307983, "learning_rate": 2.596535178973292e-05, "loss": 0.0425, "step": 17360 }, { "epoch": 5.3075512075817795, "grad_norm": 0.44943034648895264, "learning_rate": 2.5964927179312982e-05, "loss": 0.0434, "step": 17361 }, { "epoch": 5.307856924487925, "grad_norm": 0.6218191981315613, "learning_rate": 2.596450256889304e-05, "loss": 0.0675, "step": 17362 }, { "epoch": 5.308162641394069, "grad_norm": 0.38206157088279724, "learning_rate": 2.5964077958473103e-05, "loss": 0.0557, "step": 17363 }, { "epoch": 5.308468358300214, "grad_norm": 0.3280293047428131, "learning_rate": 2.596365334805316e-05, "loss": 0.0771, "step": 17364 }, { "epoch": 5.308774075206359, "grad_norm": 0.30513545870780945, "learning_rate": 2.5963228737633223e-05, "loss": 0.0395, "step": 17365 }, { "epoch": 5.309079792112504, "grad_norm": 0.3037160038948059, "learning_rate": 2.5962804127213282e-05, "loss": 0.0877, "step": 17366 }, { "epoch": 5.309385509018648, "grad_norm": 0.2099308967590332, "learning_rate": 2.5962379516793344e-05, "loss": 0.0546, "step": 17367 }, { "epoch": 5.3096912259247935, "grad_norm": 0.3891242444515228, "learning_rate": 2.5961954906373403e-05, "loss": 0.0813, "step": 17368 }, { "epoch": 5.309996942830939, "grad_norm": 0.3551992177963257, "learning_rate": 2.596153029595346e-05, "loss": 0.1047, "step": 17369 }, { "epoch": 5.310302659737084, "grad_norm": 0.5575762391090393, "learning_rate": 2.5961105685533524e-05, "loss": 0.1393, "step": 17370 }, { "epoch": 5.310608376643229, "grad_norm": 0.7046176195144653, "learning_rate": 2.5960681075113582e-05, "loss": 0.1395, "step": 17371 }, { "epoch": 5.310914093549373, "grad_norm": 0.7109256982803345, "learning_rate": 2.5960256464693645e-05, "loss": 0.1462, "step": 17372 }, { "epoch": 5.311219810455518, "grad_norm": 0.4580579102039337, "learning_rate": 2.5959831854273703e-05, "loss": 0.1491, "step": 17373 }, { "epoch": 5.311525527361663, "grad_norm": 0.962948203086853, "learning_rate": 2.5959407243853765e-05, "loss": 0.1504, "step": 17374 }, { "epoch": 5.311831244267808, "grad_norm": 0.5743358731269836, "learning_rate": 2.5958982633433824e-05, "loss": 0.1614, "step": 17375 }, { "epoch": 5.312136961173953, "grad_norm": 0.552081286907196, "learning_rate": 2.5958558023013886e-05, "loss": 0.1708, "step": 17376 }, { "epoch": 5.312442678080098, "grad_norm": 0.7679868340492249, "learning_rate": 2.5958133412593945e-05, "loss": 0.1632, "step": 17377 }, { "epoch": 5.312748394986243, "grad_norm": 0.7319192886352539, "learning_rate": 2.5957708802174007e-05, "loss": 0.1974, "step": 17378 }, { "epoch": 5.313054111892388, "grad_norm": 1.1067944765090942, "learning_rate": 2.5957284191754066e-05, "loss": 0.1958, "step": 17379 }, { "epoch": 5.313359828798532, "grad_norm": 1.100860357284546, "learning_rate": 2.5956859581334128e-05, "loss": 0.2498, "step": 17380 }, { "epoch": 5.313665545704677, "grad_norm": 0.47977784276008606, "learning_rate": 2.5956434970914186e-05, "loss": 0.1591, "step": 17381 }, { "epoch": 5.313971262610822, "grad_norm": 0.3415321707725525, "learning_rate": 2.5956010360494245e-05, "loss": 0.0673, "step": 17382 }, { "epoch": 5.3142769795169675, "grad_norm": 0.4972634017467499, "learning_rate": 2.5955585750074307e-05, "loss": 0.0617, "step": 17383 }, { "epoch": 5.314582696423113, "grad_norm": 0.31220683455467224, "learning_rate": 2.5955161139654366e-05, "loss": 0.0907, "step": 17384 }, { "epoch": 5.314888413329257, "grad_norm": 0.21393296122550964, "learning_rate": 2.5954736529234428e-05, "loss": 0.0662, "step": 17385 }, { "epoch": 5.315194130235402, "grad_norm": 0.351020872592926, "learning_rate": 2.5954311918814487e-05, "loss": 0.057, "step": 17386 }, { "epoch": 5.315499847141547, "grad_norm": 0.17539171874523163, "learning_rate": 2.595388730839455e-05, "loss": 0.0436, "step": 17387 }, { "epoch": 5.315805564047692, "grad_norm": 0.3022637963294983, "learning_rate": 2.5953462697974607e-05, "loss": 0.0621, "step": 17388 }, { "epoch": 5.3161112809538364, "grad_norm": 0.27210450172424316, "learning_rate": 2.595303808755467e-05, "loss": 0.1018, "step": 17389 }, { "epoch": 5.316416997859982, "grad_norm": 2.1902315616607666, "learning_rate": 2.5952613477134728e-05, "loss": 0.0514, "step": 17390 }, { "epoch": 5.316722714766127, "grad_norm": 0.9180057644844055, "learning_rate": 2.595218886671479e-05, "loss": 0.0859, "step": 17391 }, { "epoch": 5.317028431672272, "grad_norm": 0.2894631028175354, "learning_rate": 2.595176425629485e-05, "loss": 0.0881, "step": 17392 }, { "epoch": 5.317334148578416, "grad_norm": 0.5945976376533508, "learning_rate": 2.595133964587491e-05, "loss": 0.0934, "step": 17393 }, { "epoch": 5.317639865484561, "grad_norm": 0.40354084968566895, "learning_rate": 2.595091503545497e-05, "loss": 0.0986, "step": 17394 }, { "epoch": 5.317945582390706, "grad_norm": 0.7662529945373535, "learning_rate": 2.595049042503503e-05, "loss": 0.1007, "step": 17395 }, { "epoch": 5.318251299296851, "grad_norm": 0.6297411918640137, "learning_rate": 2.595006581461509e-05, "loss": 0.128, "step": 17396 }, { "epoch": 5.3185570162029965, "grad_norm": 0.8485533595085144, "learning_rate": 2.594964120419515e-05, "loss": 0.1382, "step": 17397 }, { "epoch": 5.318862733109141, "grad_norm": 0.7497985363006592, "learning_rate": 2.594921659377521e-05, "loss": 0.1396, "step": 17398 }, { "epoch": 5.319168450015286, "grad_norm": 0.8798383474349976, "learning_rate": 2.594879198335527e-05, "loss": 0.1837, "step": 17399 }, { "epoch": 5.319474166921431, "grad_norm": 0.3978791832923889, "learning_rate": 2.5948367372935332e-05, "loss": 0.1718, "step": 17400 }, { "epoch": 5.319779883827576, "grad_norm": 1.0597656965255737, "learning_rate": 2.594794276251539e-05, "loss": 0.1777, "step": 17401 }, { "epoch": 5.32008560073372, "grad_norm": 0.8462112545967102, "learning_rate": 2.5947518152095453e-05, "loss": 0.2321, "step": 17402 }, { "epoch": 5.320391317639865, "grad_norm": 0.8579172492027283, "learning_rate": 2.594709354167551e-05, "loss": 0.1596, "step": 17403 }, { "epoch": 5.3206970345460105, "grad_norm": 0.7216127514839172, "learning_rate": 2.5946668931255574e-05, "loss": 0.2171, "step": 17404 }, { "epoch": 5.321002751452156, "grad_norm": 1.707323670387268, "learning_rate": 2.5946244320835632e-05, "loss": 0.2271, "step": 17405 }, { "epoch": 5.3213084683583, "grad_norm": 0.4871456027030945, "learning_rate": 2.5945819710415695e-05, "loss": 0.1593, "step": 17406 }, { "epoch": 5.321614185264445, "grad_norm": 0.17900077998638153, "learning_rate": 2.5945395099995753e-05, "loss": 0.0698, "step": 17407 }, { "epoch": 5.32191990217059, "grad_norm": 0.22153933346271515, "learning_rate": 2.5944970489575812e-05, "loss": 0.0806, "step": 17408 }, { "epoch": 5.322225619076735, "grad_norm": 0.22563311457633972, "learning_rate": 2.5944545879155874e-05, "loss": 0.0665, "step": 17409 }, { "epoch": 5.32253133598288, "grad_norm": 0.19180814921855927, "learning_rate": 2.5944121268735933e-05, "loss": 0.0789, "step": 17410 }, { "epoch": 5.3228370528890245, "grad_norm": 0.21868157386779785, "learning_rate": 2.5943696658315995e-05, "loss": 0.0345, "step": 17411 }, { "epoch": 5.32314276979517, "grad_norm": 0.21627368032932281, "learning_rate": 2.5943272047896053e-05, "loss": 0.0513, "step": 17412 }, { "epoch": 5.323448486701315, "grad_norm": 0.30907300114631653, "learning_rate": 2.5942847437476116e-05, "loss": 0.0704, "step": 17413 }, { "epoch": 5.32375420360746, "grad_norm": 0.23946785926818848, "learning_rate": 2.5942422827056174e-05, "loss": 0.0467, "step": 17414 }, { "epoch": 5.324059920513604, "grad_norm": 0.4162997901439667, "learning_rate": 2.5941998216636236e-05, "loss": 0.0504, "step": 17415 }, { "epoch": 5.324365637419749, "grad_norm": 0.4081616997718811, "learning_rate": 2.5941573606216295e-05, "loss": 0.1062, "step": 17416 }, { "epoch": 5.324671354325894, "grad_norm": 0.37132692337036133, "learning_rate": 2.5941148995796357e-05, "loss": 0.0683, "step": 17417 }, { "epoch": 5.3249770712320394, "grad_norm": 0.5934848189353943, "learning_rate": 2.5940724385376416e-05, "loss": 0.1254, "step": 17418 }, { "epoch": 5.325282788138184, "grad_norm": 0.3193126618862152, "learning_rate": 2.594029977495648e-05, "loss": 0.1387, "step": 17419 }, { "epoch": 5.325588505044329, "grad_norm": 0.3541451096534729, "learning_rate": 2.593987516453654e-05, "loss": 0.1206, "step": 17420 }, { "epoch": 5.325894221950474, "grad_norm": 0.4985538125038147, "learning_rate": 2.59394505541166e-05, "loss": 0.1279, "step": 17421 }, { "epoch": 5.326199938856619, "grad_norm": 0.5543042421340942, "learning_rate": 2.593902594369666e-05, "loss": 0.1421, "step": 17422 }, { "epoch": 5.326505655762764, "grad_norm": 0.8864631056785583, "learning_rate": 2.593860133327672e-05, "loss": 0.1763, "step": 17423 }, { "epoch": 5.326811372668908, "grad_norm": 0.6449218988418579, "learning_rate": 2.593817672285678e-05, "loss": 0.1908, "step": 17424 }, { "epoch": 5.3271170895750535, "grad_norm": 0.6614227294921875, "learning_rate": 2.593775211243684e-05, "loss": 0.1651, "step": 17425 }, { "epoch": 5.327422806481199, "grad_norm": 1.197358250617981, "learning_rate": 2.5937327502016902e-05, "loss": 0.1654, "step": 17426 }, { "epoch": 5.327728523387344, "grad_norm": 2.403608798980713, "learning_rate": 2.593690289159696e-05, "loss": 0.1888, "step": 17427 }, { "epoch": 5.328034240293488, "grad_norm": 2.8901383876800537, "learning_rate": 2.5936478281177023e-05, "loss": 0.258, "step": 17428 }, { "epoch": 5.328339957199633, "grad_norm": 4.140195846557617, "learning_rate": 2.5936053670757082e-05, "loss": 0.1903, "step": 17429 }, { "epoch": 5.328645674105778, "grad_norm": 2.233199119567871, "learning_rate": 2.5935629060337144e-05, "loss": 0.2268, "step": 17430 }, { "epoch": 5.328951391011923, "grad_norm": 0.6595281958580017, "learning_rate": 2.5935204449917203e-05, "loss": 0.1423, "step": 17431 }, { "epoch": 5.3292571079180675, "grad_norm": 0.6264349222183228, "learning_rate": 2.5934779839497265e-05, "loss": 0.0726, "step": 17432 }, { "epoch": 5.329562824824213, "grad_norm": 0.39046186208724976, "learning_rate": 2.5934355229077323e-05, "loss": 0.0475, "step": 17433 }, { "epoch": 5.329868541730358, "grad_norm": 0.20229856669902802, "learning_rate": 2.5933930618657382e-05, "loss": 0.0663, "step": 17434 }, { "epoch": 5.330174258636503, "grad_norm": 0.2820141613483429, "learning_rate": 2.5933506008237444e-05, "loss": 0.0604, "step": 17435 }, { "epoch": 5.330479975542648, "grad_norm": 0.18941202759742737, "learning_rate": 2.5933081397817503e-05, "loss": 0.0575, "step": 17436 }, { "epoch": 5.330785692448792, "grad_norm": 0.20571403205394745, "learning_rate": 2.5932656787397565e-05, "loss": 0.081, "step": 17437 }, { "epoch": 5.331091409354937, "grad_norm": 0.5634913444519043, "learning_rate": 2.5932232176977624e-05, "loss": 0.0498, "step": 17438 }, { "epoch": 5.331397126261082, "grad_norm": 0.3898214101791382, "learning_rate": 2.5931807566557686e-05, "loss": 0.0945, "step": 17439 }, { "epoch": 5.3317028431672275, "grad_norm": 2.9749221801757812, "learning_rate": 2.5931382956137745e-05, "loss": 0.0637, "step": 17440 }, { "epoch": 5.332008560073372, "grad_norm": 0.1829776018857956, "learning_rate": 2.5930958345717807e-05, "loss": 0.0525, "step": 17441 }, { "epoch": 5.332314276979517, "grad_norm": 0.801478922367096, "learning_rate": 2.5930533735297865e-05, "loss": 0.0879, "step": 17442 }, { "epoch": 5.332619993885662, "grad_norm": 0.2820027470588684, "learning_rate": 2.5930109124877927e-05, "loss": 0.1263, "step": 17443 }, { "epoch": 5.332925710791807, "grad_norm": 0.9314470291137695, "learning_rate": 2.5929684514457986e-05, "loss": 0.1043, "step": 17444 }, { "epoch": 5.333231427697951, "grad_norm": 0.4619143009185791, "learning_rate": 2.5929259904038048e-05, "loss": 0.1508, "step": 17445 }, { "epoch": 5.3335371446040964, "grad_norm": 0.6063992977142334, "learning_rate": 2.5928835293618107e-05, "loss": 0.134, "step": 17446 }, { "epoch": 5.333842861510242, "grad_norm": 0.6292773485183716, "learning_rate": 2.5928410683198166e-05, "loss": 0.161, "step": 17447 }, { "epoch": 5.334148578416387, "grad_norm": 1.5845324993133545, "learning_rate": 2.5927986072778228e-05, "loss": 0.1584, "step": 17448 }, { "epoch": 5.334454295322532, "grad_norm": 0.5995978116989136, "learning_rate": 2.5927561462358286e-05, "loss": 0.1468, "step": 17449 }, { "epoch": 5.334760012228676, "grad_norm": 1.1493276357650757, "learning_rate": 2.592713685193835e-05, "loss": 0.1649, "step": 17450 }, { "epoch": 5.335065729134821, "grad_norm": 0.4762287735939026, "learning_rate": 2.5926712241518407e-05, "loss": 0.1576, "step": 17451 }, { "epoch": 5.335371446040966, "grad_norm": 0.6663997173309326, "learning_rate": 2.592628763109847e-05, "loss": 0.1748, "step": 17452 }, { "epoch": 5.335677162947111, "grad_norm": 0.5730801224708557, "learning_rate": 2.5925863020678528e-05, "loss": 0.1786, "step": 17453 }, { "epoch": 5.335982879853256, "grad_norm": 0.9712129235267639, "learning_rate": 2.592543841025859e-05, "loss": 0.244, "step": 17454 }, { "epoch": 5.336288596759401, "grad_norm": 0.9940794706344604, "learning_rate": 2.592501379983865e-05, "loss": 0.2374, "step": 17455 }, { "epoch": 5.336594313665546, "grad_norm": 0.3819144368171692, "learning_rate": 2.592458918941871e-05, "loss": 0.1377, "step": 17456 }, { "epoch": 5.336900030571691, "grad_norm": 0.24364708364009857, "learning_rate": 2.592416457899877e-05, "loss": 0.0722, "step": 17457 }, { "epoch": 5.337205747477835, "grad_norm": 0.25737351179122925, "learning_rate": 2.592373996857883e-05, "loss": 0.0872, "step": 17458 }, { "epoch": 5.33751146438398, "grad_norm": 0.28755393624305725, "learning_rate": 2.592331535815889e-05, "loss": 0.0838, "step": 17459 }, { "epoch": 5.337817181290125, "grad_norm": 0.45376813411712646, "learning_rate": 2.592289074773895e-05, "loss": 0.0536, "step": 17460 }, { "epoch": 5.3381228981962705, "grad_norm": 0.4518970847129822, "learning_rate": 2.592246613731901e-05, "loss": 0.0415, "step": 17461 }, { "epoch": 5.338428615102416, "grad_norm": 0.6104654669761658, "learning_rate": 2.592204152689907e-05, "loss": 0.0542, "step": 17462 }, { "epoch": 5.33873433200856, "grad_norm": 0.3692210614681244, "learning_rate": 2.5921616916479132e-05, "loss": 0.0711, "step": 17463 }, { "epoch": 5.339040048914705, "grad_norm": 0.41792139410972595, "learning_rate": 2.592119230605919e-05, "loss": 0.0604, "step": 17464 }, { "epoch": 5.33934576582085, "grad_norm": 0.3310199975967407, "learning_rate": 2.5920767695639253e-05, "loss": 0.0648, "step": 17465 }, { "epoch": 5.339651482726995, "grad_norm": 0.3717235326766968, "learning_rate": 2.592034308521931e-05, "loss": 0.0785, "step": 17466 }, { "epoch": 5.339957199633139, "grad_norm": 0.5348508358001709, "learning_rate": 2.5919918474799373e-05, "loss": 0.1237, "step": 17467 }, { "epoch": 5.3402629165392845, "grad_norm": 0.358508437871933, "learning_rate": 2.5919493864379432e-05, "loss": 0.0958, "step": 17468 }, { "epoch": 5.34056863344543, "grad_norm": 0.7156733274459839, "learning_rate": 2.5919069253959494e-05, "loss": 0.0983, "step": 17469 }, { "epoch": 5.340874350351575, "grad_norm": 0.3521808385848999, "learning_rate": 2.5918644643539553e-05, "loss": 0.1123, "step": 17470 }, { "epoch": 5.341180067257719, "grad_norm": 1.0991475582122803, "learning_rate": 2.5918220033119615e-05, "loss": 0.1442, "step": 17471 }, { "epoch": 5.341485784163864, "grad_norm": 0.5815557241439819, "learning_rate": 2.5917795422699674e-05, "loss": 0.1643, "step": 17472 }, { "epoch": 5.341791501070009, "grad_norm": 0.8540669083595276, "learning_rate": 2.5917370812279732e-05, "loss": 0.1658, "step": 17473 }, { "epoch": 5.342097217976154, "grad_norm": 0.6200188994407654, "learning_rate": 2.5916946201859795e-05, "loss": 0.1523, "step": 17474 }, { "epoch": 5.3424029348822994, "grad_norm": 0.6407157778739929, "learning_rate": 2.5916521591439853e-05, "loss": 0.1586, "step": 17475 }, { "epoch": 5.342708651788444, "grad_norm": 1.2162878513336182, "learning_rate": 2.5916096981019915e-05, "loss": 0.1961, "step": 17476 }, { "epoch": 5.343014368694589, "grad_norm": 0.8302090167999268, "learning_rate": 2.5915672370599974e-05, "loss": 0.1903, "step": 17477 }, { "epoch": 5.343320085600734, "grad_norm": 0.7557647824287415, "learning_rate": 2.5915247760180036e-05, "loss": 0.1991, "step": 17478 }, { "epoch": 5.343625802506879, "grad_norm": 0.6856497526168823, "learning_rate": 2.5914823149760095e-05, "loss": 0.1972, "step": 17479 }, { "epoch": 5.343931519413023, "grad_norm": 1.1442700624465942, "learning_rate": 2.5914398539340157e-05, "loss": 0.255, "step": 17480 }, { "epoch": 5.344237236319168, "grad_norm": 0.8059082627296448, "learning_rate": 2.5913973928920216e-05, "loss": 0.1765, "step": 17481 }, { "epoch": 5.3445429532253135, "grad_norm": 0.2400123029947281, "learning_rate": 2.5913549318500278e-05, "loss": 0.0925, "step": 17482 }, { "epoch": 5.344848670131459, "grad_norm": 0.5005520582199097, "learning_rate": 2.5913124708080336e-05, "loss": 0.0943, "step": 17483 }, { "epoch": 5.345154387037603, "grad_norm": 0.2210826873779297, "learning_rate": 2.5912700097660395e-05, "loss": 0.0663, "step": 17484 }, { "epoch": 5.345460103943748, "grad_norm": 0.3590957522392273, "learning_rate": 2.5912275487240457e-05, "loss": 0.0607, "step": 17485 }, { "epoch": 5.345765820849893, "grad_norm": 0.21954567730426788, "learning_rate": 2.5911850876820516e-05, "loss": 0.061, "step": 17486 }, { "epoch": 5.346071537756038, "grad_norm": 0.17073491215705872, "learning_rate": 2.5911426266400578e-05, "loss": 0.0535, "step": 17487 }, { "epoch": 5.346377254662183, "grad_norm": 1.3246979713439941, "learning_rate": 2.5911001655980637e-05, "loss": 0.0435, "step": 17488 }, { "epoch": 5.3466829715683275, "grad_norm": 0.7503647208213806, "learning_rate": 2.59105770455607e-05, "loss": 0.0828, "step": 17489 }, { "epoch": 5.346988688474473, "grad_norm": 0.2529979348182678, "learning_rate": 2.5910152435140757e-05, "loss": 0.0787, "step": 17490 }, { "epoch": 5.347294405380618, "grad_norm": 0.36763009428977966, "learning_rate": 2.590972782472082e-05, "loss": 0.0961, "step": 17491 }, { "epoch": 5.347600122286763, "grad_norm": 0.32145029306411743, "learning_rate": 2.5909303214300878e-05, "loss": 0.0848, "step": 17492 }, { "epoch": 5.347905839192907, "grad_norm": 0.3380149006843567, "learning_rate": 2.590887860388094e-05, "loss": 0.1174, "step": 17493 }, { "epoch": 5.348211556099052, "grad_norm": 0.40349531173706055, "learning_rate": 2.5908453993461e-05, "loss": 0.1271, "step": 17494 }, { "epoch": 5.348517273005197, "grad_norm": 0.7018424272537231, "learning_rate": 2.590802938304106e-05, "loss": 0.1128, "step": 17495 }, { "epoch": 5.348822989911342, "grad_norm": 0.3442157804965973, "learning_rate": 2.590760477262112e-05, "loss": 0.1185, "step": 17496 }, { "epoch": 5.349128706817487, "grad_norm": 0.822137176990509, "learning_rate": 2.590718016220118e-05, "loss": 0.1363, "step": 17497 }, { "epoch": 5.349434423723632, "grad_norm": 0.6066516041755676, "learning_rate": 2.590675555178124e-05, "loss": 0.171, "step": 17498 }, { "epoch": 5.349740140629777, "grad_norm": 0.4280405640602112, "learning_rate": 2.59063309413613e-05, "loss": 0.1317, "step": 17499 }, { "epoch": 5.350045857535922, "grad_norm": 0.46434471011161804, "learning_rate": 2.590590633094136e-05, "loss": 0.1983, "step": 17500 }, { "epoch": 5.350351574442067, "grad_norm": 1.293434977531433, "learning_rate": 2.590548172052142e-05, "loss": 0.1815, "step": 17501 }, { "epoch": 5.350657291348211, "grad_norm": 0.9794960021972656, "learning_rate": 2.5905057110101482e-05, "loss": 0.1756, "step": 17502 }, { "epoch": 5.3509630082543564, "grad_norm": 0.8945972323417664, "learning_rate": 2.590463249968154e-05, "loss": 0.1554, "step": 17503 }, { "epoch": 5.351268725160502, "grad_norm": 1.8548918962478638, "learning_rate": 2.5904207889261603e-05, "loss": 0.2065, "step": 17504 }, { "epoch": 5.351574442066647, "grad_norm": 0.8631103038787842, "learning_rate": 2.590378327884166e-05, "loss": 0.2229, "step": 17505 }, { "epoch": 5.351880158972791, "grad_norm": 0.427567720413208, "learning_rate": 2.5903358668421724e-05, "loss": 0.1377, "step": 17506 }, { "epoch": 5.352185875878936, "grad_norm": 0.36581555008888245, "learning_rate": 2.5902934058001782e-05, "loss": 0.0962, "step": 17507 }, { "epoch": 5.352491592785081, "grad_norm": 0.24086828529834747, "learning_rate": 2.5902509447581845e-05, "loss": 0.0613, "step": 17508 }, { "epoch": 5.352797309691226, "grad_norm": 0.2448224127292633, "learning_rate": 2.5902084837161903e-05, "loss": 0.0588, "step": 17509 }, { "epoch": 5.3531030265973705, "grad_norm": 0.2818410098552704, "learning_rate": 2.5901660226741962e-05, "loss": 0.0456, "step": 17510 }, { "epoch": 5.353408743503516, "grad_norm": 0.24995854496955872, "learning_rate": 2.5901235616322024e-05, "loss": 0.0601, "step": 17511 }, { "epoch": 5.353714460409661, "grad_norm": 0.22241264581680298, "learning_rate": 2.5900811005902083e-05, "loss": 0.0474, "step": 17512 }, { "epoch": 5.354020177315806, "grad_norm": 0.44681620597839355, "learning_rate": 2.5900386395482145e-05, "loss": 0.0832, "step": 17513 }, { "epoch": 5.354325894221951, "grad_norm": 0.9255015850067139, "learning_rate": 2.5899961785062204e-05, "loss": 0.0816, "step": 17514 }, { "epoch": 5.354631611128095, "grad_norm": 0.2576883137226105, "learning_rate": 2.5899537174642266e-05, "loss": 0.0687, "step": 17515 }, { "epoch": 5.35493732803424, "grad_norm": 0.43476808071136475, "learning_rate": 2.5899112564222324e-05, "loss": 0.1308, "step": 17516 }, { "epoch": 5.355243044940385, "grad_norm": 0.486181378364563, "learning_rate": 2.5898687953802386e-05, "loss": 0.0789, "step": 17517 }, { "epoch": 5.3555487618465305, "grad_norm": 0.300533264875412, "learning_rate": 2.5898263343382445e-05, "loss": 0.0785, "step": 17518 }, { "epoch": 5.355854478752675, "grad_norm": 0.6220220327377319, "learning_rate": 2.5897838732962507e-05, "loss": 0.1204, "step": 17519 }, { "epoch": 5.35616019565882, "grad_norm": 0.6288091540336609, "learning_rate": 2.5897414122542566e-05, "loss": 0.1087, "step": 17520 }, { "epoch": 5.356465912564965, "grad_norm": 0.3650303781032562, "learning_rate": 2.5896989512122628e-05, "loss": 0.1278, "step": 17521 }, { "epoch": 5.35677162947111, "grad_norm": 0.6349318027496338, "learning_rate": 2.589656490170269e-05, "loss": 0.1793, "step": 17522 }, { "epoch": 5.357077346377254, "grad_norm": 0.5342428684234619, "learning_rate": 2.589614029128275e-05, "loss": 0.1567, "step": 17523 }, { "epoch": 5.357383063283399, "grad_norm": 0.5735058784484863, "learning_rate": 2.589571568086281e-05, "loss": 0.2267, "step": 17524 }, { "epoch": 5.3576887801895445, "grad_norm": 0.8771612048149109, "learning_rate": 2.589529107044287e-05, "loss": 0.1703, "step": 17525 }, { "epoch": 5.35799449709569, "grad_norm": 0.9110962152481079, "learning_rate": 2.589486646002293e-05, "loss": 0.1609, "step": 17526 }, { "epoch": 5.358300214001835, "grad_norm": 0.7600893378257751, "learning_rate": 2.589444184960299e-05, "loss": 0.1964, "step": 17527 }, { "epoch": 5.358605930907979, "grad_norm": 5.209876537322998, "learning_rate": 2.5894017239183052e-05, "loss": 0.184, "step": 17528 }, { "epoch": 5.358911647814124, "grad_norm": 1.0662846565246582, "learning_rate": 2.589359262876311e-05, "loss": 0.1874, "step": 17529 }, { "epoch": 5.359217364720269, "grad_norm": 1.008230447769165, "learning_rate": 2.5893168018343173e-05, "loss": 0.2281, "step": 17530 }, { "epoch": 5.359523081626414, "grad_norm": 0.28672897815704346, "learning_rate": 2.5892743407923232e-05, "loss": 0.1409, "step": 17531 }, { "epoch": 5.359828798532559, "grad_norm": 0.20743893086910248, "learning_rate": 2.5892318797503294e-05, "loss": 0.0759, "step": 17532 }, { "epoch": 5.360134515438704, "grad_norm": 0.3011041581630707, "learning_rate": 2.5891894187083353e-05, "loss": 0.062, "step": 17533 }, { "epoch": 5.360440232344849, "grad_norm": 0.19206468760967255, "learning_rate": 2.5891469576663415e-05, "loss": 0.0461, "step": 17534 }, { "epoch": 5.360745949250994, "grad_norm": 0.20739169418811798, "learning_rate": 2.5891044966243473e-05, "loss": 0.0474, "step": 17535 }, { "epoch": 5.361051666157138, "grad_norm": 0.20410506427288055, "learning_rate": 2.5890620355823532e-05, "loss": 0.0529, "step": 17536 }, { "epoch": 5.361357383063283, "grad_norm": 0.2134535163640976, "learning_rate": 2.5890195745403594e-05, "loss": 0.0598, "step": 17537 }, { "epoch": 5.361663099969428, "grad_norm": 0.3487726151943207, "learning_rate": 2.5889771134983653e-05, "loss": 0.0527, "step": 17538 }, { "epoch": 5.3619688168755735, "grad_norm": 0.3002113401889801, "learning_rate": 2.5889346524563715e-05, "loss": 0.0796, "step": 17539 }, { "epoch": 5.362274533781719, "grad_norm": 0.4071653187274933, "learning_rate": 2.5888921914143774e-05, "loss": 0.0542, "step": 17540 }, { "epoch": 5.362580250687863, "grad_norm": 0.41409122943878174, "learning_rate": 2.5888497303723836e-05, "loss": 0.112, "step": 17541 }, { "epoch": 5.362885967594008, "grad_norm": 0.45894452929496765, "learning_rate": 2.5888072693303895e-05, "loss": 0.0637, "step": 17542 }, { "epoch": 5.363191684500153, "grad_norm": 0.33362042903900146, "learning_rate": 2.5887648082883957e-05, "loss": 0.0628, "step": 17543 }, { "epoch": 5.363497401406298, "grad_norm": 0.3234081268310547, "learning_rate": 2.5887223472464015e-05, "loss": 0.1085, "step": 17544 }, { "epoch": 5.363803118312442, "grad_norm": 1.486631155014038, "learning_rate": 2.5886798862044077e-05, "loss": 0.0974, "step": 17545 }, { "epoch": 5.3641088352185875, "grad_norm": 0.5247377157211304, "learning_rate": 2.5886374251624136e-05, "loss": 0.1385, "step": 17546 }, { "epoch": 5.364414552124733, "grad_norm": 1.3916223049163818, "learning_rate": 2.5885949641204198e-05, "loss": 0.1472, "step": 17547 }, { "epoch": 5.364720269030878, "grad_norm": 1.071075439453125, "learning_rate": 2.5885525030784257e-05, "loss": 0.2065, "step": 17548 }, { "epoch": 5.365025985937022, "grad_norm": 0.759652316570282, "learning_rate": 2.5885100420364316e-05, "loss": 0.1777, "step": 17549 }, { "epoch": 5.365331702843167, "grad_norm": 0.5031685829162598, "learning_rate": 2.5884675809944378e-05, "loss": 0.1739, "step": 17550 }, { "epoch": 5.365637419749312, "grad_norm": 0.6499749422073364, "learning_rate": 2.5884251199524436e-05, "loss": 0.1798, "step": 17551 }, { "epoch": 5.365943136655457, "grad_norm": 0.7080777287483215, "learning_rate": 2.58838265891045e-05, "loss": 0.1639, "step": 17552 }, { "epoch": 5.366248853561602, "grad_norm": 0.529785692691803, "learning_rate": 2.5883401978684557e-05, "loss": 0.1661, "step": 17553 }, { "epoch": 5.366554570467747, "grad_norm": 0.9307445883750916, "learning_rate": 2.588297736826462e-05, "loss": 0.1937, "step": 17554 }, { "epoch": 5.366860287373892, "grad_norm": 1.4002957344055176, "learning_rate": 2.5882552757844678e-05, "loss": 0.2236, "step": 17555 }, { "epoch": 5.367166004280037, "grad_norm": 0.2603761851787567, "learning_rate": 2.588212814742474e-05, "loss": 0.1264, "step": 17556 }, { "epoch": 5.367471721186182, "grad_norm": 0.24693770706653595, "learning_rate": 2.58817035370048e-05, "loss": 0.0766, "step": 17557 }, { "epoch": 5.367777438092326, "grad_norm": 0.25822821259498596, "learning_rate": 2.588127892658486e-05, "loss": 0.076, "step": 17558 }, { "epoch": 5.368083154998471, "grad_norm": 1.1055548191070557, "learning_rate": 2.588085431616492e-05, "loss": 0.0616, "step": 17559 }, { "epoch": 5.368388871904616, "grad_norm": 0.2059692144393921, "learning_rate": 2.588042970574498e-05, "loss": 0.0525, "step": 17560 }, { "epoch": 5.368694588810762, "grad_norm": 0.24122171103954315, "learning_rate": 2.588000509532504e-05, "loss": 0.0488, "step": 17561 }, { "epoch": 5.369000305716906, "grad_norm": 0.2433559149503708, "learning_rate": 2.58795804849051e-05, "loss": 0.0691, "step": 17562 }, { "epoch": 5.369306022623051, "grad_norm": 0.4048289358615875, "learning_rate": 2.587915587448516e-05, "loss": 0.0748, "step": 17563 }, { "epoch": 5.369611739529196, "grad_norm": 0.2854553461074829, "learning_rate": 2.587873126406522e-05, "loss": 0.0919, "step": 17564 }, { "epoch": 5.369917456435341, "grad_norm": 0.32545384764671326, "learning_rate": 2.5878306653645282e-05, "loss": 0.0698, "step": 17565 }, { "epoch": 5.370223173341486, "grad_norm": 0.38549166917800903, "learning_rate": 2.587788204322534e-05, "loss": 0.1033, "step": 17566 }, { "epoch": 5.3705288902476305, "grad_norm": 0.2945159077644348, "learning_rate": 2.5877457432805403e-05, "loss": 0.0743, "step": 17567 }, { "epoch": 5.370834607153776, "grad_norm": 0.3570353090763092, "learning_rate": 2.587703282238546e-05, "loss": 0.1075, "step": 17568 }, { "epoch": 5.371140324059921, "grad_norm": 1.2092959880828857, "learning_rate": 2.5876608211965524e-05, "loss": 0.1112, "step": 17569 }, { "epoch": 5.371446040966066, "grad_norm": 0.3170117735862732, "learning_rate": 2.5876183601545582e-05, "loss": 0.1207, "step": 17570 }, { "epoch": 5.37175175787221, "grad_norm": 0.4448407292366028, "learning_rate": 2.5875758991125644e-05, "loss": 0.1439, "step": 17571 }, { "epoch": 5.372057474778355, "grad_norm": 1.2718228101730347, "learning_rate": 2.5875334380705703e-05, "loss": 0.1581, "step": 17572 }, { "epoch": 5.3723631916845, "grad_norm": 0.6574974060058594, "learning_rate": 2.5874909770285765e-05, "loss": 0.1615, "step": 17573 }, { "epoch": 5.372668908590645, "grad_norm": 0.5816260576248169, "learning_rate": 2.5874485159865824e-05, "loss": 0.1645, "step": 17574 }, { "epoch": 5.37297462549679, "grad_norm": 0.5456323623657227, "learning_rate": 2.5874060549445882e-05, "loss": 0.1376, "step": 17575 }, { "epoch": 5.373280342402935, "grad_norm": 1.196677803993225, "learning_rate": 2.5873635939025945e-05, "loss": 0.165, "step": 17576 }, { "epoch": 5.37358605930908, "grad_norm": 0.7324596047401428, "learning_rate": 2.5873211328606003e-05, "loss": 0.1731, "step": 17577 }, { "epoch": 5.373891776215225, "grad_norm": 4.480225563049316, "learning_rate": 2.5872786718186065e-05, "loss": 0.2067, "step": 17578 }, { "epoch": 5.37419749312137, "grad_norm": 1.266359567642212, "learning_rate": 2.5872362107766124e-05, "loss": 0.1963, "step": 17579 }, { "epoch": 5.374503210027514, "grad_norm": 1.8538053035736084, "learning_rate": 2.5871937497346186e-05, "loss": 0.268, "step": 17580 }, { "epoch": 5.374808926933659, "grad_norm": 0.7440856099128723, "learning_rate": 2.5871512886926245e-05, "loss": 0.1424, "step": 17581 }, { "epoch": 5.3751146438398045, "grad_norm": 0.4674694538116455, "learning_rate": 2.5871088276506307e-05, "loss": 0.0721, "step": 17582 }, { "epoch": 5.37542036074595, "grad_norm": 0.40858200192451477, "learning_rate": 2.5870663666086366e-05, "loss": 0.0565, "step": 17583 }, { "epoch": 5.375726077652094, "grad_norm": 0.4021449387073517, "learning_rate": 2.5870239055666428e-05, "loss": 0.0664, "step": 17584 }, { "epoch": 5.376031794558239, "grad_norm": 0.4128209948539734, "learning_rate": 2.5869814445246486e-05, "loss": 0.0686, "step": 17585 }, { "epoch": 5.376337511464384, "grad_norm": 0.44828999042510986, "learning_rate": 2.586938983482655e-05, "loss": 0.0368, "step": 17586 }, { "epoch": 5.376643228370529, "grad_norm": 0.2948751747608185, "learning_rate": 2.5868965224406607e-05, "loss": 0.0578, "step": 17587 }, { "epoch": 5.376948945276673, "grad_norm": 0.46475833654403687, "learning_rate": 2.5868540613986666e-05, "loss": 0.0513, "step": 17588 }, { "epoch": 5.377254662182819, "grad_norm": 0.30156049132347107, "learning_rate": 2.5868116003566728e-05, "loss": 0.0634, "step": 17589 }, { "epoch": 5.377560379088964, "grad_norm": 0.2717687487602234, "learning_rate": 2.5867691393146787e-05, "loss": 0.0666, "step": 17590 }, { "epoch": 5.377866095995109, "grad_norm": 0.3768424689769745, "learning_rate": 2.586726678272685e-05, "loss": 0.0789, "step": 17591 }, { "epoch": 5.378171812901254, "grad_norm": 0.5207379460334778, "learning_rate": 2.5866842172306907e-05, "loss": 0.0967, "step": 17592 }, { "epoch": 5.378477529807398, "grad_norm": 0.412354052066803, "learning_rate": 2.586641756188697e-05, "loss": 0.0973, "step": 17593 }, { "epoch": 5.378783246713543, "grad_norm": 0.7933188080787659, "learning_rate": 2.5865992951467028e-05, "loss": 0.1181, "step": 17594 }, { "epoch": 5.379088963619688, "grad_norm": 0.5205254554748535, "learning_rate": 2.586556834104709e-05, "loss": 0.1186, "step": 17595 }, { "epoch": 5.3793946805258335, "grad_norm": 0.6317648887634277, "learning_rate": 2.586514373062715e-05, "loss": 0.143, "step": 17596 }, { "epoch": 5.379700397431978, "grad_norm": 0.6802299618721008, "learning_rate": 2.586471912020721e-05, "loss": 0.1601, "step": 17597 }, { "epoch": 5.380006114338123, "grad_norm": 0.5327808260917664, "learning_rate": 2.586429450978727e-05, "loss": 0.1937, "step": 17598 }, { "epoch": 5.380311831244268, "grad_norm": 0.591751217842102, "learning_rate": 2.586386989936733e-05, "loss": 0.1749, "step": 17599 }, { "epoch": 5.380617548150413, "grad_norm": 1.2447450160980225, "learning_rate": 2.586344528894739e-05, "loss": 0.193, "step": 17600 }, { "epoch": 5.380923265056557, "grad_norm": 0.7073903679847717, "learning_rate": 2.586302067852745e-05, "loss": 0.1764, "step": 17601 }, { "epoch": 5.381228981962702, "grad_norm": 0.5932996273040771, "learning_rate": 2.586259606810751e-05, "loss": 0.1637, "step": 17602 }, { "epoch": 5.3815346988688475, "grad_norm": 1.8451441526412964, "learning_rate": 2.586217145768757e-05, "loss": 0.1985, "step": 17603 }, { "epoch": 5.381840415774993, "grad_norm": 0.7339709401130676, "learning_rate": 2.5861746847267632e-05, "loss": 0.176, "step": 17604 }, { "epoch": 5.382146132681138, "grad_norm": 0.8656896352767944, "learning_rate": 2.586132223684769e-05, "loss": 0.2108, "step": 17605 }, { "epoch": 5.382451849587282, "grad_norm": 0.7133968472480774, "learning_rate": 2.5860897626427753e-05, "loss": 0.1267, "step": 17606 }, { "epoch": 5.382757566493427, "grad_norm": 0.3889751434326172, "learning_rate": 2.5860473016007812e-05, "loss": 0.0885, "step": 17607 }, { "epoch": 5.383063283399572, "grad_norm": 0.7575603723526001, "learning_rate": 2.5860048405587874e-05, "loss": 0.0559, "step": 17608 }, { "epoch": 5.383369000305717, "grad_norm": 0.191242054104805, "learning_rate": 2.5859623795167932e-05, "loss": 0.0578, "step": 17609 }, { "epoch": 5.3836747172118615, "grad_norm": 0.41504716873168945, "learning_rate": 2.5859199184747995e-05, "loss": 0.0818, "step": 17610 }, { "epoch": 5.383980434118007, "grad_norm": 0.5372995138168335, "learning_rate": 2.5858774574328053e-05, "loss": 0.0464, "step": 17611 }, { "epoch": 5.384286151024152, "grad_norm": 0.37171587347984314, "learning_rate": 2.5858349963908112e-05, "loss": 0.0638, "step": 17612 }, { "epoch": 5.384591867930297, "grad_norm": 0.39093443751335144, "learning_rate": 2.5857925353488174e-05, "loss": 0.0683, "step": 17613 }, { "epoch": 5.384897584836441, "grad_norm": 1.9665447473526, "learning_rate": 2.5857500743068233e-05, "loss": 0.0468, "step": 17614 }, { "epoch": 5.385203301742586, "grad_norm": 2.442437171936035, "learning_rate": 2.5857076132648295e-05, "loss": 0.0568, "step": 17615 }, { "epoch": 5.385509018648731, "grad_norm": 0.7182384133338928, "learning_rate": 2.5856651522228354e-05, "loss": 0.0767, "step": 17616 }, { "epoch": 5.385814735554876, "grad_norm": 0.3654138147830963, "learning_rate": 2.5856226911808416e-05, "loss": 0.0887, "step": 17617 }, { "epoch": 5.386120452461022, "grad_norm": 1.568679690361023, "learning_rate": 2.5855802301388474e-05, "loss": 0.0913, "step": 17618 }, { "epoch": 5.386426169367166, "grad_norm": 0.5278741121292114, "learning_rate": 2.5855377690968536e-05, "loss": 0.1065, "step": 17619 }, { "epoch": 5.386731886273311, "grad_norm": 0.37319961190223694, "learning_rate": 2.5854953080548595e-05, "loss": 0.1258, "step": 17620 }, { "epoch": 5.387037603179456, "grad_norm": 0.46501046419143677, "learning_rate": 2.5854528470128657e-05, "loss": 0.1667, "step": 17621 }, { "epoch": 5.387343320085601, "grad_norm": 0.5686343908309937, "learning_rate": 2.5854103859708716e-05, "loss": 0.1188, "step": 17622 }, { "epoch": 5.387649036991745, "grad_norm": 1.3113631010055542, "learning_rate": 2.5853679249288778e-05, "loss": 0.1446, "step": 17623 }, { "epoch": 5.3879547538978905, "grad_norm": 1.4609112739562988, "learning_rate": 2.585325463886884e-05, "loss": 0.1444, "step": 17624 }, { "epoch": 5.388260470804036, "grad_norm": 0.6446533799171448, "learning_rate": 2.58528300284489e-05, "loss": 0.1563, "step": 17625 }, { "epoch": 5.388566187710181, "grad_norm": 0.5992265343666077, "learning_rate": 2.585240541802896e-05, "loss": 0.1587, "step": 17626 }, { "epoch": 5.388871904616325, "grad_norm": 2.2051048278808594, "learning_rate": 2.585198080760902e-05, "loss": 0.1491, "step": 17627 }, { "epoch": 5.38917762152247, "grad_norm": 0.7952269911766052, "learning_rate": 2.585155619718908e-05, "loss": 0.1825, "step": 17628 }, { "epoch": 5.389483338428615, "grad_norm": 1.5679454803466797, "learning_rate": 2.585113158676914e-05, "loss": 0.1919, "step": 17629 }, { "epoch": 5.38978905533476, "grad_norm": 2.8537654876708984, "learning_rate": 2.5850706976349202e-05, "loss": 0.2293, "step": 17630 }, { "epoch": 5.390094772240905, "grad_norm": 0.3709367513656616, "learning_rate": 2.585028236592926e-05, "loss": 0.1424, "step": 17631 }, { "epoch": 5.39040048914705, "grad_norm": 0.24559636414051056, "learning_rate": 2.5849857755509323e-05, "loss": 0.0788, "step": 17632 }, { "epoch": 5.390706206053195, "grad_norm": 0.26136577129364014, "learning_rate": 2.5849433145089382e-05, "loss": 0.0662, "step": 17633 }, { "epoch": 5.39101192295934, "grad_norm": 1.3731276988983154, "learning_rate": 2.5849008534669444e-05, "loss": 0.0749, "step": 17634 }, { "epoch": 5.391317639865485, "grad_norm": 0.3053025007247925, "learning_rate": 2.5848583924249503e-05, "loss": 0.057, "step": 17635 }, { "epoch": 5.391623356771629, "grad_norm": 0.30179429054260254, "learning_rate": 2.5848159313829565e-05, "loss": 0.0595, "step": 17636 }, { "epoch": 5.391929073677774, "grad_norm": 0.8926263451576233, "learning_rate": 2.5847734703409624e-05, "loss": 0.0621, "step": 17637 }, { "epoch": 5.392234790583919, "grad_norm": 0.8301391005516052, "learning_rate": 2.5847310092989682e-05, "loss": 0.0845, "step": 17638 }, { "epoch": 5.3925405074900645, "grad_norm": 1.1822646856307983, "learning_rate": 2.5846885482569744e-05, "loss": 0.0524, "step": 17639 }, { "epoch": 5.392846224396209, "grad_norm": 0.8352319002151489, "learning_rate": 2.5846460872149803e-05, "loss": 0.0555, "step": 17640 }, { "epoch": 5.393151941302354, "grad_norm": 0.3536536395549774, "learning_rate": 2.5846036261729865e-05, "loss": 0.0974, "step": 17641 }, { "epoch": 5.393457658208499, "grad_norm": 0.2846263647079468, "learning_rate": 2.5845611651309924e-05, "loss": 0.0887, "step": 17642 }, { "epoch": 5.393763375114644, "grad_norm": 0.5282399654388428, "learning_rate": 2.5845187040889986e-05, "loss": 0.1111, "step": 17643 }, { "epoch": 5.394069092020789, "grad_norm": 0.4303502142429352, "learning_rate": 2.5844762430470045e-05, "loss": 0.0983, "step": 17644 }, { "epoch": 5.394374808926933, "grad_norm": 0.44202733039855957, "learning_rate": 2.5844337820050107e-05, "loss": 0.1301, "step": 17645 }, { "epoch": 5.3946805258330786, "grad_norm": 0.3952205181121826, "learning_rate": 2.5843913209630165e-05, "loss": 0.1328, "step": 17646 }, { "epoch": 5.394986242739224, "grad_norm": 0.4753582179546356, "learning_rate": 2.5843488599210227e-05, "loss": 0.141, "step": 17647 }, { "epoch": 5.395291959645369, "grad_norm": 0.7217050194740295, "learning_rate": 2.5843063988790286e-05, "loss": 0.1986, "step": 17648 }, { "epoch": 5.395597676551513, "grad_norm": 0.46928441524505615, "learning_rate": 2.5842639378370348e-05, "loss": 0.1805, "step": 17649 }, { "epoch": 5.395903393457658, "grad_norm": 0.5933506488800049, "learning_rate": 2.5842214767950407e-05, "loss": 0.1622, "step": 17650 }, { "epoch": 5.396209110363803, "grad_norm": 0.6453065276145935, "learning_rate": 2.5841790157530466e-05, "loss": 0.1865, "step": 17651 }, { "epoch": 5.396514827269948, "grad_norm": 1.516277551651001, "learning_rate": 2.5841365547110528e-05, "loss": 0.1626, "step": 17652 }, { "epoch": 5.396820544176093, "grad_norm": 3.1464688777923584, "learning_rate": 2.5840940936690586e-05, "loss": 0.2126, "step": 17653 }, { "epoch": 5.397126261082238, "grad_norm": 0.6883322596549988, "learning_rate": 2.584051632627065e-05, "loss": 0.2178, "step": 17654 }, { "epoch": 5.397431977988383, "grad_norm": 1.6219468116760254, "learning_rate": 2.5840091715850707e-05, "loss": 0.195, "step": 17655 }, { "epoch": 5.397737694894528, "grad_norm": 0.5190029144287109, "learning_rate": 2.583966710543077e-05, "loss": 0.1411, "step": 17656 }, { "epoch": 5.398043411800673, "grad_norm": 0.5458866953849792, "learning_rate": 2.5839242495010828e-05, "loss": 0.0748, "step": 17657 }, { "epoch": 5.398349128706817, "grad_norm": 0.3408924639225006, "learning_rate": 2.583881788459089e-05, "loss": 0.0814, "step": 17658 }, { "epoch": 5.398654845612962, "grad_norm": 0.33499211072921753, "learning_rate": 2.583839327417095e-05, "loss": 0.0679, "step": 17659 }, { "epoch": 5.3989605625191075, "grad_norm": 0.46444374322891235, "learning_rate": 2.583796866375101e-05, "loss": 0.0588, "step": 17660 }, { "epoch": 5.399266279425253, "grad_norm": 0.23896624147891998, "learning_rate": 2.583754405333107e-05, "loss": 0.0745, "step": 17661 }, { "epoch": 5.399571996331397, "grad_norm": 0.40026524662971497, "learning_rate": 2.583711944291113e-05, "loss": 0.0567, "step": 17662 }, { "epoch": 5.399877713237542, "grad_norm": 0.1925583779811859, "learning_rate": 2.583669483249119e-05, "loss": 0.0508, "step": 17663 }, { "epoch": 5.400183430143687, "grad_norm": 0.3033418655395508, "learning_rate": 2.583627022207125e-05, "loss": 0.0866, "step": 17664 }, { "epoch": 5.400489147049832, "grad_norm": 0.29217472672462463, "learning_rate": 2.583584561165131e-05, "loss": 0.0641, "step": 17665 }, { "epoch": 5.400794863955976, "grad_norm": 0.3566412329673767, "learning_rate": 2.583542100123137e-05, "loss": 0.0844, "step": 17666 }, { "epoch": 5.4011005808621215, "grad_norm": 0.6568431854248047, "learning_rate": 2.5834996390811432e-05, "loss": 0.0861, "step": 17667 }, { "epoch": 5.401406297768267, "grad_norm": 0.4382776916027069, "learning_rate": 2.583457178039149e-05, "loss": 0.0782, "step": 17668 }, { "epoch": 5.401712014674412, "grad_norm": 0.6361920833587646, "learning_rate": 2.5834147169971553e-05, "loss": 0.1284, "step": 17669 }, { "epoch": 5.402017731580557, "grad_norm": 0.6018670201301575, "learning_rate": 2.583372255955161e-05, "loss": 0.1474, "step": 17670 }, { "epoch": 5.402323448486701, "grad_norm": 0.9352192878723145, "learning_rate": 2.5833297949131674e-05, "loss": 0.138, "step": 17671 }, { "epoch": 5.402629165392846, "grad_norm": 0.5986952185630798, "learning_rate": 2.5832873338711732e-05, "loss": 0.1392, "step": 17672 }, { "epoch": 5.402934882298991, "grad_norm": 0.8120054602622986, "learning_rate": 2.5832448728291794e-05, "loss": 0.1617, "step": 17673 }, { "epoch": 5.403240599205136, "grad_norm": 1.1330946683883667, "learning_rate": 2.5832024117871853e-05, "loss": 0.1799, "step": 17674 }, { "epoch": 5.403546316111281, "grad_norm": 0.6539401412010193, "learning_rate": 2.5831599507451915e-05, "loss": 0.1614, "step": 17675 }, { "epoch": 5.403852033017426, "grad_norm": 2.041297197341919, "learning_rate": 2.5831174897031974e-05, "loss": 0.1723, "step": 17676 }, { "epoch": 5.404157749923571, "grad_norm": 1.2727270126342773, "learning_rate": 2.5830750286612033e-05, "loss": 0.1947, "step": 17677 }, { "epoch": 5.404463466829716, "grad_norm": 0.6622243523597717, "learning_rate": 2.5830325676192095e-05, "loss": 0.1721, "step": 17678 }, { "epoch": 5.40476918373586, "grad_norm": 1.367349624633789, "learning_rate": 2.5829901065772153e-05, "loss": 0.2246, "step": 17679 }, { "epoch": 5.405074900642005, "grad_norm": 6.031641006469727, "learning_rate": 2.5829476455352215e-05, "loss": 0.2124, "step": 17680 }, { "epoch": 5.4053806175481505, "grad_norm": 0.4979099631309509, "learning_rate": 2.5829051844932274e-05, "loss": 0.1299, "step": 17681 }, { "epoch": 5.405686334454296, "grad_norm": 0.3282302916049957, "learning_rate": 2.5828627234512336e-05, "loss": 0.0948, "step": 17682 }, { "epoch": 5.405992051360441, "grad_norm": 0.2801792323589325, "learning_rate": 2.5828202624092395e-05, "loss": 0.0618, "step": 17683 }, { "epoch": 5.406297768266585, "grad_norm": 0.30579474568367004, "learning_rate": 2.5827778013672457e-05, "loss": 0.0688, "step": 17684 }, { "epoch": 5.40660348517273, "grad_norm": 0.30891892313957214, "learning_rate": 2.5827353403252516e-05, "loss": 0.0453, "step": 17685 }, { "epoch": 5.406909202078875, "grad_norm": 1.2044200897216797, "learning_rate": 2.5826928792832578e-05, "loss": 0.0597, "step": 17686 }, { "epoch": 5.40721491898502, "grad_norm": 1.5818454027175903, "learning_rate": 2.5826504182412636e-05, "loss": 0.0519, "step": 17687 }, { "epoch": 5.4075206358911645, "grad_norm": 0.3284049332141876, "learning_rate": 2.58260795719927e-05, "loss": 0.0778, "step": 17688 }, { "epoch": 5.40782635279731, "grad_norm": 0.2843709886074066, "learning_rate": 2.5825654961572757e-05, "loss": 0.0484, "step": 17689 }, { "epoch": 5.408132069703455, "grad_norm": 0.3756740093231201, "learning_rate": 2.5825230351152816e-05, "loss": 0.0695, "step": 17690 }, { "epoch": 5.4084377866096, "grad_norm": 0.893212616443634, "learning_rate": 2.5824805740732878e-05, "loss": 0.1122, "step": 17691 }, { "epoch": 5.408743503515744, "grad_norm": 0.3838440477848053, "learning_rate": 2.5824381130312937e-05, "loss": 0.0782, "step": 17692 }, { "epoch": 5.409049220421889, "grad_norm": 0.5050315260887146, "learning_rate": 2.5823956519893e-05, "loss": 0.1133, "step": 17693 }, { "epoch": 5.409354937328034, "grad_norm": 0.5545922517776489, "learning_rate": 2.5823531909473058e-05, "loss": 0.1033, "step": 17694 }, { "epoch": 5.409660654234179, "grad_norm": 0.6125749349594116, "learning_rate": 2.582310729905312e-05, "loss": 0.1088, "step": 17695 }, { "epoch": 5.4099663711403245, "grad_norm": 0.49854564666748047, "learning_rate": 2.5822682688633178e-05, "loss": 0.1039, "step": 17696 }, { "epoch": 5.410272088046469, "grad_norm": 0.6312681436538696, "learning_rate": 2.582225807821324e-05, "loss": 0.1426, "step": 17697 }, { "epoch": 5.410577804952614, "grad_norm": 0.47877800464630127, "learning_rate": 2.58218334677933e-05, "loss": 0.1398, "step": 17698 }, { "epoch": 5.410883521858759, "grad_norm": 0.5242709517478943, "learning_rate": 2.582140885737336e-05, "loss": 0.161, "step": 17699 }, { "epoch": 5.411189238764904, "grad_norm": 1.411246657371521, "learning_rate": 2.582098424695342e-05, "loss": 0.1917, "step": 17700 }, { "epoch": 5.411494955671048, "grad_norm": 1.569435954093933, "learning_rate": 2.5820559636533482e-05, "loss": 0.2057, "step": 17701 }, { "epoch": 5.411800672577193, "grad_norm": 0.6436520218849182, "learning_rate": 2.582013502611354e-05, "loss": 0.1657, "step": 17702 }, { "epoch": 5.4121063894833386, "grad_norm": 2.132234573364258, "learning_rate": 2.58197104156936e-05, "loss": 0.1769, "step": 17703 }, { "epoch": 5.412412106389484, "grad_norm": 1.1163458824157715, "learning_rate": 2.581928580527366e-05, "loss": 0.1853, "step": 17704 }, { "epoch": 5.412717823295628, "grad_norm": 0.9533193111419678, "learning_rate": 2.581886119485372e-05, "loss": 0.2139, "step": 17705 }, { "epoch": 5.413023540201773, "grad_norm": 0.5878692865371704, "learning_rate": 2.5818436584433782e-05, "loss": 0.1259, "step": 17706 }, { "epoch": 5.413329257107918, "grad_norm": 0.20892149209976196, "learning_rate": 2.581801197401384e-05, "loss": 0.0573, "step": 17707 }, { "epoch": 5.413634974014063, "grad_norm": 2.2802176475524902, "learning_rate": 2.5817587363593903e-05, "loss": 0.0592, "step": 17708 }, { "epoch": 5.413940690920208, "grad_norm": 1.588739275932312, "learning_rate": 2.5817162753173962e-05, "loss": 0.0547, "step": 17709 }, { "epoch": 5.414246407826353, "grad_norm": 0.28576555848121643, "learning_rate": 2.5816738142754024e-05, "loss": 0.0627, "step": 17710 }, { "epoch": 5.414552124732498, "grad_norm": 0.559400200843811, "learning_rate": 2.5816313532334083e-05, "loss": 0.0445, "step": 17711 }, { "epoch": 5.414857841638643, "grad_norm": 0.8519971370697021, "learning_rate": 2.5815888921914145e-05, "loss": 0.0793, "step": 17712 }, { "epoch": 5.415163558544788, "grad_norm": 0.47179052233695984, "learning_rate": 2.5815464311494203e-05, "loss": 0.0619, "step": 17713 }, { "epoch": 5.415469275450932, "grad_norm": 0.4654650390148163, "learning_rate": 2.5815039701074262e-05, "loss": 0.077, "step": 17714 }, { "epoch": 5.415774992357077, "grad_norm": 0.28495487570762634, "learning_rate": 2.5814615090654324e-05, "loss": 0.0681, "step": 17715 }, { "epoch": 5.416080709263222, "grad_norm": 0.4225354790687561, "learning_rate": 2.5814190480234383e-05, "loss": 0.0723, "step": 17716 }, { "epoch": 5.4163864261693675, "grad_norm": 0.5927707552909851, "learning_rate": 2.5813765869814445e-05, "loss": 0.0941, "step": 17717 }, { "epoch": 5.416692143075512, "grad_norm": 0.7989650964736938, "learning_rate": 2.5813341259394504e-05, "loss": 0.0978, "step": 17718 }, { "epoch": 5.416997859981657, "grad_norm": 0.42591190338134766, "learning_rate": 2.5812916648974566e-05, "loss": 0.1507, "step": 17719 }, { "epoch": 5.417303576887802, "grad_norm": 1.1248356103897095, "learning_rate": 2.5812492038554624e-05, "loss": 0.1179, "step": 17720 }, { "epoch": 5.417609293793947, "grad_norm": 0.5716126561164856, "learning_rate": 2.5812067428134686e-05, "loss": 0.168, "step": 17721 }, { "epoch": 5.417915010700092, "grad_norm": 1.4119412899017334, "learning_rate": 2.5811642817714745e-05, "loss": 0.1458, "step": 17722 }, { "epoch": 5.418220727606236, "grad_norm": 0.7743752002716064, "learning_rate": 2.5811218207294807e-05, "loss": 0.148, "step": 17723 }, { "epoch": 5.4185264445123815, "grad_norm": 1.230774998664856, "learning_rate": 2.5810793596874866e-05, "loss": 0.1915, "step": 17724 }, { "epoch": 5.418832161418527, "grad_norm": 2.353522777557373, "learning_rate": 2.5810368986454928e-05, "loss": 0.1951, "step": 17725 }, { "epoch": 5.419137878324672, "grad_norm": 1.1838774681091309, "learning_rate": 2.580994437603499e-05, "loss": 0.2072, "step": 17726 }, { "epoch": 5.419443595230816, "grad_norm": 1.024107813835144, "learning_rate": 2.580951976561505e-05, "loss": 0.1946, "step": 17727 }, { "epoch": 5.419749312136961, "grad_norm": 2.7565107345581055, "learning_rate": 2.580909515519511e-05, "loss": 0.1888, "step": 17728 }, { "epoch": 5.420055029043106, "grad_norm": 1.787035584449768, "learning_rate": 2.580867054477517e-05, "loss": 0.1912, "step": 17729 }, { "epoch": 5.420360745949251, "grad_norm": 2.2928855419158936, "learning_rate": 2.5808245934355232e-05, "loss": 0.2059, "step": 17730 }, { "epoch": 5.4206664628553956, "grad_norm": 0.46662288904190063, "learning_rate": 2.580782132393529e-05, "loss": 0.1238, "step": 17731 }, { "epoch": 5.420972179761541, "grad_norm": 0.40266671776771545, "learning_rate": 2.5807396713515352e-05, "loss": 0.0891, "step": 17732 }, { "epoch": 5.421277896667686, "grad_norm": 0.2953406870365143, "learning_rate": 2.580697210309541e-05, "loss": 0.1071, "step": 17733 }, { "epoch": 5.421583613573831, "grad_norm": 0.3023248314857483, "learning_rate": 2.5806547492675473e-05, "loss": 0.0554, "step": 17734 }, { "epoch": 5.421889330479976, "grad_norm": 0.42883172631263733, "learning_rate": 2.5806122882255532e-05, "loss": 0.0628, "step": 17735 }, { "epoch": 5.42219504738612, "grad_norm": 0.37349358201026917, "learning_rate": 2.5805698271835594e-05, "loss": 0.0764, "step": 17736 }, { "epoch": 5.422500764292265, "grad_norm": 0.28531956672668457, "learning_rate": 2.5805273661415653e-05, "loss": 0.0615, "step": 17737 }, { "epoch": 5.4228064811984105, "grad_norm": 0.2592794895172119, "learning_rate": 2.5804849050995715e-05, "loss": 0.0539, "step": 17738 }, { "epoch": 5.423112198104556, "grad_norm": 0.35326555371284485, "learning_rate": 2.5804424440575774e-05, "loss": 0.0749, "step": 17739 }, { "epoch": 5.4234179150107, "grad_norm": 0.3551596999168396, "learning_rate": 2.5803999830155832e-05, "loss": 0.0556, "step": 17740 }, { "epoch": 5.423723631916845, "grad_norm": 0.6437387466430664, "learning_rate": 2.5803575219735894e-05, "loss": 0.0763, "step": 17741 }, { "epoch": 5.42402934882299, "grad_norm": 0.4277696907520294, "learning_rate": 2.5803150609315953e-05, "loss": 0.0768, "step": 17742 }, { "epoch": 5.424335065729135, "grad_norm": 0.6327899694442749, "learning_rate": 2.5802725998896015e-05, "loss": 0.0824, "step": 17743 }, { "epoch": 5.424640782635279, "grad_norm": 0.6572338938713074, "learning_rate": 2.5802301388476074e-05, "loss": 0.1411, "step": 17744 }, { "epoch": 5.4249464995414245, "grad_norm": 0.5701612830162048, "learning_rate": 2.5801876778056136e-05, "loss": 0.1248, "step": 17745 }, { "epoch": 5.42525221644757, "grad_norm": 0.5128480792045593, "learning_rate": 2.5801452167636195e-05, "loss": 0.1361, "step": 17746 }, { "epoch": 5.425557933353715, "grad_norm": 1.3085055351257324, "learning_rate": 2.5801027557216257e-05, "loss": 0.1992, "step": 17747 }, { "epoch": 5.42586365025986, "grad_norm": 0.720861554145813, "learning_rate": 2.5800602946796315e-05, "loss": 0.157, "step": 17748 }, { "epoch": 5.426169367166004, "grad_norm": 1.686551809310913, "learning_rate": 2.5800178336376377e-05, "loss": 0.1857, "step": 17749 }, { "epoch": 5.426475084072149, "grad_norm": 1.2294373512268066, "learning_rate": 2.5799753725956436e-05, "loss": 0.1787, "step": 17750 }, { "epoch": 5.426780800978294, "grad_norm": 0.8188624978065491, "learning_rate": 2.5799329115536498e-05, "loss": 0.1592, "step": 17751 }, { "epoch": 5.427086517884439, "grad_norm": 0.8868386745452881, "learning_rate": 2.5798904505116557e-05, "loss": 0.1834, "step": 17752 }, { "epoch": 5.427392234790584, "grad_norm": 1.2276134490966797, "learning_rate": 2.5798479894696616e-05, "loss": 0.2009, "step": 17753 }, { "epoch": 5.427697951696729, "grad_norm": 1.3868283033370972, "learning_rate": 2.5798055284276678e-05, "loss": 0.2178, "step": 17754 }, { "epoch": 5.428003668602874, "grad_norm": 1.8991283178329468, "learning_rate": 2.5797630673856736e-05, "loss": 0.3059, "step": 17755 }, { "epoch": 5.428309385509019, "grad_norm": 0.5193761587142944, "learning_rate": 2.57972060634368e-05, "loss": 0.1266, "step": 17756 }, { "epoch": 5.428615102415163, "grad_norm": 0.35480284690856934, "learning_rate": 2.5796781453016857e-05, "loss": 0.1036, "step": 17757 }, { "epoch": 5.428920819321308, "grad_norm": 0.9832378625869751, "learning_rate": 2.579635684259692e-05, "loss": 0.0767, "step": 17758 }, { "epoch": 5.429226536227453, "grad_norm": 0.4544336199760437, "learning_rate": 2.5795932232176978e-05, "loss": 0.0649, "step": 17759 }, { "epoch": 5.4295322531335986, "grad_norm": 0.2562420964241028, "learning_rate": 2.579550762175704e-05, "loss": 0.049, "step": 17760 }, { "epoch": 5.429837970039744, "grad_norm": 0.2965337634086609, "learning_rate": 2.57950830113371e-05, "loss": 0.0618, "step": 17761 }, { "epoch": 5.430143686945888, "grad_norm": 0.9886170625686646, "learning_rate": 2.579465840091716e-05, "loss": 0.0586, "step": 17762 }, { "epoch": 5.430449403852033, "grad_norm": 0.434605211019516, "learning_rate": 2.579423379049722e-05, "loss": 0.0542, "step": 17763 }, { "epoch": 5.430755120758178, "grad_norm": 0.19999822974205017, "learning_rate": 2.5793809180077282e-05, "loss": 0.0615, "step": 17764 }, { "epoch": 5.431060837664323, "grad_norm": 1.374260663986206, "learning_rate": 2.579338456965734e-05, "loss": 0.0779, "step": 17765 }, { "epoch": 5.4313665545704675, "grad_norm": 0.4892440736293793, "learning_rate": 2.57929599592374e-05, "loss": 0.1022, "step": 17766 }, { "epoch": 5.431672271476613, "grad_norm": 0.43651869893074036, "learning_rate": 2.579253534881746e-05, "loss": 0.1061, "step": 17767 }, { "epoch": 5.431977988382758, "grad_norm": 0.3871915936470032, "learning_rate": 2.579211073839752e-05, "loss": 0.0896, "step": 17768 }, { "epoch": 5.432283705288903, "grad_norm": 0.8812291026115417, "learning_rate": 2.5791686127977582e-05, "loss": 0.1377, "step": 17769 }, { "epoch": 5.432589422195047, "grad_norm": 0.711980938911438, "learning_rate": 2.579126151755764e-05, "loss": 0.1152, "step": 17770 }, { "epoch": 5.432895139101192, "grad_norm": 0.58348149061203, "learning_rate": 2.5790836907137703e-05, "loss": 0.1482, "step": 17771 }, { "epoch": 5.433200856007337, "grad_norm": 0.985148549079895, "learning_rate": 2.579041229671776e-05, "loss": 0.1652, "step": 17772 }, { "epoch": 5.433506572913482, "grad_norm": 3.192687749862671, "learning_rate": 2.5789987686297824e-05, "loss": 0.17, "step": 17773 }, { "epoch": 5.4338122898196275, "grad_norm": 1.0520174503326416, "learning_rate": 2.5789563075877882e-05, "loss": 0.1808, "step": 17774 }, { "epoch": 5.434118006725772, "grad_norm": 1.4559433460235596, "learning_rate": 2.5789138465457944e-05, "loss": 0.1983, "step": 17775 }, { "epoch": 5.434423723631917, "grad_norm": 1.5598466396331787, "learning_rate": 2.5788713855038003e-05, "loss": 0.179, "step": 17776 }, { "epoch": 5.434729440538062, "grad_norm": NaN, "learning_rate": 2.5788713855038003e-05, "loss": 0.1567, "step": 17777 }, { "epoch": 5.435035157444207, "grad_norm": 1.7302592992782593, "learning_rate": 2.5788289244618065e-05, "loss": 0.2066, "step": 17778 }, { "epoch": 5.435340874350351, "grad_norm": 1.290088176727295, "learning_rate": 2.5787864634198124e-05, "loss": 0.224, "step": 17779 }, { "epoch": 5.435646591256496, "grad_norm": 1.942000389099121, "learning_rate": 2.5787440023778183e-05, "loss": 0.2568, "step": 17780 }, { "epoch": 5.4359523081626415, "grad_norm": 0.6466100811958313, "learning_rate": 2.5787015413358245e-05, "loss": 0.1551, "step": 17781 }, { "epoch": 5.436258025068787, "grad_norm": 0.3554234206676483, "learning_rate": 2.5786590802938303e-05, "loss": 0.0615, "step": 17782 }, { "epoch": 5.436563741974931, "grad_norm": 0.7312273383140564, "learning_rate": 2.5786166192518365e-05, "loss": 0.0701, "step": 17783 }, { "epoch": 5.436869458881076, "grad_norm": 0.2200843244791031, "learning_rate": 2.5785741582098424e-05, "loss": 0.0738, "step": 17784 }, { "epoch": 5.437175175787221, "grad_norm": 0.35626110434532166, "learning_rate": 2.5785316971678486e-05, "loss": 0.0609, "step": 17785 }, { "epoch": 5.437480892693366, "grad_norm": 0.500328779220581, "learning_rate": 2.5784892361258545e-05, "loss": 0.0696, "step": 17786 }, { "epoch": 5.437786609599511, "grad_norm": 0.16322065889835358, "learning_rate": 2.5784467750838607e-05, "loss": 0.0691, "step": 17787 }, { "epoch": 5.4380923265056555, "grad_norm": 0.2685287296772003, "learning_rate": 2.5784043140418666e-05, "loss": 0.0571, "step": 17788 }, { "epoch": 5.438398043411801, "grad_norm": 0.2237055003643036, "learning_rate": 2.5783618529998728e-05, "loss": 0.0656, "step": 17789 }, { "epoch": 5.438703760317946, "grad_norm": 0.236003115773201, "learning_rate": 2.5783193919578786e-05, "loss": 0.0617, "step": 17790 }, { "epoch": 5.439009477224091, "grad_norm": 1.7509368658065796, "learning_rate": 2.578276930915885e-05, "loss": 0.0846, "step": 17791 }, { "epoch": 5.439315194130235, "grad_norm": 0.45074090361595154, "learning_rate": 2.5782344698738907e-05, "loss": 0.081, "step": 17792 }, { "epoch": 5.43962091103638, "grad_norm": 2.16957950592041, "learning_rate": 2.5781920088318966e-05, "loss": 0.0959, "step": 17793 }, { "epoch": 5.439926627942525, "grad_norm": 0.6634747982025146, "learning_rate": 2.5781495477899028e-05, "loss": 0.0892, "step": 17794 }, { "epoch": 5.4402323448486705, "grad_norm": 0.713859498500824, "learning_rate": 2.5781070867479087e-05, "loss": 0.1568, "step": 17795 }, { "epoch": 5.440538061754815, "grad_norm": 0.5095196962356567, "learning_rate": 2.578064625705915e-05, "loss": 0.1219, "step": 17796 }, { "epoch": 5.44084377866096, "grad_norm": 0.945172131061554, "learning_rate": 2.5780221646639208e-05, "loss": 0.1532, "step": 17797 }, { "epoch": 5.441149495567105, "grad_norm": 1.1653478145599365, "learning_rate": 2.577979703621927e-05, "loss": 0.144, "step": 17798 }, { "epoch": 5.44145521247325, "grad_norm": 0.5836684703826904, "learning_rate": 2.577937242579933e-05, "loss": 0.1616, "step": 17799 }, { "epoch": 5.441760929379395, "grad_norm": 1.7120652198791504, "learning_rate": 2.577894781537939e-05, "loss": 0.1538, "step": 17800 }, { "epoch": 5.442066646285539, "grad_norm": 1.3369680643081665, "learning_rate": 2.577852320495945e-05, "loss": 0.1777, "step": 17801 }, { "epoch": 5.4423723631916845, "grad_norm": 1.2605267763137817, "learning_rate": 2.577809859453951e-05, "loss": 0.2057, "step": 17802 }, { "epoch": 5.44267808009783, "grad_norm": 9.51916790008545, "learning_rate": 2.577767398411957e-05, "loss": 0.1792, "step": 17803 }, { "epoch": 5.442983797003975, "grad_norm": 1.9914902448654175, "learning_rate": 2.5777249373699632e-05, "loss": 0.2397, "step": 17804 }, { "epoch": 5.443289513910119, "grad_norm": 1.6253575086593628, "learning_rate": 2.577682476327969e-05, "loss": 0.239, "step": 17805 }, { "epoch": 5.443595230816264, "grad_norm": 0.5637826323509216, "learning_rate": 2.577640015285975e-05, "loss": 0.1337, "step": 17806 }, { "epoch": 5.443900947722409, "grad_norm": 0.5678479075431824, "learning_rate": 2.577597554243981e-05, "loss": 0.0702, "step": 17807 }, { "epoch": 5.444206664628554, "grad_norm": 0.25310033559799194, "learning_rate": 2.577555093201987e-05, "loss": 0.077, "step": 17808 }, { "epoch": 5.4445123815346985, "grad_norm": 0.4972168505191803, "learning_rate": 2.5775126321599932e-05, "loss": 0.0591, "step": 17809 }, { "epoch": 5.444818098440844, "grad_norm": 0.18574345111846924, "learning_rate": 2.577470171117999e-05, "loss": 0.0577, "step": 17810 }, { "epoch": 5.445123815346989, "grad_norm": 0.5883479118347168, "learning_rate": 2.5774277100760053e-05, "loss": 0.0508, "step": 17811 }, { "epoch": 5.445429532253134, "grad_norm": 0.3062955141067505, "learning_rate": 2.5773852490340112e-05, "loss": 0.0593, "step": 17812 }, { "epoch": 5.445735249159279, "grad_norm": 0.3509267568588257, "learning_rate": 2.5773427879920174e-05, "loss": 0.0463, "step": 17813 }, { "epoch": 5.446040966065423, "grad_norm": 0.3737746775150299, "learning_rate": 2.5773003269500233e-05, "loss": 0.1088, "step": 17814 }, { "epoch": 5.446346682971568, "grad_norm": 0.5459342002868652, "learning_rate": 2.5772578659080295e-05, "loss": 0.0519, "step": 17815 }, { "epoch": 5.446652399877713, "grad_norm": 0.4371106028556824, "learning_rate": 2.5772154048660353e-05, "loss": 0.0888, "step": 17816 }, { "epoch": 5.4469581167838586, "grad_norm": 0.8115184307098389, "learning_rate": 2.5771729438240415e-05, "loss": 0.0888, "step": 17817 }, { "epoch": 5.447263833690003, "grad_norm": 0.2570803463459015, "learning_rate": 2.5771304827820474e-05, "loss": 0.0833, "step": 17818 }, { "epoch": 5.447569550596148, "grad_norm": 0.5013136863708496, "learning_rate": 2.5770880217400533e-05, "loss": 0.1021, "step": 17819 }, { "epoch": 5.447875267502293, "grad_norm": 1.0097758769989014, "learning_rate": 2.5770455606980595e-05, "loss": 0.1056, "step": 17820 }, { "epoch": 5.448180984408438, "grad_norm": 0.6034950613975525, "learning_rate": 2.5770030996560654e-05, "loss": 0.1744, "step": 17821 }, { "epoch": 5.448486701314582, "grad_norm": 0.9176740050315857, "learning_rate": 2.5769606386140716e-05, "loss": 0.1598, "step": 17822 }, { "epoch": 5.4487924182207275, "grad_norm": 0.9445605874061584, "learning_rate": 2.5769181775720774e-05, "loss": 0.1988, "step": 17823 }, { "epoch": 5.449098135126873, "grad_norm": 0.5561435222625732, "learning_rate": 2.5768757165300836e-05, "loss": 0.1896, "step": 17824 }, { "epoch": 5.449403852033018, "grad_norm": 2.309534788131714, "learning_rate": 2.5768332554880895e-05, "loss": 0.1621, "step": 17825 }, { "epoch": 5.449709568939163, "grad_norm": 1.0407261848449707, "learning_rate": 2.5767907944460957e-05, "loss": 0.18, "step": 17826 }, { "epoch": 5.450015285845307, "grad_norm": 0.5222876667976379, "learning_rate": 2.5767483334041016e-05, "loss": 0.172, "step": 17827 }, { "epoch": 5.450321002751452, "grad_norm": 0.9374191761016846, "learning_rate": 2.5767058723621078e-05, "loss": 0.2083, "step": 17828 }, { "epoch": 5.450626719657597, "grad_norm": 0.6938203573226929, "learning_rate": 2.576663411320114e-05, "loss": 0.2093, "step": 17829 }, { "epoch": 5.450932436563742, "grad_norm": 2.5641679763793945, "learning_rate": 2.57662095027812e-05, "loss": 0.248, "step": 17830 }, { "epoch": 5.451238153469887, "grad_norm": 0.37219178676605225, "learning_rate": 2.576578489236126e-05, "loss": 0.1583, "step": 17831 }, { "epoch": 5.451543870376032, "grad_norm": 0.7019333243370056, "learning_rate": 2.576536028194132e-05, "loss": 0.0795, "step": 17832 }, { "epoch": 5.451849587282177, "grad_norm": 0.2851669490337372, "learning_rate": 2.5764935671521382e-05, "loss": 0.0557, "step": 17833 }, { "epoch": 5.452155304188322, "grad_norm": 0.32095539569854736, "learning_rate": 2.576451106110144e-05, "loss": 0.0593, "step": 17834 }, { "epoch": 5.452461021094466, "grad_norm": 0.29924342036247253, "learning_rate": 2.5764086450681503e-05, "loss": 0.0428, "step": 17835 }, { "epoch": 5.452766738000611, "grad_norm": 0.19267645478248596, "learning_rate": 2.576366184026156e-05, "loss": 0.0656, "step": 17836 }, { "epoch": 5.453072454906756, "grad_norm": 0.35147616267204285, "learning_rate": 2.5763237229841623e-05, "loss": 0.0447, "step": 17837 }, { "epoch": 5.4533781718129015, "grad_norm": 0.4746277630329132, "learning_rate": 2.5762812619421682e-05, "loss": 0.0701, "step": 17838 }, { "epoch": 5.453683888719046, "grad_norm": 0.5283970236778259, "learning_rate": 2.5762388009001744e-05, "loss": 0.0498, "step": 17839 }, { "epoch": 5.453989605625191, "grad_norm": 0.49816539883613586, "learning_rate": 2.5761963398581803e-05, "loss": 0.0805, "step": 17840 }, { "epoch": 5.454295322531336, "grad_norm": 0.33008289337158203, "learning_rate": 2.5761538788161865e-05, "loss": 0.1058, "step": 17841 }, { "epoch": 5.454601039437481, "grad_norm": 0.21536628901958466, "learning_rate": 2.5761114177741924e-05, "loss": 0.0518, "step": 17842 }, { "epoch": 5.454906756343626, "grad_norm": 0.40220242738723755, "learning_rate": 2.5760689567321982e-05, "loss": 0.0827, "step": 17843 }, { "epoch": 5.45521247324977, "grad_norm": 1.160796046257019, "learning_rate": 2.5760264956902044e-05, "loss": 0.1209, "step": 17844 }, { "epoch": 5.4555181901559155, "grad_norm": 1.2322337627410889, "learning_rate": 2.5759840346482103e-05, "loss": 0.1536, "step": 17845 }, { "epoch": 5.455823907062061, "grad_norm": 0.9983041286468506, "learning_rate": 2.5759415736062165e-05, "loss": 0.1731, "step": 17846 }, { "epoch": 5.456129623968206, "grad_norm": 0.48293444514274597, "learning_rate": 2.5758991125642224e-05, "loss": 0.1611, "step": 17847 }, { "epoch": 5.45643534087435, "grad_norm": 1.4989060163497925, "learning_rate": 2.5758566515222286e-05, "loss": 0.1496, "step": 17848 }, { "epoch": 5.456741057780495, "grad_norm": 0.8192698955535889, "learning_rate": 2.5758141904802345e-05, "loss": 0.1667, "step": 17849 }, { "epoch": 5.45704677468664, "grad_norm": 0.5911040306091309, "learning_rate": 2.5757717294382407e-05, "loss": 0.1901, "step": 17850 }, { "epoch": 5.457352491592785, "grad_norm": 0.8569478988647461, "learning_rate": 2.5757292683962465e-05, "loss": 0.166, "step": 17851 }, { "epoch": 5.45765820849893, "grad_norm": 2.0481362342834473, "learning_rate": 2.5756868073542528e-05, "loss": 0.1599, "step": 17852 }, { "epoch": 5.457963925405075, "grad_norm": 0.7978664040565491, "learning_rate": 2.5756443463122586e-05, "loss": 0.1812, "step": 17853 }, { "epoch": 5.45826964231122, "grad_norm": 1.3279223442077637, "learning_rate": 2.575601885270265e-05, "loss": 0.2074, "step": 17854 }, { "epoch": 5.458575359217365, "grad_norm": 1.1623378992080688, "learning_rate": 2.5755594242282707e-05, "loss": 0.2273, "step": 17855 }, { "epoch": 5.45888107612351, "grad_norm": 0.3307773768901825, "learning_rate": 2.5755169631862766e-05, "loss": 0.1469, "step": 17856 }, { "epoch": 5.459186793029654, "grad_norm": 0.6072080135345459, "learning_rate": 2.5754745021442828e-05, "loss": 0.0974, "step": 17857 }, { "epoch": 5.459492509935799, "grad_norm": 0.2480347901582718, "learning_rate": 2.5754320411022886e-05, "loss": 0.0739, "step": 17858 }, { "epoch": 5.4597982268419445, "grad_norm": 0.32724568247795105, "learning_rate": 2.575389580060295e-05, "loss": 0.0736, "step": 17859 }, { "epoch": 5.46010394374809, "grad_norm": 0.17984004318714142, "learning_rate": 2.5753471190183007e-05, "loss": 0.0487, "step": 17860 }, { "epoch": 5.460409660654234, "grad_norm": 0.18378418684005737, "learning_rate": 2.575304657976307e-05, "loss": 0.0607, "step": 17861 }, { "epoch": 5.460715377560379, "grad_norm": 0.22201569378376007, "learning_rate": 2.5752621969343128e-05, "loss": 0.0802, "step": 17862 }, { "epoch": 5.461021094466524, "grad_norm": 0.29437434673309326, "learning_rate": 2.575219735892319e-05, "loss": 0.0511, "step": 17863 }, { "epoch": 5.461326811372669, "grad_norm": 0.31922289729118347, "learning_rate": 2.575177274850325e-05, "loss": 0.056, "step": 17864 }, { "epoch": 5.461632528278813, "grad_norm": 0.28558850288391113, "learning_rate": 2.575134813808331e-05, "loss": 0.084, "step": 17865 }, { "epoch": 5.4619382451849585, "grad_norm": 0.8771422505378723, "learning_rate": 2.575092352766337e-05, "loss": 0.0935, "step": 17866 }, { "epoch": 5.462243962091104, "grad_norm": 0.39268532395362854, "learning_rate": 2.5750498917243432e-05, "loss": 0.0819, "step": 17867 }, { "epoch": 5.462549678997249, "grad_norm": 0.27527761459350586, "learning_rate": 2.575007430682349e-05, "loss": 0.1025, "step": 17868 }, { "epoch": 5.462855395903394, "grad_norm": 0.9179679751396179, "learning_rate": 2.574964969640355e-05, "loss": 0.138, "step": 17869 }, { "epoch": 5.463161112809538, "grad_norm": 0.4506025016307831, "learning_rate": 2.574922508598361e-05, "loss": 0.1033, "step": 17870 }, { "epoch": 5.463466829715683, "grad_norm": 0.7292141318321228, "learning_rate": 2.574880047556367e-05, "loss": 0.1535, "step": 17871 }, { "epoch": 5.463772546621828, "grad_norm": 0.4963727295398712, "learning_rate": 2.5748375865143732e-05, "loss": 0.1458, "step": 17872 }, { "epoch": 5.464078263527973, "grad_norm": 0.7824214696884155, "learning_rate": 2.574795125472379e-05, "loss": 0.1318, "step": 17873 }, { "epoch": 5.464383980434118, "grad_norm": 0.8217960000038147, "learning_rate": 2.5747526644303853e-05, "loss": 0.1484, "step": 17874 }, { "epoch": 5.464689697340263, "grad_norm": 0.5186364650726318, "learning_rate": 2.574710203388391e-05, "loss": 0.1721, "step": 17875 }, { "epoch": 5.464995414246408, "grad_norm": 0.6725145578384399, "learning_rate": 2.5746677423463974e-05, "loss": 0.1828, "step": 17876 }, { "epoch": 5.465301131152553, "grad_norm": 0.5884656310081482, "learning_rate": 2.5746252813044032e-05, "loss": 0.1939, "step": 17877 }, { "epoch": 5.465606848058697, "grad_norm": 1.1644023656845093, "learning_rate": 2.5745828202624094e-05, "loss": 0.21, "step": 17878 }, { "epoch": 5.465912564964842, "grad_norm": 1.3123244047164917, "learning_rate": 2.5745403592204153e-05, "loss": 0.1667, "step": 17879 }, { "epoch": 5.4662182818709875, "grad_norm": 3.2687807083129883, "learning_rate": 2.5744978981784215e-05, "loss": 0.2146, "step": 17880 }, { "epoch": 5.466523998777133, "grad_norm": 0.6401085257530212, "learning_rate": 2.5744554371364274e-05, "loss": 0.128, "step": 17881 }, { "epoch": 5.466829715683278, "grad_norm": 0.21842451393604279, "learning_rate": 2.5744129760944333e-05, "loss": 0.0744, "step": 17882 }, { "epoch": 5.467135432589422, "grad_norm": 0.5760063529014587, "learning_rate": 2.5743705150524395e-05, "loss": 0.0703, "step": 17883 }, { "epoch": 5.467441149495567, "grad_norm": 0.25595325231552124, "learning_rate": 2.5743280540104453e-05, "loss": 0.0706, "step": 17884 }, { "epoch": 5.467746866401712, "grad_norm": 0.2537398934364319, "learning_rate": 2.5742855929684515e-05, "loss": 0.0472, "step": 17885 }, { "epoch": 5.468052583307857, "grad_norm": 0.2519209086894989, "learning_rate": 2.5742431319264574e-05, "loss": 0.062, "step": 17886 }, { "epoch": 5.4683583002140015, "grad_norm": 0.3385803997516632, "learning_rate": 2.5742006708844636e-05, "loss": 0.0568, "step": 17887 }, { "epoch": 5.468664017120147, "grad_norm": 1.4112073183059692, "learning_rate": 2.5741582098424695e-05, "loss": 0.0562, "step": 17888 }, { "epoch": 5.468969734026292, "grad_norm": 0.4135737419128418, "learning_rate": 2.5741157488004757e-05, "loss": 0.0647, "step": 17889 }, { "epoch": 5.469275450932437, "grad_norm": 0.294258713722229, "learning_rate": 2.5740732877584816e-05, "loss": 0.056, "step": 17890 }, { "epoch": 5.469581167838581, "grad_norm": 0.23282390832901, "learning_rate": 2.5740308267164878e-05, "loss": 0.0749, "step": 17891 }, { "epoch": 5.469886884744726, "grad_norm": 0.4785754084587097, "learning_rate": 2.5739883656744937e-05, "loss": 0.0742, "step": 17892 }, { "epoch": 5.470192601650871, "grad_norm": 0.4704841077327728, "learning_rate": 2.5739459046325e-05, "loss": 0.092, "step": 17893 }, { "epoch": 5.470498318557016, "grad_norm": 0.7514201998710632, "learning_rate": 2.5739034435905057e-05, "loss": 0.0981, "step": 17894 }, { "epoch": 5.4708040354631615, "grad_norm": 0.6733801364898682, "learning_rate": 2.5738609825485116e-05, "loss": 0.1639, "step": 17895 }, { "epoch": 5.471109752369306, "grad_norm": 0.3422427475452423, "learning_rate": 2.5738185215065178e-05, "loss": 0.1339, "step": 17896 }, { "epoch": 5.471415469275451, "grad_norm": 0.32212039828300476, "learning_rate": 2.5737760604645237e-05, "loss": 0.1327, "step": 17897 }, { "epoch": 5.471721186181596, "grad_norm": 1.3130426406860352, "learning_rate": 2.57373359942253e-05, "loss": 0.1536, "step": 17898 }, { "epoch": 5.472026903087741, "grad_norm": 1.2300153970718384, "learning_rate": 2.5736911383805358e-05, "loss": 0.1994, "step": 17899 }, { "epoch": 5.472332619993885, "grad_norm": 0.5897002816200256, "learning_rate": 2.573648677338542e-05, "loss": 0.1775, "step": 17900 }, { "epoch": 5.47263833690003, "grad_norm": 1.2561157941818237, "learning_rate": 2.573606216296548e-05, "loss": 0.1948, "step": 17901 }, { "epoch": 5.4729440538061755, "grad_norm": 0.685992419719696, "learning_rate": 2.573563755254554e-05, "loss": 0.1803, "step": 17902 }, { "epoch": 5.473249770712321, "grad_norm": 1.315086007118225, "learning_rate": 2.57352129421256e-05, "loss": 0.218, "step": 17903 }, { "epoch": 5.473555487618465, "grad_norm": 0.8976170420646667, "learning_rate": 2.573478833170566e-05, "loss": 0.1717, "step": 17904 }, { "epoch": 5.47386120452461, "grad_norm": 0.9299706220626831, "learning_rate": 2.573436372128572e-05, "loss": 0.2844, "step": 17905 }, { "epoch": 5.474166921430755, "grad_norm": 0.4857066869735718, "learning_rate": 2.5733939110865782e-05, "loss": 0.1513, "step": 17906 }, { "epoch": 5.4744726383369, "grad_norm": 0.3851558566093445, "learning_rate": 2.573351450044584e-05, "loss": 0.0888, "step": 17907 }, { "epoch": 5.474778355243045, "grad_norm": 0.41809648275375366, "learning_rate": 2.57330898900259e-05, "loss": 0.0716, "step": 17908 }, { "epoch": 5.47508407214919, "grad_norm": 0.1671319603919983, "learning_rate": 2.573266527960596e-05, "loss": 0.0507, "step": 17909 }, { "epoch": 5.475389789055335, "grad_norm": 0.4472218155860901, "learning_rate": 2.573224066918602e-05, "loss": 0.0718, "step": 17910 }, { "epoch": 5.47569550596148, "grad_norm": 0.2999393343925476, "learning_rate": 2.5731816058766082e-05, "loss": 0.0536, "step": 17911 }, { "epoch": 5.476001222867625, "grad_norm": 0.19108489155769348, "learning_rate": 2.573139144834614e-05, "loss": 0.047, "step": 17912 }, { "epoch": 5.476306939773769, "grad_norm": 0.5545867681503296, "learning_rate": 2.5730966837926203e-05, "loss": 0.0768, "step": 17913 }, { "epoch": 5.476612656679914, "grad_norm": 0.2680521607398987, "learning_rate": 2.5730542227506262e-05, "loss": 0.0595, "step": 17914 }, { "epoch": 5.476918373586059, "grad_norm": 0.3152911365032196, "learning_rate": 2.5730117617086324e-05, "loss": 0.0763, "step": 17915 }, { "epoch": 5.4772240904922045, "grad_norm": 0.3449380695819855, "learning_rate": 2.5729693006666383e-05, "loss": 0.0943, "step": 17916 }, { "epoch": 5.477529807398349, "grad_norm": 0.2861163020133972, "learning_rate": 2.5729268396246445e-05, "loss": 0.0852, "step": 17917 }, { "epoch": 5.477835524304494, "grad_norm": 0.6900731921195984, "learning_rate": 2.5728843785826503e-05, "loss": 0.1033, "step": 17918 }, { "epoch": 5.478141241210639, "grad_norm": 0.6244825124740601, "learning_rate": 2.5728419175406565e-05, "loss": 0.1462, "step": 17919 }, { "epoch": 5.478446958116784, "grad_norm": 0.30325794219970703, "learning_rate": 2.5727994564986624e-05, "loss": 0.0928, "step": 17920 }, { "epoch": 5.478752675022929, "grad_norm": 0.3862561881542206, "learning_rate": 2.5727569954566683e-05, "loss": 0.1388, "step": 17921 }, { "epoch": 5.479058391929073, "grad_norm": 0.43946072459220886, "learning_rate": 2.5727145344146745e-05, "loss": 0.1549, "step": 17922 }, { "epoch": 5.4793641088352185, "grad_norm": 0.860380232334137, "learning_rate": 2.5726720733726804e-05, "loss": 0.1705, "step": 17923 }, { "epoch": 5.479669825741364, "grad_norm": 0.8287355303764343, "learning_rate": 2.5726296123306866e-05, "loss": 0.152, "step": 17924 }, { "epoch": 5.479975542647509, "grad_norm": 1.619492530822754, "learning_rate": 2.5725871512886924e-05, "loss": 0.1848, "step": 17925 }, { "epoch": 5.480281259553653, "grad_norm": 2.8244659900665283, "learning_rate": 2.5725446902466987e-05, "loss": 0.1667, "step": 17926 }, { "epoch": 5.480586976459798, "grad_norm": 1.565324306488037, "learning_rate": 2.5725022292047045e-05, "loss": 0.1675, "step": 17927 }, { "epoch": 5.480892693365943, "grad_norm": 0.7465373873710632, "learning_rate": 2.5724597681627107e-05, "loss": 0.1706, "step": 17928 }, { "epoch": 5.481198410272088, "grad_norm": 0.8872531652450562, "learning_rate": 2.5724173071207166e-05, "loss": 0.1895, "step": 17929 }, { "epoch": 5.4815041271782325, "grad_norm": 1.728035569190979, "learning_rate": 2.5723748460787228e-05, "loss": 0.2419, "step": 17930 }, { "epoch": 5.481809844084378, "grad_norm": 1.3466180562973022, "learning_rate": 2.572332385036729e-05, "loss": 0.1437, "step": 17931 }, { "epoch": 5.482115560990523, "grad_norm": 0.9332375526428223, "learning_rate": 2.5722899239947352e-05, "loss": 0.0889, "step": 17932 }, { "epoch": 5.482421277896668, "grad_norm": 0.25682690739631653, "learning_rate": 2.572247462952741e-05, "loss": 0.0558, "step": 17933 }, { "epoch": 5.482726994802813, "grad_norm": 0.33719635009765625, "learning_rate": 2.572205001910747e-05, "loss": 0.0657, "step": 17934 }, { "epoch": 5.483032711708957, "grad_norm": 0.285445898771286, "learning_rate": 2.5721625408687532e-05, "loss": 0.0476, "step": 17935 }, { "epoch": 5.483338428615102, "grad_norm": 0.21773262321949005, "learning_rate": 2.572120079826759e-05, "loss": 0.0615, "step": 17936 }, { "epoch": 5.4836441455212475, "grad_norm": 1.9207193851470947, "learning_rate": 2.5720776187847653e-05, "loss": 0.0381, "step": 17937 }, { "epoch": 5.483949862427393, "grad_norm": 0.3880285322666168, "learning_rate": 2.572035157742771e-05, "loss": 0.0618, "step": 17938 }, { "epoch": 5.484255579333537, "grad_norm": 0.3643975555896759, "learning_rate": 2.5719926967007773e-05, "loss": 0.0703, "step": 17939 }, { "epoch": 5.484561296239682, "grad_norm": 0.2682657241821289, "learning_rate": 2.5719502356587832e-05, "loss": 0.0567, "step": 17940 }, { "epoch": 5.484867013145827, "grad_norm": 0.5408663749694824, "learning_rate": 2.5719077746167894e-05, "loss": 0.0927, "step": 17941 }, { "epoch": 5.485172730051972, "grad_norm": 0.2938993275165558, "learning_rate": 2.5718653135747953e-05, "loss": 0.0795, "step": 17942 }, { "epoch": 5.485478446958116, "grad_norm": 0.3331314027309418, "learning_rate": 2.5718228525328015e-05, "loss": 0.0911, "step": 17943 }, { "epoch": 5.4857841638642615, "grad_norm": 0.4607211947441101, "learning_rate": 2.5717803914908074e-05, "loss": 0.117, "step": 17944 }, { "epoch": 5.486089880770407, "grad_norm": 0.6406403183937073, "learning_rate": 2.5717379304488132e-05, "loss": 0.1705, "step": 17945 }, { "epoch": 5.486395597676552, "grad_norm": 1.08026123046875, "learning_rate": 2.5716954694068194e-05, "loss": 0.135, "step": 17946 }, { "epoch": 5.486701314582697, "grad_norm": 1.0211955308914185, "learning_rate": 2.5716530083648253e-05, "loss": 0.1407, "step": 17947 }, { "epoch": 5.487007031488841, "grad_norm": 0.4607406258583069, "learning_rate": 2.5716105473228315e-05, "loss": 0.1552, "step": 17948 }, { "epoch": 5.487312748394986, "grad_norm": 1.9993420839309692, "learning_rate": 2.5715680862808374e-05, "loss": 0.1468, "step": 17949 }, { "epoch": 5.487618465301131, "grad_norm": 1.4075638055801392, "learning_rate": 2.5715256252388436e-05, "loss": 0.1657, "step": 17950 }, { "epoch": 5.487924182207276, "grad_norm": 0.6828092932701111, "learning_rate": 2.5714831641968495e-05, "loss": 0.154, "step": 17951 }, { "epoch": 5.488229899113421, "grad_norm": 0.9647865295410156, "learning_rate": 2.5714407031548557e-05, "loss": 0.1767, "step": 17952 }, { "epoch": 5.488535616019566, "grad_norm": 1.5772550106048584, "learning_rate": 2.5713982421128615e-05, "loss": 0.1541, "step": 17953 }, { "epoch": 5.488841332925711, "grad_norm": 0.7171614170074463, "learning_rate": 2.5713557810708678e-05, "loss": 0.1763, "step": 17954 }, { "epoch": 5.489147049831856, "grad_norm": 15.90996265411377, "learning_rate": 2.5713133200288736e-05, "loss": 0.2915, "step": 17955 }, { "epoch": 5.489452766738, "grad_norm": 0.2997455894947052, "learning_rate": 2.57127085898688e-05, "loss": 0.1281, "step": 17956 }, { "epoch": 5.489758483644145, "grad_norm": 0.17056308686733246, "learning_rate": 2.5712283979448857e-05, "loss": 0.0753, "step": 17957 }, { "epoch": 5.49006420055029, "grad_norm": 0.4078495502471924, "learning_rate": 2.5711859369028916e-05, "loss": 0.0725, "step": 17958 }, { "epoch": 5.4903699174564355, "grad_norm": 0.1958223134279251, "learning_rate": 2.5711434758608978e-05, "loss": 0.0527, "step": 17959 }, { "epoch": 5.490675634362581, "grad_norm": 0.1767519861459732, "learning_rate": 2.5711010148189037e-05, "loss": 0.0443, "step": 17960 }, { "epoch": 5.490981351268725, "grad_norm": 0.15656016767024994, "learning_rate": 2.57105855377691e-05, "loss": 0.0398, "step": 17961 }, { "epoch": 5.49128706817487, "grad_norm": 0.6143420338630676, "learning_rate": 2.5710160927349157e-05, "loss": 0.0537, "step": 17962 }, { "epoch": 5.491592785081015, "grad_norm": 0.48772621154785156, "learning_rate": 2.570973631692922e-05, "loss": 0.0754, "step": 17963 }, { "epoch": 5.49189850198716, "grad_norm": 0.701438307762146, "learning_rate": 2.5709311706509278e-05, "loss": 0.0758, "step": 17964 }, { "epoch": 5.4922042188933045, "grad_norm": 0.23345650732517242, "learning_rate": 2.570888709608934e-05, "loss": 0.064, "step": 17965 }, { "epoch": 5.49250993579945, "grad_norm": 0.25750914216041565, "learning_rate": 2.57084624856694e-05, "loss": 0.0864, "step": 17966 }, { "epoch": 5.492815652705595, "grad_norm": 0.3201320469379425, "learning_rate": 2.570803787524946e-05, "loss": 0.0777, "step": 17967 }, { "epoch": 5.49312136961174, "grad_norm": 1.2221693992614746, "learning_rate": 2.570761326482952e-05, "loss": 0.0953, "step": 17968 }, { "epoch": 5.493427086517884, "grad_norm": 0.36956122517585754, "learning_rate": 2.5707188654409582e-05, "loss": 0.093, "step": 17969 }, { "epoch": 5.493732803424029, "grad_norm": 0.9189037084579468, "learning_rate": 2.570676404398964e-05, "loss": 0.1244, "step": 17970 }, { "epoch": 5.494038520330174, "grad_norm": 0.2695816159248352, "learning_rate": 2.57063394335697e-05, "loss": 0.1179, "step": 17971 }, { "epoch": 5.494344237236319, "grad_norm": 1.001640796661377, "learning_rate": 2.570591482314976e-05, "loss": 0.1557, "step": 17972 }, { "epoch": 5.4946499541424645, "grad_norm": 1.3354674577713013, "learning_rate": 2.570549021272982e-05, "loss": 0.1708, "step": 17973 }, { "epoch": 5.494955671048609, "grad_norm": 0.6881787180900574, "learning_rate": 2.5705065602309882e-05, "loss": 0.179, "step": 17974 }, { "epoch": 5.495261387954754, "grad_norm": 2.2737510204315186, "learning_rate": 2.570464099188994e-05, "loss": 0.1498, "step": 17975 }, { "epoch": 5.495567104860899, "grad_norm": 0.7350075840950012, "learning_rate": 2.5704216381470003e-05, "loss": 0.1822, "step": 17976 }, { "epoch": 5.495872821767044, "grad_norm": 0.716929018497467, "learning_rate": 2.570379177105006e-05, "loss": 0.2028, "step": 17977 }, { "epoch": 5.496178538673188, "grad_norm": 0.9242053627967834, "learning_rate": 2.5703367160630124e-05, "loss": 0.1735, "step": 17978 }, { "epoch": 5.496484255579333, "grad_norm": 1.0835851430892944, "learning_rate": 2.5702942550210182e-05, "loss": 0.1761, "step": 17979 }, { "epoch": 5.4967899724854785, "grad_norm": 7.688479900360107, "learning_rate": 2.5702517939790244e-05, "loss": 0.244, "step": 17980 }, { "epoch": 5.497095689391624, "grad_norm": 1.1996625661849976, "learning_rate": 2.5702093329370303e-05, "loss": 0.1309, "step": 17981 }, { "epoch": 5.497401406297768, "grad_norm": 0.37689611315727234, "learning_rate": 2.5701668718950365e-05, "loss": 0.0848, "step": 17982 }, { "epoch": 5.497707123203913, "grad_norm": 0.47367623448371887, "learning_rate": 2.5701244108530424e-05, "loss": 0.0861, "step": 17983 }, { "epoch": 5.498012840110058, "grad_norm": 0.200336754322052, "learning_rate": 2.5700819498110483e-05, "loss": 0.0639, "step": 17984 }, { "epoch": 5.498318557016203, "grad_norm": 0.29526233673095703, "learning_rate": 2.5700394887690545e-05, "loss": 0.0516, "step": 17985 }, { "epoch": 5.498624273922348, "grad_norm": 0.23640300333499908, "learning_rate": 2.5699970277270603e-05, "loss": 0.0552, "step": 17986 }, { "epoch": 5.4989299908284925, "grad_norm": 0.24052345752716064, "learning_rate": 2.5699545666850665e-05, "loss": 0.0449, "step": 17987 }, { "epoch": 5.499235707734638, "grad_norm": 0.3726736903190613, "learning_rate": 2.5699121056430724e-05, "loss": 0.0728, "step": 17988 }, { "epoch": 5.499541424640783, "grad_norm": 0.596839189529419, "learning_rate": 2.5698696446010786e-05, "loss": 0.1038, "step": 17989 }, { "epoch": 5.499847141546928, "grad_norm": 0.2804550230503082, "learning_rate": 2.5698271835590845e-05, "loss": 0.0746, "step": 17990 }, { "epoch": 5.500152858453072, "grad_norm": 0.8150602579116821, "learning_rate": 2.5697847225170907e-05, "loss": 0.0581, "step": 17991 }, { "epoch": 5.500458575359217, "grad_norm": 0.3968225419521332, "learning_rate": 2.5697422614750966e-05, "loss": 0.0797, "step": 17992 }, { "epoch": 5.500764292265362, "grad_norm": 0.543877899646759, "learning_rate": 2.5696998004331028e-05, "loss": 0.1013, "step": 17993 }, { "epoch": 5.5010700091715075, "grad_norm": 0.6455498337745667, "learning_rate": 2.5696573393911087e-05, "loss": 0.1374, "step": 17994 }, { "epoch": 5.501375726077653, "grad_norm": 0.42560887336730957, "learning_rate": 2.569614878349115e-05, "loss": 0.1516, "step": 17995 }, { "epoch": 5.501681442983797, "grad_norm": 0.6599999070167542, "learning_rate": 2.5695724173071207e-05, "loss": 0.1152, "step": 17996 }, { "epoch": 5.501987159889942, "grad_norm": 5.0673723220825195, "learning_rate": 2.5695299562651266e-05, "loss": 0.1608, "step": 17997 }, { "epoch": 5.502292876796087, "grad_norm": 1.5063166618347168, "learning_rate": 2.5694874952231328e-05, "loss": 0.1648, "step": 17998 }, { "epoch": 5.502598593702231, "grad_norm": 0.5931892991065979, "learning_rate": 2.5694450341811387e-05, "loss": 0.1772, "step": 17999 }, { "epoch": 5.502904310608376, "grad_norm": 2.8192436695098877, "learning_rate": 2.569402573139145e-05, "loss": 0.1974, "step": 18000 }, { "epoch": 5.502904310608376, "eval_cer": 0.18989558443258814, "eval_loss": 0.23962095379829407, "eval_runtime": 18.9961, "eval_samples_per_second": 238.892, "eval_steps_per_second": 0.79, "eval_wer": 0.3332309153375604, "step": 18000 }, { "epoch": 5.5032100275145215, "grad_norm": 0.6267180442810059, "learning_rate": 2.5693601120971508e-05, "loss": 0.1852, "step": 18001 }, { "epoch": 5.503515744420667, "grad_norm": 0.8950520157814026, "learning_rate": 2.569317651055157e-05, "loss": 0.1748, "step": 18002 }, { "epoch": 5.503821461326812, "grad_norm": 1.757005214691162, "learning_rate": 2.569275190013163e-05, "loss": 0.1935, "step": 18003 }, { "epoch": 5.504127178232956, "grad_norm": 1.2944791316986084, "learning_rate": 2.569232728971169e-05, "loss": 0.2222, "step": 18004 }, { "epoch": 5.504432895139101, "grad_norm": 1.018923282623291, "learning_rate": 2.569190267929175e-05, "loss": 0.2365, "step": 18005 }, { "epoch": 5.504738612045246, "grad_norm": 0.831725001335144, "learning_rate": 2.569147806887181e-05, "loss": 0.139, "step": 18006 }, { "epoch": 5.505044328951391, "grad_norm": 0.23841066658496857, "learning_rate": 2.569105345845187e-05, "loss": 0.0684, "step": 18007 }, { "epoch": 5.505350045857536, "grad_norm": 0.2837525010108948, "learning_rate": 2.5690628848031932e-05, "loss": 0.0846, "step": 18008 }, { "epoch": 5.505655762763681, "grad_norm": 0.15218950808048248, "learning_rate": 2.569020423761199e-05, "loss": 0.0465, "step": 18009 }, { "epoch": 5.505961479669826, "grad_norm": 0.22203975915908813, "learning_rate": 2.568977962719205e-05, "loss": 0.0752, "step": 18010 }, { "epoch": 5.506267196575971, "grad_norm": 0.38195931911468506, "learning_rate": 2.568935501677211e-05, "loss": 0.08, "step": 18011 }, { "epoch": 5.506572913482115, "grad_norm": 1.2909266948699951, "learning_rate": 2.568893040635217e-05, "loss": 0.0509, "step": 18012 }, { "epoch": 5.50687863038826, "grad_norm": 0.520631730556488, "learning_rate": 2.5688505795932232e-05, "loss": 0.0574, "step": 18013 }, { "epoch": 5.507184347294405, "grad_norm": 0.41378307342529297, "learning_rate": 2.568808118551229e-05, "loss": 0.0573, "step": 18014 }, { "epoch": 5.50749006420055, "grad_norm": 0.33442947268486023, "learning_rate": 2.5687656575092353e-05, "loss": 0.077, "step": 18015 }, { "epoch": 5.5077957811066955, "grad_norm": 0.3143943250179291, "learning_rate": 2.5687231964672412e-05, "loss": 0.1072, "step": 18016 }, { "epoch": 5.50810149801284, "grad_norm": 1.1245054006576538, "learning_rate": 2.5686807354252474e-05, "loss": 0.0659, "step": 18017 }, { "epoch": 5.508407214918985, "grad_norm": 0.7692553400993347, "learning_rate": 2.5686382743832533e-05, "loss": 0.1176, "step": 18018 }, { "epoch": 5.50871293182513, "grad_norm": 2.027956962585449, "learning_rate": 2.5685958133412595e-05, "loss": 0.101, "step": 18019 }, { "epoch": 5.509018648731275, "grad_norm": 0.2993135452270508, "learning_rate": 2.5685533522992653e-05, "loss": 0.1103, "step": 18020 }, { "epoch": 5.50932436563742, "grad_norm": 0.6649914979934692, "learning_rate": 2.5685108912572715e-05, "loss": 0.132, "step": 18021 }, { "epoch": 5.5096300825435645, "grad_norm": 1.448671817779541, "learning_rate": 2.5684684302152774e-05, "loss": 0.1556, "step": 18022 }, { "epoch": 5.50993579944971, "grad_norm": 0.44917911291122437, "learning_rate": 2.5684259691732833e-05, "loss": 0.1351, "step": 18023 }, { "epoch": 5.510241516355855, "grad_norm": 0.7281275987625122, "learning_rate": 2.5683835081312895e-05, "loss": 0.1399, "step": 18024 }, { "epoch": 5.510547233261999, "grad_norm": 7.623199939727783, "learning_rate": 2.5683410470892954e-05, "loss": 0.1423, "step": 18025 }, { "epoch": 5.510852950168144, "grad_norm": 0.9083811640739441, "learning_rate": 2.5682985860473016e-05, "loss": 0.2289, "step": 18026 }, { "epoch": 5.511158667074289, "grad_norm": 0.5946227312088013, "learning_rate": 2.5682561250053074e-05, "loss": 0.1593, "step": 18027 }, { "epoch": 5.511464383980434, "grad_norm": 5.159407138824463, "learning_rate": 2.5682136639633137e-05, "loss": 0.186, "step": 18028 }, { "epoch": 5.511770100886579, "grad_norm": 1.0201890468597412, "learning_rate": 2.5681712029213195e-05, "loss": 0.1917, "step": 18029 }, { "epoch": 5.512075817792724, "grad_norm": 2.83345365524292, "learning_rate": 2.5681287418793257e-05, "loss": 0.2159, "step": 18030 }, { "epoch": 5.512381534698869, "grad_norm": 0.8982387781143188, "learning_rate": 2.5680862808373316e-05, "loss": 0.1249, "step": 18031 }, { "epoch": 5.512687251605014, "grad_norm": 0.20579230785369873, "learning_rate": 2.5680438197953378e-05, "loss": 0.0493, "step": 18032 }, { "epoch": 5.512992968511159, "grad_norm": 0.22487123310565948, "learning_rate": 2.568001358753344e-05, "loss": 0.0502, "step": 18033 }, { "epoch": 5.513298685417304, "grad_norm": 1.0245338678359985, "learning_rate": 2.5679588977113502e-05, "loss": 0.0392, "step": 18034 }, { "epoch": 5.513604402323448, "grad_norm": 0.38566508889198303, "learning_rate": 2.567916436669356e-05, "loss": 0.062, "step": 18035 }, { "epoch": 5.513910119229593, "grad_norm": 0.2567332088947296, "learning_rate": 2.567873975627362e-05, "loss": 0.0534, "step": 18036 }, { "epoch": 5.5142158361357385, "grad_norm": 0.18243053555488586, "learning_rate": 2.5678315145853682e-05, "loss": 0.0533, "step": 18037 }, { "epoch": 5.514521553041883, "grad_norm": 0.24854175746440887, "learning_rate": 2.567789053543374e-05, "loss": 0.0549, "step": 18038 }, { "epoch": 5.514827269948028, "grad_norm": 0.4536980986595154, "learning_rate": 2.5677465925013803e-05, "loss": 0.1048, "step": 18039 }, { "epoch": 5.515132986854173, "grad_norm": 0.25529298186302185, "learning_rate": 2.567704131459386e-05, "loss": 0.0808, "step": 18040 }, { "epoch": 5.515438703760318, "grad_norm": 1.3842926025390625, "learning_rate": 2.5676616704173923e-05, "loss": 0.1117, "step": 18041 }, { "epoch": 5.515744420666463, "grad_norm": 0.6016749739646912, "learning_rate": 2.5676192093753982e-05, "loss": 0.12, "step": 18042 }, { "epoch": 5.516050137572607, "grad_norm": 0.2978745400905609, "learning_rate": 2.5675767483334044e-05, "loss": 0.076, "step": 18043 }, { "epoch": 5.5163558544787525, "grad_norm": 0.3759467303752899, "learning_rate": 2.5675342872914103e-05, "loss": 0.126, "step": 18044 }, { "epoch": 5.516661571384898, "grad_norm": 1.0639989376068115, "learning_rate": 2.5674918262494165e-05, "loss": 0.1309, "step": 18045 }, { "epoch": 5.516967288291043, "grad_norm": 0.4187204837799072, "learning_rate": 2.5674493652074224e-05, "loss": 0.1424, "step": 18046 }, { "epoch": 5.517273005197188, "grad_norm": 0.40534666180610657, "learning_rate": 2.5674069041654286e-05, "loss": 0.1514, "step": 18047 }, { "epoch": 5.517578722103332, "grad_norm": 1.7538563013076782, "learning_rate": 2.5673644431234344e-05, "loss": 0.1564, "step": 18048 }, { "epoch": 5.517884439009477, "grad_norm": 0.867701530456543, "learning_rate": 2.5673219820814403e-05, "loss": 0.1404, "step": 18049 }, { "epoch": 5.518190155915622, "grad_norm": 1.5014516115188599, "learning_rate": 2.5672795210394465e-05, "loss": 0.1561, "step": 18050 }, { "epoch": 5.518495872821767, "grad_norm": 0.8398255109786987, "learning_rate": 2.5672370599974524e-05, "loss": 0.1907, "step": 18051 }, { "epoch": 5.518801589727912, "grad_norm": 0.6833816766738892, "learning_rate": 2.5671945989554586e-05, "loss": 0.1951, "step": 18052 }, { "epoch": 5.519107306634057, "grad_norm": 0.609977662563324, "learning_rate": 2.5671521379134645e-05, "loss": 0.1496, "step": 18053 }, { "epoch": 5.519413023540202, "grad_norm": 1.7919082641601562, "learning_rate": 2.5671096768714707e-05, "loss": 0.2006, "step": 18054 }, { "epoch": 5.519718740446347, "grad_norm": 1.4415675401687622, "learning_rate": 2.5670672158294765e-05, "loss": 0.2466, "step": 18055 }, { "epoch": 5.520024457352491, "grad_norm": 0.34602227807044983, "learning_rate": 2.5670247547874828e-05, "loss": 0.1456, "step": 18056 }, { "epoch": 5.520330174258636, "grad_norm": 0.23795053362846375, "learning_rate": 2.5669822937454886e-05, "loss": 0.0836, "step": 18057 }, { "epoch": 5.5206358911647815, "grad_norm": 0.29530754685401917, "learning_rate": 2.566939832703495e-05, "loss": 0.0732, "step": 18058 }, { "epoch": 5.520941608070927, "grad_norm": 0.22950813174247742, "learning_rate": 2.5668973716615007e-05, "loss": 0.051, "step": 18059 }, { "epoch": 5.521247324977072, "grad_norm": 0.6639609932899475, "learning_rate": 2.566854910619507e-05, "loss": 0.0552, "step": 18060 }, { "epoch": 5.521553041883216, "grad_norm": 0.22328715026378632, "learning_rate": 2.5668124495775128e-05, "loss": 0.0642, "step": 18061 }, { "epoch": 5.521858758789361, "grad_norm": 0.5070368051528931, "learning_rate": 2.5667699885355187e-05, "loss": 0.0455, "step": 18062 }, { "epoch": 5.522164475695506, "grad_norm": 0.3414435088634491, "learning_rate": 2.566727527493525e-05, "loss": 0.0512, "step": 18063 }, { "epoch": 5.52247019260165, "grad_norm": 0.7999484539031982, "learning_rate": 2.5666850664515307e-05, "loss": 0.0785, "step": 18064 }, { "epoch": 5.5227759095077955, "grad_norm": 0.2931734621524811, "learning_rate": 2.566642605409537e-05, "loss": 0.0812, "step": 18065 }, { "epoch": 5.523081626413941, "grad_norm": 0.3243177831172943, "learning_rate": 2.5666001443675428e-05, "loss": 0.0695, "step": 18066 }, { "epoch": 5.523387343320086, "grad_norm": 2.7243237495422363, "learning_rate": 2.566557683325549e-05, "loss": 0.0804, "step": 18067 }, { "epoch": 5.523693060226231, "grad_norm": 0.3809860944747925, "learning_rate": 2.566515222283555e-05, "loss": 0.0992, "step": 18068 }, { "epoch": 5.523998777132375, "grad_norm": 0.32526862621307373, "learning_rate": 2.566472761241561e-05, "loss": 0.0891, "step": 18069 }, { "epoch": 5.52430449403852, "grad_norm": 0.3083510100841522, "learning_rate": 2.566430300199567e-05, "loss": 0.1216, "step": 18070 }, { "epoch": 5.524610210944665, "grad_norm": 0.48087623715400696, "learning_rate": 2.5663878391575732e-05, "loss": 0.1484, "step": 18071 }, { "epoch": 5.52491592785081, "grad_norm": 1.0334500074386597, "learning_rate": 2.566345378115579e-05, "loss": 0.1534, "step": 18072 }, { "epoch": 5.5252216447569555, "grad_norm": 0.7263479232788086, "learning_rate": 2.566302917073585e-05, "loss": 0.1743, "step": 18073 }, { "epoch": 5.5255273616631, "grad_norm": 0.7074320912361145, "learning_rate": 2.566260456031591e-05, "loss": 0.1774, "step": 18074 }, { "epoch": 5.525833078569245, "grad_norm": 1.872473955154419, "learning_rate": 2.566217994989597e-05, "loss": 0.1943, "step": 18075 }, { "epoch": 5.52613879547539, "grad_norm": 0.6966062188148499, "learning_rate": 2.5661755339476032e-05, "loss": 0.1512, "step": 18076 }, { "epoch": 5.526444512381534, "grad_norm": 4.572185516357422, "learning_rate": 2.566133072905609e-05, "loss": 0.2025, "step": 18077 }, { "epoch": 5.526750229287679, "grad_norm": 1.2679247856140137, "learning_rate": 2.5660906118636153e-05, "loss": 0.1533, "step": 18078 }, { "epoch": 5.5270559461938245, "grad_norm": 0.8264580965042114, "learning_rate": 2.566048150821621e-05, "loss": 0.2304, "step": 18079 }, { "epoch": 5.52736166309997, "grad_norm": 1.1976076364517212, "learning_rate": 2.5660056897796274e-05, "loss": 0.2008, "step": 18080 }, { "epoch": 5.527667380006115, "grad_norm": 1.0164154767990112, "learning_rate": 2.5659632287376332e-05, "loss": 0.1443, "step": 18081 }, { "epoch": 5.527973096912259, "grad_norm": 0.6286493539810181, "learning_rate": 2.5659207676956394e-05, "loss": 0.0996, "step": 18082 }, { "epoch": 5.528278813818404, "grad_norm": 0.5151241421699524, "learning_rate": 2.5658783066536453e-05, "loss": 0.0771, "step": 18083 }, { "epoch": 5.528584530724549, "grad_norm": 0.18052320182323456, "learning_rate": 2.5658358456116515e-05, "loss": 0.0566, "step": 18084 }, { "epoch": 5.528890247630694, "grad_norm": 0.17068856954574585, "learning_rate": 2.5657933845696574e-05, "loss": 0.0587, "step": 18085 }, { "epoch": 5.529195964536839, "grad_norm": 0.13805927336215973, "learning_rate": 2.5657509235276633e-05, "loss": 0.0495, "step": 18086 }, { "epoch": 5.529501681442984, "grad_norm": 0.6500966548919678, "learning_rate": 2.5657084624856695e-05, "loss": 0.0746, "step": 18087 }, { "epoch": 5.529807398349129, "grad_norm": 0.3060779273509979, "learning_rate": 2.5656660014436753e-05, "loss": 0.0571, "step": 18088 }, { "epoch": 5.530113115255274, "grad_norm": 0.6952208876609802, "learning_rate": 2.5656235404016815e-05, "loss": 0.0773, "step": 18089 }, { "epoch": 5.530418832161418, "grad_norm": 0.24278368055820465, "learning_rate": 2.5655810793596874e-05, "loss": 0.0554, "step": 18090 }, { "epoch": 5.530724549067563, "grad_norm": 0.4738689959049225, "learning_rate": 2.5655386183176936e-05, "loss": 0.1074, "step": 18091 }, { "epoch": 5.531030265973708, "grad_norm": 0.25881439447402954, "learning_rate": 2.5654961572756995e-05, "loss": 0.0703, "step": 18092 }, { "epoch": 5.531335982879853, "grad_norm": 0.5086950659751892, "learning_rate": 2.5654536962337057e-05, "loss": 0.0767, "step": 18093 }, { "epoch": 5.5316416997859985, "grad_norm": 0.37594130635261536, "learning_rate": 2.5654112351917116e-05, "loss": 0.1246, "step": 18094 }, { "epoch": 5.531947416692143, "grad_norm": 0.44853153824806213, "learning_rate": 2.5653687741497178e-05, "loss": 0.1142, "step": 18095 }, { "epoch": 5.532253133598288, "grad_norm": 0.7862557172775269, "learning_rate": 2.5653263131077237e-05, "loss": 0.1422, "step": 18096 }, { "epoch": 5.532558850504433, "grad_norm": 0.4674795866012573, "learning_rate": 2.56528385206573e-05, "loss": 0.1287, "step": 18097 }, { "epoch": 5.532864567410578, "grad_norm": 1.563585877418518, "learning_rate": 2.5652413910237357e-05, "loss": 0.1471, "step": 18098 }, { "epoch": 5.533170284316723, "grad_norm": 0.3706517815589905, "learning_rate": 2.5651989299817416e-05, "loss": 0.1776, "step": 18099 }, { "epoch": 5.533476001222867, "grad_norm": 0.5871512293815613, "learning_rate": 2.5651564689397478e-05, "loss": 0.1741, "step": 18100 }, { "epoch": 5.5337817181290125, "grad_norm": 0.7111625075340271, "learning_rate": 2.5651140078977537e-05, "loss": 0.1772, "step": 18101 }, { "epoch": 5.534087435035158, "grad_norm": 0.9615498185157776, "learning_rate": 2.56507154685576e-05, "loss": 0.168, "step": 18102 }, { "epoch": 5.534393151941302, "grad_norm": 1.6901288032531738, "learning_rate": 2.5650290858137658e-05, "loss": 0.1846, "step": 18103 }, { "epoch": 5.534698868847447, "grad_norm": 3.1088223457336426, "learning_rate": 2.564986624771772e-05, "loss": 0.1998, "step": 18104 }, { "epoch": 5.535004585753592, "grad_norm": 1.8949530124664307, "learning_rate": 2.564944163729778e-05, "loss": 0.1899, "step": 18105 }, { "epoch": 5.535310302659737, "grad_norm": 0.4656505286693573, "learning_rate": 2.564901702687784e-05, "loss": 0.1547, "step": 18106 }, { "epoch": 5.535616019565882, "grad_norm": 0.2301201969385147, "learning_rate": 2.56485924164579e-05, "loss": 0.0854, "step": 18107 }, { "epoch": 5.535921736472027, "grad_norm": 0.3392500877380371, "learning_rate": 2.564816780603796e-05, "loss": 0.0673, "step": 18108 }, { "epoch": 5.536227453378172, "grad_norm": 0.4396427273750305, "learning_rate": 2.564774319561802e-05, "loss": 0.0666, "step": 18109 }, { "epoch": 5.536533170284317, "grad_norm": 0.39685386419296265, "learning_rate": 2.5647318585198082e-05, "loss": 0.0588, "step": 18110 }, { "epoch": 5.536838887190462, "grad_norm": 0.46919816732406616, "learning_rate": 2.564689397477814e-05, "loss": 0.0709, "step": 18111 }, { "epoch": 5.537144604096607, "grad_norm": 0.4767204821109772, "learning_rate": 2.56464693643582e-05, "loss": 0.0544, "step": 18112 }, { "epoch": 5.537450321002751, "grad_norm": 3.6391921043395996, "learning_rate": 2.564604475393826e-05, "loss": 0.0405, "step": 18113 }, { "epoch": 5.537756037908896, "grad_norm": 0.40291157364845276, "learning_rate": 2.564562014351832e-05, "loss": 0.1056, "step": 18114 }, { "epoch": 5.5380617548150415, "grad_norm": 0.3516983389854431, "learning_rate": 2.5645195533098382e-05, "loss": 0.0659, "step": 18115 }, { "epoch": 5.538367471721186, "grad_norm": 0.3845501244068146, "learning_rate": 2.564477092267844e-05, "loss": 0.0962, "step": 18116 }, { "epoch": 5.538673188627331, "grad_norm": 0.8781955242156982, "learning_rate": 2.5644346312258503e-05, "loss": 0.0762, "step": 18117 }, { "epoch": 5.538978905533476, "grad_norm": 0.3925141990184784, "learning_rate": 2.5643921701838562e-05, "loss": 0.0876, "step": 18118 }, { "epoch": 5.539284622439621, "grad_norm": 0.8153861165046692, "learning_rate": 2.5643497091418624e-05, "loss": 0.1066, "step": 18119 }, { "epoch": 5.539590339345766, "grad_norm": 0.8738040328025818, "learning_rate": 2.5643072480998683e-05, "loss": 0.1075, "step": 18120 }, { "epoch": 5.53989605625191, "grad_norm": 0.6454999446868896, "learning_rate": 2.5642647870578745e-05, "loss": 0.1591, "step": 18121 }, { "epoch": 5.5402017731580555, "grad_norm": 1.7337932586669922, "learning_rate": 2.5642223260158803e-05, "loss": 0.1591, "step": 18122 }, { "epoch": 5.540507490064201, "grad_norm": 1.2042385339736938, "learning_rate": 2.5641798649738866e-05, "loss": 0.1411, "step": 18123 }, { "epoch": 5.540813206970346, "grad_norm": 1.15369713306427, "learning_rate": 2.5641374039318924e-05, "loss": 0.1444, "step": 18124 }, { "epoch": 5.541118923876491, "grad_norm": 2.5777924060821533, "learning_rate": 2.5640949428898983e-05, "loss": 0.153, "step": 18125 }, { "epoch": 5.541424640782635, "grad_norm": 0.7406261563301086, "learning_rate": 2.5640524818479045e-05, "loss": 0.1802, "step": 18126 }, { "epoch": 5.54173035768878, "grad_norm": 0.9185925126075745, "learning_rate": 2.5640100208059104e-05, "loss": 0.1865, "step": 18127 }, { "epoch": 5.542036074594925, "grad_norm": 1.5099009275436401, "learning_rate": 2.5639675597639166e-05, "loss": 0.1821, "step": 18128 }, { "epoch": 5.5423417915010695, "grad_norm": 1.722994089126587, "learning_rate": 2.5639250987219224e-05, "loss": 0.1972, "step": 18129 }, { "epoch": 5.542647508407215, "grad_norm": 0.7694106698036194, "learning_rate": 2.5638826376799287e-05, "loss": 0.2084, "step": 18130 }, { "epoch": 5.54295322531336, "grad_norm": 0.43223753571510315, "learning_rate": 2.5638401766379345e-05, "loss": 0.1382, "step": 18131 }, { "epoch": 5.543258942219505, "grad_norm": 0.24549680948257446, "learning_rate": 2.5637977155959407e-05, "loss": 0.0939, "step": 18132 }, { "epoch": 5.54356465912565, "grad_norm": 0.39897915720939636, "learning_rate": 2.5637552545539466e-05, "loss": 0.0766, "step": 18133 }, { "epoch": 5.543870376031794, "grad_norm": 0.2753201723098755, "learning_rate": 2.5637127935119528e-05, "loss": 0.045, "step": 18134 }, { "epoch": 5.544176092937939, "grad_norm": 0.2942052185535431, "learning_rate": 2.563670332469959e-05, "loss": 0.0634, "step": 18135 }, { "epoch": 5.5444818098440845, "grad_norm": 0.1728944033384323, "learning_rate": 2.5636278714279652e-05, "loss": 0.058, "step": 18136 }, { "epoch": 5.54478752675023, "grad_norm": 0.24700243771076202, "learning_rate": 2.563585410385971e-05, "loss": 0.0686, "step": 18137 }, { "epoch": 5.545093243656375, "grad_norm": 0.2830154299736023, "learning_rate": 2.563542949343977e-05, "loss": 0.0828, "step": 18138 }, { "epoch": 5.545398960562519, "grad_norm": 0.2615518271923065, "learning_rate": 2.5635004883019832e-05, "loss": 0.0582, "step": 18139 }, { "epoch": 5.545704677468664, "grad_norm": 0.2825852036476135, "learning_rate": 2.563458027259989e-05, "loss": 0.0701, "step": 18140 }, { "epoch": 5.546010394374809, "grad_norm": 0.406435489654541, "learning_rate": 2.5634155662179953e-05, "loss": 0.0825, "step": 18141 }, { "epoch": 5.546316111280953, "grad_norm": 0.3649517595767975, "learning_rate": 2.563373105176001e-05, "loss": 0.0737, "step": 18142 }, { "epoch": 5.5466218281870985, "grad_norm": 0.3940063714981079, "learning_rate": 2.5633306441340073e-05, "loss": 0.1203, "step": 18143 }, { "epoch": 5.546927545093244, "grad_norm": 0.3809981346130371, "learning_rate": 2.5632881830920132e-05, "loss": 0.0851, "step": 18144 }, { "epoch": 5.547233261999389, "grad_norm": 0.47912654280662537, "learning_rate": 2.5632457220500194e-05, "loss": 0.1341, "step": 18145 }, { "epoch": 5.547538978905534, "grad_norm": 0.6931911706924438, "learning_rate": 2.5632032610080253e-05, "loss": 0.1418, "step": 18146 }, { "epoch": 5.547844695811678, "grad_norm": 0.5588856935501099, "learning_rate": 2.5631607999660315e-05, "loss": 0.1784, "step": 18147 }, { "epoch": 5.548150412717823, "grad_norm": 1.7161896228790283, "learning_rate": 2.5631183389240374e-05, "loss": 0.163, "step": 18148 }, { "epoch": 5.548456129623968, "grad_norm": 1.1216284036636353, "learning_rate": 2.5630758778820436e-05, "loss": 0.1778, "step": 18149 }, { "epoch": 5.548761846530113, "grad_norm": 1.3785226345062256, "learning_rate": 2.5630334168400494e-05, "loss": 0.1598, "step": 18150 }, { "epoch": 5.5490675634362585, "grad_norm": 0.674984335899353, "learning_rate": 2.5629909557980553e-05, "loss": 0.193, "step": 18151 }, { "epoch": 5.549373280342403, "grad_norm": 0.8464537262916565, "learning_rate": 2.5629484947560615e-05, "loss": 0.1876, "step": 18152 }, { "epoch": 5.549678997248548, "grad_norm": 0.7279518246650696, "learning_rate": 2.5629060337140674e-05, "loss": 0.203, "step": 18153 }, { "epoch": 5.549984714154693, "grad_norm": 1.1861811876296997, "learning_rate": 2.5628635726720736e-05, "loss": 0.2007, "step": 18154 }, { "epoch": 5.550290431060837, "grad_norm": 0.9227790832519531, "learning_rate": 2.5628211116300795e-05, "loss": 0.279, "step": 18155 }, { "epoch": 5.550596147966982, "grad_norm": 0.3477182984352112, "learning_rate": 2.5627786505880857e-05, "loss": 0.1334, "step": 18156 }, { "epoch": 5.550901864873127, "grad_norm": 0.37942013144493103, "learning_rate": 2.5627361895460916e-05, "loss": 0.0721, "step": 18157 }, { "epoch": 5.5512075817792725, "grad_norm": 0.4355303943157196, "learning_rate": 2.5626937285040978e-05, "loss": 0.0651, "step": 18158 }, { "epoch": 5.551513298685418, "grad_norm": 0.22595307230949402, "learning_rate": 2.5626512674621036e-05, "loss": 0.0409, "step": 18159 }, { "epoch": 5.551819015591562, "grad_norm": 0.19690658152103424, "learning_rate": 2.56260880642011e-05, "loss": 0.0739, "step": 18160 }, { "epoch": 5.552124732497707, "grad_norm": 0.6557488441467285, "learning_rate": 2.5625663453781157e-05, "loss": 0.0552, "step": 18161 }, { "epoch": 5.552430449403852, "grad_norm": 0.4158211052417755, "learning_rate": 2.562523884336122e-05, "loss": 0.051, "step": 18162 }, { "epoch": 5.552736166309997, "grad_norm": 0.1860416978597641, "learning_rate": 2.5624814232941278e-05, "loss": 0.0688, "step": 18163 }, { "epoch": 5.553041883216142, "grad_norm": 0.5398805141448975, "learning_rate": 2.5624389622521337e-05, "loss": 0.1066, "step": 18164 }, { "epoch": 5.553347600122287, "grad_norm": 0.4735147953033447, "learning_rate": 2.56239650121014e-05, "loss": 0.0655, "step": 18165 }, { "epoch": 5.553653317028432, "grad_norm": 0.4435317814350128, "learning_rate": 2.5623540401681457e-05, "loss": 0.1246, "step": 18166 }, { "epoch": 5.553959033934577, "grad_norm": 1.1839826107025146, "learning_rate": 2.562311579126152e-05, "loss": 0.0704, "step": 18167 }, { "epoch": 5.554264750840721, "grad_norm": 0.5247227549552917, "learning_rate": 2.5622691180841578e-05, "loss": 0.1093, "step": 18168 }, { "epoch": 5.554570467746866, "grad_norm": 0.39008384943008423, "learning_rate": 2.562226657042164e-05, "loss": 0.0744, "step": 18169 }, { "epoch": 5.554876184653011, "grad_norm": 0.5741655230522156, "learning_rate": 2.56218419600017e-05, "loss": 0.1407, "step": 18170 }, { "epoch": 5.555181901559156, "grad_norm": 1.3828297853469849, "learning_rate": 2.562141734958176e-05, "loss": 0.1236, "step": 18171 }, { "epoch": 5.5554876184653015, "grad_norm": 0.7518048882484436, "learning_rate": 2.562099273916182e-05, "loss": 0.153, "step": 18172 }, { "epoch": 5.555793335371446, "grad_norm": 1.3250856399536133, "learning_rate": 2.5620568128741882e-05, "loss": 0.1593, "step": 18173 }, { "epoch": 5.556099052277591, "grad_norm": 3.0665807723999023, "learning_rate": 2.562014351832194e-05, "loss": 0.1602, "step": 18174 }, { "epoch": 5.556404769183736, "grad_norm": 1.013611912727356, "learning_rate": 2.5619718907902003e-05, "loss": 0.131, "step": 18175 }, { "epoch": 5.556710486089881, "grad_norm": 0.7416409850120544, "learning_rate": 2.561929429748206e-05, "loss": 0.1879, "step": 18176 }, { "epoch": 5.557016202996026, "grad_norm": 1.2573846578598022, "learning_rate": 2.561886968706212e-05, "loss": 0.1539, "step": 18177 }, { "epoch": 5.55732191990217, "grad_norm": 4.374913692474365, "learning_rate": 2.5618445076642182e-05, "loss": 0.176, "step": 18178 }, { "epoch": 5.5576276368083155, "grad_norm": 1.3068124055862427, "learning_rate": 2.561802046622224e-05, "loss": 0.2087, "step": 18179 }, { "epoch": 5.557933353714461, "grad_norm": 1.4363025426864624, "learning_rate": 2.5617595855802303e-05, "loss": 0.2584, "step": 18180 }, { "epoch": 5.558239070620605, "grad_norm": 0.6346430778503418, "learning_rate": 2.561717124538236e-05, "loss": 0.117, "step": 18181 }, { "epoch": 5.55854478752675, "grad_norm": 0.7972927093505859, "learning_rate": 2.5616746634962424e-05, "loss": 0.0777, "step": 18182 }, { "epoch": 5.558850504432895, "grad_norm": 0.4103338420391083, "learning_rate": 2.5616322024542482e-05, "loss": 0.0679, "step": 18183 }, { "epoch": 5.55915622133904, "grad_norm": 0.2286425530910492, "learning_rate": 2.5615897414122544e-05, "loss": 0.0601, "step": 18184 }, { "epoch": 5.559461938245185, "grad_norm": 0.18531754612922668, "learning_rate": 2.5615472803702603e-05, "loss": 0.056, "step": 18185 }, { "epoch": 5.5597676551513295, "grad_norm": 0.5745859146118164, "learning_rate": 2.5615048193282665e-05, "loss": 0.0689, "step": 18186 }, { "epoch": 5.560073372057475, "grad_norm": 0.3150327205657959, "learning_rate": 2.5614623582862724e-05, "loss": 0.0752, "step": 18187 }, { "epoch": 5.56037908896362, "grad_norm": 0.41964492201805115, "learning_rate": 2.5614198972442783e-05, "loss": 0.0691, "step": 18188 }, { "epoch": 5.560684805869765, "grad_norm": 0.4148467183113098, "learning_rate": 2.5613774362022845e-05, "loss": 0.0624, "step": 18189 }, { "epoch": 5.56099052277591, "grad_norm": 0.2686937749385834, "learning_rate": 2.5613349751602903e-05, "loss": 0.0712, "step": 18190 }, { "epoch": 5.561296239682054, "grad_norm": 0.2350790649652481, "learning_rate": 2.5612925141182966e-05, "loss": 0.0802, "step": 18191 }, { "epoch": 5.561601956588199, "grad_norm": 0.8980087637901306, "learning_rate": 2.5612500530763024e-05, "loss": 0.0735, "step": 18192 }, { "epoch": 5.5619076734943445, "grad_norm": 0.7887543439865112, "learning_rate": 2.5612075920343086e-05, "loss": 0.0819, "step": 18193 }, { "epoch": 5.562213390400489, "grad_norm": 0.3130825459957123, "learning_rate": 2.5611651309923145e-05, "loss": 0.104, "step": 18194 }, { "epoch": 5.562519107306634, "grad_norm": 0.9381800889968872, "learning_rate": 2.5611226699503207e-05, "loss": 0.1428, "step": 18195 }, { "epoch": 5.562824824212779, "grad_norm": 0.3894016444683075, "learning_rate": 2.5610802089083266e-05, "loss": 0.1276, "step": 18196 }, { "epoch": 5.563130541118924, "grad_norm": 2.601618766784668, "learning_rate": 2.5610377478663328e-05, "loss": 0.145, "step": 18197 }, { "epoch": 5.563436258025069, "grad_norm": 0.5394982695579529, "learning_rate": 2.5609952868243387e-05, "loss": 0.1611, "step": 18198 }, { "epoch": 5.563741974931213, "grad_norm": 1.7035691738128662, "learning_rate": 2.560952825782345e-05, "loss": 0.1557, "step": 18199 }, { "epoch": 5.5640476918373585, "grad_norm": 0.8955191373825073, "learning_rate": 2.5609103647403507e-05, "loss": 0.1687, "step": 18200 }, { "epoch": 5.564353408743504, "grad_norm": 1.7379570007324219, "learning_rate": 2.5608679036983566e-05, "loss": 0.1445, "step": 18201 }, { "epoch": 5.564659125649649, "grad_norm": 0.6403531432151794, "learning_rate": 2.5608254426563628e-05, "loss": 0.1853, "step": 18202 }, { "epoch": 5.564964842555794, "grad_norm": 1.2355467081069946, "learning_rate": 2.5607829816143687e-05, "loss": 0.1599, "step": 18203 }, { "epoch": 5.565270559461938, "grad_norm": 1.2689508199691772, "learning_rate": 2.560740520572375e-05, "loss": 0.1917, "step": 18204 }, { "epoch": 5.565576276368083, "grad_norm": 1.5791460275650024, "learning_rate": 2.5606980595303808e-05, "loss": 0.2209, "step": 18205 }, { "epoch": 5.565881993274228, "grad_norm": 0.3659871816635132, "learning_rate": 2.560655598488387e-05, "loss": 0.136, "step": 18206 }, { "epoch": 5.5661877101803725, "grad_norm": 0.36038991808891296, "learning_rate": 2.560613137446393e-05, "loss": 0.072, "step": 18207 }, { "epoch": 5.566493427086518, "grad_norm": 0.17764714360237122, "learning_rate": 2.560570676404399e-05, "loss": 0.0637, "step": 18208 }, { "epoch": 5.566799143992663, "grad_norm": 0.3920097351074219, "learning_rate": 2.560528215362405e-05, "loss": 0.066, "step": 18209 }, { "epoch": 5.567104860898808, "grad_norm": 0.18096420168876648, "learning_rate": 2.560485754320411e-05, "loss": 0.0389, "step": 18210 }, { "epoch": 5.567410577804953, "grad_norm": 0.33687111735343933, "learning_rate": 2.560443293278417e-05, "loss": 0.0408, "step": 18211 }, { "epoch": 5.567716294711097, "grad_norm": 0.6991496682167053, "learning_rate": 2.5604008322364232e-05, "loss": 0.0611, "step": 18212 }, { "epoch": 5.568022011617242, "grad_norm": 0.34299975633621216, "learning_rate": 2.560358371194429e-05, "loss": 0.0695, "step": 18213 }, { "epoch": 5.568327728523387, "grad_norm": 0.3086771070957184, "learning_rate": 2.560315910152435e-05, "loss": 0.0552, "step": 18214 }, { "epoch": 5.5686334454295325, "grad_norm": 0.31989291310310364, "learning_rate": 2.560273449110441e-05, "loss": 0.0511, "step": 18215 }, { "epoch": 5.568939162335678, "grad_norm": 0.3949809968471527, "learning_rate": 2.560230988068447e-05, "loss": 0.0949, "step": 18216 }, { "epoch": 5.569244879241822, "grad_norm": 2.0121452808380127, "learning_rate": 2.5601885270264532e-05, "loss": 0.0774, "step": 18217 }, { "epoch": 5.569550596147967, "grad_norm": 0.3369629979133606, "learning_rate": 2.560146065984459e-05, "loss": 0.0813, "step": 18218 }, { "epoch": 5.569856313054112, "grad_norm": 0.7477700710296631, "learning_rate": 2.5601036049424653e-05, "loss": 0.131, "step": 18219 }, { "epoch": 5.570162029960256, "grad_norm": 0.38101184368133545, "learning_rate": 2.5600611439004712e-05, "loss": 0.0824, "step": 18220 }, { "epoch": 5.5704677468664014, "grad_norm": 0.9335916042327881, "learning_rate": 2.5600186828584774e-05, "loss": 0.1357, "step": 18221 }, { "epoch": 5.570773463772547, "grad_norm": 2.0911591053009033, "learning_rate": 2.5599762218164833e-05, "loss": 0.1458, "step": 18222 }, { "epoch": 5.571079180678692, "grad_norm": 0.4694939851760864, "learning_rate": 2.5599337607744895e-05, "loss": 0.1669, "step": 18223 }, { "epoch": 5.571384897584837, "grad_norm": 0.7713999152183533, "learning_rate": 2.5598912997324953e-05, "loss": 0.1523, "step": 18224 }, { "epoch": 5.571690614490981, "grad_norm": 0.9502610564231873, "learning_rate": 2.5598488386905016e-05, "loss": 0.1562, "step": 18225 }, { "epoch": 5.571996331397126, "grad_norm": 0.6290639638900757, "learning_rate": 2.5598063776485074e-05, "loss": 0.1824, "step": 18226 }, { "epoch": 5.572302048303271, "grad_norm": 1.2107418775558472, "learning_rate": 2.5597639166065133e-05, "loss": 0.1603, "step": 18227 }, { "epoch": 5.572607765209416, "grad_norm": 1.7002454996109009, "learning_rate": 2.5597214555645195e-05, "loss": 0.1561, "step": 18228 }, { "epoch": 5.5729134821155615, "grad_norm": 1.1432609558105469, "learning_rate": 2.5596789945225254e-05, "loss": 0.2145, "step": 18229 }, { "epoch": 5.573219199021706, "grad_norm": 1.21875, "learning_rate": 2.5596365334805316e-05, "loss": 0.233, "step": 18230 }, { "epoch": 5.573524915927851, "grad_norm": 0.36984407901763916, "learning_rate": 2.5595940724385375e-05, "loss": 0.1348, "step": 18231 }, { "epoch": 5.573830632833996, "grad_norm": 0.3420976400375366, "learning_rate": 2.5595516113965437e-05, "loss": 0.0971, "step": 18232 }, { "epoch": 5.57413634974014, "grad_norm": 0.4986873269081116, "learning_rate": 2.5595091503545495e-05, "loss": 0.076, "step": 18233 }, { "epoch": 5.574442066646285, "grad_norm": 0.3662385940551758, "learning_rate": 2.5594666893125557e-05, "loss": 0.067, "step": 18234 }, { "epoch": 5.57474778355243, "grad_norm": 0.359659343957901, "learning_rate": 2.5594242282705616e-05, "loss": 0.0563, "step": 18235 }, { "epoch": 5.5750535004585755, "grad_norm": 0.23621471226215363, "learning_rate": 2.5593817672285678e-05, "loss": 0.0425, "step": 18236 }, { "epoch": 5.575359217364721, "grad_norm": 0.27636727690696716, "learning_rate": 2.559339306186574e-05, "loss": 0.0484, "step": 18237 }, { "epoch": 5.575664934270865, "grad_norm": 0.26008129119873047, "learning_rate": 2.5592968451445802e-05, "loss": 0.0917, "step": 18238 }, { "epoch": 5.57597065117701, "grad_norm": 0.24376484751701355, "learning_rate": 2.559254384102586e-05, "loss": 0.0693, "step": 18239 }, { "epoch": 5.576276368083155, "grad_norm": 0.2924080193042755, "learning_rate": 2.559211923060592e-05, "loss": 0.0846, "step": 18240 }, { "epoch": 5.5765820849893, "grad_norm": 0.872961163520813, "learning_rate": 2.5591694620185982e-05, "loss": 0.0862, "step": 18241 }, { "epoch": 5.576887801895445, "grad_norm": 0.6587720513343811, "learning_rate": 2.559127000976604e-05, "loss": 0.1012, "step": 18242 }, { "epoch": 5.5771935188015895, "grad_norm": 0.32475802302360535, "learning_rate": 2.5590845399346103e-05, "loss": 0.0847, "step": 18243 }, { "epoch": 5.577499235707735, "grad_norm": 0.5134047269821167, "learning_rate": 2.559042078892616e-05, "loss": 0.1006, "step": 18244 }, { "epoch": 5.57780495261388, "grad_norm": 0.5361547470092773, "learning_rate": 2.5589996178506223e-05, "loss": 0.135, "step": 18245 }, { "epoch": 5.578110669520024, "grad_norm": 0.663695216178894, "learning_rate": 2.5589571568086282e-05, "loss": 0.1536, "step": 18246 }, { "epoch": 5.578416386426169, "grad_norm": 0.5147190093994141, "learning_rate": 2.5589146957666344e-05, "loss": 0.1477, "step": 18247 }, { "epoch": 5.578722103332314, "grad_norm": 1.0885143280029297, "learning_rate": 2.5588722347246403e-05, "loss": 0.1332, "step": 18248 }, { "epoch": 5.579027820238459, "grad_norm": 1.2082918882369995, "learning_rate": 2.5588297736826465e-05, "loss": 0.1898, "step": 18249 }, { "epoch": 5.5793335371446044, "grad_norm": 0.6588470935821533, "learning_rate": 2.5587873126406524e-05, "loss": 0.1885, "step": 18250 }, { "epoch": 5.579639254050749, "grad_norm": 1.3274037837982178, "learning_rate": 2.5587448515986586e-05, "loss": 0.1531, "step": 18251 }, { "epoch": 5.579944970956894, "grad_norm": 0.7022039890289307, "learning_rate": 2.5587023905566644e-05, "loss": 0.179, "step": 18252 }, { "epoch": 5.580250687863039, "grad_norm": 1.2852256298065186, "learning_rate": 2.5586599295146703e-05, "loss": 0.1493, "step": 18253 }, { "epoch": 5.580556404769184, "grad_norm": 2.819136381149292, "learning_rate": 2.5586174684726765e-05, "loss": 0.1954, "step": 18254 }, { "epoch": 5.580862121675329, "grad_norm": 2.9819889068603516, "learning_rate": 2.5585750074306824e-05, "loss": 0.2869, "step": 18255 }, { "epoch": 5.581167838581473, "grad_norm": 0.5199511647224426, "learning_rate": 2.5585325463886886e-05, "loss": 0.1285, "step": 18256 }, { "epoch": 5.5814735554876185, "grad_norm": 0.48873084783554077, "learning_rate": 2.5584900853466945e-05, "loss": 0.1076, "step": 18257 }, { "epoch": 5.581779272393764, "grad_norm": 0.3225087523460388, "learning_rate": 2.5584476243047007e-05, "loss": 0.0657, "step": 18258 }, { "epoch": 5.582084989299908, "grad_norm": 0.2614062428474426, "learning_rate": 2.5584051632627066e-05, "loss": 0.0955, "step": 18259 }, { "epoch": 5.582390706206053, "grad_norm": 0.18758346140384674, "learning_rate": 2.5583627022207128e-05, "loss": 0.0497, "step": 18260 }, { "epoch": 5.582696423112198, "grad_norm": 0.4030674695968628, "learning_rate": 2.5583202411787186e-05, "loss": 0.0576, "step": 18261 }, { "epoch": 5.583002140018343, "grad_norm": 0.445995956659317, "learning_rate": 2.558277780136725e-05, "loss": 0.0542, "step": 18262 }, { "epoch": 5.583307856924488, "grad_norm": 0.2416725754737854, "learning_rate": 2.5582353190947307e-05, "loss": 0.0488, "step": 18263 }, { "epoch": 5.5836135738306325, "grad_norm": 0.3997728228569031, "learning_rate": 2.558192858052737e-05, "loss": 0.0793, "step": 18264 }, { "epoch": 5.583919290736778, "grad_norm": 0.48270997405052185, "learning_rate": 2.5581503970107428e-05, "loss": 0.0449, "step": 18265 }, { "epoch": 5.584225007642923, "grad_norm": 0.29373639822006226, "learning_rate": 2.5581079359687487e-05, "loss": 0.0957, "step": 18266 }, { "epoch": 5.584530724549068, "grad_norm": 0.7496426701545715, "learning_rate": 2.558065474926755e-05, "loss": 0.0734, "step": 18267 }, { "epoch": 5.584836441455213, "grad_norm": 0.7271338701248169, "learning_rate": 2.5580230138847607e-05, "loss": 0.0836, "step": 18268 }, { "epoch": 5.585142158361357, "grad_norm": 0.35986825823783875, "learning_rate": 2.557980552842767e-05, "loss": 0.1048, "step": 18269 }, { "epoch": 5.585447875267502, "grad_norm": 0.759092390537262, "learning_rate": 2.5579380918007728e-05, "loss": 0.1097, "step": 18270 }, { "epoch": 5.585753592173647, "grad_norm": 1.737236738204956, "learning_rate": 2.557895630758779e-05, "loss": 0.1599, "step": 18271 }, { "epoch": 5.586059309079792, "grad_norm": 0.5426335334777832, "learning_rate": 2.557853169716785e-05, "loss": 0.1099, "step": 18272 }, { "epoch": 5.586365025985937, "grad_norm": NaN, "learning_rate": 2.557853169716785e-05, "loss": 0.1856, "step": 18273 }, { "epoch": 5.586670742892082, "grad_norm": 0.49117082357406616, "learning_rate": 2.557810708674791e-05, "loss": 0.1707, "step": 18274 }, { "epoch": 5.586976459798227, "grad_norm": 0.7507224678993225, "learning_rate": 2.557768247632797e-05, "loss": 0.1684, "step": 18275 }, { "epoch": 5.587282176704372, "grad_norm": 0.6961421370506287, "learning_rate": 2.5577257865908032e-05, "loss": 0.1353, "step": 18276 }, { "epoch": 5.587587893610516, "grad_norm": 0.6474846005439758, "learning_rate": 2.557683325548809e-05, "loss": 0.1456, "step": 18277 }, { "epoch": 5.5878936105166614, "grad_norm": 1.1997066736221313, "learning_rate": 2.5576408645068153e-05, "loss": 0.2155, "step": 18278 }, { "epoch": 5.588199327422807, "grad_norm": 0.915921688079834, "learning_rate": 2.557598403464821e-05, "loss": 0.1997, "step": 18279 }, { "epoch": 5.588505044328952, "grad_norm": 1.0445983409881592, "learning_rate": 2.557555942422827e-05, "loss": 0.2111, "step": 18280 }, { "epoch": 5.588810761235097, "grad_norm": 0.5158277750015259, "learning_rate": 2.5575134813808332e-05, "loss": 0.1476, "step": 18281 }, { "epoch": 5.589116478141241, "grad_norm": 0.2924502491950989, "learning_rate": 2.557471020338839e-05, "loss": 0.0751, "step": 18282 }, { "epoch": 5.589422195047386, "grad_norm": 0.30518248677253723, "learning_rate": 2.5574285592968453e-05, "loss": 0.0539, "step": 18283 }, { "epoch": 5.589727911953531, "grad_norm": 0.28949207067489624, "learning_rate": 2.557386098254851e-05, "loss": 0.0664, "step": 18284 }, { "epoch": 5.5900336288596755, "grad_norm": 0.30180981755256653, "learning_rate": 2.5573436372128574e-05, "loss": 0.068, "step": 18285 }, { "epoch": 5.590339345765821, "grad_norm": 0.13290192186832428, "learning_rate": 2.5573011761708632e-05, "loss": 0.0367, "step": 18286 }, { "epoch": 5.590645062671966, "grad_norm": 0.2858189642429352, "learning_rate": 2.5572587151288694e-05, "loss": 0.0618, "step": 18287 }, { "epoch": 5.590950779578111, "grad_norm": 0.6638586521148682, "learning_rate": 2.5572162540868753e-05, "loss": 0.0541, "step": 18288 }, { "epoch": 5.591256496484256, "grad_norm": 0.5358924269676208, "learning_rate": 2.5571737930448815e-05, "loss": 0.0708, "step": 18289 }, { "epoch": 5.5915622133904, "grad_norm": 0.39520156383514404, "learning_rate": 2.5571313320028874e-05, "loss": 0.0663, "step": 18290 }, { "epoch": 5.591867930296545, "grad_norm": 0.3666377663612366, "learning_rate": 2.5570888709608936e-05, "loss": 0.0947, "step": 18291 }, { "epoch": 5.59217364720269, "grad_norm": 0.2805826961994171, "learning_rate": 2.5570464099188995e-05, "loss": 0.0743, "step": 18292 }, { "epoch": 5.5924793641088355, "grad_norm": 0.2662874460220337, "learning_rate": 2.5570039488769053e-05, "loss": 0.0899, "step": 18293 }, { "epoch": 5.592785081014981, "grad_norm": 0.43489620089530945, "learning_rate": 2.5569614878349116e-05, "loss": 0.1027, "step": 18294 }, { "epoch": 5.593090797921125, "grad_norm": 0.45221394300460815, "learning_rate": 2.5569190267929174e-05, "loss": 0.1545, "step": 18295 }, { "epoch": 5.59339651482727, "grad_norm": 0.4465176463127136, "learning_rate": 2.5568765657509236e-05, "loss": 0.1243, "step": 18296 }, { "epoch": 5.593702231733415, "grad_norm": 2.5913608074188232, "learning_rate": 2.5568341047089295e-05, "loss": 0.189, "step": 18297 }, { "epoch": 5.594007948639559, "grad_norm": 0.9094834923744202, "learning_rate": 2.5567916436669357e-05, "loss": 0.1542, "step": 18298 }, { "epoch": 5.594313665545704, "grad_norm": 0.4633389413356781, "learning_rate": 2.5567491826249416e-05, "loss": 0.1738, "step": 18299 }, { "epoch": 5.5946193824518495, "grad_norm": 0.5642329454421997, "learning_rate": 2.5567067215829478e-05, "loss": 0.1534, "step": 18300 }, { "epoch": 5.594925099357995, "grad_norm": 0.5438848733901978, "learning_rate": 2.5566642605409537e-05, "loss": 0.1533, "step": 18301 }, { "epoch": 5.59523081626414, "grad_norm": 1.0531153678894043, "learning_rate": 2.55662179949896e-05, "loss": 0.169, "step": 18302 }, { "epoch": 5.595536533170284, "grad_norm": 1.092287302017212, "learning_rate": 2.5565793384569657e-05, "loss": 0.1932, "step": 18303 }, { "epoch": 5.595842250076429, "grad_norm": 0.9711857438087463, "learning_rate": 2.5565368774149716e-05, "loss": 0.1669, "step": 18304 }, { "epoch": 5.596147966982574, "grad_norm": 2.179421901702881, "learning_rate": 2.5564944163729778e-05, "loss": 0.2212, "step": 18305 }, { "epoch": 5.596453683888719, "grad_norm": 0.3701043725013733, "learning_rate": 2.5564519553309837e-05, "loss": 0.1173, "step": 18306 }, { "epoch": 5.5967594007948644, "grad_norm": 0.5180609226226807, "learning_rate": 2.55640949428899e-05, "loss": 0.0649, "step": 18307 }, { "epoch": 5.597065117701009, "grad_norm": 0.5908294916152954, "learning_rate": 2.5563670332469958e-05, "loss": 0.0932, "step": 18308 }, { "epoch": 5.597370834607154, "grad_norm": 0.2464517503976822, "learning_rate": 2.556324572205002e-05, "loss": 0.0414, "step": 18309 }, { "epoch": 5.597676551513299, "grad_norm": 0.3050457835197449, "learning_rate": 2.556282111163008e-05, "loss": 0.0555, "step": 18310 }, { "epoch": 5.597982268419443, "grad_norm": 3.5179948806762695, "learning_rate": 2.556239650121014e-05, "loss": 0.0938, "step": 18311 }, { "epoch": 5.598287985325588, "grad_norm": 0.45646238327026367, "learning_rate": 2.55619718907902e-05, "loss": 0.0548, "step": 18312 }, { "epoch": 5.598593702231733, "grad_norm": 0.46691322326660156, "learning_rate": 2.556154728037026e-05, "loss": 0.0657, "step": 18313 }, { "epoch": 5.5988994191378785, "grad_norm": 0.36241424083709717, "learning_rate": 2.556112266995032e-05, "loss": 0.0566, "step": 18314 }, { "epoch": 5.599205136044024, "grad_norm": 0.4105997085571289, "learning_rate": 2.5560698059530382e-05, "loss": 0.074, "step": 18315 }, { "epoch": 5.599510852950168, "grad_norm": 0.8527687191963196, "learning_rate": 2.556027344911044e-05, "loss": 0.1143, "step": 18316 }, { "epoch": 5.599816569856313, "grad_norm": 0.37523311376571655, "learning_rate": 2.55598488386905e-05, "loss": 0.0777, "step": 18317 }, { "epoch": 5.600122286762458, "grad_norm": 0.7474581003189087, "learning_rate": 2.555942422827056e-05, "loss": 0.1002, "step": 18318 }, { "epoch": 5.600428003668603, "grad_norm": 0.6421859264373779, "learning_rate": 2.555899961785062e-05, "loss": 0.123, "step": 18319 }, { "epoch": 5.600733720574748, "grad_norm": 0.4499695301055908, "learning_rate": 2.5558575007430682e-05, "loss": 0.1767, "step": 18320 }, { "epoch": 5.6010394374808925, "grad_norm": 1.2326194047927856, "learning_rate": 2.555815039701074e-05, "loss": 0.1281, "step": 18321 }, { "epoch": 5.601345154387038, "grad_norm": 0.5783007740974426, "learning_rate": 2.5557725786590803e-05, "loss": 0.1583, "step": 18322 }, { "epoch": 5.601650871293183, "grad_norm": 0.6018354892730713, "learning_rate": 2.5557301176170862e-05, "loss": 0.1774, "step": 18323 }, { "epoch": 5.601956588199327, "grad_norm": 0.6259329915046692, "learning_rate": 2.5556876565750924e-05, "loss": 0.1687, "step": 18324 }, { "epoch": 5.602262305105472, "grad_norm": 1.1085996627807617, "learning_rate": 2.5556451955330983e-05, "loss": 0.1305, "step": 18325 }, { "epoch": 5.602568022011617, "grad_norm": 1.0240309238433838, "learning_rate": 2.5556027344911045e-05, "loss": 0.1693, "step": 18326 }, { "epoch": 5.602873738917762, "grad_norm": 1.677309274673462, "learning_rate": 2.5555602734491103e-05, "loss": 0.1946, "step": 18327 }, { "epoch": 5.603179455823907, "grad_norm": 1.1795698404312134, "learning_rate": 2.5555178124071166e-05, "loss": 0.1978, "step": 18328 }, { "epoch": 5.603485172730052, "grad_norm": 0.7014297842979431, "learning_rate": 2.5554753513651224e-05, "loss": 0.164, "step": 18329 }, { "epoch": 5.603790889636197, "grad_norm": 1.456209421157837, "learning_rate": 2.5554328903231283e-05, "loss": 0.221, "step": 18330 }, { "epoch": 5.604096606542342, "grad_norm": 0.6593865156173706, "learning_rate": 2.5553904292811345e-05, "loss": 0.134, "step": 18331 }, { "epoch": 5.604402323448487, "grad_norm": 0.556879460811615, "learning_rate": 2.5553479682391404e-05, "loss": 0.0909, "step": 18332 }, { "epoch": 5.604708040354632, "grad_norm": 0.2810749113559723, "learning_rate": 2.5553055071971466e-05, "loss": 0.0636, "step": 18333 }, { "epoch": 5.605013757260776, "grad_norm": 0.19630540907382965, "learning_rate": 2.5552630461551525e-05, "loss": 0.044, "step": 18334 }, { "epoch": 5.6053194741669214, "grad_norm": 0.21681417524814606, "learning_rate": 2.5552205851131587e-05, "loss": 0.072, "step": 18335 }, { "epoch": 5.605625191073067, "grad_norm": 0.25979942083358765, "learning_rate": 2.5551781240711645e-05, "loss": 0.0381, "step": 18336 }, { "epoch": 5.605930907979211, "grad_norm": 0.2653813064098358, "learning_rate": 2.5551356630291707e-05, "loss": 0.0555, "step": 18337 }, { "epoch": 5.606236624885356, "grad_norm": 0.25605547428131104, "learning_rate": 2.5550932019871766e-05, "loss": 0.0535, "step": 18338 }, { "epoch": 5.606542341791501, "grad_norm": 0.2844046652317047, "learning_rate": 2.5550507409451828e-05, "loss": 0.0731, "step": 18339 }, { "epoch": 5.606848058697646, "grad_norm": 0.2867510914802551, "learning_rate": 2.5550082799031887e-05, "loss": 0.0396, "step": 18340 }, { "epoch": 5.607153775603791, "grad_norm": 0.2644846737384796, "learning_rate": 2.554965818861195e-05, "loss": 0.0684, "step": 18341 }, { "epoch": 5.6074594925099355, "grad_norm": 0.3620111644268036, "learning_rate": 2.554923357819201e-05, "loss": 0.0711, "step": 18342 }, { "epoch": 5.607765209416081, "grad_norm": 0.7335001230239868, "learning_rate": 2.554880896777207e-05, "loss": 0.1287, "step": 18343 }, { "epoch": 5.608070926322226, "grad_norm": 0.6635366082191467, "learning_rate": 2.5548384357352132e-05, "loss": 0.1478, "step": 18344 }, { "epoch": 5.608376643228371, "grad_norm": 0.7507050037384033, "learning_rate": 2.554795974693219e-05, "loss": 0.1211, "step": 18345 }, { "epoch": 5.608682360134516, "grad_norm": 0.4637993574142456, "learning_rate": 2.5547535136512253e-05, "loss": 0.126, "step": 18346 }, { "epoch": 5.60898807704066, "grad_norm": 1.0918055772781372, "learning_rate": 2.554711052609231e-05, "loss": 0.1478, "step": 18347 }, { "epoch": 5.609293793946805, "grad_norm": 0.43471693992614746, "learning_rate": 2.5546685915672373e-05, "loss": 0.1589, "step": 18348 }, { "epoch": 5.60959951085295, "grad_norm": 0.6311132907867432, "learning_rate": 2.5546261305252432e-05, "loss": 0.2077, "step": 18349 }, { "epoch": 5.609905227759095, "grad_norm": 2.1593711376190186, "learning_rate": 2.5545836694832494e-05, "loss": 0.1843, "step": 18350 }, { "epoch": 5.61021094466524, "grad_norm": 0.800590455532074, "learning_rate": 2.5545412084412553e-05, "loss": 0.17, "step": 18351 }, { "epoch": 5.610516661571385, "grad_norm": 0.9671453237533569, "learning_rate": 2.5544987473992615e-05, "loss": 0.1841, "step": 18352 }, { "epoch": 5.61082237847753, "grad_norm": 0.575558066368103, "learning_rate": 2.5544562863572674e-05, "loss": 0.1729, "step": 18353 }, { "epoch": 5.611128095383675, "grad_norm": 1.407077431678772, "learning_rate": 2.5544138253152736e-05, "loss": 0.2113, "step": 18354 }, { "epoch": 5.611433812289819, "grad_norm": 2.2530078887939453, "learning_rate": 2.5543713642732795e-05, "loss": 0.2013, "step": 18355 }, { "epoch": 5.611739529195964, "grad_norm": 1.2347095012664795, "learning_rate": 2.5543289032312853e-05, "loss": 0.1365, "step": 18356 }, { "epoch": 5.6120452461021095, "grad_norm": 0.27907806634902954, "learning_rate": 2.5542864421892915e-05, "loss": 0.0951, "step": 18357 }, { "epoch": 5.612350963008255, "grad_norm": 0.2175348848104477, "learning_rate": 2.5542439811472974e-05, "loss": 0.0609, "step": 18358 }, { "epoch": 5.6126566799144, "grad_norm": 0.5466992855072021, "learning_rate": 2.5542015201053036e-05, "loss": 0.1114, "step": 18359 }, { "epoch": 5.612962396820544, "grad_norm": 0.3007045388221741, "learning_rate": 2.5541590590633095e-05, "loss": 0.0623, "step": 18360 }, { "epoch": 5.613268113726689, "grad_norm": 0.3950837552547455, "learning_rate": 2.5541165980213157e-05, "loss": 0.0497, "step": 18361 }, { "epoch": 5.613573830632834, "grad_norm": 0.3152276575565338, "learning_rate": 2.5540741369793216e-05, "loss": 0.0488, "step": 18362 }, { "epoch": 5.613879547538978, "grad_norm": 0.2755364179611206, "learning_rate": 2.5540316759373278e-05, "loss": 0.0672, "step": 18363 }, { "epoch": 5.614185264445124, "grad_norm": 0.3574415147304535, "learning_rate": 2.5539892148953336e-05, "loss": 0.0401, "step": 18364 }, { "epoch": 5.614490981351269, "grad_norm": 0.3049468994140625, "learning_rate": 2.55394675385334e-05, "loss": 0.0602, "step": 18365 }, { "epoch": 5.614796698257414, "grad_norm": 0.32711291313171387, "learning_rate": 2.5539042928113457e-05, "loss": 0.0876, "step": 18366 }, { "epoch": 5.615102415163559, "grad_norm": 0.2703985273838043, "learning_rate": 2.553861831769352e-05, "loss": 0.082, "step": 18367 }, { "epoch": 5.615408132069703, "grad_norm": 0.6479877829551697, "learning_rate": 2.5538193707273578e-05, "loss": 0.0897, "step": 18368 }, { "epoch": 5.615713848975848, "grad_norm": 0.5852622389793396, "learning_rate": 2.5537769096853637e-05, "loss": 0.1142, "step": 18369 }, { "epoch": 5.616019565881993, "grad_norm": 0.7199486494064331, "learning_rate": 2.55373444864337e-05, "loss": 0.1224, "step": 18370 }, { "epoch": 5.6163252827881385, "grad_norm": 1.0010203123092651, "learning_rate": 2.5536919876013757e-05, "loss": 0.1378, "step": 18371 }, { "epoch": 5.616630999694284, "grad_norm": 0.5241647362709045, "learning_rate": 2.553649526559382e-05, "loss": 0.1366, "step": 18372 }, { "epoch": 5.616936716600428, "grad_norm": 1.2921569347381592, "learning_rate": 2.5536070655173878e-05, "loss": 0.1675, "step": 18373 }, { "epoch": 5.617242433506573, "grad_norm": 0.7712802290916443, "learning_rate": 2.553564604475394e-05, "loss": 0.165, "step": 18374 }, { "epoch": 5.617548150412718, "grad_norm": 0.6041282415390015, "learning_rate": 2.5535221434334e-05, "loss": 0.1565, "step": 18375 }, { "epoch": 5.617853867318862, "grad_norm": 0.6444793343544006, "learning_rate": 2.553479682391406e-05, "loss": 0.1993, "step": 18376 }, { "epoch": 5.618159584225007, "grad_norm": 1.4104039669036865, "learning_rate": 2.553437221349412e-05, "loss": 0.1833, "step": 18377 }, { "epoch": 5.6184653011311525, "grad_norm": 0.7025769352912903, "learning_rate": 2.5533947603074182e-05, "loss": 0.1817, "step": 18378 }, { "epoch": 5.618771018037298, "grad_norm": 1.3158737421035767, "learning_rate": 2.553352299265424e-05, "loss": 0.2198, "step": 18379 }, { "epoch": 5.619076734943443, "grad_norm": 1.2085230350494385, "learning_rate": 2.5533098382234303e-05, "loss": 0.1984, "step": 18380 }, { "epoch": 5.619382451849587, "grad_norm": 0.3944307565689087, "learning_rate": 2.553267377181436e-05, "loss": 0.1371, "step": 18381 }, { "epoch": 5.619688168755732, "grad_norm": 0.3044701814651489, "learning_rate": 2.553224916139442e-05, "loss": 0.0639, "step": 18382 }, { "epoch": 5.619993885661877, "grad_norm": 0.323711633682251, "learning_rate": 2.5531824550974482e-05, "loss": 0.0735, "step": 18383 }, { "epoch": 5.620299602568022, "grad_norm": 0.2655865550041199, "learning_rate": 2.553139994055454e-05, "loss": 0.055, "step": 18384 }, { "epoch": 5.620605319474167, "grad_norm": 0.1908949613571167, "learning_rate": 2.5530975330134603e-05, "loss": 0.0594, "step": 18385 }, { "epoch": 5.620911036380312, "grad_norm": 0.2946237623691559, "learning_rate": 2.553055071971466e-05, "loss": 0.0526, "step": 18386 }, { "epoch": 5.621216753286457, "grad_norm": 0.15077988803386688, "learning_rate": 2.5530126109294724e-05, "loss": 0.0506, "step": 18387 }, { "epoch": 5.621522470192602, "grad_norm": 0.18089918792247772, "learning_rate": 2.5529701498874782e-05, "loss": 0.0736, "step": 18388 }, { "epoch": 5.621828187098746, "grad_norm": 0.49957138299942017, "learning_rate": 2.5529276888454845e-05, "loss": 0.0934, "step": 18389 }, { "epoch": 5.622133904004891, "grad_norm": 0.17919307947158813, "learning_rate": 2.5528852278034903e-05, "loss": 0.0597, "step": 18390 }, { "epoch": 5.622439620911036, "grad_norm": 0.4282303750514984, "learning_rate": 2.5528427667614965e-05, "loss": 0.0724, "step": 18391 }, { "epoch": 5.622745337817181, "grad_norm": 0.3802656829357147, "learning_rate": 2.5528003057195024e-05, "loss": 0.0775, "step": 18392 }, { "epoch": 5.623051054723327, "grad_norm": 0.5588738322257996, "learning_rate": 2.5527578446775086e-05, "loss": 0.0749, "step": 18393 }, { "epoch": 5.623356771629471, "grad_norm": 0.8118077516555786, "learning_rate": 2.5527153836355145e-05, "loss": 0.1115, "step": 18394 }, { "epoch": 5.623662488535616, "grad_norm": 0.46445631980895996, "learning_rate": 2.5526729225935203e-05, "loss": 0.1114, "step": 18395 }, { "epoch": 5.623968205441761, "grad_norm": 0.5238533020019531, "learning_rate": 2.5526304615515266e-05, "loss": 0.1105, "step": 18396 }, { "epoch": 5.624273922347906, "grad_norm": 0.6106948852539062, "learning_rate": 2.5525880005095324e-05, "loss": 0.146, "step": 18397 }, { "epoch": 5.624579639254051, "grad_norm": 0.7683594822883606, "learning_rate": 2.5525455394675386e-05, "loss": 0.1471, "step": 18398 }, { "epoch": 5.6248853561601955, "grad_norm": 0.6357750296592712, "learning_rate": 2.5525030784255445e-05, "loss": 0.1861, "step": 18399 }, { "epoch": 5.625191073066341, "grad_norm": 0.7400969862937927, "learning_rate": 2.5524606173835507e-05, "loss": 0.1823, "step": 18400 }, { "epoch": 5.625496789972486, "grad_norm": 1.8920694589614868, "learning_rate": 2.5524181563415566e-05, "loss": 0.1734, "step": 18401 }, { "epoch": 5.62580250687863, "grad_norm": 0.73150235414505, "learning_rate": 2.5523756952995628e-05, "loss": 0.1563, "step": 18402 }, { "epoch": 5.626108223784775, "grad_norm": 0.7553536295890808, "learning_rate": 2.5523332342575687e-05, "loss": 0.176, "step": 18403 }, { "epoch": 5.62641394069092, "grad_norm": 0.765883207321167, "learning_rate": 2.552290773215575e-05, "loss": 0.1927, "step": 18404 }, { "epoch": 5.626719657597065, "grad_norm": 1.1407947540283203, "learning_rate": 2.5522483121735807e-05, "loss": 0.253, "step": 18405 }, { "epoch": 5.62702537450321, "grad_norm": 0.34362131357192993, "learning_rate": 2.552205851131587e-05, "loss": 0.1601, "step": 18406 }, { "epoch": 5.627331091409355, "grad_norm": 0.19805558025836945, "learning_rate": 2.5521633900895928e-05, "loss": 0.0804, "step": 18407 }, { "epoch": 5.6276368083155, "grad_norm": 0.21457865834236145, "learning_rate": 2.5521209290475987e-05, "loss": 0.0762, "step": 18408 }, { "epoch": 5.627942525221645, "grad_norm": 0.3055373728275299, "learning_rate": 2.552078468005605e-05, "loss": 0.0488, "step": 18409 }, { "epoch": 5.62824824212779, "grad_norm": 0.38694822788238525, "learning_rate": 2.5520360069636108e-05, "loss": 0.0533, "step": 18410 }, { "epoch": 5.628553959033935, "grad_norm": 0.19008947908878326, "learning_rate": 2.551993545921617e-05, "loss": 0.0496, "step": 18411 }, { "epoch": 5.628859675940079, "grad_norm": 0.6846996545791626, "learning_rate": 2.551951084879623e-05, "loss": 0.0698, "step": 18412 }, { "epoch": 5.629165392846224, "grad_norm": 0.48525184392929077, "learning_rate": 2.551908623837629e-05, "loss": 0.0675, "step": 18413 }, { "epoch": 5.6294711097523695, "grad_norm": 0.7362934947013855, "learning_rate": 2.551866162795635e-05, "loss": 0.0722, "step": 18414 }, { "epoch": 5.629776826658514, "grad_norm": 0.2001827210187912, "learning_rate": 2.551823701753641e-05, "loss": 0.051, "step": 18415 }, { "epoch": 5.630082543564659, "grad_norm": 0.42259034514427185, "learning_rate": 2.551781240711647e-05, "loss": 0.1212, "step": 18416 }, { "epoch": 5.630388260470804, "grad_norm": 0.39875084161758423, "learning_rate": 2.5517387796696532e-05, "loss": 0.0794, "step": 18417 }, { "epoch": 5.630693977376949, "grad_norm": 0.2640916705131531, "learning_rate": 2.551696318627659e-05, "loss": 0.0723, "step": 18418 }, { "epoch": 5.630999694283094, "grad_norm": 0.4708666205406189, "learning_rate": 2.551653857585665e-05, "loss": 0.1211, "step": 18419 }, { "epoch": 5.631305411189238, "grad_norm": 0.29123854637145996, "learning_rate": 2.551611396543671e-05, "loss": 0.0975, "step": 18420 }, { "epoch": 5.631611128095384, "grad_norm": 0.6214659214019775, "learning_rate": 2.551568935501677e-05, "loss": 0.1296, "step": 18421 }, { "epoch": 5.631916845001529, "grad_norm": 0.46046915650367737, "learning_rate": 2.5515264744596832e-05, "loss": 0.1602, "step": 18422 }, { "epoch": 5.632222561907674, "grad_norm": 0.44548627734184265, "learning_rate": 2.551484013417689e-05, "loss": 0.1499, "step": 18423 }, { "epoch": 5.632528278813819, "grad_norm": 0.5805967450141907, "learning_rate": 2.5514415523756953e-05, "loss": 0.1462, "step": 18424 }, { "epoch": 5.632833995719963, "grad_norm": 0.5914457440376282, "learning_rate": 2.5513990913337012e-05, "loss": 0.1652, "step": 18425 }, { "epoch": 5.633139712626108, "grad_norm": 1.0902252197265625, "learning_rate": 2.5513566302917074e-05, "loss": 0.2083, "step": 18426 }, { "epoch": 5.633445429532253, "grad_norm": 1.019568920135498, "learning_rate": 2.5513141692497133e-05, "loss": 0.177, "step": 18427 }, { "epoch": 5.633751146438398, "grad_norm": 1.4159470796585083, "learning_rate": 2.5512717082077195e-05, "loss": 0.2052, "step": 18428 }, { "epoch": 5.634056863344543, "grad_norm": 0.8630003929138184, "learning_rate": 2.5512292471657254e-05, "loss": 0.2023, "step": 18429 }, { "epoch": 5.634362580250688, "grad_norm": 1.150755524635315, "learning_rate": 2.5511867861237316e-05, "loss": 0.2236, "step": 18430 }, { "epoch": 5.634668297156833, "grad_norm": 0.5829628705978394, "learning_rate": 2.5511443250817374e-05, "loss": 0.1368, "step": 18431 }, { "epoch": 5.634974014062978, "grad_norm": 0.21245886385440826, "learning_rate": 2.5511018640397433e-05, "loss": 0.0647, "step": 18432 }, { "epoch": 5.635279730969122, "grad_norm": 0.4454851746559143, "learning_rate": 2.5510594029977495e-05, "loss": 0.077, "step": 18433 }, { "epoch": 5.635585447875267, "grad_norm": 0.23562999069690704, "learning_rate": 2.5510169419557554e-05, "loss": 0.0695, "step": 18434 }, { "epoch": 5.6358911647814125, "grad_norm": 0.1752994805574417, "learning_rate": 2.5509744809137616e-05, "loss": 0.0512, "step": 18435 }, { "epoch": 5.636196881687558, "grad_norm": 0.2186470627784729, "learning_rate": 2.5509320198717675e-05, "loss": 0.0453, "step": 18436 }, { "epoch": 5.636502598593703, "grad_norm": 0.23952725529670715, "learning_rate": 2.5508895588297737e-05, "loss": 0.0492, "step": 18437 }, { "epoch": 5.636808315499847, "grad_norm": 0.22865600883960724, "learning_rate": 2.5508470977877795e-05, "loss": 0.0842, "step": 18438 }, { "epoch": 5.637114032405992, "grad_norm": 0.47735780477523804, "learning_rate": 2.5508046367457857e-05, "loss": 0.0606, "step": 18439 }, { "epoch": 5.637419749312137, "grad_norm": 0.34648072719573975, "learning_rate": 2.5507621757037916e-05, "loss": 0.0717, "step": 18440 }, { "epoch": 5.637725466218281, "grad_norm": 0.2714444398880005, "learning_rate": 2.5507197146617978e-05, "loss": 0.0649, "step": 18441 }, { "epoch": 5.6380311831244265, "grad_norm": 0.19756640493869781, "learning_rate": 2.5506772536198037e-05, "loss": 0.0834, "step": 18442 }, { "epoch": 5.638336900030572, "grad_norm": 0.5132104158401489, "learning_rate": 2.55063479257781e-05, "loss": 0.1165, "step": 18443 }, { "epoch": 5.638642616936717, "grad_norm": 0.8439040780067444, "learning_rate": 2.550592331535816e-05, "loss": 0.1164, "step": 18444 }, { "epoch": 5.638948333842862, "grad_norm": 0.6721707582473755, "learning_rate": 2.550549870493822e-05, "loss": 0.1436, "step": 18445 }, { "epoch": 5.639254050749006, "grad_norm": 0.5008426308631897, "learning_rate": 2.5505074094518282e-05, "loss": 0.1098, "step": 18446 }, { "epoch": 5.639559767655151, "grad_norm": 0.817044198513031, "learning_rate": 2.550464948409834e-05, "loss": 0.1896, "step": 18447 }, { "epoch": 5.639865484561296, "grad_norm": 0.6587046384811401, "learning_rate": 2.5504224873678403e-05, "loss": 0.1488, "step": 18448 }, { "epoch": 5.640171201467441, "grad_norm": 0.6782678365707397, "learning_rate": 2.550380026325846e-05, "loss": 0.1425, "step": 18449 }, { "epoch": 5.640476918373587, "grad_norm": 0.6864480376243591, "learning_rate": 2.5503375652838523e-05, "loss": 0.1576, "step": 18450 }, { "epoch": 5.640782635279731, "grad_norm": 0.4903359115123749, "learning_rate": 2.5502951042418582e-05, "loss": 0.1782, "step": 18451 }, { "epoch": 5.641088352185876, "grad_norm": 0.5138401389122009, "learning_rate": 2.5502526431998644e-05, "loss": 0.1797, "step": 18452 }, { "epoch": 5.641394069092021, "grad_norm": 1.057532787322998, "learning_rate": 2.5502101821578703e-05, "loss": 0.1836, "step": 18453 }, { "epoch": 5.641699785998165, "grad_norm": 1.3012136220932007, "learning_rate": 2.5501677211158765e-05, "loss": 0.2336, "step": 18454 }, { "epoch": 5.64200550290431, "grad_norm": 1.6566760540008545, "learning_rate": 2.5501252600738824e-05, "loss": 0.2527, "step": 18455 }, { "epoch": 5.6423112198104555, "grad_norm": 0.42375221848487854, "learning_rate": 2.5500827990318886e-05, "loss": 0.111, "step": 18456 }, { "epoch": 5.642616936716601, "grad_norm": 0.37816163897514343, "learning_rate": 2.5500403379898945e-05, "loss": 0.064, "step": 18457 }, { "epoch": 5.642922653622746, "grad_norm": 0.30621960759162903, "learning_rate": 2.5499978769479003e-05, "loss": 0.0737, "step": 18458 }, { "epoch": 5.64322837052889, "grad_norm": 0.15712840855121613, "learning_rate": 2.5499554159059065e-05, "loss": 0.0468, "step": 18459 }, { "epoch": 5.643534087435035, "grad_norm": 0.179916650056839, "learning_rate": 2.5499129548639124e-05, "loss": 0.0477, "step": 18460 }, { "epoch": 5.64383980434118, "grad_norm": 0.26511964201927185, "learning_rate": 2.5498704938219186e-05, "loss": 0.0477, "step": 18461 }, { "epoch": 5.644145521247325, "grad_norm": 0.23202064633369446, "learning_rate": 2.5498280327799245e-05, "loss": 0.0635, "step": 18462 }, { "epoch": 5.64445123815347, "grad_norm": 0.3108268678188324, "learning_rate": 2.5497855717379307e-05, "loss": 0.0617, "step": 18463 }, { "epoch": 5.644756955059615, "grad_norm": 0.2885558307170868, "learning_rate": 2.5497431106959366e-05, "loss": 0.0981, "step": 18464 }, { "epoch": 5.64506267196576, "grad_norm": 0.40167009830474854, "learning_rate": 2.5497006496539428e-05, "loss": 0.0633, "step": 18465 }, { "epoch": 5.645368388871905, "grad_norm": 0.4009724259376526, "learning_rate": 2.5496581886119486e-05, "loss": 0.1083, "step": 18466 }, { "epoch": 5.645674105778049, "grad_norm": 0.2701486349105835, "learning_rate": 2.549615727569955e-05, "loss": 0.0895, "step": 18467 }, { "epoch": 5.645979822684194, "grad_norm": 0.6698035001754761, "learning_rate": 2.5495732665279607e-05, "loss": 0.1039, "step": 18468 }, { "epoch": 5.646285539590339, "grad_norm": 2.2862470149993896, "learning_rate": 2.549530805485967e-05, "loss": 0.1279, "step": 18469 }, { "epoch": 5.646591256496484, "grad_norm": 0.7373393774032593, "learning_rate": 2.5494883444439728e-05, "loss": 0.1183, "step": 18470 }, { "epoch": 5.6468969734026295, "grad_norm": 0.30177631974220276, "learning_rate": 2.5494458834019787e-05, "loss": 0.1053, "step": 18471 }, { "epoch": 5.647202690308774, "grad_norm": 2.3086564540863037, "learning_rate": 2.549403422359985e-05, "loss": 0.1768, "step": 18472 }, { "epoch": 5.647508407214919, "grad_norm": 0.7466709017753601, "learning_rate": 2.5493609613179907e-05, "loss": 0.15, "step": 18473 }, { "epoch": 5.647814124121064, "grad_norm": 0.6714603304862976, "learning_rate": 2.549318500275997e-05, "loss": 0.1642, "step": 18474 }, { "epoch": 5.648119841027209, "grad_norm": 0.5699871182441711, "learning_rate": 2.5492760392340028e-05, "loss": 0.174, "step": 18475 }, { "epoch": 5.648425557933354, "grad_norm": 0.758994996547699, "learning_rate": 2.549233578192009e-05, "loss": 0.1866, "step": 18476 }, { "epoch": 5.648731274839498, "grad_norm": 0.5084396004676819, "learning_rate": 2.549191117150015e-05, "loss": 0.1664, "step": 18477 }, { "epoch": 5.6490369917456436, "grad_norm": 1.3609265089035034, "learning_rate": 2.549148656108021e-05, "loss": 0.1868, "step": 18478 }, { "epoch": 5.649342708651789, "grad_norm": 0.6764551401138306, "learning_rate": 2.549106195066027e-05, "loss": 0.1851, "step": 18479 }, { "epoch": 5.649648425557933, "grad_norm": 1.4693233966827393, "learning_rate": 2.5490637340240332e-05, "loss": 0.217, "step": 18480 }, { "epoch": 5.649954142464078, "grad_norm": 0.2944168746471405, "learning_rate": 2.549021272982039e-05, "loss": 0.134, "step": 18481 }, { "epoch": 5.650259859370223, "grad_norm": 0.2609524428844452, "learning_rate": 2.5489788119400453e-05, "loss": 0.0855, "step": 18482 }, { "epoch": 5.650565576276368, "grad_norm": 0.4646514356136322, "learning_rate": 2.548936350898051e-05, "loss": 0.0839, "step": 18483 }, { "epoch": 5.650871293182513, "grad_norm": 0.18474321067333221, "learning_rate": 2.548893889856057e-05, "loss": 0.0496, "step": 18484 }, { "epoch": 5.651177010088658, "grad_norm": 0.24719330668449402, "learning_rate": 2.5488514288140632e-05, "loss": 0.0419, "step": 18485 }, { "epoch": 5.651482726994803, "grad_norm": 0.4384026527404785, "learning_rate": 2.548808967772069e-05, "loss": 0.055, "step": 18486 }, { "epoch": 5.651788443900948, "grad_norm": 0.37716472148895264, "learning_rate": 2.5487665067300753e-05, "loss": 0.0434, "step": 18487 }, { "epoch": 5.652094160807093, "grad_norm": 0.8592354655265808, "learning_rate": 2.548724045688081e-05, "loss": 0.0606, "step": 18488 }, { "epoch": 5.652399877713238, "grad_norm": 0.7742878198623657, "learning_rate": 2.5486815846460874e-05, "loss": 0.0914, "step": 18489 }, { "epoch": 5.652705594619382, "grad_norm": 0.30219876766204834, "learning_rate": 2.5486391236040932e-05, "loss": 0.0722, "step": 18490 }, { "epoch": 5.653011311525527, "grad_norm": 0.22960150241851807, "learning_rate": 2.5485966625620995e-05, "loss": 0.0612, "step": 18491 }, { "epoch": 5.6533170284316725, "grad_norm": 0.27640119194984436, "learning_rate": 2.5485542015201053e-05, "loss": 0.08, "step": 18492 }, { "epoch": 5.653622745337817, "grad_norm": 0.8863904476165771, "learning_rate": 2.5485117404781115e-05, "loss": 0.1092, "step": 18493 }, { "epoch": 5.653928462243962, "grad_norm": 1.2875447273254395, "learning_rate": 2.5484692794361174e-05, "loss": 0.0959, "step": 18494 }, { "epoch": 5.654234179150107, "grad_norm": 0.6255121231079102, "learning_rate": 2.5484268183941236e-05, "loss": 0.1657, "step": 18495 }, { "epoch": 5.654539896056252, "grad_norm": 0.7239121794700623, "learning_rate": 2.5483843573521295e-05, "loss": 0.1468, "step": 18496 }, { "epoch": 5.654845612962397, "grad_norm": 0.5515616536140442, "learning_rate": 2.5483418963101354e-05, "loss": 0.1433, "step": 18497 }, { "epoch": 5.655151329868541, "grad_norm": 1.0401304960250854, "learning_rate": 2.5482994352681416e-05, "loss": 0.1582, "step": 18498 }, { "epoch": 5.6554570467746865, "grad_norm": 0.8271476626396179, "learning_rate": 2.5482569742261474e-05, "loss": 0.197, "step": 18499 }, { "epoch": 5.655762763680832, "grad_norm": 0.5947284698486328, "learning_rate": 2.5482145131841536e-05, "loss": 0.1845, "step": 18500 }, { "epoch": 5.656068480586977, "grad_norm": 1.1087075471878052, "learning_rate": 2.5481720521421595e-05, "loss": 0.1796, "step": 18501 }, { "epoch": 5.656374197493121, "grad_norm": 0.9266759753227234, "learning_rate": 2.5481295911001657e-05, "loss": 0.1806, "step": 18502 }, { "epoch": 5.656679914399266, "grad_norm": 1.1524014472961426, "learning_rate": 2.5480871300581716e-05, "loss": 0.1996, "step": 18503 }, { "epoch": 5.656985631305411, "grad_norm": 1.132796049118042, "learning_rate": 2.5480446690161778e-05, "loss": 0.1804, "step": 18504 }, { "epoch": 5.657291348211556, "grad_norm": 1.844029426574707, "learning_rate": 2.5480022079741837e-05, "loss": 0.2101, "step": 18505 }, { "epoch": 5.6575970651177006, "grad_norm": 0.3803228735923767, "learning_rate": 2.54795974693219e-05, "loss": 0.1285, "step": 18506 }, { "epoch": 5.657902782023846, "grad_norm": 0.7733468413352966, "learning_rate": 2.5479172858901957e-05, "loss": 0.1044, "step": 18507 }, { "epoch": 5.658208498929991, "grad_norm": 0.2661001980304718, "learning_rate": 2.547874824848202e-05, "loss": 0.0622, "step": 18508 }, { "epoch": 5.658514215836136, "grad_norm": 0.9077520966529846, "learning_rate": 2.5478323638062078e-05, "loss": 0.0652, "step": 18509 }, { "epoch": 5.658819932742281, "grad_norm": 0.20628884434700012, "learning_rate": 2.5477899027642137e-05, "loss": 0.0701, "step": 18510 }, { "epoch": 5.659125649648425, "grad_norm": 0.4192962348461151, "learning_rate": 2.54774744172222e-05, "loss": 0.0492, "step": 18511 }, { "epoch": 5.65943136655457, "grad_norm": 0.8188371658325195, "learning_rate": 2.5477049806802258e-05, "loss": 0.0501, "step": 18512 }, { "epoch": 5.6597370834607155, "grad_norm": 0.2415381669998169, "learning_rate": 2.547662519638232e-05, "loss": 0.0606, "step": 18513 }, { "epoch": 5.660042800366861, "grad_norm": 0.30186185240745544, "learning_rate": 2.547620058596238e-05, "loss": 0.0502, "step": 18514 }, { "epoch": 5.660348517273005, "grad_norm": 0.818983793258667, "learning_rate": 2.547577597554244e-05, "loss": 0.0911, "step": 18515 }, { "epoch": 5.66065423417915, "grad_norm": 0.35079512000083923, "learning_rate": 2.54753513651225e-05, "loss": 0.0822, "step": 18516 }, { "epoch": 5.660959951085295, "grad_norm": 1.1470630168914795, "learning_rate": 2.547492675470256e-05, "loss": 0.0636, "step": 18517 }, { "epoch": 5.66126566799144, "grad_norm": 0.6475244760513306, "learning_rate": 2.547450214428262e-05, "loss": 0.1074, "step": 18518 }, { "epoch": 5.661571384897584, "grad_norm": 0.3550567030906677, "learning_rate": 2.5474077533862682e-05, "loss": 0.1103, "step": 18519 }, { "epoch": 5.6618771018037295, "grad_norm": 0.4824056327342987, "learning_rate": 2.547365292344274e-05, "loss": 0.0918, "step": 18520 }, { "epoch": 5.662182818709875, "grad_norm": 0.43186646699905396, "learning_rate": 2.5473228313022803e-05, "loss": 0.1244, "step": 18521 }, { "epoch": 5.66248853561602, "grad_norm": 0.7859824895858765, "learning_rate": 2.547280370260286e-05, "loss": 0.1465, "step": 18522 }, { "epoch": 5.662794252522165, "grad_norm": 0.9720561504364014, "learning_rate": 2.547237909218292e-05, "loss": 0.2088, "step": 18523 }, { "epoch": 5.663099969428309, "grad_norm": 1.6055572032928467, "learning_rate": 2.5471954481762982e-05, "loss": 0.1693, "step": 18524 }, { "epoch": 5.663405686334454, "grad_norm": 0.8475479483604431, "learning_rate": 2.547152987134304e-05, "loss": 0.1617, "step": 18525 }, { "epoch": 5.663711403240599, "grad_norm": 1.2936248779296875, "learning_rate": 2.5471105260923103e-05, "loss": 0.2049, "step": 18526 }, { "epoch": 5.664017120146744, "grad_norm": 1.842755913734436, "learning_rate": 2.5470680650503162e-05, "loss": 0.1818, "step": 18527 }, { "epoch": 5.664322837052889, "grad_norm": 0.8095844388008118, "learning_rate": 2.5470256040083224e-05, "loss": 0.18, "step": 18528 }, { "epoch": 5.664628553959034, "grad_norm": 1.3398072719573975, "learning_rate": 2.5469831429663283e-05, "loss": 0.1801, "step": 18529 }, { "epoch": 5.664934270865179, "grad_norm": 1.1143486499786377, "learning_rate": 2.5469406819243345e-05, "loss": 0.2222, "step": 18530 }, { "epoch": 5.665239987771324, "grad_norm": 0.3359677493572235, "learning_rate": 2.5468982208823404e-05, "loss": 0.1328, "step": 18531 }, { "epoch": 5.665545704677468, "grad_norm": 0.22514308989048004, "learning_rate": 2.5468557598403466e-05, "loss": 0.0875, "step": 18532 }, { "epoch": 5.665851421583613, "grad_norm": 0.41745561361312866, "learning_rate": 2.5468132987983524e-05, "loss": 0.1141, "step": 18533 }, { "epoch": 5.666157138489758, "grad_norm": 0.3417200744152069, "learning_rate": 2.5467708377563586e-05, "loss": 0.0527, "step": 18534 }, { "epoch": 5.6664628553959036, "grad_norm": 0.5452728271484375, "learning_rate": 2.5467283767143645e-05, "loss": 0.0408, "step": 18535 }, { "epoch": 5.666768572302049, "grad_norm": 0.17795869708061218, "learning_rate": 2.5466859156723704e-05, "loss": 0.0504, "step": 18536 }, { "epoch": 5.667074289208193, "grad_norm": 0.5844858884811401, "learning_rate": 2.5466434546303766e-05, "loss": 0.0765, "step": 18537 }, { "epoch": 5.667380006114338, "grad_norm": 0.2314738780260086, "learning_rate": 2.5466009935883825e-05, "loss": 0.0763, "step": 18538 }, { "epoch": 5.667685723020483, "grad_norm": 0.33428487181663513, "learning_rate": 2.5465585325463887e-05, "loss": 0.0486, "step": 18539 }, { "epoch": 5.667991439926628, "grad_norm": 0.4195386469364166, "learning_rate": 2.5465160715043945e-05, "loss": 0.0661, "step": 18540 }, { "epoch": 5.6682971568327725, "grad_norm": 0.2632922828197479, "learning_rate": 2.5464736104624007e-05, "loss": 0.108, "step": 18541 }, { "epoch": 5.668602873738918, "grad_norm": 0.44749096035957336, "learning_rate": 2.5464311494204066e-05, "loss": 0.0736, "step": 18542 }, { "epoch": 5.668908590645063, "grad_norm": 0.378113329410553, "learning_rate": 2.5463886883784128e-05, "loss": 0.1179, "step": 18543 }, { "epoch": 5.669214307551208, "grad_norm": 0.5105288624763489, "learning_rate": 2.5463462273364187e-05, "loss": 0.1041, "step": 18544 }, { "epoch": 5.669520024457352, "grad_norm": 0.3515630066394806, "learning_rate": 2.546303766294425e-05, "loss": 0.1303, "step": 18545 }, { "epoch": 5.669825741363497, "grad_norm": 0.4692878723144531, "learning_rate": 2.546261305252431e-05, "loss": 0.1359, "step": 18546 }, { "epoch": 5.670131458269642, "grad_norm": 0.5738835334777832, "learning_rate": 2.546218844210437e-05, "loss": 0.1681, "step": 18547 }, { "epoch": 5.670437175175787, "grad_norm": 1.1729353666305542, "learning_rate": 2.5461763831684432e-05, "loss": 0.1861, "step": 18548 }, { "epoch": 5.6707428920819325, "grad_norm": 0.4897361099720001, "learning_rate": 2.546133922126449e-05, "loss": 0.1743, "step": 18549 }, { "epoch": 5.671048608988077, "grad_norm": 0.4255358874797821, "learning_rate": 2.5460914610844553e-05, "loss": 0.1449, "step": 18550 }, { "epoch": 5.671354325894222, "grad_norm": 3.608874559402466, "learning_rate": 2.546049000042461e-05, "loss": 0.2002, "step": 18551 }, { "epoch": 5.671660042800367, "grad_norm": 0.765680730342865, "learning_rate": 2.5460065390004674e-05, "loss": 0.1386, "step": 18552 }, { "epoch": 5.671965759706512, "grad_norm": 10.499080657958984, "learning_rate": 2.5459640779584732e-05, "loss": 0.2111, "step": 18553 }, { "epoch": 5.672271476612656, "grad_norm": 1.2929213047027588, "learning_rate": 2.5459216169164794e-05, "loss": 0.2043, "step": 18554 }, { "epoch": 5.672577193518801, "grad_norm": 3.925189256668091, "learning_rate": 2.5458791558744853e-05, "loss": 0.2327, "step": 18555 }, { "epoch": 5.6728829104249465, "grad_norm": 0.43100637197494507, "learning_rate": 2.5458366948324915e-05, "loss": 0.1294, "step": 18556 }, { "epoch": 5.673188627331092, "grad_norm": 0.2982318103313446, "learning_rate": 2.5457942337904974e-05, "loss": 0.0884, "step": 18557 }, { "epoch": 5.673494344237236, "grad_norm": 0.29726919531822205, "learning_rate": 2.5457517727485036e-05, "loss": 0.0747, "step": 18558 }, { "epoch": 5.673800061143381, "grad_norm": 0.5109903812408447, "learning_rate": 2.5457093117065095e-05, "loss": 0.0537, "step": 18559 }, { "epoch": 5.674105778049526, "grad_norm": 0.19127638638019562, "learning_rate": 2.5456668506645153e-05, "loss": 0.068, "step": 18560 }, { "epoch": 5.674411494955671, "grad_norm": 0.35755184292793274, "learning_rate": 2.5456243896225215e-05, "loss": 0.0352, "step": 18561 }, { "epoch": 5.674717211861816, "grad_norm": 0.22297129034996033, "learning_rate": 2.5455819285805274e-05, "loss": 0.0424, "step": 18562 }, { "epoch": 5.6750229287679606, "grad_norm": 0.24405062198638916, "learning_rate": 2.5455394675385336e-05, "loss": 0.0535, "step": 18563 }, { "epoch": 5.675328645674106, "grad_norm": 0.5633051991462708, "learning_rate": 2.5454970064965395e-05, "loss": 0.1001, "step": 18564 }, { "epoch": 5.675634362580251, "grad_norm": 0.18760643899440765, "learning_rate": 2.5454545454545457e-05, "loss": 0.0435, "step": 18565 }, { "epoch": 5.675940079486396, "grad_norm": 0.8420765399932861, "learning_rate": 2.5454120844125516e-05, "loss": 0.0957, "step": 18566 }, { "epoch": 5.67624579639254, "grad_norm": 0.41108912229537964, "learning_rate": 2.5453696233705578e-05, "loss": 0.0719, "step": 18567 }, { "epoch": 5.676551513298685, "grad_norm": 0.4107869267463684, "learning_rate": 2.5453271623285636e-05, "loss": 0.0905, "step": 18568 }, { "epoch": 5.67685723020483, "grad_norm": 0.7136121988296509, "learning_rate": 2.54528470128657e-05, "loss": 0.0876, "step": 18569 }, { "epoch": 5.6771629471109755, "grad_norm": 0.3498241901397705, "learning_rate": 2.5452422402445757e-05, "loss": 0.1019, "step": 18570 }, { "epoch": 5.67746866401712, "grad_norm": 1.4910794496536255, "learning_rate": 2.545199779202582e-05, "loss": 0.1369, "step": 18571 }, { "epoch": 5.677774380923265, "grad_norm": 0.4808568060398102, "learning_rate": 2.5451573181605878e-05, "loss": 0.1414, "step": 18572 }, { "epoch": 5.67808009782941, "grad_norm": 0.4789227545261383, "learning_rate": 2.5451148571185937e-05, "loss": 0.1533, "step": 18573 }, { "epoch": 5.678385814735555, "grad_norm": 0.6773920059204102, "learning_rate": 2.5450723960766e-05, "loss": 0.1701, "step": 18574 }, { "epoch": 5.6786915316417, "grad_norm": 0.4860672652721405, "learning_rate": 2.5450299350346057e-05, "loss": 0.1952, "step": 18575 }, { "epoch": 5.678997248547844, "grad_norm": 1.1697263717651367, "learning_rate": 2.544987473992612e-05, "loss": 0.1922, "step": 18576 }, { "epoch": 5.6793029654539895, "grad_norm": 1.2199746370315552, "learning_rate": 2.5449450129506178e-05, "loss": 0.2013, "step": 18577 }, { "epoch": 5.679608682360135, "grad_norm": 2.2086620330810547, "learning_rate": 2.544902551908624e-05, "loss": 0.2053, "step": 18578 }, { "epoch": 5.67991439926628, "grad_norm": 1.1960076093673706, "learning_rate": 2.54486009086663e-05, "loss": 0.2054, "step": 18579 }, { "epoch": 5.680220116172424, "grad_norm": 1.9199124574661255, "learning_rate": 2.544817629824636e-05, "loss": 0.231, "step": 18580 }, { "epoch": 5.680525833078569, "grad_norm": 0.6720374822616577, "learning_rate": 2.544775168782642e-05, "loss": 0.1529, "step": 18581 }, { "epoch": 5.680831549984714, "grad_norm": 0.2990720570087433, "learning_rate": 2.5447327077406482e-05, "loss": 0.0755, "step": 18582 }, { "epoch": 5.681137266890859, "grad_norm": 0.17702366411685944, "learning_rate": 2.544690246698654e-05, "loss": 0.0855, "step": 18583 }, { "epoch": 5.6814429837970035, "grad_norm": 0.37496694922447205, "learning_rate": 2.5446477856566603e-05, "loss": 0.0755, "step": 18584 }, { "epoch": 5.681748700703149, "grad_norm": 0.16581784188747406, "learning_rate": 2.544605324614666e-05, "loss": 0.037, "step": 18585 }, { "epoch": 5.682054417609294, "grad_norm": 0.3415208160877228, "learning_rate": 2.544562863572672e-05, "loss": 0.0513, "step": 18586 }, { "epoch": 5.682360134515439, "grad_norm": 0.39088118076324463, "learning_rate": 2.5445204025306782e-05, "loss": 0.0616, "step": 18587 }, { "epoch": 5.682665851421584, "grad_norm": 0.22201527655124664, "learning_rate": 2.544477941488684e-05, "loss": 0.0601, "step": 18588 }, { "epoch": 5.682971568327728, "grad_norm": 0.4511376917362213, "learning_rate": 2.5444354804466903e-05, "loss": 0.0815, "step": 18589 }, { "epoch": 5.683277285233873, "grad_norm": 0.3083445727825165, "learning_rate": 2.5443930194046962e-05, "loss": 0.0557, "step": 18590 }, { "epoch": 5.683583002140018, "grad_norm": 0.2649531662464142, "learning_rate": 2.5443505583627024e-05, "loss": 0.0918, "step": 18591 }, { "epoch": 5.6838887190461636, "grad_norm": 0.2095055878162384, "learning_rate": 2.5443080973207082e-05, "loss": 0.0846, "step": 18592 }, { "epoch": 5.684194435952308, "grad_norm": 0.5869274735450745, "learning_rate": 2.5442656362787145e-05, "loss": 0.0684, "step": 18593 }, { "epoch": 5.684500152858453, "grad_norm": 13.2318754196167, "learning_rate": 2.5442231752367203e-05, "loss": 0.0927, "step": 18594 }, { "epoch": 5.684805869764598, "grad_norm": 0.5029048323631287, "learning_rate": 2.5441807141947265e-05, "loss": 0.129, "step": 18595 }, { "epoch": 5.685111586670743, "grad_norm": 0.3530340790748596, "learning_rate": 2.5441382531527324e-05, "loss": 0.1165, "step": 18596 }, { "epoch": 5.685417303576887, "grad_norm": 2.0598983764648438, "learning_rate": 2.5440957921107386e-05, "loss": 0.1221, "step": 18597 }, { "epoch": 5.6857230204830325, "grad_norm": 0.8388881087303162, "learning_rate": 2.5440533310687445e-05, "loss": 0.1776, "step": 18598 }, { "epoch": 5.686028737389178, "grad_norm": 0.6919925212860107, "learning_rate": 2.5440108700267504e-05, "loss": 0.1479, "step": 18599 }, { "epoch": 5.686334454295323, "grad_norm": 0.7358434200286865, "learning_rate": 2.5439684089847566e-05, "loss": 0.1654, "step": 18600 }, { "epoch": 5.686640171201468, "grad_norm": 0.7756868004798889, "learning_rate": 2.5439259479427624e-05, "loss": 0.1792, "step": 18601 }, { "epoch": 5.686945888107612, "grad_norm": 3.5577096939086914, "learning_rate": 2.5438834869007686e-05, "loss": 0.2195, "step": 18602 }, { "epoch": 5.687251605013757, "grad_norm": 2.151115894317627, "learning_rate": 2.5438410258587745e-05, "loss": 0.1631, "step": 18603 }, { "epoch": 5.687557321919902, "grad_norm": 1.9568315744400024, "learning_rate": 2.5437985648167807e-05, "loss": 0.2031, "step": 18604 }, { "epoch": 5.687863038826047, "grad_norm": 1.1249927282333374, "learning_rate": 2.5437561037747866e-05, "loss": 0.2375, "step": 18605 }, { "epoch": 5.688168755732192, "grad_norm": 0.3670465052127838, "learning_rate": 2.5437136427327928e-05, "loss": 0.1261, "step": 18606 }, { "epoch": 5.688474472638337, "grad_norm": 0.3241949677467346, "learning_rate": 2.5436711816907987e-05, "loss": 0.0807, "step": 18607 }, { "epoch": 5.688780189544482, "grad_norm": 0.5783215761184692, "learning_rate": 2.543628720648805e-05, "loss": 0.0778, "step": 18608 }, { "epoch": 5.689085906450627, "grad_norm": 0.3223001956939697, "learning_rate": 2.5435862596068107e-05, "loss": 0.0783, "step": 18609 }, { "epoch": 5.689391623356771, "grad_norm": 0.1830097883939743, "learning_rate": 2.543543798564817e-05, "loss": 0.0369, "step": 18610 }, { "epoch": 5.689697340262916, "grad_norm": 0.43933552503585815, "learning_rate": 2.5435013375228228e-05, "loss": 0.0686, "step": 18611 }, { "epoch": 5.690003057169061, "grad_norm": 0.18558482825756073, "learning_rate": 2.5434588764808287e-05, "loss": 0.0498, "step": 18612 }, { "epoch": 5.6903087740752065, "grad_norm": 0.22324004769325256, "learning_rate": 2.543416415438835e-05, "loss": 0.0551, "step": 18613 }, { "epoch": 5.690614490981352, "grad_norm": 0.37701675295829773, "learning_rate": 2.5433739543968408e-05, "loss": 0.0967, "step": 18614 }, { "epoch": 5.690920207887496, "grad_norm": 0.4216599762439728, "learning_rate": 2.543331493354847e-05, "loss": 0.0647, "step": 18615 }, { "epoch": 5.691225924793641, "grad_norm": 0.36850157380104065, "learning_rate": 2.543289032312853e-05, "loss": 0.0703, "step": 18616 }, { "epoch": 5.691531641699786, "grad_norm": 0.31711694598197937, "learning_rate": 2.543246571270859e-05, "loss": 0.0678, "step": 18617 }, { "epoch": 5.691837358605931, "grad_norm": 0.7281199097633362, "learning_rate": 2.543204110228865e-05, "loss": 0.0631, "step": 18618 }, { "epoch": 5.692143075512075, "grad_norm": 0.4058483839035034, "learning_rate": 2.543161649186871e-05, "loss": 0.1281, "step": 18619 }, { "epoch": 5.6924487924182205, "grad_norm": 0.6919861435890198, "learning_rate": 2.543119188144877e-05, "loss": 0.1242, "step": 18620 }, { "epoch": 5.692754509324366, "grad_norm": 0.6299869418144226, "learning_rate": 2.5430767271028832e-05, "loss": 0.1498, "step": 18621 }, { "epoch": 5.693060226230511, "grad_norm": 0.64848393201828, "learning_rate": 2.543034266060889e-05, "loss": 0.1456, "step": 18622 }, { "epoch": 5.693365943136655, "grad_norm": 0.5360555648803711, "learning_rate": 2.5429918050188953e-05, "loss": 0.1361, "step": 18623 }, { "epoch": 5.6936716600428, "grad_norm": 0.9445700645446777, "learning_rate": 2.5429493439769012e-05, "loss": 0.1657, "step": 18624 }, { "epoch": 5.693977376948945, "grad_norm": 1.1106923818588257, "learning_rate": 2.542906882934907e-05, "loss": 0.1484, "step": 18625 }, { "epoch": 5.69428309385509, "grad_norm": 1.1515451669692993, "learning_rate": 2.5428644218929133e-05, "loss": 0.1669, "step": 18626 }, { "epoch": 5.6945888107612355, "grad_norm": 0.8279430866241455, "learning_rate": 2.542821960850919e-05, "loss": 0.1853, "step": 18627 }, { "epoch": 5.69489452766738, "grad_norm": 1.2030842304229736, "learning_rate": 2.5427794998089253e-05, "loss": 0.1892, "step": 18628 }, { "epoch": 5.695200244573525, "grad_norm": 1.2653671503067017, "learning_rate": 2.5427370387669312e-05, "loss": 0.1887, "step": 18629 }, { "epoch": 5.69550596147967, "grad_norm": 0.9451742172241211, "learning_rate": 2.5426945777249374e-05, "loss": 0.224, "step": 18630 }, { "epoch": 5.695811678385815, "grad_norm": 0.6105967164039612, "learning_rate": 2.5426521166829433e-05, "loss": 0.1319, "step": 18631 }, { "epoch": 5.696117395291959, "grad_norm": 0.39327967166900635, "learning_rate": 2.5426096556409495e-05, "loss": 0.0813, "step": 18632 }, { "epoch": 5.696423112198104, "grad_norm": 0.3037634491920471, "learning_rate": 2.5425671945989554e-05, "loss": 0.0656, "step": 18633 }, { "epoch": 5.6967288291042495, "grad_norm": 0.41174718737602234, "learning_rate": 2.5425247335569616e-05, "loss": 0.0665, "step": 18634 }, { "epoch": 5.697034546010395, "grad_norm": 0.18256694078445435, "learning_rate": 2.5424822725149674e-05, "loss": 0.0501, "step": 18635 }, { "epoch": 5.697340262916539, "grad_norm": 0.21399454772472382, "learning_rate": 2.5424398114729736e-05, "loss": 0.042, "step": 18636 }, { "epoch": 5.697645979822684, "grad_norm": 0.2561348080635071, "learning_rate": 2.5423973504309795e-05, "loss": 0.0718, "step": 18637 }, { "epoch": 5.697951696728829, "grad_norm": 0.5231165885925293, "learning_rate": 2.5423548893889854e-05, "loss": 0.0639, "step": 18638 }, { "epoch": 5.698257413634974, "grad_norm": 0.3665730953216553, "learning_rate": 2.5423124283469916e-05, "loss": 0.0856, "step": 18639 }, { "epoch": 5.698563130541119, "grad_norm": 0.2762523591518402, "learning_rate": 2.5422699673049975e-05, "loss": 0.0467, "step": 18640 }, { "epoch": 5.6988688474472635, "grad_norm": 0.2347649782896042, "learning_rate": 2.5422275062630037e-05, "loss": 0.0875, "step": 18641 }, { "epoch": 5.699174564353409, "grad_norm": 0.3562887907028198, "learning_rate": 2.5421850452210095e-05, "loss": 0.1273, "step": 18642 }, { "epoch": 5.699480281259554, "grad_norm": 0.4811681807041168, "learning_rate": 2.5421425841790158e-05, "loss": 0.0863, "step": 18643 }, { "epoch": 5.699785998165699, "grad_norm": 0.30141764879226685, "learning_rate": 2.5421001231370216e-05, "loss": 0.1055, "step": 18644 }, { "epoch": 5.700091715071843, "grad_norm": 0.7200740575790405, "learning_rate": 2.5420576620950278e-05, "loss": 0.1413, "step": 18645 }, { "epoch": 5.700397431977988, "grad_norm": 1.158706784248352, "learning_rate": 2.5420152010530337e-05, "loss": 0.1294, "step": 18646 }, { "epoch": 5.700703148884133, "grad_norm": 0.4484245777130127, "learning_rate": 2.54197274001104e-05, "loss": 0.1388, "step": 18647 }, { "epoch": 5.701008865790278, "grad_norm": 0.6327133774757385, "learning_rate": 2.541930278969046e-05, "loss": 0.1882, "step": 18648 }, { "epoch": 5.701314582696423, "grad_norm": 0.6131977438926697, "learning_rate": 2.5418878179270523e-05, "loss": 0.1512, "step": 18649 }, { "epoch": 5.701620299602568, "grad_norm": 1.9011907577514648, "learning_rate": 2.5418453568850582e-05, "loss": 0.1865, "step": 18650 }, { "epoch": 5.701926016508713, "grad_norm": 0.5041196346282959, "learning_rate": 2.541802895843064e-05, "loss": 0.166, "step": 18651 }, { "epoch": 5.702231733414858, "grad_norm": 0.8564692139625549, "learning_rate": 2.5417604348010703e-05, "loss": 0.2001, "step": 18652 }, { "epoch": 5.702537450321003, "grad_norm": 1.1844090223312378, "learning_rate": 2.541717973759076e-05, "loss": 0.1883, "step": 18653 }, { "epoch": 5.702843167227147, "grad_norm": 1.1220088005065918, "learning_rate": 2.5416755127170824e-05, "loss": 0.2004, "step": 18654 }, { "epoch": 5.7031488841332925, "grad_norm": 1.8786641359329224, "learning_rate": 2.5416330516750882e-05, "loss": 0.3024, "step": 18655 }, { "epoch": 5.703454601039438, "grad_norm": 0.25745877623558044, "learning_rate": 2.5415905906330944e-05, "loss": 0.1198, "step": 18656 }, { "epoch": 5.703760317945583, "grad_norm": 0.24358202517032623, "learning_rate": 2.5415481295911003e-05, "loss": 0.0645, "step": 18657 }, { "epoch": 5.704066034851727, "grad_norm": 0.24635787308216095, "learning_rate": 2.5415056685491065e-05, "loss": 0.0835, "step": 18658 }, { "epoch": 5.704371751757872, "grad_norm": 0.21362845599651337, "learning_rate": 2.5414632075071124e-05, "loss": 0.0629, "step": 18659 }, { "epoch": 5.704677468664017, "grad_norm": 0.4285033345222473, "learning_rate": 2.5414207464651186e-05, "loss": 0.0362, "step": 18660 }, { "epoch": 5.704983185570162, "grad_norm": 0.6974202990531921, "learning_rate": 2.5413782854231245e-05, "loss": 0.062, "step": 18661 }, { "epoch": 5.7052889024763065, "grad_norm": 0.16777868568897247, "learning_rate": 2.5413358243811303e-05, "loss": 0.0496, "step": 18662 }, { "epoch": 5.705594619382452, "grad_norm": 1.0884315967559814, "learning_rate": 2.5412933633391365e-05, "loss": 0.0762, "step": 18663 }, { "epoch": 5.705900336288597, "grad_norm": 0.2640117108821869, "learning_rate": 2.5412509022971424e-05, "loss": 0.0796, "step": 18664 }, { "epoch": 5.706206053194742, "grad_norm": 0.2948550879955292, "learning_rate": 2.5412084412551486e-05, "loss": 0.0711, "step": 18665 }, { "epoch": 5.706511770100887, "grad_norm": 0.9654630422592163, "learning_rate": 2.5411659802131545e-05, "loss": 0.0746, "step": 18666 }, { "epoch": 5.706817487007031, "grad_norm": 0.2683297097682953, "learning_rate": 2.5411235191711607e-05, "loss": 0.074, "step": 18667 }, { "epoch": 5.707123203913176, "grad_norm": 0.38121965527534485, "learning_rate": 2.5410810581291666e-05, "loss": 0.1009, "step": 18668 }, { "epoch": 5.707428920819321, "grad_norm": 0.40779751539230347, "learning_rate": 2.5410385970871728e-05, "loss": 0.0929, "step": 18669 }, { "epoch": 5.7077346377254665, "grad_norm": 0.46128049492836, "learning_rate": 2.5409961360451786e-05, "loss": 0.11, "step": 18670 }, { "epoch": 5.708040354631611, "grad_norm": 0.603236198425293, "learning_rate": 2.540953675003185e-05, "loss": 0.1302, "step": 18671 }, { "epoch": 5.708346071537756, "grad_norm": 0.587801992893219, "learning_rate": 2.5409112139611907e-05, "loss": 0.1665, "step": 18672 }, { "epoch": 5.708651788443901, "grad_norm": 0.4973999857902527, "learning_rate": 2.540868752919197e-05, "loss": 0.1584, "step": 18673 }, { "epoch": 5.708957505350046, "grad_norm": 1.8626434803009033, "learning_rate": 2.5408262918772028e-05, "loss": 0.1412, "step": 18674 }, { "epoch": 5.70926322225619, "grad_norm": 0.6063597798347473, "learning_rate": 2.5407838308352087e-05, "loss": 0.1659, "step": 18675 }, { "epoch": 5.709568939162335, "grad_norm": 1.5315186977386475, "learning_rate": 2.540741369793215e-05, "loss": 0.1669, "step": 18676 }, { "epoch": 5.7098746560684805, "grad_norm": 0.8186375498771667, "learning_rate": 2.5406989087512208e-05, "loss": 0.1693, "step": 18677 }, { "epoch": 5.710180372974626, "grad_norm": 0.5643807053565979, "learning_rate": 2.540656447709227e-05, "loss": 0.1701, "step": 18678 }, { "epoch": 5.710486089880771, "grad_norm": 1.621171236038208, "learning_rate": 2.5406139866672328e-05, "loss": 0.2131, "step": 18679 }, { "epoch": 5.710791806786915, "grad_norm": 1.2611528635025024, "learning_rate": 2.540571525625239e-05, "loss": 0.205, "step": 18680 }, { "epoch": 5.71109752369306, "grad_norm": 0.5052173137664795, "learning_rate": 2.540529064583245e-05, "loss": 0.1539, "step": 18681 }, { "epoch": 5.711403240599205, "grad_norm": 0.3167942464351654, "learning_rate": 2.540486603541251e-05, "loss": 0.0976, "step": 18682 }, { "epoch": 5.71170895750535, "grad_norm": 0.22291067242622375, "learning_rate": 2.540444142499257e-05, "loss": 0.0922, "step": 18683 }, { "epoch": 5.712014674411495, "grad_norm": 0.30268409848213196, "learning_rate": 2.5404016814572632e-05, "loss": 0.073, "step": 18684 }, { "epoch": 5.71232039131764, "grad_norm": 0.1562056541442871, "learning_rate": 2.540359220415269e-05, "loss": 0.0631, "step": 18685 }, { "epoch": 5.712626108223785, "grad_norm": 0.20884723961353302, "learning_rate": 2.5403167593732753e-05, "loss": 0.0474, "step": 18686 }, { "epoch": 5.71293182512993, "grad_norm": 0.2519817352294922, "learning_rate": 2.540274298331281e-05, "loss": 0.0496, "step": 18687 }, { "epoch": 5.713237542036074, "grad_norm": 0.37333133816719055, "learning_rate": 2.540231837289287e-05, "loss": 0.0739, "step": 18688 }, { "epoch": 5.713543258942219, "grad_norm": 0.22719870507717133, "learning_rate": 2.5401893762472932e-05, "loss": 0.0537, "step": 18689 }, { "epoch": 5.713848975848364, "grad_norm": 0.5826574563980103, "learning_rate": 2.540146915205299e-05, "loss": 0.0645, "step": 18690 }, { "epoch": 5.7141546927545095, "grad_norm": 0.27574047446250916, "learning_rate": 2.5401044541633053e-05, "loss": 0.0537, "step": 18691 }, { "epoch": 5.714460409660655, "grad_norm": 0.2491411417722702, "learning_rate": 2.5400619931213112e-05, "loss": 0.0737, "step": 18692 }, { "epoch": 5.714766126566799, "grad_norm": 0.42267823219299316, "learning_rate": 2.5400195320793174e-05, "loss": 0.1039, "step": 18693 }, { "epoch": 5.715071843472944, "grad_norm": 0.5404069423675537, "learning_rate": 2.5399770710373233e-05, "loss": 0.0755, "step": 18694 }, { "epoch": 5.715377560379089, "grad_norm": 0.31652402877807617, "learning_rate": 2.5399346099953295e-05, "loss": 0.111, "step": 18695 }, { "epoch": 5.715683277285234, "grad_norm": 1.0189889669418335, "learning_rate": 2.5398921489533353e-05, "loss": 0.1278, "step": 18696 }, { "epoch": 5.715988994191378, "grad_norm": 1.8136368989944458, "learning_rate": 2.5398496879113415e-05, "loss": 0.1424, "step": 18697 }, { "epoch": 5.7162947110975235, "grad_norm": 0.5367833971977234, "learning_rate": 2.5398072268693474e-05, "loss": 0.1726, "step": 18698 }, { "epoch": 5.716600428003669, "grad_norm": 0.9572295546531677, "learning_rate": 2.5397647658273536e-05, "loss": 0.1775, "step": 18699 }, { "epoch": 5.716906144909814, "grad_norm": 0.6758299469947815, "learning_rate": 2.5397223047853595e-05, "loss": 0.1692, "step": 18700 }, { "epoch": 5.717211861815958, "grad_norm": 0.7576196789741516, "learning_rate": 2.5396798437433654e-05, "loss": 0.1867, "step": 18701 }, { "epoch": 5.717517578722103, "grad_norm": 1.0111048221588135, "learning_rate": 2.5396373827013716e-05, "loss": 0.1548, "step": 18702 }, { "epoch": 5.717823295628248, "grad_norm": 0.6576048135757446, "learning_rate": 2.5395949216593774e-05, "loss": 0.1653, "step": 18703 }, { "epoch": 5.718129012534393, "grad_norm": 0.43248534202575684, "learning_rate": 2.5395524606173836e-05, "loss": 0.1732, "step": 18704 }, { "epoch": 5.718434729440538, "grad_norm": 1.8234351873397827, "learning_rate": 2.5395099995753895e-05, "loss": 0.2453, "step": 18705 }, { "epoch": 5.718740446346683, "grad_norm": 0.5962262749671936, "learning_rate": 2.5394675385333957e-05, "loss": 0.1242, "step": 18706 }, { "epoch": 5.719046163252828, "grad_norm": 0.4512355327606201, "learning_rate": 2.5394250774914016e-05, "loss": 0.0737, "step": 18707 }, { "epoch": 5.719351880158973, "grad_norm": 0.28675413131713867, "learning_rate": 2.5393826164494078e-05, "loss": 0.0555, "step": 18708 }, { "epoch": 5.719657597065118, "grad_norm": 0.4188797175884247, "learning_rate": 2.5393401554074137e-05, "loss": 0.0578, "step": 18709 }, { "epoch": 5.719963313971262, "grad_norm": 0.5232982039451599, "learning_rate": 2.53929769436542e-05, "loss": 0.0769, "step": 18710 }, { "epoch": 5.720269030877407, "grad_norm": 0.2623283267021179, "learning_rate": 2.5392552333234258e-05, "loss": 0.0558, "step": 18711 }, { "epoch": 5.7205747477835525, "grad_norm": 0.23807299137115479, "learning_rate": 2.539212772281432e-05, "loss": 0.0757, "step": 18712 }, { "epoch": 5.720880464689698, "grad_norm": 0.6867741942405701, "learning_rate": 2.539170311239438e-05, "loss": 0.0516, "step": 18713 }, { "epoch": 5.721186181595842, "grad_norm": 0.2811562418937683, "learning_rate": 2.5391278501974437e-05, "loss": 0.0822, "step": 18714 }, { "epoch": 5.721491898501987, "grad_norm": 0.604166567325592, "learning_rate": 2.53908538915545e-05, "loss": 0.0492, "step": 18715 }, { "epoch": 5.721797615408132, "grad_norm": 0.41364797949790955, "learning_rate": 2.5390429281134558e-05, "loss": 0.0744, "step": 18716 }, { "epoch": 5.722103332314277, "grad_norm": 0.30926749110221863, "learning_rate": 2.539000467071462e-05, "loss": 0.0623, "step": 18717 }, { "epoch": 5.722409049220422, "grad_norm": 0.947966456413269, "learning_rate": 2.538958006029468e-05, "loss": 0.0774, "step": 18718 }, { "epoch": 5.7227147661265665, "grad_norm": 0.4564127027988434, "learning_rate": 2.538915544987474e-05, "loss": 0.1464, "step": 18719 }, { "epoch": 5.723020483032712, "grad_norm": 0.32165905833244324, "learning_rate": 2.53887308394548e-05, "loss": 0.0809, "step": 18720 }, { "epoch": 5.723326199938857, "grad_norm": 1.3963598012924194, "learning_rate": 2.538830622903486e-05, "loss": 0.1727, "step": 18721 }, { "epoch": 5.723631916845002, "grad_norm": 1.1916682720184326, "learning_rate": 2.538788161861492e-05, "loss": 0.178, "step": 18722 }, { "epoch": 5.723937633751146, "grad_norm": 0.5648320913314819, "learning_rate": 2.5387457008194982e-05, "loss": 0.1449, "step": 18723 }, { "epoch": 5.724243350657291, "grad_norm": 2.3894715309143066, "learning_rate": 2.538703239777504e-05, "loss": 0.1614, "step": 18724 }, { "epoch": 5.724549067563436, "grad_norm": 0.750705361366272, "learning_rate": 2.5386607787355103e-05, "loss": 0.1442, "step": 18725 }, { "epoch": 5.724854784469581, "grad_norm": 0.7667519450187683, "learning_rate": 2.5386183176935162e-05, "loss": 0.1641, "step": 18726 }, { "epoch": 5.725160501375726, "grad_norm": 1.0993956327438354, "learning_rate": 2.538575856651522e-05, "loss": 0.1763, "step": 18727 }, { "epoch": 5.725466218281871, "grad_norm": 0.7744357585906982, "learning_rate": 2.5385333956095283e-05, "loss": 0.1763, "step": 18728 }, { "epoch": 5.725771935188016, "grad_norm": 1.6051397323608398, "learning_rate": 2.538490934567534e-05, "loss": 0.1675, "step": 18729 }, { "epoch": 5.726077652094161, "grad_norm": 1.3821619749069214, "learning_rate": 2.5384484735255403e-05, "loss": 0.2497, "step": 18730 }, { "epoch": 5.726383369000306, "grad_norm": 0.42585936188697815, "learning_rate": 2.5384060124835462e-05, "loss": 0.1557, "step": 18731 }, { "epoch": 5.72668908590645, "grad_norm": 0.18160048127174377, "learning_rate": 2.5383635514415524e-05, "loss": 0.069, "step": 18732 }, { "epoch": 5.726994802812595, "grad_norm": 0.3113768398761749, "learning_rate": 2.5383210903995583e-05, "loss": 0.0763, "step": 18733 }, { "epoch": 5.7273005197187405, "grad_norm": 0.3238016366958618, "learning_rate": 2.5382786293575645e-05, "loss": 0.0502, "step": 18734 }, { "epoch": 5.727606236624886, "grad_norm": 0.35528576374053955, "learning_rate": 2.5382361683155704e-05, "loss": 0.0513, "step": 18735 }, { "epoch": 5.72791195353103, "grad_norm": 0.21068421006202698, "learning_rate": 2.5381937072735766e-05, "loss": 0.0619, "step": 18736 }, { "epoch": 5.728217670437175, "grad_norm": 0.2149127870798111, "learning_rate": 2.5381512462315824e-05, "loss": 0.064, "step": 18737 }, { "epoch": 5.72852338734332, "grad_norm": 0.46439245343208313, "learning_rate": 2.5381087851895886e-05, "loss": 0.0598, "step": 18738 }, { "epoch": 5.728829104249465, "grad_norm": 0.19368156790733337, "learning_rate": 2.5380663241475945e-05, "loss": 0.0639, "step": 18739 }, { "epoch": 5.7291348211556095, "grad_norm": 0.2148621380329132, "learning_rate": 2.5380238631056004e-05, "loss": 0.0497, "step": 18740 }, { "epoch": 5.729440538061755, "grad_norm": 0.6742950677871704, "learning_rate": 2.5379814020636066e-05, "loss": 0.0869, "step": 18741 }, { "epoch": 5.7297462549679, "grad_norm": 1.2946544885635376, "learning_rate": 2.5379389410216125e-05, "loss": 0.0826, "step": 18742 }, { "epoch": 5.730051971874045, "grad_norm": 0.452360600233078, "learning_rate": 2.5378964799796187e-05, "loss": 0.0974, "step": 18743 }, { "epoch": 5.73035768878019, "grad_norm": 0.5160893797874451, "learning_rate": 2.5378540189376245e-05, "loss": 0.1487, "step": 18744 }, { "epoch": 5.730663405686334, "grad_norm": 0.5799211263656616, "learning_rate": 2.5378115578956308e-05, "loss": 0.1186, "step": 18745 }, { "epoch": 5.730969122592479, "grad_norm": 0.67337566614151, "learning_rate": 2.5377690968536366e-05, "loss": 0.1339, "step": 18746 }, { "epoch": 5.731274839498624, "grad_norm": 0.6090250015258789, "learning_rate": 2.537726635811643e-05, "loss": 0.164, "step": 18747 }, { "epoch": 5.7315805564047695, "grad_norm": 0.674278974533081, "learning_rate": 2.5376841747696487e-05, "loss": 0.1961, "step": 18748 }, { "epoch": 5.731886273310914, "grad_norm": 1.0088483095169067, "learning_rate": 2.537641713727655e-05, "loss": 0.1449, "step": 18749 }, { "epoch": 5.732191990217059, "grad_norm": 1.0332586765289307, "learning_rate": 2.537599252685661e-05, "loss": 0.1737, "step": 18750 }, { "epoch": 5.732497707123204, "grad_norm": 0.9113104343414307, "learning_rate": 2.5375567916436673e-05, "loss": 0.1816, "step": 18751 }, { "epoch": 5.732803424029349, "grad_norm": 4.21523380279541, "learning_rate": 2.5375143306016732e-05, "loss": 0.2065, "step": 18752 }, { "epoch": 5.733109140935493, "grad_norm": 0.6503276824951172, "learning_rate": 2.537471869559679e-05, "loss": 0.1604, "step": 18753 }, { "epoch": 5.733414857841638, "grad_norm": 1.099997639656067, "learning_rate": 2.5374294085176853e-05, "loss": 0.2164, "step": 18754 }, { "epoch": 5.7337205747477835, "grad_norm": 3.9489545822143555, "learning_rate": 2.537386947475691e-05, "loss": 0.222, "step": 18755 }, { "epoch": 5.734026291653929, "grad_norm": 0.7844131588935852, "learning_rate": 2.5373444864336974e-05, "loss": 0.14, "step": 18756 }, { "epoch": 5.734332008560074, "grad_norm": 0.37239331007003784, "learning_rate": 2.5373020253917032e-05, "loss": 0.0669, "step": 18757 }, { "epoch": 5.734637725466218, "grad_norm": 0.33701881766319275, "learning_rate": 2.5372595643497094e-05, "loss": 0.0733, "step": 18758 }, { "epoch": 5.734943442372363, "grad_norm": 0.5243931412696838, "learning_rate": 2.5372171033077153e-05, "loss": 0.0638, "step": 18759 }, { "epoch": 5.735249159278508, "grad_norm": 0.6349968314170837, "learning_rate": 2.5371746422657215e-05, "loss": 0.0498, "step": 18760 }, { "epoch": 5.735554876184653, "grad_norm": 0.6161239147186279, "learning_rate": 2.5371321812237274e-05, "loss": 0.051, "step": 18761 }, { "epoch": 5.7358605930907975, "grad_norm": 0.39616847038269043, "learning_rate": 2.5370897201817336e-05, "loss": 0.0917, "step": 18762 }, { "epoch": 5.736166309996943, "grad_norm": 0.3635847866535187, "learning_rate": 2.5370472591397395e-05, "loss": 0.0551, "step": 18763 }, { "epoch": 5.736472026903088, "grad_norm": 0.29994913935661316, "learning_rate": 2.5370047980977457e-05, "loss": 0.0894, "step": 18764 }, { "epoch": 5.736777743809233, "grad_norm": 0.28734880685806274, "learning_rate": 2.5369623370557515e-05, "loss": 0.0768, "step": 18765 }, { "epoch": 5.737083460715377, "grad_norm": 1.5063214302062988, "learning_rate": 2.5369198760137574e-05, "loss": 0.1448, "step": 18766 }, { "epoch": 5.737389177621522, "grad_norm": 0.3879759907722473, "learning_rate": 2.5368774149717636e-05, "loss": 0.07, "step": 18767 }, { "epoch": 5.737694894527667, "grad_norm": 0.4678099751472473, "learning_rate": 2.5368349539297695e-05, "loss": 0.0931, "step": 18768 }, { "epoch": 5.7380006114338125, "grad_norm": 0.3531857132911682, "learning_rate": 2.5367924928877757e-05, "loss": 0.1501, "step": 18769 }, { "epoch": 5.738306328339958, "grad_norm": 1.4715850353240967, "learning_rate": 2.5367500318457816e-05, "loss": 0.0979, "step": 18770 }, { "epoch": 5.738612045246102, "grad_norm": 0.44359681010246277, "learning_rate": 2.5367075708037878e-05, "loss": 0.1362, "step": 18771 }, { "epoch": 5.738917762152247, "grad_norm": 0.5759987235069275, "learning_rate": 2.5366651097617936e-05, "loss": 0.1369, "step": 18772 }, { "epoch": 5.739223479058392, "grad_norm": 0.6020252704620361, "learning_rate": 2.5366226487198e-05, "loss": 0.1847, "step": 18773 }, { "epoch": 5.739529195964537, "grad_norm": 0.7975849509239197, "learning_rate": 2.5365801876778057e-05, "loss": 0.163, "step": 18774 }, { "epoch": 5.739834912870681, "grad_norm": 0.8849524855613708, "learning_rate": 2.536537726635812e-05, "loss": 0.1832, "step": 18775 }, { "epoch": 5.7401406297768265, "grad_norm": 2.2317609786987305, "learning_rate": 2.5364952655938178e-05, "loss": 0.1721, "step": 18776 }, { "epoch": 5.740446346682972, "grad_norm": 1.1883492469787598, "learning_rate": 2.5364528045518237e-05, "loss": 0.1722, "step": 18777 }, { "epoch": 5.740752063589117, "grad_norm": 1.713629126548767, "learning_rate": 2.53641034350983e-05, "loss": 0.183, "step": 18778 }, { "epoch": 5.741057780495261, "grad_norm": 0.7916172742843628, "learning_rate": 2.5363678824678358e-05, "loss": 0.1723, "step": 18779 }, { "epoch": 5.741363497401406, "grad_norm": 0.8513553142547607, "learning_rate": 2.536325421425842e-05, "loss": 0.2762, "step": 18780 }, { "epoch": 5.741669214307551, "grad_norm": 0.4374150037765503, "learning_rate": 2.536282960383848e-05, "loss": 0.116, "step": 18781 }, { "epoch": 5.741974931213696, "grad_norm": 0.3771440386772156, "learning_rate": 2.536240499341854e-05, "loss": 0.0972, "step": 18782 }, { "epoch": 5.742280648119841, "grad_norm": 0.2502720057964325, "learning_rate": 2.53619803829986e-05, "loss": 0.0586, "step": 18783 }, { "epoch": 5.742586365025986, "grad_norm": 0.4300520718097687, "learning_rate": 2.536155577257866e-05, "loss": 0.0545, "step": 18784 }, { "epoch": 5.742892081932131, "grad_norm": 0.20979845523834229, "learning_rate": 2.536113116215872e-05, "loss": 0.0579, "step": 18785 }, { "epoch": 5.743197798838276, "grad_norm": 0.24617557227611542, "learning_rate": 2.5360706551738782e-05, "loss": 0.0546, "step": 18786 }, { "epoch": 5.743503515744421, "grad_norm": 0.23765301704406738, "learning_rate": 2.536028194131884e-05, "loss": 0.0521, "step": 18787 }, { "epoch": 5.743809232650565, "grad_norm": 0.7591007947921753, "learning_rate": 2.5359857330898903e-05, "loss": 0.0661, "step": 18788 }, { "epoch": 5.74411494955671, "grad_norm": 0.45873722434043884, "learning_rate": 2.535943272047896e-05, "loss": 0.0612, "step": 18789 }, { "epoch": 5.744420666462855, "grad_norm": 0.656512439250946, "learning_rate": 2.535900811005902e-05, "loss": 0.0555, "step": 18790 }, { "epoch": 5.7447263833690005, "grad_norm": 0.36748069524765015, "learning_rate": 2.5358583499639082e-05, "loss": 0.1042, "step": 18791 }, { "epoch": 5.745032100275145, "grad_norm": 0.25042635202407837, "learning_rate": 2.535815888921914e-05, "loss": 0.0673, "step": 18792 }, { "epoch": 5.74533781718129, "grad_norm": 0.3976706564426422, "learning_rate": 2.5357734278799203e-05, "loss": 0.1134, "step": 18793 }, { "epoch": 5.745643534087435, "grad_norm": 1.3359887599945068, "learning_rate": 2.5357309668379262e-05, "loss": 0.0953, "step": 18794 }, { "epoch": 5.74594925099358, "grad_norm": 0.5019357204437256, "learning_rate": 2.5356885057959324e-05, "loss": 0.1144, "step": 18795 }, { "epoch": 5.746254967899725, "grad_norm": 0.5447941422462463, "learning_rate": 2.5356460447539383e-05, "loss": 0.1572, "step": 18796 }, { "epoch": 5.7465606848058695, "grad_norm": 1.3143339157104492, "learning_rate": 2.5356035837119445e-05, "loss": 0.1461, "step": 18797 }, { "epoch": 5.746866401712015, "grad_norm": 0.28046002984046936, "learning_rate": 2.5355611226699503e-05, "loss": 0.128, "step": 18798 }, { "epoch": 5.74717211861816, "grad_norm": 0.45387011766433716, "learning_rate": 2.5355186616279565e-05, "loss": 0.1887, "step": 18799 }, { "epoch": 5.747477835524305, "grad_norm": 0.7848306894302368, "learning_rate": 2.5354762005859624e-05, "loss": 0.1848, "step": 18800 }, { "epoch": 5.747783552430449, "grad_norm": 0.7976605296134949, "learning_rate": 2.5354337395439686e-05, "loss": 0.1629, "step": 18801 }, { "epoch": 5.748089269336594, "grad_norm": 1.1532093286514282, "learning_rate": 2.5353912785019745e-05, "loss": 0.192, "step": 18802 }, { "epoch": 5.748394986242739, "grad_norm": 0.9307479858398438, "learning_rate": 2.5353488174599804e-05, "loss": 0.1888, "step": 18803 }, { "epoch": 5.748700703148884, "grad_norm": 0.8115317225456238, "learning_rate": 2.5353063564179866e-05, "loss": 0.1943, "step": 18804 }, { "epoch": 5.749006420055029, "grad_norm": 0.7686141133308411, "learning_rate": 2.5352638953759924e-05, "loss": 0.1928, "step": 18805 }, { "epoch": 5.749312136961174, "grad_norm": 0.32193225622177124, "learning_rate": 2.5352214343339986e-05, "loss": 0.1181, "step": 18806 }, { "epoch": 5.749617853867319, "grad_norm": 0.326969712972641, "learning_rate": 2.5351789732920045e-05, "loss": 0.0669, "step": 18807 }, { "epoch": 5.749923570773464, "grad_norm": 0.31231093406677246, "learning_rate": 2.5351365122500107e-05, "loss": 0.1215, "step": 18808 }, { "epoch": 5.750229287679609, "grad_norm": 0.23371127247810364, "learning_rate": 2.5350940512080166e-05, "loss": 0.0756, "step": 18809 }, { "epoch": 5.750535004585753, "grad_norm": 0.5120605230331421, "learning_rate": 2.5350515901660228e-05, "loss": 0.0533, "step": 18810 }, { "epoch": 5.750840721491898, "grad_norm": 0.20598354935646057, "learning_rate": 2.5350091291240287e-05, "loss": 0.0584, "step": 18811 }, { "epoch": 5.7511464383980435, "grad_norm": 0.2277086079120636, "learning_rate": 2.534966668082035e-05, "loss": 0.0712, "step": 18812 }, { "epoch": 5.751452155304189, "grad_norm": 0.15748584270477295, "learning_rate": 2.5349242070400408e-05, "loss": 0.0434, "step": 18813 }, { "epoch": 5.751757872210333, "grad_norm": 0.17930971086025238, "learning_rate": 2.534881745998047e-05, "loss": 0.0679, "step": 18814 }, { "epoch": 5.752063589116478, "grad_norm": 0.4311087131500244, "learning_rate": 2.534839284956053e-05, "loss": 0.0635, "step": 18815 }, { "epoch": 5.752369306022623, "grad_norm": 0.24363479018211365, "learning_rate": 2.5347968239140587e-05, "loss": 0.0802, "step": 18816 }, { "epoch": 5.752675022928768, "grad_norm": 0.44765928387641907, "learning_rate": 2.534754362872065e-05, "loss": 0.0959, "step": 18817 }, { "epoch": 5.752980739834912, "grad_norm": 0.2419704794883728, "learning_rate": 2.5347119018300708e-05, "loss": 0.0917, "step": 18818 }, { "epoch": 5.7532864567410575, "grad_norm": 0.3888092339038849, "learning_rate": 2.534669440788077e-05, "loss": 0.1307, "step": 18819 }, { "epoch": 5.753592173647203, "grad_norm": 0.4806535243988037, "learning_rate": 2.534626979746083e-05, "loss": 0.1343, "step": 18820 }, { "epoch": 5.753897890553348, "grad_norm": 1.5253900289535522, "learning_rate": 2.534584518704089e-05, "loss": 0.1265, "step": 18821 }, { "epoch": 5.754203607459493, "grad_norm": 0.9018454551696777, "learning_rate": 2.534542057662095e-05, "loss": 0.1438, "step": 18822 }, { "epoch": 5.754509324365637, "grad_norm": 0.4314505457878113, "learning_rate": 2.534499596620101e-05, "loss": 0.1414, "step": 18823 }, { "epoch": 5.754815041271782, "grad_norm": 0.34115785360336304, "learning_rate": 2.534457135578107e-05, "loss": 0.1191, "step": 18824 }, { "epoch": 5.755120758177927, "grad_norm": 0.6110333204269409, "learning_rate": 2.5344146745361132e-05, "loss": 0.1573, "step": 18825 }, { "epoch": 5.7554264750840725, "grad_norm": 0.7248196601867676, "learning_rate": 2.534372213494119e-05, "loss": 0.2053, "step": 18826 }, { "epoch": 5.755732191990217, "grad_norm": 1.5136675834655762, "learning_rate": 2.5343297524521253e-05, "loss": 0.1844, "step": 18827 }, { "epoch": 5.756037908896362, "grad_norm": 0.9543635845184326, "learning_rate": 2.5342872914101312e-05, "loss": 0.2148, "step": 18828 }, { "epoch": 5.756343625802507, "grad_norm": 0.7693895101547241, "learning_rate": 2.534244830368137e-05, "loss": 0.1841, "step": 18829 }, { "epoch": 5.756649342708652, "grad_norm": 3.167778968811035, "learning_rate": 2.5342023693261433e-05, "loss": 0.2716, "step": 18830 }, { "epoch": 5.756955059614796, "grad_norm": 0.8537558317184448, "learning_rate": 2.534159908284149e-05, "loss": 0.1469, "step": 18831 }, { "epoch": 5.757260776520941, "grad_norm": 0.1839110553264618, "learning_rate": 2.5341174472421553e-05, "loss": 0.0752, "step": 18832 }, { "epoch": 5.7575664934270865, "grad_norm": 0.3252720534801483, "learning_rate": 2.5340749862001612e-05, "loss": 0.0823, "step": 18833 }, { "epoch": 5.757872210333232, "grad_norm": 0.2524643838405609, "learning_rate": 2.5340325251581674e-05, "loss": 0.0427, "step": 18834 }, { "epoch": 5.758177927239377, "grad_norm": 0.21003945171833038, "learning_rate": 2.5339900641161733e-05, "loss": 0.0569, "step": 18835 }, { "epoch": 5.758483644145521, "grad_norm": 0.3303726613521576, "learning_rate": 2.5339476030741795e-05, "loss": 0.0588, "step": 18836 }, { "epoch": 5.758789361051666, "grad_norm": 0.3451198935508728, "learning_rate": 2.5339051420321854e-05, "loss": 0.0739, "step": 18837 }, { "epoch": 5.759095077957811, "grad_norm": 0.1759386956691742, "learning_rate": 2.5338626809901916e-05, "loss": 0.0565, "step": 18838 }, { "epoch": 5.759400794863956, "grad_norm": 0.2898235321044922, "learning_rate": 2.5338202199481974e-05, "loss": 0.0761, "step": 18839 }, { "epoch": 5.7597065117701005, "grad_norm": 0.3507949113845825, "learning_rate": 2.5337777589062037e-05, "loss": 0.0736, "step": 18840 }, { "epoch": 5.760012228676246, "grad_norm": 0.6107078194618225, "learning_rate": 2.5337352978642095e-05, "loss": 0.0898, "step": 18841 }, { "epoch": 5.760317945582391, "grad_norm": 0.4224831163883209, "learning_rate": 2.5336928368222154e-05, "loss": 0.0914, "step": 18842 }, { "epoch": 5.760623662488536, "grad_norm": 0.9016759395599365, "learning_rate": 2.5336503757802216e-05, "loss": 0.0823, "step": 18843 }, { "epoch": 5.76092937939468, "grad_norm": 0.4615282714366913, "learning_rate": 2.5336079147382275e-05, "loss": 0.1179, "step": 18844 }, { "epoch": 5.761235096300825, "grad_norm": 0.31622055172920227, "learning_rate": 2.5335654536962337e-05, "loss": 0.1019, "step": 18845 }, { "epoch": 5.76154081320697, "grad_norm": 0.6754903793334961, "learning_rate": 2.5335229926542395e-05, "loss": 0.1439, "step": 18846 }, { "epoch": 5.761846530113115, "grad_norm": 0.8080312013626099, "learning_rate": 2.5334805316122458e-05, "loss": 0.1365, "step": 18847 }, { "epoch": 5.7621522470192605, "grad_norm": 0.6984068751335144, "learning_rate": 2.5334380705702516e-05, "loss": 0.1641, "step": 18848 }, { "epoch": 5.762457963925405, "grad_norm": 0.48402369022369385, "learning_rate": 2.533395609528258e-05, "loss": 0.1484, "step": 18849 }, { "epoch": 5.76276368083155, "grad_norm": 0.4612075984477997, "learning_rate": 2.5333531484862637e-05, "loss": 0.1448, "step": 18850 }, { "epoch": 5.763069397737695, "grad_norm": 4.746153831481934, "learning_rate": 2.53331068744427e-05, "loss": 0.1512, "step": 18851 }, { "epoch": 5.76337511464384, "grad_norm": 0.5038236975669861, "learning_rate": 2.533268226402276e-05, "loss": 0.1877, "step": 18852 }, { "epoch": 5.763680831549984, "grad_norm": 3.492457151412964, "learning_rate": 2.5332257653602823e-05, "loss": 0.2034, "step": 18853 }, { "epoch": 5.7639865484561295, "grad_norm": 0.7143345475196838, "learning_rate": 2.5331833043182882e-05, "loss": 0.1947, "step": 18854 }, { "epoch": 5.764292265362275, "grad_norm": 0.829811692237854, "learning_rate": 2.533140843276294e-05, "loss": 0.2628, "step": 18855 }, { "epoch": 5.76459798226842, "grad_norm": 0.3414965271949768, "learning_rate": 2.5330983822343003e-05, "loss": 0.1791, "step": 18856 }, { "epoch": 5.764903699174564, "grad_norm": 0.20614656805992126, "learning_rate": 2.533055921192306e-05, "loss": 0.0816, "step": 18857 }, { "epoch": 5.765209416080709, "grad_norm": 0.2329728752374649, "learning_rate": 2.5330134601503124e-05, "loss": 0.0561, "step": 18858 }, { "epoch": 5.765515132986854, "grad_norm": 0.20066408812999725, "learning_rate": 2.5329709991083182e-05, "loss": 0.0531, "step": 18859 }, { "epoch": 5.765820849892999, "grad_norm": 0.20495189726352692, "learning_rate": 2.5329285380663244e-05, "loss": 0.0736, "step": 18860 }, { "epoch": 5.766126566799144, "grad_norm": 0.2586032450199127, "learning_rate": 2.5328860770243303e-05, "loss": 0.0932, "step": 18861 }, { "epoch": 5.766432283705289, "grad_norm": 0.31698352098464966, "learning_rate": 2.5328436159823365e-05, "loss": 0.0644, "step": 18862 }, { "epoch": 5.766738000611434, "grad_norm": 0.4558704197406769, "learning_rate": 2.5328011549403424e-05, "loss": 0.07, "step": 18863 }, { "epoch": 5.767043717517579, "grad_norm": 0.30035656690597534, "learning_rate": 2.5327586938983486e-05, "loss": 0.0581, "step": 18864 }, { "epoch": 5.767349434423724, "grad_norm": 0.5590085983276367, "learning_rate": 2.5327162328563545e-05, "loss": 0.0649, "step": 18865 }, { "epoch": 5.767655151329868, "grad_norm": 0.27112647891044617, "learning_rate": 2.5326737718143607e-05, "loss": 0.1112, "step": 18866 }, { "epoch": 5.767960868236013, "grad_norm": 0.33696043491363525, "learning_rate": 2.5326313107723665e-05, "loss": 0.0706, "step": 18867 }, { "epoch": 5.768266585142158, "grad_norm": 0.29441842436790466, "learning_rate": 2.5325888497303724e-05, "loss": 0.1095, "step": 18868 }, { "epoch": 5.7685723020483035, "grad_norm": 0.41394564509391785, "learning_rate": 2.5325463886883786e-05, "loss": 0.1204, "step": 18869 }, { "epoch": 5.768878018954448, "grad_norm": 0.43144282698631287, "learning_rate": 2.5325039276463845e-05, "loss": 0.1204, "step": 18870 }, { "epoch": 5.769183735860593, "grad_norm": 0.39037394523620605, "learning_rate": 2.5324614666043907e-05, "loss": 0.1176, "step": 18871 }, { "epoch": 5.769489452766738, "grad_norm": 0.47552451491355896, "learning_rate": 2.5324190055623966e-05, "loss": 0.1308, "step": 18872 }, { "epoch": 5.769795169672883, "grad_norm": 0.9562070965766907, "learning_rate": 2.5323765445204028e-05, "loss": 0.1599, "step": 18873 }, { "epoch": 5.770100886579028, "grad_norm": 0.8005731105804443, "learning_rate": 2.5323340834784087e-05, "loss": 0.1291, "step": 18874 }, { "epoch": 5.770406603485172, "grad_norm": 0.8874202370643616, "learning_rate": 2.532291622436415e-05, "loss": 0.1618, "step": 18875 }, { "epoch": 5.7707123203913175, "grad_norm": 0.5618204474449158, "learning_rate": 2.5322491613944207e-05, "loss": 0.1803, "step": 18876 }, { "epoch": 5.771018037297463, "grad_norm": 1.2600722312927246, "learning_rate": 2.532206700352427e-05, "loss": 0.2283, "step": 18877 }, { "epoch": 5.771323754203608, "grad_norm": 0.6381109952926636, "learning_rate": 2.5321642393104328e-05, "loss": 0.1929, "step": 18878 }, { "epoch": 5.771629471109752, "grad_norm": 0.862105131149292, "learning_rate": 2.532121778268439e-05, "loss": 0.1898, "step": 18879 }, { "epoch": 5.771935188015897, "grad_norm": 0.8265422582626343, "learning_rate": 2.532079317226445e-05, "loss": 0.2147, "step": 18880 }, { "epoch": 5.772240904922042, "grad_norm": 0.3277219831943512, "learning_rate": 2.5320368561844508e-05, "loss": 0.1355, "step": 18881 }, { "epoch": 5.772546621828187, "grad_norm": 0.25864261388778687, "learning_rate": 2.531994395142457e-05, "loss": 0.073, "step": 18882 }, { "epoch": 5.772852338734332, "grad_norm": 0.2800222933292389, "learning_rate": 2.531951934100463e-05, "loss": 0.0731, "step": 18883 }, { "epoch": 5.773158055640477, "grad_norm": 0.1677408218383789, "learning_rate": 2.531909473058469e-05, "loss": 0.0515, "step": 18884 }, { "epoch": 5.773463772546622, "grad_norm": 0.2656038701534271, "learning_rate": 2.531867012016475e-05, "loss": 0.0651, "step": 18885 }, { "epoch": 5.773769489452767, "grad_norm": 0.3900659680366516, "learning_rate": 2.531824550974481e-05, "loss": 0.0446, "step": 18886 }, { "epoch": 5.774075206358912, "grad_norm": 0.4004378616809845, "learning_rate": 2.531782089932487e-05, "loss": 0.0609, "step": 18887 }, { "epoch": 5.774380923265056, "grad_norm": 0.1667684018611908, "learning_rate": 2.5317396288904932e-05, "loss": 0.0418, "step": 18888 }, { "epoch": 5.774686640171201, "grad_norm": 0.322361558675766, "learning_rate": 2.531697167848499e-05, "loss": 0.0622, "step": 18889 }, { "epoch": 5.7749923570773465, "grad_norm": 0.5598129630088806, "learning_rate": 2.5316547068065053e-05, "loss": 0.076, "step": 18890 }, { "epoch": 5.775298073983492, "grad_norm": 1.0145994424819946, "learning_rate": 2.531612245764511e-05, "loss": 0.0791, "step": 18891 }, { "epoch": 5.775603790889636, "grad_norm": 0.6317553520202637, "learning_rate": 2.531569784722517e-05, "loss": 0.0907, "step": 18892 }, { "epoch": 5.775909507795781, "grad_norm": 0.41636255383491516, "learning_rate": 2.5315273236805232e-05, "loss": 0.0858, "step": 18893 }, { "epoch": 5.776215224701926, "grad_norm": 0.6325554251670837, "learning_rate": 2.531484862638529e-05, "loss": 0.1001, "step": 18894 }, { "epoch": 5.776520941608071, "grad_norm": 0.4705307185649872, "learning_rate": 2.5314424015965353e-05, "loss": 0.1268, "step": 18895 }, { "epoch": 5.776826658514215, "grad_norm": 0.9056005477905273, "learning_rate": 2.5313999405545412e-05, "loss": 0.1407, "step": 18896 }, { "epoch": 5.7771323754203605, "grad_norm": 0.8941135406494141, "learning_rate": 2.5313574795125474e-05, "loss": 0.156, "step": 18897 }, { "epoch": 5.777438092326506, "grad_norm": 0.38874688744544983, "learning_rate": 2.5313150184705533e-05, "loss": 0.1744, "step": 18898 }, { "epoch": 5.777743809232651, "grad_norm": 1.7190274000167847, "learning_rate": 2.5312725574285595e-05, "loss": 0.1432, "step": 18899 }, { "epoch": 5.778049526138796, "grad_norm": 0.5902360677719116, "learning_rate": 2.5312300963865653e-05, "loss": 0.1395, "step": 18900 }, { "epoch": 5.77835524304494, "grad_norm": 0.7914438247680664, "learning_rate": 2.5311876353445715e-05, "loss": 0.1733, "step": 18901 }, { "epoch": 5.778660959951085, "grad_norm": 0.8777830004692078, "learning_rate": 2.5311451743025774e-05, "loss": 0.1718, "step": 18902 }, { "epoch": 5.77896667685723, "grad_norm": 1.1420667171478271, "learning_rate": 2.5311027132605836e-05, "loss": 0.1881, "step": 18903 }, { "epoch": 5.779272393763375, "grad_norm": 1.0699865818023682, "learning_rate": 2.5310602522185895e-05, "loss": 0.1894, "step": 18904 }, { "epoch": 5.77957811066952, "grad_norm": 1.6174259185791016, "learning_rate": 2.5310177911765954e-05, "loss": 0.2152, "step": 18905 }, { "epoch": 5.779883827575665, "grad_norm": 0.670520007610321, "learning_rate": 2.5309753301346016e-05, "loss": 0.1458, "step": 18906 }, { "epoch": 5.78018954448181, "grad_norm": 0.23666508495807648, "learning_rate": 2.5309328690926074e-05, "loss": 0.0829, "step": 18907 }, { "epoch": 5.780495261387955, "grad_norm": 0.22801785171031952, "learning_rate": 2.5308904080506137e-05, "loss": 0.057, "step": 18908 }, { "epoch": 5.780800978294099, "grad_norm": 0.2504537105560303, "learning_rate": 2.5308479470086195e-05, "loss": 0.0607, "step": 18909 }, { "epoch": 5.781106695200244, "grad_norm": 0.2130390703678131, "learning_rate": 2.5308054859666257e-05, "loss": 0.0463, "step": 18910 }, { "epoch": 5.7814124121063895, "grad_norm": 0.2526482343673706, "learning_rate": 2.5307630249246316e-05, "loss": 0.0363, "step": 18911 }, { "epoch": 5.781718129012535, "grad_norm": 0.14712397754192352, "learning_rate": 2.5307205638826378e-05, "loss": 0.0537, "step": 18912 }, { "epoch": 5.78202384591868, "grad_norm": 0.3430447280406952, "learning_rate": 2.5306781028406437e-05, "loss": 0.0611, "step": 18913 }, { "epoch": 5.782329562824824, "grad_norm": 0.7500473260879517, "learning_rate": 2.53063564179865e-05, "loss": 0.0939, "step": 18914 }, { "epoch": 5.782635279730969, "grad_norm": 0.24898786842823029, "learning_rate": 2.5305931807566558e-05, "loss": 0.0667, "step": 18915 }, { "epoch": 5.782940996637114, "grad_norm": 0.4761553108692169, "learning_rate": 2.530550719714662e-05, "loss": 0.0758, "step": 18916 }, { "epoch": 5.783246713543259, "grad_norm": 0.6318238377571106, "learning_rate": 2.530508258672668e-05, "loss": 0.0572, "step": 18917 }, { "epoch": 5.7835524304494035, "grad_norm": 0.26066383719444275, "learning_rate": 2.5304657976306737e-05, "loss": 0.0963, "step": 18918 }, { "epoch": 5.783858147355549, "grad_norm": 0.3970194458961487, "learning_rate": 2.53042333658868e-05, "loss": 0.1048, "step": 18919 }, { "epoch": 5.784163864261694, "grad_norm": 0.44255340099334717, "learning_rate": 2.5303808755466858e-05, "loss": 0.1287, "step": 18920 }, { "epoch": 5.784469581167839, "grad_norm": 0.48706355690956116, "learning_rate": 2.530338414504692e-05, "loss": 0.1347, "step": 18921 }, { "epoch": 5.784775298073983, "grad_norm": 0.37410861253738403, "learning_rate": 2.530295953462698e-05, "loss": 0.1332, "step": 18922 }, { "epoch": 5.785081014980128, "grad_norm": 0.37854301929473877, "learning_rate": 2.530253492420704e-05, "loss": 0.1387, "step": 18923 }, { "epoch": 5.785386731886273, "grad_norm": 0.5308800339698792, "learning_rate": 2.53021103137871e-05, "loss": 0.2081, "step": 18924 }, { "epoch": 5.785692448792418, "grad_norm": 0.6183987855911255, "learning_rate": 2.530168570336716e-05, "loss": 0.1538, "step": 18925 }, { "epoch": 5.7859981656985635, "grad_norm": 1.0505775213241577, "learning_rate": 2.530126109294722e-05, "loss": 0.1715, "step": 18926 }, { "epoch": 5.786303882604708, "grad_norm": 1.4329453706741333, "learning_rate": 2.5300836482527282e-05, "loss": 0.2031, "step": 18927 }, { "epoch": 5.786609599510853, "grad_norm": 1.1171510219573975, "learning_rate": 2.530041187210734e-05, "loss": 0.1942, "step": 18928 }, { "epoch": 5.786915316416998, "grad_norm": 0.8478264808654785, "learning_rate": 2.5299987261687403e-05, "loss": 0.1601, "step": 18929 }, { "epoch": 5.787221033323143, "grad_norm": 1.5522254705429077, "learning_rate": 2.5299562651267462e-05, "loss": 0.2287, "step": 18930 }, { "epoch": 5.787526750229287, "grad_norm": 1.3062740564346313, "learning_rate": 2.529913804084752e-05, "loss": 0.1967, "step": 18931 }, { "epoch": 5.787832467135432, "grad_norm": 0.5778439044952393, "learning_rate": 2.5298713430427583e-05, "loss": 0.0741, "step": 18932 }, { "epoch": 5.7881381840415775, "grad_norm": 0.24916879832744598, "learning_rate": 2.529828882000764e-05, "loss": 0.0495, "step": 18933 }, { "epoch": 5.788443900947723, "grad_norm": 0.5904675722122192, "learning_rate": 2.5297864209587703e-05, "loss": 0.056, "step": 18934 }, { "epoch": 5.788749617853867, "grad_norm": 0.4407442510128021, "learning_rate": 2.5297439599167762e-05, "loss": 0.0865, "step": 18935 }, { "epoch": 5.789055334760012, "grad_norm": 0.39067524671554565, "learning_rate": 2.5297014988747824e-05, "loss": 0.0617, "step": 18936 }, { "epoch": 5.789361051666157, "grad_norm": 0.25872525572776794, "learning_rate": 2.5296590378327883e-05, "loss": 0.0517, "step": 18937 }, { "epoch": 5.789666768572302, "grad_norm": 0.173599511384964, "learning_rate": 2.5296165767907945e-05, "loss": 0.0341, "step": 18938 }, { "epoch": 5.789972485478447, "grad_norm": 0.44443702697753906, "learning_rate": 2.5295741157488004e-05, "loss": 0.0987, "step": 18939 }, { "epoch": 5.790278202384592, "grad_norm": 0.6055813431739807, "learning_rate": 2.5295316547068066e-05, "loss": 0.0506, "step": 18940 }, { "epoch": 5.790583919290737, "grad_norm": 0.3145870566368103, "learning_rate": 2.5294891936648124e-05, "loss": 0.0777, "step": 18941 }, { "epoch": 5.790889636196882, "grad_norm": 0.3207734525203705, "learning_rate": 2.5294467326228187e-05, "loss": 0.0849, "step": 18942 }, { "epoch": 5.791195353103027, "grad_norm": 0.4654087722301483, "learning_rate": 2.5294042715808245e-05, "loss": 0.1201, "step": 18943 }, { "epoch": 5.791501070009171, "grad_norm": 0.7198963761329651, "learning_rate": 2.5293618105388304e-05, "loss": 0.1442, "step": 18944 }, { "epoch": 5.791806786915316, "grad_norm": 0.7055400013923645, "learning_rate": 2.5293193494968366e-05, "loss": 0.1241, "step": 18945 }, { "epoch": 5.792112503821461, "grad_norm": 3.044607162475586, "learning_rate": 2.5292768884548425e-05, "loss": 0.169, "step": 18946 }, { "epoch": 5.7924182207276065, "grad_norm": 0.5567911863327026, "learning_rate": 2.5292344274128487e-05, "loss": 0.1544, "step": 18947 }, { "epoch": 5.792723937633751, "grad_norm": 0.4845536947250366, "learning_rate": 2.5291919663708546e-05, "loss": 0.1442, "step": 18948 }, { "epoch": 5.793029654539896, "grad_norm": 1.097844123840332, "learning_rate": 2.5291495053288608e-05, "loss": 0.1705, "step": 18949 }, { "epoch": 5.793335371446041, "grad_norm": 1.6470189094543457, "learning_rate": 2.5291070442868666e-05, "loss": 0.1614, "step": 18950 }, { "epoch": 5.793641088352186, "grad_norm": 0.5869936943054199, "learning_rate": 2.529064583244873e-05, "loss": 0.1957, "step": 18951 }, { "epoch": 5.793946805258331, "grad_norm": 0.9768716096878052, "learning_rate": 2.5290221222028787e-05, "loss": 0.1627, "step": 18952 }, { "epoch": 5.794252522164475, "grad_norm": 2.1919949054718018, "learning_rate": 2.528979661160885e-05, "loss": 0.1872, "step": 18953 }, { "epoch": 5.7945582390706205, "grad_norm": 1.1712054014205933, "learning_rate": 2.528937200118891e-05, "loss": 0.2129, "step": 18954 }, { "epoch": 5.794863955976766, "grad_norm": 4.111703872680664, "learning_rate": 2.5288947390768973e-05, "loss": 0.2477, "step": 18955 }, { "epoch": 5.795169672882911, "grad_norm": 0.38109496235847473, "learning_rate": 2.5288522780349032e-05, "loss": 0.1275, "step": 18956 }, { "epoch": 5.795475389789055, "grad_norm": 0.3775770664215088, "learning_rate": 2.528809816992909e-05, "loss": 0.0779, "step": 18957 }, { "epoch": 5.7957811066952, "grad_norm": 0.49041876196861267, "learning_rate": 2.5287673559509153e-05, "loss": 0.0703, "step": 18958 }, { "epoch": 5.796086823601345, "grad_norm": 0.21796418726444244, "learning_rate": 2.528724894908921e-05, "loss": 0.0538, "step": 18959 }, { "epoch": 5.79639254050749, "grad_norm": 0.24924777448177338, "learning_rate": 2.5286824338669274e-05, "loss": 0.0504, "step": 18960 }, { "epoch": 5.7966982574136345, "grad_norm": 0.26013675332069397, "learning_rate": 2.5286399728249332e-05, "loss": 0.0463, "step": 18961 }, { "epoch": 5.79700397431978, "grad_norm": 0.4554291367530823, "learning_rate": 2.5285975117829394e-05, "loss": 0.0871, "step": 18962 }, { "epoch": 5.797309691225925, "grad_norm": 0.3640006482601166, "learning_rate": 2.5285550507409453e-05, "loss": 0.0483, "step": 18963 }, { "epoch": 5.79761540813207, "grad_norm": 0.30304598808288574, "learning_rate": 2.5285125896989515e-05, "loss": 0.0891, "step": 18964 }, { "epoch": 5.797921125038215, "grad_norm": 0.17942440509796143, "learning_rate": 2.5284701286569574e-05, "loss": 0.0567, "step": 18965 }, { "epoch": 5.798226841944359, "grad_norm": 0.45794129371643066, "learning_rate": 2.5284276676149636e-05, "loss": 0.0621, "step": 18966 }, { "epoch": 5.798532558850504, "grad_norm": 0.5885121822357178, "learning_rate": 2.5283852065729695e-05, "loss": 0.089, "step": 18967 }, { "epoch": 5.7988382757566495, "grad_norm": 0.33122238516807556, "learning_rate": 2.5283427455309757e-05, "loss": 0.0667, "step": 18968 }, { "epoch": 5.799143992662795, "grad_norm": 0.5281667113304138, "learning_rate": 2.5283002844889815e-05, "loss": 0.0946, "step": 18969 }, { "epoch": 5.799449709568939, "grad_norm": 0.42964091897010803, "learning_rate": 2.5282578234469874e-05, "loss": 0.1127, "step": 18970 }, { "epoch": 5.799755426475084, "grad_norm": 0.41967806220054626, "learning_rate": 2.5282153624049936e-05, "loss": 0.1443, "step": 18971 }, { "epoch": 5.800061143381229, "grad_norm": 1.663765788078308, "learning_rate": 2.5281729013629995e-05, "loss": 0.1619, "step": 18972 }, { "epoch": 5.800366860287374, "grad_norm": 3.932251453399658, "learning_rate": 2.5281304403210057e-05, "loss": 0.1455, "step": 18973 }, { "epoch": 5.800672577193518, "grad_norm": 1.4487276077270508, "learning_rate": 2.5280879792790116e-05, "loss": 0.1587, "step": 18974 }, { "epoch": 5.8009782940996635, "grad_norm": 0.6820043921470642, "learning_rate": 2.5280455182370178e-05, "loss": 0.1699, "step": 18975 }, { "epoch": 5.801284011005809, "grad_norm": 0.5740697383880615, "learning_rate": 2.5280030571950237e-05, "loss": 0.1427, "step": 18976 }, { "epoch": 5.801589727911954, "grad_norm": 0.6840261816978455, "learning_rate": 2.52796059615303e-05, "loss": 0.2086, "step": 18977 }, { "epoch": 5.801895444818099, "grad_norm": 0.7194338440895081, "learning_rate": 2.5279181351110357e-05, "loss": 0.1549, "step": 18978 }, { "epoch": 5.802201161724243, "grad_norm": 1.3484821319580078, "learning_rate": 2.527875674069042e-05, "loss": 0.1725, "step": 18979 }, { "epoch": 5.802506878630388, "grad_norm": 2.3443443775177, "learning_rate": 2.5278332130270478e-05, "loss": 0.277, "step": 18980 }, { "epoch": 5.802812595536533, "grad_norm": 0.3590095043182373, "learning_rate": 2.527790751985054e-05, "loss": 0.1714, "step": 18981 }, { "epoch": 5.803118312442678, "grad_norm": 0.38969406485557556, "learning_rate": 2.52774829094306e-05, "loss": 0.075, "step": 18982 }, { "epoch": 5.803424029348823, "grad_norm": 0.21501465141773224, "learning_rate": 2.5277058299010658e-05, "loss": 0.075, "step": 18983 }, { "epoch": 5.803729746254968, "grad_norm": 0.2878636419773102, "learning_rate": 2.527663368859072e-05, "loss": 0.053, "step": 18984 }, { "epoch": 5.804035463161113, "grad_norm": 0.21308860182762146, "learning_rate": 2.527620907817078e-05, "loss": 0.0481, "step": 18985 }, { "epoch": 5.804341180067258, "grad_norm": 0.24748508632183075, "learning_rate": 2.527578446775084e-05, "loss": 0.0667, "step": 18986 }, { "epoch": 5.804646896973402, "grad_norm": 0.41843605041503906, "learning_rate": 2.52753598573309e-05, "loss": 0.0747, "step": 18987 }, { "epoch": 5.804952613879547, "grad_norm": 0.24623514711856842, "learning_rate": 2.527493524691096e-05, "loss": 0.0875, "step": 18988 }, { "epoch": 5.805258330785692, "grad_norm": 0.278006374835968, "learning_rate": 2.527451063649102e-05, "loss": 0.0518, "step": 18989 }, { "epoch": 5.8055640476918375, "grad_norm": 0.27905043959617615, "learning_rate": 2.5274086026071082e-05, "loss": 0.0656, "step": 18990 }, { "epoch": 5.805869764597983, "grad_norm": 0.37358585000038147, "learning_rate": 2.527366141565114e-05, "loss": 0.076, "step": 18991 }, { "epoch": 5.806175481504127, "grad_norm": 0.305586576461792, "learning_rate": 2.5273236805231203e-05, "loss": 0.108, "step": 18992 }, { "epoch": 5.806481198410272, "grad_norm": 0.8368573188781738, "learning_rate": 2.527281219481126e-05, "loss": 0.0913, "step": 18993 }, { "epoch": 5.806786915316417, "grad_norm": 0.41055214405059814, "learning_rate": 2.5272387584391324e-05, "loss": 0.122, "step": 18994 }, { "epoch": 5.807092632222562, "grad_norm": 0.49345529079437256, "learning_rate": 2.5271962973971382e-05, "loss": 0.1321, "step": 18995 }, { "epoch": 5.8073983491287064, "grad_norm": 2.291191577911377, "learning_rate": 2.527153836355144e-05, "loss": 0.1754, "step": 18996 }, { "epoch": 5.807704066034852, "grad_norm": 0.7780249714851379, "learning_rate": 2.5271113753131503e-05, "loss": 0.1522, "step": 18997 }, { "epoch": 5.808009782940997, "grad_norm": 0.38399580121040344, "learning_rate": 2.5270689142711562e-05, "loss": 0.1285, "step": 18998 }, { "epoch": 5.808315499847142, "grad_norm": 0.6040226817131042, "learning_rate": 2.5270264532291624e-05, "loss": 0.1802, "step": 18999 }, { "epoch": 5.808621216753286, "grad_norm": 0.49119094014167786, "learning_rate": 2.5269839921871683e-05, "loss": 0.1789, "step": 19000 }, { "epoch": 5.808621216753286, "eval_cer": 0.18776799450099832, "eval_loss": 0.22853875160217285, "eval_runtime": 19.1083, "eval_samples_per_second": 237.489, "eval_steps_per_second": 0.785, "eval_wer": 0.32423675316331946, "step": 19000 }, { "epoch": 5.808926933659431, "grad_norm": 2.3270373344421387, "learning_rate": 2.5269415311451745e-05, "loss": 0.159, "step": 19001 }, { "epoch": 5.809232650565576, "grad_norm": 0.7855657935142517, "learning_rate": 2.5268990701031803e-05, "loss": 0.1971, "step": 19002 }, { "epoch": 5.809538367471721, "grad_norm": 0.975879430770874, "learning_rate": 2.5268566090611865e-05, "loss": 0.1919, "step": 19003 }, { "epoch": 5.8098440843778665, "grad_norm": 0.8335655927658081, "learning_rate": 2.5268141480191924e-05, "loss": 0.163, "step": 19004 }, { "epoch": 5.810149801284011, "grad_norm": 1.7097290754318237, "learning_rate": 2.5267716869771986e-05, "loss": 0.2724, "step": 19005 }, { "epoch": 5.810455518190156, "grad_norm": 0.5645607113838196, "learning_rate": 2.5267292259352045e-05, "loss": 0.1358, "step": 19006 }, { "epoch": 5.810761235096301, "grad_norm": 1.3197035789489746, "learning_rate": 2.5266867648932104e-05, "loss": 0.0792, "step": 19007 }, { "epoch": 5.811066952002446, "grad_norm": 0.2225322723388672, "learning_rate": 2.5266443038512166e-05, "loss": 0.059, "step": 19008 }, { "epoch": 5.81137266890859, "grad_norm": 0.6353883147239685, "learning_rate": 2.5266018428092224e-05, "loss": 0.0775, "step": 19009 }, { "epoch": 5.811678385814735, "grad_norm": 0.367380291223526, "learning_rate": 2.5265593817672287e-05, "loss": 0.0442, "step": 19010 }, { "epoch": 5.8119841027208805, "grad_norm": 0.3106071949005127, "learning_rate": 2.5265169207252345e-05, "loss": 0.074, "step": 19011 }, { "epoch": 5.812289819627026, "grad_norm": 0.4566020965576172, "learning_rate": 2.5264744596832407e-05, "loss": 0.0761, "step": 19012 }, { "epoch": 5.81259553653317, "grad_norm": 0.19250476360321045, "learning_rate": 2.5264319986412466e-05, "loss": 0.0497, "step": 19013 }, { "epoch": 5.812901253439315, "grad_norm": 0.3424806296825409, "learning_rate": 2.5263895375992528e-05, "loss": 0.0627, "step": 19014 }, { "epoch": 5.81320697034546, "grad_norm": 0.43680572509765625, "learning_rate": 2.5263470765572587e-05, "loss": 0.0437, "step": 19015 }, { "epoch": 5.813512687251605, "grad_norm": 0.2979128062725067, "learning_rate": 2.526304615515265e-05, "loss": 0.0695, "step": 19016 }, { "epoch": 5.81381840415775, "grad_norm": 0.39472833275794983, "learning_rate": 2.5262621544732708e-05, "loss": 0.0548, "step": 19017 }, { "epoch": 5.8141241210638945, "grad_norm": 0.45670488476753235, "learning_rate": 2.526219693431277e-05, "loss": 0.092, "step": 19018 }, { "epoch": 5.81442983797004, "grad_norm": 1.7288717031478882, "learning_rate": 2.526177232389283e-05, "loss": 0.1215, "step": 19019 }, { "epoch": 5.814735554876185, "grad_norm": 0.7363646626472473, "learning_rate": 2.5261347713472887e-05, "loss": 0.1238, "step": 19020 }, { "epoch": 5.81504127178233, "grad_norm": 0.48483535647392273, "learning_rate": 2.526092310305295e-05, "loss": 0.104, "step": 19021 }, { "epoch": 5.815346988688474, "grad_norm": 0.4868695139884949, "learning_rate": 2.5260498492633008e-05, "loss": 0.1461, "step": 19022 }, { "epoch": 5.815652705594619, "grad_norm": 1.0249452590942383, "learning_rate": 2.526007388221307e-05, "loss": 0.1457, "step": 19023 }, { "epoch": 5.815958422500764, "grad_norm": 0.9076442122459412, "learning_rate": 2.525964927179313e-05, "loss": 0.1689, "step": 19024 }, { "epoch": 5.8162641394069095, "grad_norm": 0.8161860704421997, "learning_rate": 2.525922466137319e-05, "loss": 0.1815, "step": 19025 }, { "epoch": 5.816569856313054, "grad_norm": 0.9297267198562622, "learning_rate": 2.525880005095325e-05, "loss": 0.1832, "step": 19026 }, { "epoch": 5.816875573219199, "grad_norm": 0.6235979795455933, "learning_rate": 2.525837544053331e-05, "loss": 0.1784, "step": 19027 }, { "epoch": 5.817181290125344, "grad_norm": 0.7704402804374695, "learning_rate": 2.525795083011337e-05, "loss": 0.1654, "step": 19028 }, { "epoch": 5.817487007031489, "grad_norm": 0.5350661873817444, "learning_rate": 2.5257526219693432e-05, "loss": 0.1968, "step": 19029 }, { "epoch": 5.817792723937634, "grad_norm": 1.5195696353912354, "learning_rate": 2.525710160927349e-05, "loss": 0.2246, "step": 19030 }, { "epoch": 5.818098440843778, "grad_norm": 0.4375050365924835, "learning_rate": 2.5256676998853553e-05, "loss": 0.1956, "step": 19031 }, { "epoch": 5.8184041577499235, "grad_norm": 0.3375566899776459, "learning_rate": 2.5256252388433612e-05, "loss": 0.074, "step": 19032 }, { "epoch": 5.818709874656069, "grad_norm": 0.12916818261146545, "learning_rate": 2.525582777801367e-05, "loss": 0.0398, "step": 19033 }, { "epoch": 5.819015591562214, "grad_norm": 0.39624398946762085, "learning_rate": 2.5255403167593733e-05, "loss": 0.0639, "step": 19034 }, { "epoch": 5.819321308468358, "grad_norm": 0.2191012054681778, "learning_rate": 2.525497855717379e-05, "loss": 0.0551, "step": 19035 }, { "epoch": 5.819627025374503, "grad_norm": 0.2024151086807251, "learning_rate": 2.5254553946753853e-05, "loss": 0.0594, "step": 19036 }, { "epoch": 5.819932742280648, "grad_norm": 0.28345999121665955, "learning_rate": 2.5254129336333912e-05, "loss": 0.0431, "step": 19037 }, { "epoch": 5.820238459186793, "grad_norm": 0.46935129165649414, "learning_rate": 2.5253704725913974e-05, "loss": 0.057, "step": 19038 }, { "epoch": 5.8205441760929375, "grad_norm": 0.3655724823474884, "learning_rate": 2.5253280115494033e-05, "loss": 0.0888, "step": 19039 }, { "epoch": 5.820849892999083, "grad_norm": 0.1960141509771347, "learning_rate": 2.5252855505074095e-05, "loss": 0.0521, "step": 19040 }, { "epoch": 5.821155609905228, "grad_norm": 0.2857668101787567, "learning_rate": 2.5252430894654154e-05, "loss": 0.0735, "step": 19041 }, { "epoch": 5.821461326811373, "grad_norm": 0.6011529564857483, "learning_rate": 2.5252006284234216e-05, "loss": 0.0948, "step": 19042 }, { "epoch": 5.821767043717518, "grad_norm": 0.285253643989563, "learning_rate": 2.5251581673814274e-05, "loss": 0.1009, "step": 19043 }, { "epoch": 5.822072760623662, "grad_norm": 0.2802004814147949, "learning_rate": 2.5251157063394337e-05, "loss": 0.0947, "step": 19044 }, { "epoch": 5.822378477529807, "grad_norm": 0.6961274743080139, "learning_rate": 2.5250732452974395e-05, "loss": 0.1276, "step": 19045 }, { "epoch": 5.822684194435952, "grad_norm": 0.9320313334465027, "learning_rate": 2.5250307842554454e-05, "loss": 0.1223, "step": 19046 }, { "epoch": 5.8229899113420975, "grad_norm": 0.7647895812988281, "learning_rate": 2.5249883232134516e-05, "loss": 0.1668, "step": 19047 }, { "epoch": 5.823295628248242, "grad_norm": 0.41568607091903687, "learning_rate": 2.5249458621714575e-05, "loss": 0.1512, "step": 19048 }, { "epoch": 5.823601345154387, "grad_norm": 1.6566795110702515, "learning_rate": 2.5249034011294637e-05, "loss": 0.1732, "step": 19049 }, { "epoch": 5.823907062060532, "grad_norm": 0.8483220934867859, "learning_rate": 2.5248609400874696e-05, "loss": 0.1847, "step": 19050 }, { "epoch": 5.824212778966677, "grad_norm": 0.6394903063774109, "learning_rate": 2.5248184790454758e-05, "loss": 0.1742, "step": 19051 }, { "epoch": 5.824518495872821, "grad_norm": 1.07952880859375, "learning_rate": 2.5247760180034816e-05, "loss": 0.1749, "step": 19052 }, { "epoch": 5.8248242127789664, "grad_norm": 1.6004570722579956, "learning_rate": 2.524733556961488e-05, "loss": 0.1996, "step": 19053 }, { "epoch": 5.825129929685112, "grad_norm": 1.659911036491394, "learning_rate": 2.5246910959194937e-05, "loss": 0.1781, "step": 19054 }, { "epoch": 5.825435646591257, "grad_norm": 1.5110853910446167, "learning_rate": 2.5246486348775e-05, "loss": 0.237, "step": 19055 }, { "epoch": 5.825741363497402, "grad_norm": 0.4132724106311798, "learning_rate": 2.524606173835506e-05, "loss": 0.1667, "step": 19056 }, { "epoch": 5.826047080403546, "grad_norm": 0.22147899866104126, "learning_rate": 2.5245637127935123e-05, "loss": 0.0747, "step": 19057 }, { "epoch": 5.826352797309691, "grad_norm": 0.20992162823677063, "learning_rate": 2.5245212517515182e-05, "loss": 0.0567, "step": 19058 }, { "epoch": 5.826658514215836, "grad_norm": 0.36981332302093506, "learning_rate": 2.524478790709524e-05, "loss": 0.0657, "step": 19059 }, { "epoch": 5.826964231121981, "grad_norm": 0.5727090835571289, "learning_rate": 2.5244363296675303e-05, "loss": 0.0517, "step": 19060 }, { "epoch": 5.827269948028126, "grad_norm": 0.9639095664024353, "learning_rate": 2.524393868625536e-05, "loss": 0.0603, "step": 19061 }, { "epoch": 5.827575664934271, "grad_norm": 0.7354093194007874, "learning_rate": 2.5243514075835424e-05, "loss": 0.0528, "step": 19062 }, { "epoch": 5.827881381840416, "grad_norm": 0.27539145946502686, "learning_rate": 2.5243089465415482e-05, "loss": 0.0594, "step": 19063 }, { "epoch": 5.828187098746561, "grad_norm": 0.23275572061538696, "learning_rate": 2.5242664854995544e-05, "loss": 0.0644, "step": 19064 }, { "epoch": 5.828492815652705, "grad_norm": 0.3700043261051178, "learning_rate": 2.5242240244575603e-05, "loss": 0.0637, "step": 19065 }, { "epoch": 5.82879853255885, "grad_norm": 0.3132288455963135, "learning_rate": 2.5241815634155665e-05, "loss": 0.093, "step": 19066 }, { "epoch": 5.829104249464995, "grad_norm": 0.5715323686599731, "learning_rate": 2.5241391023735724e-05, "loss": 0.0892, "step": 19067 }, { "epoch": 5.8294099663711405, "grad_norm": 0.4390237331390381, "learning_rate": 2.5240966413315786e-05, "loss": 0.0903, "step": 19068 }, { "epoch": 5.829715683277286, "grad_norm": 0.6539603471755981, "learning_rate": 2.5240541802895845e-05, "loss": 0.1012, "step": 19069 }, { "epoch": 5.83002140018343, "grad_norm": 0.47944071888923645, "learning_rate": 2.5240117192475907e-05, "loss": 0.1412, "step": 19070 }, { "epoch": 5.830327117089575, "grad_norm": 0.3462805151939392, "learning_rate": 2.5239692582055966e-05, "loss": 0.1306, "step": 19071 }, { "epoch": 5.83063283399572, "grad_norm": 0.7652830481529236, "learning_rate": 2.5239267971636024e-05, "loss": 0.1644, "step": 19072 }, { "epoch": 5.830938550901865, "grad_norm": 0.9241788387298584, "learning_rate": 2.5238843361216086e-05, "loss": 0.1593, "step": 19073 }, { "epoch": 5.831244267808009, "grad_norm": 1.1383323669433594, "learning_rate": 2.5238418750796145e-05, "loss": 0.1713, "step": 19074 }, { "epoch": 5.8315499847141545, "grad_norm": 1.3354840278625488, "learning_rate": 2.5237994140376207e-05, "loss": 0.1771, "step": 19075 }, { "epoch": 5.8318557016203, "grad_norm": 1.3442927598953247, "learning_rate": 2.5237569529956266e-05, "loss": 0.1835, "step": 19076 }, { "epoch": 5.832161418526445, "grad_norm": 1.012730360031128, "learning_rate": 2.5237144919536328e-05, "loss": 0.1944, "step": 19077 }, { "epoch": 5.832467135432589, "grad_norm": 1.4650905132293701, "learning_rate": 2.5236720309116387e-05, "loss": 0.1909, "step": 19078 }, { "epoch": 5.832772852338734, "grad_norm": 2.168118715286255, "learning_rate": 2.523629569869645e-05, "loss": 0.1995, "step": 19079 }, { "epoch": 5.833078569244879, "grad_norm": 1.8927019834518433, "learning_rate": 2.5235871088276507e-05, "loss": 0.24, "step": 19080 }, { "epoch": 5.833384286151024, "grad_norm": 0.5278323292732239, "learning_rate": 2.523544647785657e-05, "loss": 0.1436, "step": 19081 }, { "epoch": 5.8336900030571694, "grad_norm": 0.4701160788536072, "learning_rate": 2.5235021867436628e-05, "loss": 0.0526, "step": 19082 }, { "epoch": 5.833995719963314, "grad_norm": 1.4706735610961914, "learning_rate": 2.523459725701669e-05, "loss": 0.0664, "step": 19083 }, { "epoch": 5.834301436869459, "grad_norm": 0.2388773262500763, "learning_rate": 2.523417264659675e-05, "loss": 0.0582, "step": 19084 }, { "epoch": 5.834607153775604, "grad_norm": 0.19922427833080292, "learning_rate": 2.5233748036176808e-05, "loss": 0.0486, "step": 19085 }, { "epoch": 5.834912870681749, "grad_norm": 0.2588753402233124, "learning_rate": 2.523332342575687e-05, "loss": 0.0565, "step": 19086 }, { "epoch": 5.835218587587893, "grad_norm": 0.23287342488765717, "learning_rate": 2.523289881533693e-05, "loss": 0.0554, "step": 19087 }, { "epoch": 5.835524304494038, "grad_norm": 0.6730877161026001, "learning_rate": 2.523247420491699e-05, "loss": 0.0532, "step": 19088 }, { "epoch": 5.8358300214001835, "grad_norm": 0.4405491054058075, "learning_rate": 2.523204959449705e-05, "loss": 0.0923, "step": 19089 }, { "epoch": 5.836135738306329, "grad_norm": 0.7262604832649231, "learning_rate": 2.523162498407711e-05, "loss": 0.0761, "step": 19090 }, { "epoch": 5.836441455212473, "grad_norm": 0.5177577137947083, "learning_rate": 2.523120037365717e-05, "loss": 0.0785, "step": 19091 }, { "epoch": 5.836747172118618, "grad_norm": 0.29227501153945923, "learning_rate": 2.5230775763237232e-05, "loss": 0.0721, "step": 19092 }, { "epoch": 5.837052889024763, "grad_norm": 0.4623161852359772, "learning_rate": 2.523035115281729e-05, "loss": 0.1515, "step": 19093 }, { "epoch": 5.837358605930908, "grad_norm": 0.5352635383605957, "learning_rate": 2.5229926542397353e-05, "loss": 0.127, "step": 19094 }, { "epoch": 5.837664322837053, "grad_norm": 0.3715769052505493, "learning_rate": 2.522950193197741e-05, "loss": 0.113, "step": 19095 }, { "epoch": 5.8379700397431975, "grad_norm": 0.4659007489681244, "learning_rate": 2.5229077321557474e-05, "loss": 0.152, "step": 19096 }, { "epoch": 5.838275756649343, "grad_norm": 0.6885432004928589, "learning_rate": 2.5228652711137532e-05, "loss": 0.1762, "step": 19097 }, { "epoch": 5.838581473555488, "grad_norm": 0.5305585265159607, "learning_rate": 2.522822810071759e-05, "loss": 0.16, "step": 19098 }, { "epoch": 5.838887190461633, "grad_norm": 0.8068288564682007, "learning_rate": 2.5227803490297653e-05, "loss": 0.1574, "step": 19099 }, { "epoch": 5.839192907367777, "grad_norm": 0.6634947061538696, "learning_rate": 2.5227378879877712e-05, "loss": 0.1427, "step": 19100 }, { "epoch": 5.839498624273922, "grad_norm": 1.227394700050354, "learning_rate": 2.5226954269457774e-05, "loss": 0.1784, "step": 19101 }, { "epoch": 5.839804341180067, "grad_norm": 2.6164021492004395, "learning_rate": 2.5226529659037833e-05, "loss": 0.1756, "step": 19102 }, { "epoch": 5.840110058086212, "grad_norm": 0.7215743660926819, "learning_rate": 2.5226105048617895e-05, "loss": 0.1824, "step": 19103 }, { "epoch": 5.840415774992357, "grad_norm": 0.8042263984680176, "learning_rate": 2.5225680438197953e-05, "loss": 0.1879, "step": 19104 }, { "epoch": 5.840721491898502, "grad_norm": 0.6891629099845886, "learning_rate": 2.5225255827778016e-05, "loss": 0.1667, "step": 19105 }, { "epoch": 5.841027208804647, "grad_norm": 0.3775773346424103, "learning_rate": 2.5224831217358074e-05, "loss": 0.1272, "step": 19106 }, { "epoch": 5.841332925710792, "grad_norm": 0.44594016671180725, "learning_rate": 2.5224406606938136e-05, "loss": 0.0602, "step": 19107 }, { "epoch": 5.841638642616937, "grad_norm": 0.3739071190357208, "learning_rate": 2.5223981996518195e-05, "loss": 0.0804, "step": 19108 }, { "epoch": 5.841944359523081, "grad_norm": 0.33262717723846436, "learning_rate": 2.5223557386098257e-05, "loss": 0.0476, "step": 19109 }, { "epoch": 5.8422500764292264, "grad_norm": 0.1588488668203354, "learning_rate": 2.5223132775678316e-05, "loss": 0.0446, "step": 19110 }, { "epoch": 5.842555793335372, "grad_norm": 1.830382227897644, "learning_rate": 2.5222708165258374e-05, "loss": 0.0682, "step": 19111 }, { "epoch": 5.842861510241517, "grad_norm": 0.43648961186408997, "learning_rate": 2.5222283554838437e-05, "loss": 0.0444, "step": 19112 }, { "epoch": 5.843167227147661, "grad_norm": 0.31787195801734924, "learning_rate": 2.5221858944418495e-05, "loss": 0.0757, "step": 19113 }, { "epoch": 5.843472944053806, "grad_norm": 0.512908935546875, "learning_rate": 2.5221434333998557e-05, "loss": 0.0739, "step": 19114 }, { "epoch": 5.843778660959951, "grad_norm": 0.24749788641929626, "learning_rate": 2.5221009723578616e-05, "loss": 0.0607, "step": 19115 }, { "epoch": 5.844084377866096, "grad_norm": 1.184554100036621, "learning_rate": 2.5220585113158678e-05, "loss": 0.0962, "step": 19116 }, { "epoch": 5.8443900947722405, "grad_norm": 0.4225708246231079, "learning_rate": 2.5220160502738737e-05, "loss": 0.0756, "step": 19117 }, { "epoch": 5.844695811678386, "grad_norm": 0.32628703117370605, "learning_rate": 2.52197358923188e-05, "loss": 0.1165, "step": 19118 }, { "epoch": 5.845001528584531, "grad_norm": 0.34752553701400757, "learning_rate": 2.5219311281898858e-05, "loss": 0.1241, "step": 19119 }, { "epoch": 5.845307245490676, "grad_norm": 0.37225770950317383, "learning_rate": 2.521888667147892e-05, "loss": 0.1124, "step": 19120 }, { "epoch": 5.845612962396821, "grad_norm": 0.7395799160003662, "learning_rate": 2.521846206105898e-05, "loss": 0.1301, "step": 19121 }, { "epoch": 5.845918679302965, "grad_norm": 1.2141687870025635, "learning_rate": 2.521803745063904e-05, "loss": 0.1535, "step": 19122 }, { "epoch": 5.84622439620911, "grad_norm": 0.6784619092941284, "learning_rate": 2.52176128402191e-05, "loss": 0.1883, "step": 19123 }, { "epoch": 5.846530113115255, "grad_norm": 0.6454549431800842, "learning_rate": 2.5217188229799158e-05, "loss": 0.1652, "step": 19124 }, { "epoch": 5.8468358300214005, "grad_norm": 1.1306620836257935, "learning_rate": 2.521676361937922e-05, "loss": 0.1873, "step": 19125 }, { "epoch": 5.847141546927545, "grad_norm": 0.898648202419281, "learning_rate": 2.521633900895928e-05, "loss": 0.149, "step": 19126 }, { "epoch": 5.84744726383369, "grad_norm": 0.5504564046859741, "learning_rate": 2.521591439853934e-05, "loss": 0.1726, "step": 19127 }, { "epoch": 5.847752980739835, "grad_norm": 0.949518084526062, "learning_rate": 2.52154897881194e-05, "loss": 0.1918, "step": 19128 }, { "epoch": 5.84805869764598, "grad_norm": 0.740281879901886, "learning_rate": 2.521506517769946e-05, "loss": 0.2515, "step": 19129 }, { "epoch": 5.848364414552124, "grad_norm": 0.8913405537605286, "learning_rate": 2.521464056727952e-05, "loss": 0.2091, "step": 19130 }, { "epoch": 5.848670131458269, "grad_norm": 0.42934450507164, "learning_rate": 2.5214215956859582e-05, "loss": 0.1441, "step": 19131 }, { "epoch": 5.8489758483644145, "grad_norm": 0.1734764575958252, "learning_rate": 2.521379134643964e-05, "loss": 0.0828, "step": 19132 }, { "epoch": 5.84928156527056, "grad_norm": 0.21428875625133514, "learning_rate": 2.5213366736019703e-05, "loss": 0.0577, "step": 19133 }, { "epoch": 5.849587282176705, "grad_norm": 0.27588725090026855, "learning_rate": 2.5212942125599762e-05, "loss": 0.0579, "step": 19134 }, { "epoch": 5.849892999082849, "grad_norm": 0.1811354011297226, "learning_rate": 2.521251751517982e-05, "loss": 0.0464, "step": 19135 }, { "epoch": 5.850198715988994, "grad_norm": 0.23466862738132477, "learning_rate": 2.5212092904759883e-05, "loss": 0.0509, "step": 19136 }, { "epoch": 5.850504432895139, "grad_norm": 0.31790339946746826, "learning_rate": 2.521166829433994e-05, "loss": 0.0634, "step": 19137 }, { "epoch": 5.850810149801284, "grad_norm": 0.3903902769088745, "learning_rate": 2.5211243683920003e-05, "loss": 0.0651, "step": 19138 }, { "epoch": 5.851115866707429, "grad_norm": 0.22120127081871033, "learning_rate": 2.5210819073500062e-05, "loss": 0.0604, "step": 19139 }, { "epoch": 5.851421583613574, "grad_norm": 0.268729031085968, "learning_rate": 2.5210394463080124e-05, "loss": 0.059, "step": 19140 }, { "epoch": 5.851727300519719, "grad_norm": 0.5876834988594055, "learning_rate": 2.5209969852660183e-05, "loss": 0.0826, "step": 19141 }, { "epoch": 5.852033017425864, "grad_norm": 0.3250431716442108, "learning_rate": 2.5209545242240245e-05, "loss": 0.0895, "step": 19142 }, { "epoch": 5.852338734332008, "grad_norm": 0.44211533665657043, "learning_rate": 2.5209120631820304e-05, "loss": 0.0929, "step": 19143 }, { "epoch": 5.852644451238153, "grad_norm": 0.6651122570037842, "learning_rate": 2.5208696021400366e-05, "loss": 0.1091, "step": 19144 }, { "epoch": 5.852950168144298, "grad_norm": 0.35861867666244507, "learning_rate": 2.5208271410980425e-05, "loss": 0.1026, "step": 19145 }, { "epoch": 5.8532558850504435, "grad_norm": 0.558383584022522, "learning_rate": 2.5207846800560487e-05, "loss": 0.118, "step": 19146 }, { "epoch": 5.853561601956589, "grad_norm": 0.8575935363769531, "learning_rate": 2.5207422190140545e-05, "loss": 0.1494, "step": 19147 }, { "epoch": 5.853867318862733, "grad_norm": 0.5476775169372559, "learning_rate": 2.5206997579720604e-05, "loss": 0.1646, "step": 19148 }, { "epoch": 5.854173035768878, "grad_norm": 0.43560197949409485, "learning_rate": 2.5206572969300666e-05, "loss": 0.1512, "step": 19149 }, { "epoch": 5.854478752675023, "grad_norm": 0.7842085957527161, "learning_rate": 2.5206148358880725e-05, "loss": 0.1716, "step": 19150 }, { "epoch": 5.854784469581168, "grad_norm": 1.799959421157837, "learning_rate": 2.5205723748460787e-05, "loss": 0.1914, "step": 19151 }, { "epoch": 5.855090186487312, "grad_norm": 0.6404722332954407, "learning_rate": 2.5205299138040846e-05, "loss": 0.1403, "step": 19152 }, { "epoch": 5.8553959033934575, "grad_norm": 1.0406317710876465, "learning_rate": 2.5204874527620908e-05, "loss": 0.1953, "step": 19153 }, { "epoch": 5.855701620299603, "grad_norm": 0.7992905378341675, "learning_rate": 2.5204449917200966e-05, "loss": 0.2051, "step": 19154 }, { "epoch": 5.856007337205748, "grad_norm": 2.247396230697632, "learning_rate": 2.520402530678103e-05, "loss": 0.2496, "step": 19155 }, { "epoch": 5.856313054111892, "grad_norm": 13.158391952514648, "learning_rate": 2.5203600696361087e-05, "loss": 0.1337, "step": 19156 }, { "epoch": 5.856618771018037, "grad_norm": 0.20794111490249634, "learning_rate": 2.520317608594115e-05, "loss": 0.0707, "step": 19157 }, { "epoch": 5.856924487924182, "grad_norm": 0.3728295862674713, "learning_rate": 2.5202751475521208e-05, "loss": 0.0662, "step": 19158 }, { "epoch": 5.857230204830327, "grad_norm": 0.308743953704834, "learning_rate": 2.520232686510127e-05, "loss": 0.0576, "step": 19159 }, { "epoch": 5.857535921736472, "grad_norm": 0.20730572938919067, "learning_rate": 2.5201902254681332e-05, "loss": 0.057, "step": 19160 }, { "epoch": 5.857841638642617, "grad_norm": 0.2442288100719452, "learning_rate": 2.520147764426139e-05, "loss": 0.0362, "step": 19161 }, { "epoch": 5.858147355548762, "grad_norm": 0.26767146587371826, "learning_rate": 2.5201053033841453e-05, "loss": 0.0464, "step": 19162 }, { "epoch": 5.858453072454907, "grad_norm": 0.4915364980697632, "learning_rate": 2.520062842342151e-05, "loss": 0.0677, "step": 19163 }, { "epoch": 5.858758789361052, "grad_norm": 0.30049729347229004, "learning_rate": 2.5200203813001574e-05, "loss": 0.0732, "step": 19164 }, { "epoch": 5.859064506267196, "grad_norm": 0.14985960721969604, "learning_rate": 2.5199779202581632e-05, "loss": 0.0464, "step": 19165 }, { "epoch": 5.859370223173341, "grad_norm": 0.8146577477455139, "learning_rate": 2.5199354592161694e-05, "loss": 0.0981, "step": 19166 }, { "epoch": 5.8596759400794864, "grad_norm": 0.35704419016838074, "learning_rate": 2.5198929981741753e-05, "loss": 0.0701, "step": 19167 }, { "epoch": 5.859981656985632, "grad_norm": 0.2989770770072937, "learning_rate": 2.5198505371321815e-05, "loss": 0.0716, "step": 19168 }, { "epoch": 5.860287373891776, "grad_norm": 0.44600850343704224, "learning_rate": 2.5198080760901874e-05, "loss": 0.1266, "step": 19169 }, { "epoch": 5.860593090797921, "grad_norm": 0.44195276498794556, "learning_rate": 2.5197656150481936e-05, "loss": 0.0971, "step": 19170 }, { "epoch": 5.860898807704066, "grad_norm": 0.4411501884460449, "learning_rate": 2.5197231540061995e-05, "loss": 0.1364, "step": 19171 }, { "epoch": 5.861204524610211, "grad_norm": 0.502195417881012, "learning_rate": 2.5196806929642057e-05, "loss": 0.1144, "step": 19172 }, { "epoch": 5.861510241516356, "grad_norm": 0.8265092372894287, "learning_rate": 2.5196382319222116e-05, "loss": 0.163, "step": 19173 }, { "epoch": 5.8618159584225005, "grad_norm": 0.7186024785041809, "learning_rate": 2.5195957708802174e-05, "loss": 0.1699, "step": 19174 }, { "epoch": 5.862121675328646, "grad_norm": 1.8108114004135132, "learning_rate": 2.5195533098382236e-05, "loss": 0.1636, "step": 19175 }, { "epoch": 5.862427392234791, "grad_norm": 0.4530941843986511, "learning_rate": 2.5195108487962295e-05, "loss": 0.1739, "step": 19176 }, { "epoch": 5.862733109140936, "grad_norm": 0.6948827505111694, "learning_rate": 2.5194683877542357e-05, "loss": 0.1659, "step": 19177 }, { "epoch": 5.86303882604708, "grad_norm": 0.6085458397865295, "learning_rate": 2.5194259267122416e-05, "loss": 0.1704, "step": 19178 }, { "epoch": 5.863344542953225, "grad_norm": 0.995928168296814, "learning_rate": 2.5193834656702478e-05, "loss": 0.1516, "step": 19179 }, { "epoch": 5.86365025985937, "grad_norm": 1.0410878658294678, "learning_rate": 2.5193410046282537e-05, "loss": 0.2475, "step": 19180 }, { "epoch": 5.863955976765515, "grad_norm": 0.39201393723487854, "learning_rate": 2.51929854358626e-05, "loss": 0.1704, "step": 19181 }, { "epoch": 5.86426169367166, "grad_norm": 0.3216928541660309, "learning_rate": 2.5192560825442657e-05, "loss": 0.0837, "step": 19182 }, { "epoch": 5.864567410577805, "grad_norm": 0.16221840679645538, "learning_rate": 2.519213621502272e-05, "loss": 0.0562, "step": 19183 }, { "epoch": 5.86487312748395, "grad_norm": 0.18749643862247467, "learning_rate": 2.5191711604602778e-05, "loss": 0.0758, "step": 19184 }, { "epoch": 5.865178844390095, "grad_norm": 0.8569106459617615, "learning_rate": 2.519128699418284e-05, "loss": 0.0401, "step": 19185 }, { "epoch": 5.86548456129624, "grad_norm": 0.14707133173942566, "learning_rate": 2.51908623837629e-05, "loss": 0.0653, "step": 19186 }, { "epoch": 5.865790278202384, "grad_norm": 0.2835550308227539, "learning_rate": 2.5190437773342958e-05, "loss": 0.0532, "step": 19187 }, { "epoch": 5.866095995108529, "grad_norm": 0.29077479243278503, "learning_rate": 2.519001316292302e-05, "loss": 0.0751, "step": 19188 }, { "epoch": 5.8664017120146745, "grad_norm": 0.4437263011932373, "learning_rate": 2.518958855250308e-05, "loss": 0.0767, "step": 19189 }, { "epoch": 5.86670742892082, "grad_norm": 0.21729806065559387, "learning_rate": 2.518916394208314e-05, "loss": 0.0738, "step": 19190 }, { "epoch": 5.867013145826964, "grad_norm": 0.6840935945510864, "learning_rate": 2.51887393316632e-05, "loss": 0.0827, "step": 19191 }, { "epoch": 5.867318862733109, "grad_norm": 0.3250899910926819, "learning_rate": 2.518831472124326e-05, "loss": 0.103, "step": 19192 }, { "epoch": 5.867624579639254, "grad_norm": 0.620754599571228, "learning_rate": 2.518789011082332e-05, "loss": 0.1134, "step": 19193 }, { "epoch": 5.867930296545399, "grad_norm": 0.5067532658576965, "learning_rate": 2.5187465500403382e-05, "loss": 0.096, "step": 19194 }, { "epoch": 5.868236013451543, "grad_norm": 0.6893195509910583, "learning_rate": 2.518704088998344e-05, "loss": 0.117, "step": 19195 }, { "epoch": 5.868541730357689, "grad_norm": 0.69980788230896, "learning_rate": 2.5186616279563503e-05, "loss": 0.136, "step": 19196 }, { "epoch": 5.868847447263834, "grad_norm": 2.030735969543457, "learning_rate": 2.518619166914356e-05, "loss": 0.1146, "step": 19197 }, { "epoch": 5.869153164169979, "grad_norm": 1.1113120317459106, "learning_rate": 2.5185767058723624e-05, "loss": 0.1354, "step": 19198 }, { "epoch": 5.869458881076124, "grad_norm": 1.6010582447052002, "learning_rate": 2.5185342448303682e-05, "loss": 0.1772, "step": 19199 }, { "epoch": 5.869764597982268, "grad_norm": 2.5533130168914795, "learning_rate": 2.518491783788374e-05, "loss": 0.2006, "step": 19200 }, { "epoch": 5.870070314888413, "grad_norm": 1.50563383102417, "learning_rate": 2.5184493227463803e-05, "loss": 0.181, "step": 19201 }, { "epoch": 5.870376031794558, "grad_norm": 1.0427240133285522, "learning_rate": 2.5184068617043862e-05, "loss": 0.2325, "step": 19202 }, { "epoch": 5.8706817487007035, "grad_norm": 1.4093061685562134, "learning_rate": 2.5183644006623924e-05, "loss": 0.1565, "step": 19203 }, { "epoch": 5.870987465606848, "grad_norm": 1.6389517784118652, "learning_rate": 2.5183219396203983e-05, "loss": 0.1782, "step": 19204 }, { "epoch": 5.871293182512993, "grad_norm": 11.893939018249512, "learning_rate": 2.5182794785784045e-05, "loss": 0.2118, "step": 19205 }, { "epoch": 5.871598899419138, "grad_norm": 0.4066401422023773, "learning_rate": 2.5182370175364103e-05, "loss": 0.1482, "step": 19206 }, { "epoch": 5.871904616325283, "grad_norm": 0.3811056911945343, "learning_rate": 2.5181945564944166e-05, "loss": 0.0802, "step": 19207 }, { "epoch": 5.872210333231427, "grad_norm": 0.2527361512184143, "learning_rate": 2.5181520954524224e-05, "loss": 0.0676, "step": 19208 }, { "epoch": 5.872516050137572, "grad_norm": 0.7376211285591125, "learning_rate": 2.5181096344104286e-05, "loss": 0.0713, "step": 19209 }, { "epoch": 5.8728217670437175, "grad_norm": 0.28652825951576233, "learning_rate": 2.5180671733684345e-05, "loss": 0.0513, "step": 19210 }, { "epoch": 5.873127483949863, "grad_norm": 0.2226138561964035, "learning_rate": 2.5180247123264407e-05, "loss": 0.072, "step": 19211 }, { "epoch": 5.873433200856008, "grad_norm": 0.2942057251930237, "learning_rate": 2.5179822512844466e-05, "loss": 0.066, "step": 19212 }, { "epoch": 5.873738917762152, "grad_norm": 0.3146311044692993, "learning_rate": 2.5179397902424525e-05, "loss": 0.0599, "step": 19213 }, { "epoch": 5.874044634668297, "grad_norm": 0.2903156876564026, "learning_rate": 2.5178973292004587e-05, "loss": 0.0734, "step": 19214 }, { "epoch": 5.874350351574442, "grad_norm": 0.4358319640159607, "learning_rate": 2.5178548681584645e-05, "loss": 0.0795, "step": 19215 }, { "epoch": 5.874656068480587, "grad_norm": 0.412552148103714, "learning_rate": 2.5178124071164707e-05, "loss": 0.107, "step": 19216 }, { "epoch": 5.8749617853867315, "grad_norm": 0.48161956667900085, "learning_rate": 2.5177699460744766e-05, "loss": 0.0971, "step": 19217 }, { "epoch": 5.875267502292877, "grad_norm": 0.6414896845817566, "learning_rate": 2.5177274850324828e-05, "loss": 0.1095, "step": 19218 }, { "epoch": 5.875573219199022, "grad_norm": 0.5457608699798584, "learning_rate": 2.5176850239904887e-05, "loss": 0.0817, "step": 19219 }, { "epoch": 5.875878936105167, "grad_norm": 0.4220584034919739, "learning_rate": 2.517642562948495e-05, "loss": 0.1494, "step": 19220 }, { "epoch": 5.876184653011311, "grad_norm": 0.4292669892311096, "learning_rate": 2.5176001019065008e-05, "loss": 0.1407, "step": 19221 }, { "epoch": 5.876490369917456, "grad_norm": 0.7266040444374084, "learning_rate": 2.517557640864507e-05, "loss": 0.1912, "step": 19222 }, { "epoch": 5.876796086823601, "grad_norm": 0.9692152142524719, "learning_rate": 2.517515179822513e-05, "loss": 0.1853, "step": 19223 }, { "epoch": 5.877101803729746, "grad_norm": 0.8977276086807251, "learning_rate": 2.517472718780519e-05, "loss": 0.1647, "step": 19224 }, { "epoch": 5.877407520635892, "grad_norm": 0.7464172840118408, "learning_rate": 2.517430257738525e-05, "loss": 0.1615, "step": 19225 }, { "epoch": 5.877713237542036, "grad_norm": 0.5374807119369507, "learning_rate": 2.5173877966965308e-05, "loss": 0.19, "step": 19226 }, { "epoch": 5.878018954448181, "grad_norm": 0.7495740652084351, "learning_rate": 2.517345335654537e-05, "loss": 0.1776, "step": 19227 }, { "epoch": 5.878324671354326, "grad_norm": 1.8599618673324585, "learning_rate": 2.517302874612543e-05, "loss": 0.1768, "step": 19228 }, { "epoch": 5.878630388260471, "grad_norm": 1.3268779516220093, "learning_rate": 2.517260413570549e-05, "loss": 0.1762, "step": 19229 }, { "epoch": 5.878936105166615, "grad_norm": 1.166237711906433, "learning_rate": 2.517217952528555e-05, "loss": 0.233, "step": 19230 }, { "epoch": 5.8792418220727605, "grad_norm": 0.2983999252319336, "learning_rate": 2.517175491486561e-05, "loss": 0.1206, "step": 19231 }, { "epoch": 5.879547538978906, "grad_norm": 0.4388585686683655, "learning_rate": 2.517133030444567e-05, "loss": 0.1001, "step": 19232 }, { "epoch": 5.879853255885051, "grad_norm": 0.3573589324951172, "learning_rate": 2.5170905694025732e-05, "loss": 0.0656, "step": 19233 }, { "epoch": 5.880158972791195, "grad_norm": 0.2884696424007416, "learning_rate": 2.517048108360579e-05, "loss": 0.0582, "step": 19234 }, { "epoch": 5.88046468969734, "grad_norm": 0.34900036454200745, "learning_rate": 2.5170056473185853e-05, "loss": 0.0542, "step": 19235 }, { "epoch": 5.880770406603485, "grad_norm": 0.15725117921829224, "learning_rate": 2.5169631862765912e-05, "loss": 0.0466, "step": 19236 }, { "epoch": 5.88107612350963, "grad_norm": 0.38328707218170166, "learning_rate": 2.5169207252345974e-05, "loss": 0.0557, "step": 19237 }, { "epoch": 5.881381840415775, "grad_norm": 0.1338314712047577, "learning_rate": 2.5168782641926033e-05, "loss": 0.0549, "step": 19238 }, { "epoch": 5.88168755732192, "grad_norm": 0.3749690353870392, "learning_rate": 2.516835803150609e-05, "loss": 0.0981, "step": 19239 }, { "epoch": 5.881993274228065, "grad_norm": 0.4031566381454468, "learning_rate": 2.5167933421086153e-05, "loss": 0.0862, "step": 19240 }, { "epoch": 5.88229899113421, "grad_norm": 0.26185858249664307, "learning_rate": 2.5167508810666212e-05, "loss": 0.0749, "step": 19241 }, { "epoch": 5.882604708040355, "grad_norm": 0.8116191625595093, "learning_rate": 2.5167084200246274e-05, "loss": 0.0932, "step": 19242 }, { "epoch": 5.882910424946499, "grad_norm": 0.4166949689388275, "learning_rate": 2.5166659589826333e-05, "loss": 0.0914, "step": 19243 }, { "epoch": 5.883216141852644, "grad_norm": 0.5345158576965332, "learning_rate": 2.5166234979406395e-05, "loss": 0.0941, "step": 19244 }, { "epoch": 5.883521858758789, "grad_norm": 0.9315260052680969, "learning_rate": 2.5165810368986454e-05, "loss": 0.1249, "step": 19245 }, { "epoch": 5.8838275756649345, "grad_norm": 0.6645663976669312, "learning_rate": 2.5165385758566516e-05, "loss": 0.1254, "step": 19246 }, { "epoch": 5.884133292571079, "grad_norm": 1.4170652627944946, "learning_rate": 2.5164961148146575e-05, "loss": 0.153, "step": 19247 }, { "epoch": 5.884439009477224, "grad_norm": 1.309107780456543, "learning_rate": 2.5164536537726637e-05, "loss": 0.1472, "step": 19248 }, { "epoch": 5.884744726383369, "grad_norm": 1.0216130018234253, "learning_rate": 2.5164111927306695e-05, "loss": 0.1578, "step": 19249 }, { "epoch": 5.885050443289514, "grad_norm": 0.5785285830497742, "learning_rate": 2.5163687316886754e-05, "loss": 0.175, "step": 19250 }, { "epoch": 5.885356160195659, "grad_norm": 0.8135616779327393, "learning_rate": 2.5163262706466816e-05, "loss": 0.1863, "step": 19251 }, { "epoch": 5.885661877101803, "grad_norm": 0.9957015514373779, "learning_rate": 2.5162838096046875e-05, "loss": 0.1708, "step": 19252 }, { "epoch": 5.8859675940079486, "grad_norm": 0.6178616285324097, "learning_rate": 2.5162413485626937e-05, "loss": 0.2112, "step": 19253 }, { "epoch": 5.886273310914094, "grad_norm": 0.6888741850852966, "learning_rate": 2.5161988875206996e-05, "loss": 0.1716, "step": 19254 }, { "epoch": 5.886579027820239, "grad_norm": 0.8947192430496216, "learning_rate": 2.5161564264787058e-05, "loss": 0.1584, "step": 19255 }, { "epoch": 5.886884744726383, "grad_norm": 0.4479350745677948, "learning_rate": 2.5161139654367116e-05, "loss": 0.1397, "step": 19256 }, { "epoch": 5.887190461632528, "grad_norm": 0.7320625185966492, "learning_rate": 2.516071504394718e-05, "loss": 0.1047, "step": 19257 }, { "epoch": 5.887496178538673, "grad_norm": 0.5740407109260559, "learning_rate": 2.5160290433527237e-05, "loss": 0.0532, "step": 19258 }, { "epoch": 5.887801895444818, "grad_norm": 0.9562541842460632, "learning_rate": 2.51598658231073e-05, "loss": 0.0405, "step": 19259 }, { "epoch": 5.888107612350963, "grad_norm": 0.2609972655773163, "learning_rate": 2.5159441212687358e-05, "loss": 0.0523, "step": 19260 }, { "epoch": 5.888413329257108, "grad_norm": 0.21863892674446106, "learning_rate": 2.515901660226742e-05, "loss": 0.0595, "step": 19261 }, { "epoch": 5.888719046163253, "grad_norm": 0.2180890291929245, "learning_rate": 2.5158591991847482e-05, "loss": 0.0459, "step": 19262 }, { "epoch": 5.889024763069398, "grad_norm": 0.3501117527484894, "learning_rate": 2.515816738142754e-05, "loss": 0.069, "step": 19263 }, { "epoch": 5.889330479975543, "grad_norm": 0.31091558933258057, "learning_rate": 2.5157742771007603e-05, "loss": 0.0563, "step": 19264 }, { "epoch": 5.889636196881687, "grad_norm": 0.2636590898036957, "learning_rate": 2.515731816058766e-05, "loss": 0.0806, "step": 19265 }, { "epoch": 5.889941913787832, "grad_norm": 0.6053327322006226, "learning_rate": 2.5156893550167724e-05, "loss": 0.0771, "step": 19266 }, { "epoch": 5.8902476306939775, "grad_norm": 0.18837866187095642, "learning_rate": 2.5156468939747782e-05, "loss": 0.0731, "step": 19267 }, { "epoch": 5.890553347600123, "grad_norm": 0.639278769493103, "learning_rate": 2.5156044329327844e-05, "loss": 0.1418, "step": 19268 }, { "epoch": 5.890859064506267, "grad_norm": 0.6608238816261292, "learning_rate": 2.5155619718907903e-05, "loss": 0.0738, "step": 19269 }, { "epoch": 5.891164781412412, "grad_norm": 0.7016557455062866, "learning_rate": 2.5155195108487965e-05, "loss": 0.1254, "step": 19270 }, { "epoch": 5.891470498318557, "grad_norm": 0.5010502338409424, "learning_rate": 2.5154770498068024e-05, "loss": 0.112, "step": 19271 }, { "epoch": 5.891776215224702, "grad_norm": 0.9330998063087463, "learning_rate": 2.5154345887648086e-05, "loss": 0.1396, "step": 19272 }, { "epoch": 5.892081932130846, "grad_norm": 0.9307645559310913, "learning_rate": 2.5153921277228145e-05, "loss": 0.1768, "step": 19273 }, { "epoch": 5.8923876490369915, "grad_norm": 0.9150912761688232, "learning_rate": 2.5153496666808207e-05, "loss": 0.1849, "step": 19274 }, { "epoch": 5.892693365943137, "grad_norm": 0.47753071784973145, "learning_rate": 2.5153072056388266e-05, "loss": 0.1441, "step": 19275 }, { "epoch": 5.892999082849282, "grad_norm": 0.665471076965332, "learning_rate": 2.5152647445968324e-05, "loss": 0.1585, "step": 19276 }, { "epoch": 5.893304799755427, "grad_norm": 1.7375291585922241, "learning_rate": 2.5152222835548386e-05, "loss": 0.168, "step": 19277 }, { "epoch": 5.893610516661571, "grad_norm": 0.7155013084411621, "learning_rate": 2.5151798225128445e-05, "loss": 0.1822, "step": 19278 }, { "epoch": 5.893916233567716, "grad_norm": 0.9998350143432617, "learning_rate": 2.5151373614708507e-05, "loss": 0.2206, "step": 19279 }, { "epoch": 5.894221950473861, "grad_norm": 0.8713080883026123, "learning_rate": 2.5150949004288566e-05, "loss": 0.1915, "step": 19280 }, { "epoch": 5.894527667380006, "grad_norm": 0.402566134929657, "learning_rate": 2.5150524393868628e-05, "loss": 0.1552, "step": 19281 }, { "epoch": 5.894833384286151, "grad_norm": 0.22498957812786102, "learning_rate": 2.5150099783448687e-05, "loss": 0.0834, "step": 19282 }, { "epoch": 5.895139101192296, "grad_norm": 0.3202205300331116, "learning_rate": 2.514967517302875e-05, "loss": 0.0592, "step": 19283 }, { "epoch": 5.895444818098441, "grad_norm": 0.6852295398712158, "learning_rate": 2.5149250562608807e-05, "loss": 0.0406, "step": 19284 }, { "epoch": 5.895750535004586, "grad_norm": 0.19600075483322144, "learning_rate": 2.514882595218887e-05, "loss": 0.0564, "step": 19285 }, { "epoch": 5.89605625191073, "grad_norm": 0.449668824672699, "learning_rate": 2.5148401341768928e-05, "loss": 0.0877, "step": 19286 }, { "epoch": 5.896361968816875, "grad_norm": 0.65451580286026, "learning_rate": 2.514797673134899e-05, "loss": 0.0419, "step": 19287 }, { "epoch": 5.8966676857230205, "grad_norm": 0.19185805320739746, "learning_rate": 2.514755212092905e-05, "loss": 0.0547, "step": 19288 }, { "epoch": 5.896973402629166, "grad_norm": 0.7627127170562744, "learning_rate": 2.5147127510509108e-05, "loss": 0.0853, "step": 19289 }, { "epoch": 5.897279119535311, "grad_norm": 0.4129268527030945, "learning_rate": 2.514670290008917e-05, "loss": 0.07, "step": 19290 }, { "epoch": 5.897584836441455, "grad_norm": 0.3280611038208008, "learning_rate": 2.514627828966923e-05, "loss": 0.0863, "step": 19291 }, { "epoch": 5.8978905533476, "grad_norm": 0.3465743362903595, "learning_rate": 2.514585367924929e-05, "loss": 0.0985, "step": 19292 }, { "epoch": 5.898196270253745, "grad_norm": 0.501606822013855, "learning_rate": 2.514542906882935e-05, "loss": 0.1193, "step": 19293 }, { "epoch": 5.89850198715989, "grad_norm": 0.9035157561302185, "learning_rate": 2.514500445840941e-05, "loss": 0.151, "step": 19294 }, { "epoch": 5.8988077040660345, "grad_norm": 0.9730497002601624, "learning_rate": 2.514457984798947e-05, "loss": 0.117, "step": 19295 }, { "epoch": 5.89911342097218, "grad_norm": 0.6614850163459778, "learning_rate": 2.5144155237569532e-05, "loss": 0.1206, "step": 19296 }, { "epoch": 5.899419137878325, "grad_norm": 0.8110998868942261, "learning_rate": 2.514373062714959e-05, "loss": 0.152, "step": 19297 }, { "epoch": 5.89972485478447, "grad_norm": 0.7681334018707275, "learning_rate": 2.5143306016729653e-05, "loss": 0.1565, "step": 19298 }, { "epoch": 5.900030571690614, "grad_norm": 0.7729600071907043, "learning_rate": 2.514288140630971e-05, "loss": 0.1915, "step": 19299 }, { "epoch": 5.900336288596759, "grad_norm": 0.9222761392593384, "learning_rate": 2.5142456795889774e-05, "loss": 0.1933, "step": 19300 }, { "epoch": 5.900642005502904, "grad_norm": 0.7130700945854187, "learning_rate": 2.5142032185469832e-05, "loss": 0.1736, "step": 19301 }, { "epoch": 5.900947722409049, "grad_norm": 1.4368807077407837, "learning_rate": 2.514160757504989e-05, "loss": 0.1882, "step": 19302 }, { "epoch": 5.9012534393151945, "grad_norm": 1.806085467338562, "learning_rate": 2.5141182964629953e-05, "loss": 0.1756, "step": 19303 }, { "epoch": 5.901559156221339, "grad_norm": 0.947275698184967, "learning_rate": 2.5140758354210012e-05, "loss": 0.1924, "step": 19304 }, { "epoch": 5.901864873127484, "grad_norm": 1.2720612287521362, "learning_rate": 2.5140333743790074e-05, "loss": 0.2524, "step": 19305 }, { "epoch": 5.902170590033629, "grad_norm": 0.627963125705719, "learning_rate": 2.5139909133370133e-05, "loss": 0.1412, "step": 19306 }, { "epoch": 5.902476306939774, "grad_norm": 0.2680966854095459, "learning_rate": 2.5139484522950195e-05, "loss": 0.104, "step": 19307 }, { "epoch": 5.902782023845918, "grad_norm": 0.2798987030982971, "learning_rate": 2.5139059912530253e-05, "loss": 0.0666, "step": 19308 }, { "epoch": 5.903087740752063, "grad_norm": 0.18553538620471954, "learning_rate": 2.5138635302110316e-05, "loss": 0.0746, "step": 19309 }, { "epoch": 5.9033934576582086, "grad_norm": 0.37357980012893677, "learning_rate": 2.5138210691690374e-05, "loss": 0.0642, "step": 19310 }, { "epoch": 5.903699174564354, "grad_norm": 0.39201727509498596, "learning_rate": 2.5137786081270436e-05, "loss": 0.0632, "step": 19311 }, { "epoch": 5.904004891470498, "grad_norm": 0.24874450266361237, "learning_rate": 2.5137361470850495e-05, "loss": 0.0448, "step": 19312 }, { "epoch": 5.904310608376643, "grad_norm": 0.24979637563228607, "learning_rate": 2.5136936860430557e-05, "loss": 0.0758, "step": 19313 }, { "epoch": 5.904616325282788, "grad_norm": 0.3114820122718811, "learning_rate": 2.5136512250010616e-05, "loss": 0.0696, "step": 19314 }, { "epoch": 5.904922042188933, "grad_norm": 0.31115126609802246, "learning_rate": 2.5136087639590675e-05, "loss": 0.0487, "step": 19315 }, { "epoch": 5.905227759095078, "grad_norm": 0.28700152039527893, "learning_rate": 2.5135663029170737e-05, "loss": 0.0917, "step": 19316 }, { "epoch": 5.905533476001223, "grad_norm": 0.3071611225605011, "learning_rate": 2.5135238418750795e-05, "loss": 0.0763, "step": 19317 }, { "epoch": 5.905839192907368, "grad_norm": 0.3638869822025299, "learning_rate": 2.5134813808330857e-05, "loss": 0.0806, "step": 19318 }, { "epoch": 5.906144909813513, "grad_norm": 0.907365620136261, "learning_rate": 2.5134389197910916e-05, "loss": 0.1115, "step": 19319 }, { "epoch": 5.906450626719658, "grad_norm": 0.5328813791275024, "learning_rate": 2.5133964587490978e-05, "loss": 0.1345, "step": 19320 }, { "epoch": 5.906756343625802, "grad_norm": 0.5128492116928101, "learning_rate": 2.5133539977071037e-05, "loss": 0.1443, "step": 19321 }, { "epoch": 5.907062060531947, "grad_norm": 0.4374108612537384, "learning_rate": 2.51331153666511e-05, "loss": 0.1638, "step": 19322 }, { "epoch": 5.907367777438092, "grad_norm": 0.871762752532959, "learning_rate": 2.5132690756231158e-05, "loss": 0.1754, "step": 19323 }, { "epoch": 5.9076734943442375, "grad_norm": 0.7301279306411743, "learning_rate": 2.513226614581122e-05, "loss": 0.1724, "step": 19324 }, { "epoch": 5.907979211250382, "grad_norm": 0.5457285642623901, "learning_rate": 2.513184153539128e-05, "loss": 0.1634, "step": 19325 }, { "epoch": 5.908284928156527, "grad_norm": 0.7590218782424927, "learning_rate": 2.513141692497134e-05, "loss": 0.1614, "step": 19326 }, { "epoch": 5.908590645062672, "grad_norm": 0.61636883020401, "learning_rate": 2.51309923145514e-05, "loss": 0.1692, "step": 19327 }, { "epoch": 5.908896361968817, "grad_norm": 3.4559507369995117, "learning_rate": 2.5130567704131458e-05, "loss": 0.2008, "step": 19328 }, { "epoch": 5.909202078874962, "grad_norm": 0.9150338172912598, "learning_rate": 2.513014309371152e-05, "loss": 0.1896, "step": 19329 }, { "epoch": 5.909507795781106, "grad_norm": 2.7075281143188477, "learning_rate": 2.512971848329158e-05, "loss": 0.2297, "step": 19330 }, { "epoch": 5.9098135126872515, "grad_norm": 0.36220529675483704, "learning_rate": 2.512929387287164e-05, "loss": 0.1516, "step": 19331 }, { "epoch": 5.910119229593397, "grad_norm": 0.2881115972995758, "learning_rate": 2.51288692624517e-05, "loss": 0.0681, "step": 19332 }, { "epoch": 5.910424946499542, "grad_norm": 0.25197121500968933, "learning_rate": 2.512844465203176e-05, "loss": 0.0658, "step": 19333 }, { "epoch": 5.910730663405686, "grad_norm": 0.2380155324935913, "learning_rate": 2.512802004161182e-05, "loss": 0.0576, "step": 19334 }, { "epoch": 5.911036380311831, "grad_norm": 0.3455945551395416, "learning_rate": 2.5127595431191882e-05, "loss": 0.0587, "step": 19335 }, { "epoch": 5.911342097217976, "grad_norm": 0.3227989971637726, "learning_rate": 2.512717082077194e-05, "loss": 0.039, "step": 19336 }, { "epoch": 5.911647814124121, "grad_norm": 0.195307195186615, "learning_rate": 2.5126746210352003e-05, "loss": 0.062, "step": 19337 }, { "epoch": 5.9119535310302656, "grad_norm": 0.3111739456653595, "learning_rate": 2.5126321599932062e-05, "loss": 0.076, "step": 19338 }, { "epoch": 5.912259247936411, "grad_norm": 0.28251972794532776, "learning_rate": 2.5125896989512124e-05, "loss": 0.0628, "step": 19339 }, { "epoch": 5.912564964842556, "grad_norm": 1.0548479557037354, "learning_rate": 2.5125472379092183e-05, "loss": 0.0727, "step": 19340 }, { "epoch": 5.912870681748701, "grad_norm": 2.009272336959839, "learning_rate": 2.512504776867224e-05, "loss": 0.0734, "step": 19341 }, { "epoch": 5.913176398654846, "grad_norm": 0.3186144530773163, "learning_rate": 2.5124623158252303e-05, "loss": 0.0687, "step": 19342 }, { "epoch": 5.91348211556099, "grad_norm": 0.21302063763141632, "learning_rate": 2.5124198547832362e-05, "loss": 0.0862, "step": 19343 }, { "epoch": 5.913787832467135, "grad_norm": 0.43608367443084717, "learning_rate": 2.5123773937412424e-05, "loss": 0.1051, "step": 19344 }, { "epoch": 5.9140935493732805, "grad_norm": 0.7218865752220154, "learning_rate": 2.5123349326992483e-05, "loss": 0.1275, "step": 19345 }, { "epoch": 5.914399266279426, "grad_norm": 0.7283331751823425, "learning_rate": 2.5122924716572545e-05, "loss": 0.1254, "step": 19346 }, { "epoch": 5.91470498318557, "grad_norm": 0.6846649050712585, "learning_rate": 2.5122500106152604e-05, "loss": 0.1188, "step": 19347 }, { "epoch": 5.915010700091715, "grad_norm": 0.7272336483001709, "learning_rate": 2.5122075495732666e-05, "loss": 0.1609, "step": 19348 }, { "epoch": 5.91531641699786, "grad_norm": 1.6203926801681519, "learning_rate": 2.5121650885312725e-05, "loss": 0.1548, "step": 19349 }, { "epoch": 5.915622133904005, "grad_norm": 0.6680417060852051, "learning_rate": 2.5121226274892787e-05, "loss": 0.1934, "step": 19350 }, { "epoch": 5.915927850810149, "grad_norm": 0.7587728500366211, "learning_rate": 2.5120801664472845e-05, "loss": 0.1782, "step": 19351 }, { "epoch": 5.9162335677162945, "grad_norm": 1.5187528133392334, "learning_rate": 2.5120377054052907e-05, "loss": 0.1815, "step": 19352 }, { "epoch": 5.91653928462244, "grad_norm": 0.7287006378173828, "learning_rate": 2.5119952443632966e-05, "loss": 0.2122, "step": 19353 }, { "epoch": 5.916845001528585, "grad_norm": 1.0378364324569702, "learning_rate": 2.5119527833213025e-05, "loss": 0.2347, "step": 19354 }, { "epoch": 5.91715071843473, "grad_norm": 1.0599008798599243, "learning_rate": 2.5119103222793087e-05, "loss": 0.2351, "step": 19355 }, { "epoch": 5.917456435340874, "grad_norm": 0.45212435722351074, "learning_rate": 2.5118678612373146e-05, "loss": 0.1611, "step": 19356 }, { "epoch": 5.917762152247019, "grad_norm": 0.30199187994003296, "learning_rate": 2.5118254001953208e-05, "loss": 0.0822, "step": 19357 }, { "epoch": 5.918067869153164, "grad_norm": 0.28145214915275574, "learning_rate": 2.5117829391533266e-05, "loss": 0.0429, "step": 19358 }, { "epoch": 5.918373586059309, "grad_norm": 0.6468340754508972, "learning_rate": 2.511740478111333e-05, "loss": 0.0832, "step": 19359 }, { "epoch": 5.918679302965454, "grad_norm": 0.3356952369213104, "learning_rate": 2.5116980170693387e-05, "loss": 0.0671, "step": 19360 }, { "epoch": 5.918985019871599, "grad_norm": 0.11755373328924179, "learning_rate": 2.511655556027345e-05, "loss": 0.0396, "step": 19361 }, { "epoch": 5.919290736777744, "grad_norm": 0.33308470249176025, "learning_rate": 2.5116130949853508e-05, "loss": 0.0458, "step": 19362 }, { "epoch": 5.919596453683889, "grad_norm": 0.36584314703941345, "learning_rate": 2.511570633943357e-05, "loss": 0.1077, "step": 19363 }, { "epoch": 5.919902170590033, "grad_norm": 0.33288946747779846, "learning_rate": 2.5115281729013632e-05, "loss": 0.0628, "step": 19364 }, { "epoch": 5.920207887496178, "grad_norm": 0.4057868421077728, "learning_rate": 2.511485711859369e-05, "loss": 0.0619, "step": 19365 }, { "epoch": 5.920513604402323, "grad_norm": 0.23282670974731445, "learning_rate": 2.5114432508173753e-05, "loss": 0.0829, "step": 19366 }, { "epoch": 5.9208193213084686, "grad_norm": 0.5698826909065247, "learning_rate": 2.511400789775381e-05, "loss": 0.0669, "step": 19367 }, { "epoch": 5.921125038214614, "grad_norm": 0.3565516471862793, "learning_rate": 2.5113583287333874e-05, "loss": 0.0791, "step": 19368 }, { "epoch": 5.921430755120758, "grad_norm": 0.2896840572357178, "learning_rate": 2.5113158676913932e-05, "loss": 0.0858, "step": 19369 }, { "epoch": 5.921736472026903, "grad_norm": 0.3069559633731842, "learning_rate": 2.5112734066493995e-05, "loss": 0.1123, "step": 19370 }, { "epoch": 5.922042188933048, "grad_norm": 0.34320345520973206, "learning_rate": 2.5112309456074053e-05, "loss": 0.1237, "step": 19371 }, { "epoch": 5.922347905839193, "grad_norm": 0.5666184425354004, "learning_rate": 2.5111884845654115e-05, "loss": 0.1717, "step": 19372 }, { "epoch": 5.9226536227453375, "grad_norm": 1.459834098815918, "learning_rate": 2.5111460235234174e-05, "loss": 0.1451, "step": 19373 }, { "epoch": 5.922959339651483, "grad_norm": 0.8346847891807556, "learning_rate": 2.5111035624814236e-05, "loss": 0.1451, "step": 19374 }, { "epoch": 5.923265056557628, "grad_norm": 0.5670163631439209, "learning_rate": 2.5110611014394295e-05, "loss": 0.1883, "step": 19375 }, { "epoch": 5.923570773463773, "grad_norm": 0.9293971061706543, "learning_rate": 2.5110186403974357e-05, "loss": 0.1732, "step": 19376 }, { "epoch": 5.923876490369917, "grad_norm": 1.2193454504013062, "learning_rate": 2.5109761793554416e-05, "loss": 0.178, "step": 19377 }, { "epoch": 5.924182207276062, "grad_norm": 1.7837120294570923, "learning_rate": 2.5109337183134474e-05, "loss": 0.1734, "step": 19378 }, { "epoch": 5.924487924182207, "grad_norm": 1.0031510591506958, "learning_rate": 2.5108912572714536e-05, "loss": 0.1673, "step": 19379 }, { "epoch": 5.924793641088352, "grad_norm": 1.4230597019195557, "learning_rate": 2.5108487962294595e-05, "loss": 0.2535, "step": 19380 }, { "epoch": 5.9250993579944975, "grad_norm": 0.3675248920917511, "learning_rate": 2.5108063351874657e-05, "loss": 0.1302, "step": 19381 }, { "epoch": 5.925405074900642, "grad_norm": 0.19947801530361176, "learning_rate": 2.5107638741454716e-05, "loss": 0.0639, "step": 19382 }, { "epoch": 5.925710791806787, "grad_norm": 0.19140473008155823, "learning_rate": 2.5107214131034778e-05, "loss": 0.0859, "step": 19383 }, { "epoch": 5.926016508712932, "grad_norm": 0.22323618829250336, "learning_rate": 2.5106789520614837e-05, "loss": 0.0582, "step": 19384 }, { "epoch": 5.926322225619077, "grad_norm": 0.21633540093898773, "learning_rate": 2.51063649101949e-05, "loss": 0.0616, "step": 19385 }, { "epoch": 5.926627942525221, "grad_norm": 0.27341073751449585, "learning_rate": 2.5105940299774957e-05, "loss": 0.0747, "step": 19386 }, { "epoch": 5.926933659431366, "grad_norm": 0.1573062688112259, "learning_rate": 2.510551568935502e-05, "loss": 0.0509, "step": 19387 }, { "epoch": 5.9272393763375115, "grad_norm": 0.5726405382156372, "learning_rate": 2.5105091078935078e-05, "loss": 0.0473, "step": 19388 }, { "epoch": 5.927545093243657, "grad_norm": 0.32459163665771484, "learning_rate": 2.510466646851514e-05, "loss": 0.0725, "step": 19389 }, { "epoch": 5.927850810149801, "grad_norm": 1.3135948181152344, "learning_rate": 2.51042418580952e-05, "loss": 0.0595, "step": 19390 }, { "epoch": 5.928156527055946, "grad_norm": 0.5969603657722473, "learning_rate": 2.5103817247675258e-05, "loss": 0.0792, "step": 19391 }, { "epoch": 5.928462243962091, "grad_norm": 1.018286108970642, "learning_rate": 2.510339263725532e-05, "loss": 0.0617, "step": 19392 }, { "epoch": 5.928767960868236, "grad_norm": 0.3703102767467499, "learning_rate": 2.510296802683538e-05, "loss": 0.1052, "step": 19393 }, { "epoch": 5.929073677774381, "grad_norm": 0.40961623191833496, "learning_rate": 2.510254341641544e-05, "loss": 0.1218, "step": 19394 }, { "epoch": 5.9293793946805256, "grad_norm": 0.4647846817970276, "learning_rate": 2.51021188059955e-05, "loss": 0.1264, "step": 19395 }, { "epoch": 5.929685111586671, "grad_norm": 0.5654824376106262, "learning_rate": 2.510169419557556e-05, "loss": 0.1233, "step": 19396 }, { "epoch": 5.929990828492816, "grad_norm": 0.4264219105243683, "learning_rate": 2.510126958515562e-05, "loss": 0.1442, "step": 19397 }, { "epoch": 5.930296545398961, "grad_norm": 0.41914093494415283, "learning_rate": 2.5100844974735682e-05, "loss": 0.1798, "step": 19398 }, { "epoch": 5.930602262305105, "grad_norm": 7.2592058181762695, "learning_rate": 2.510042036431574e-05, "loss": 0.1703, "step": 19399 }, { "epoch": 5.93090797921125, "grad_norm": 1.0752229690551758, "learning_rate": 2.5099995753895803e-05, "loss": 0.1891, "step": 19400 }, { "epoch": 5.931213696117395, "grad_norm": 1.420858383178711, "learning_rate": 2.509957114347586e-05, "loss": 0.1858, "step": 19401 }, { "epoch": 5.9315194130235405, "grad_norm": 0.9295209050178528, "learning_rate": 2.5099146533055924e-05, "loss": 0.1695, "step": 19402 }, { "epoch": 5.931825129929685, "grad_norm": 0.5501106977462769, "learning_rate": 2.5098721922635982e-05, "loss": 0.198, "step": 19403 }, { "epoch": 5.93213084683583, "grad_norm": 1.4369579553604126, "learning_rate": 2.509829731221604e-05, "loss": 0.1988, "step": 19404 }, { "epoch": 5.932436563741975, "grad_norm": 1.1379343271255493, "learning_rate": 2.5097872701796103e-05, "loss": 0.2265, "step": 19405 }, { "epoch": 5.93274228064812, "grad_norm": 0.47468113899230957, "learning_rate": 2.5097448091376162e-05, "loss": 0.15, "step": 19406 }, { "epoch": 5.933047997554265, "grad_norm": 0.49036866426467896, "learning_rate": 2.5097023480956224e-05, "loss": 0.0746, "step": 19407 }, { "epoch": 5.933353714460409, "grad_norm": 0.2539461851119995, "learning_rate": 2.5096598870536283e-05, "loss": 0.0681, "step": 19408 }, { "epoch": 5.9336594313665545, "grad_norm": 0.5319697260856628, "learning_rate": 2.5096174260116345e-05, "loss": 0.0671, "step": 19409 }, { "epoch": 5.9339651482727, "grad_norm": 0.1978350281715393, "learning_rate": 2.5095749649696404e-05, "loss": 0.0496, "step": 19410 }, { "epoch": 5.934270865178845, "grad_norm": 0.18242834508419037, "learning_rate": 2.5095325039276466e-05, "loss": 0.0473, "step": 19411 }, { "epoch": 5.934576582084989, "grad_norm": 0.28807616233825684, "learning_rate": 2.5094900428856524e-05, "loss": 0.0531, "step": 19412 }, { "epoch": 5.934882298991134, "grad_norm": 0.25577840209007263, "learning_rate": 2.5094475818436586e-05, "loss": 0.0564, "step": 19413 }, { "epoch": 5.935188015897279, "grad_norm": 0.3416120707988739, "learning_rate": 2.5094051208016645e-05, "loss": 0.0801, "step": 19414 }, { "epoch": 5.935493732803424, "grad_norm": 0.257282555103302, "learning_rate": 2.5093626597596707e-05, "loss": 0.0689, "step": 19415 }, { "epoch": 5.9357994497095685, "grad_norm": 0.33939364552497864, "learning_rate": 2.5093201987176766e-05, "loss": 0.0898, "step": 19416 }, { "epoch": 5.936105166615714, "grad_norm": 0.3091033399105072, "learning_rate": 2.5092777376756825e-05, "loss": 0.0834, "step": 19417 }, { "epoch": 5.936410883521859, "grad_norm": 0.3817943036556244, "learning_rate": 2.5092352766336887e-05, "loss": 0.0763, "step": 19418 }, { "epoch": 5.936716600428004, "grad_norm": 0.44353994727134705, "learning_rate": 2.5091928155916945e-05, "loss": 0.0871, "step": 19419 }, { "epoch": 5.937022317334149, "grad_norm": 0.8078013062477112, "learning_rate": 2.5091503545497007e-05, "loss": 0.1353, "step": 19420 }, { "epoch": 5.937328034240293, "grad_norm": 0.6397669315338135, "learning_rate": 2.5091078935077066e-05, "loss": 0.1425, "step": 19421 }, { "epoch": 5.937633751146438, "grad_norm": 1.0873156785964966, "learning_rate": 2.5090654324657128e-05, "loss": 0.155, "step": 19422 }, { "epoch": 5.937939468052583, "grad_norm": 0.8075398206710815, "learning_rate": 2.5090229714237187e-05, "loss": 0.1488, "step": 19423 }, { "epoch": 5.9382451849587286, "grad_norm": 0.40754443407058716, "learning_rate": 2.508980510381725e-05, "loss": 0.1762, "step": 19424 }, { "epoch": 5.938550901864873, "grad_norm": 0.5978090167045593, "learning_rate": 2.5089380493397308e-05, "loss": 0.2397, "step": 19425 }, { "epoch": 5.938856618771018, "grad_norm": 0.5942800641059875, "learning_rate": 2.508895588297737e-05, "loss": 0.1601, "step": 19426 }, { "epoch": 5.939162335677163, "grad_norm": 2.3426904678344727, "learning_rate": 2.508853127255743e-05, "loss": 0.1761, "step": 19427 }, { "epoch": 5.939468052583308, "grad_norm": 1.195210576057434, "learning_rate": 2.508810666213749e-05, "loss": 0.1585, "step": 19428 }, { "epoch": 5.939773769489452, "grad_norm": 0.8649086356163025, "learning_rate": 2.508768205171755e-05, "loss": 0.1599, "step": 19429 }, { "epoch": 5.9400794863955975, "grad_norm": 1.195273756980896, "learning_rate": 2.5087257441297608e-05, "loss": 0.2577, "step": 19430 }, { "epoch": 5.940385203301743, "grad_norm": 0.2626422047615051, "learning_rate": 2.508683283087767e-05, "loss": 0.1241, "step": 19431 }, { "epoch": 5.940690920207888, "grad_norm": 0.4111546277999878, "learning_rate": 2.508640822045773e-05, "loss": 0.0809, "step": 19432 }, { "epoch": 5.940996637114033, "grad_norm": 0.23813453316688538, "learning_rate": 2.508598361003779e-05, "loss": 0.068, "step": 19433 }, { "epoch": 5.941302354020177, "grad_norm": 0.2558896541595459, "learning_rate": 2.508555899961785e-05, "loss": 0.0657, "step": 19434 }, { "epoch": 5.941608070926322, "grad_norm": 0.9077708721160889, "learning_rate": 2.508513438919791e-05, "loss": 0.0743, "step": 19435 }, { "epoch": 5.941913787832467, "grad_norm": 0.21459341049194336, "learning_rate": 2.508470977877797e-05, "loss": 0.0503, "step": 19436 }, { "epoch": 5.942219504738612, "grad_norm": 0.24791236221790314, "learning_rate": 2.5084285168358032e-05, "loss": 0.0633, "step": 19437 }, { "epoch": 5.942525221644757, "grad_norm": 0.1933252066373825, "learning_rate": 2.508386055793809e-05, "loss": 0.0505, "step": 19438 }, { "epoch": 5.942830938550902, "grad_norm": 0.3054386079311371, "learning_rate": 2.5083435947518153e-05, "loss": 0.0652, "step": 19439 }, { "epoch": 5.943136655457047, "grad_norm": 0.2071492075920105, "learning_rate": 2.5083011337098212e-05, "loss": 0.0698, "step": 19440 }, { "epoch": 5.943442372363192, "grad_norm": 0.46436700224876404, "learning_rate": 2.5082586726678274e-05, "loss": 0.0926, "step": 19441 }, { "epoch": 5.943748089269336, "grad_norm": 0.3790648579597473, "learning_rate": 2.5082162116258333e-05, "loss": 0.0738, "step": 19442 }, { "epoch": 5.944053806175481, "grad_norm": 0.4102518558502197, "learning_rate": 2.508173750583839e-05, "loss": 0.1084, "step": 19443 }, { "epoch": 5.944359523081626, "grad_norm": 0.9612168073654175, "learning_rate": 2.5081312895418454e-05, "loss": 0.0995, "step": 19444 }, { "epoch": 5.9446652399877715, "grad_norm": 0.3008677065372467, "learning_rate": 2.5080888284998512e-05, "loss": 0.1082, "step": 19445 }, { "epoch": 5.944970956893917, "grad_norm": 0.6368591785430908, "learning_rate": 2.5080463674578574e-05, "loss": 0.138, "step": 19446 }, { "epoch": 5.945276673800061, "grad_norm": 0.8449358344078064, "learning_rate": 2.5080039064158633e-05, "loss": 0.1313, "step": 19447 }, { "epoch": 5.945582390706206, "grad_norm": 1.951158046722412, "learning_rate": 2.5079614453738695e-05, "loss": 0.1567, "step": 19448 }, { "epoch": 5.945888107612351, "grad_norm": 0.4118245542049408, "learning_rate": 2.5079189843318754e-05, "loss": 0.147, "step": 19449 }, { "epoch": 5.946193824518496, "grad_norm": 0.5540169477462769, "learning_rate": 2.5078765232898816e-05, "loss": 0.1941, "step": 19450 }, { "epoch": 5.94649954142464, "grad_norm": 0.8751447796821594, "learning_rate": 2.5078340622478875e-05, "loss": 0.2104, "step": 19451 }, { "epoch": 5.9468052583307855, "grad_norm": 0.7560058236122131, "learning_rate": 2.5077916012058937e-05, "loss": 0.1536, "step": 19452 }, { "epoch": 5.947110975236931, "grad_norm": 0.8436893224716187, "learning_rate": 2.5077491401638995e-05, "loss": 0.1418, "step": 19453 }, { "epoch": 5.947416692143076, "grad_norm": 0.9279645085334778, "learning_rate": 2.5077066791219057e-05, "loss": 0.1728, "step": 19454 }, { "epoch": 5.94772240904922, "grad_norm": 1.17820405960083, "learning_rate": 2.5076642180799116e-05, "loss": 0.2104, "step": 19455 }, { "epoch": 5.948028125955365, "grad_norm": 0.6803661584854126, "learning_rate": 2.5076217570379175e-05, "loss": 0.1353, "step": 19456 }, { "epoch": 5.94833384286151, "grad_norm": 0.29643911123275757, "learning_rate": 2.5075792959959237e-05, "loss": 0.0687, "step": 19457 }, { "epoch": 5.948639559767655, "grad_norm": 0.2178429663181305, "learning_rate": 2.5075368349539296e-05, "loss": 0.0777, "step": 19458 }, { "epoch": 5.9489452766738005, "grad_norm": 0.21246430277824402, "learning_rate": 2.5074943739119358e-05, "loss": 0.0414, "step": 19459 }, { "epoch": 5.949250993579945, "grad_norm": 0.21281498670578003, "learning_rate": 2.5074519128699416e-05, "loss": 0.0536, "step": 19460 }, { "epoch": 5.94955671048609, "grad_norm": 2.660342216491699, "learning_rate": 2.507409451827948e-05, "loss": 0.0244, "step": 19461 }, { "epoch": 5.949862427392235, "grad_norm": 0.1632928103208542, "learning_rate": 2.5073669907859537e-05, "loss": 0.0649, "step": 19462 }, { "epoch": 5.95016814429838, "grad_norm": 0.18873877823352814, "learning_rate": 2.50732452974396e-05, "loss": 0.0587, "step": 19463 }, { "epoch": 5.950473861204524, "grad_norm": 0.4006464183330536, "learning_rate": 2.5072820687019658e-05, "loss": 0.0639, "step": 19464 }, { "epoch": 5.950779578110669, "grad_norm": 0.3766739070415497, "learning_rate": 2.507239607659972e-05, "loss": 0.0503, "step": 19465 }, { "epoch": 5.9510852950168145, "grad_norm": 0.5194097757339478, "learning_rate": 2.5071971466179782e-05, "loss": 0.0589, "step": 19466 }, { "epoch": 5.95139101192296, "grad_norm": 0.4886767268180847, "learning_rate": 2.5071546855759844e-05, "loss": 0.07, "step": 19467 }, { "epoch": 5.951696728829104, "grad_norm": 0.6606088280677795, "learning_rate": 2.5071122245339903e-05, "loss": 0.0791, "step": 19468 }, { "epoch": 5.952002445735249, "grad_norm": 0.399305522441864, "learning_rate": 2.507069763491996e-05, "loss": 0.1267, "step": 19469 }, { "epoch": 5.952308162641394, "grad_norm": 0.5723254084587097, "learning_rate": 2.5070273024500024e-05, "loss": 0.1051, "step": 19470 }, { "epoch": 5.952613879547539, "grad_norm": 0.40713366866111755, "learning_rate": 2.5069848414080082e-05, "loss": 0.1597, "step": 19471 }, { "epoch": 5.952919596453684, "grad_norm": 0.7184001207351685, "learning_rate": 2.5069423803660145e-05, "loss": 0.139, "step": 19472 }, { "epoch": 5.9532253133598285, "grad_norm": 0.7897038459777832, "learning_rate": 2.5068999193240203e-05, "loss": 0.1855, "step": 19473 }, { "epoch": 5.953531030265974, "grad_norm": 0.7129080891609192, "learning_rate": 2.5068574582820265e-05, "loss": 0.1514, "step": 19474 }, { "epoch": 5.953836747172119, "grad_norm": 0.5329235196113586, "learning_rate": 2.5068149972400324e-05, "loss": 0.1673, "step": 19475 }, { "epoch": 5.954142464078264, "grad_norm": 0.6728454828262329, "learning_rate": 2.5067725361980386e-05, "loss": 0.1584, "step": 19476 }, { "epoch": 5.954448180984408, "grad_norm": 0.8235107064247131, "learning_rate": 2.5067300751560445e-05, "loss": 0.1817, "step": 19477 }, { "epoch": 5.954753897890553, "grad_norm": 1.392313003540039, "learning_rate": 2.5066876141140507e-05, "loss": 0.1817, "step": 19478 }, { "epoch": 5.955059614796698, "grad_norm": 1.2992238998413086, "learning_rate": 2.5066451530720566e-05, "loss": 0.19, "step": 19479 }, { "epoch": 5.955365331702843, "grad_norm": 3.1591720581054688, "learning_rate": 2.5066026920300624e-05, "loss": 0.212, "step": 19480 }, { "epoch": 5.955671048608988, "grad_norm": 0.4253385365009308, "learning_rate": 2.5065602309880686e-05, "loss": 0.1559, "step": 19481 }, { "epoch": 5.955976765515133, "grad_norm": 0.29594868421554565, "learning_rate": 2.5065177699460745e-05, "loss": 0.0773, "step": 19482 }, { "epoch": 5.956282482421278, "grad_norm": 0.312350332736969, "learning_rate": 2.5064753089040807e-05, "loss": 0.0716, "step": 19483 }, { "epoch": 5.956588199327423, "grad_norm": 0.19218643009662628, "learning_rate": 2.5064328478620866e-05, "loss": 0.0591, "step": 19484 }, { "epoch": 5.956893916233568, "grad_norm": 0.29431799054145813, "learning_rate": 2.5063903868200928e-05, "loss": 0.0672, "step": 19485 }, { "epoch": 5.957199633139712, "grad_norm": 0.16936875879764557, "learning_rate": 2.5063479257780987e-05, "loss": 0.0469, "step": 19486 }, { "epoch": 5.9575053500458575, "grad_norm": 0.28364893794059753, "learning_rate": 2.506305464736105e-05, "loss": 0.0618, "step": 19487 }, { "epoch": 5.957811066952003, "grad_norm": 0.21639122068881989, "learning_rate": 2.5062630036941107e-05, "loss": 0.053, "step": 19488 }, { "epoch": 5.958116783858148, "grad_norm": 0.5864650011062622, "learning_rate": 2.506220542652117e-05, "loss": 0.0614, "step": 19489 }, { "epoch": 5.958422500764292, "grad_norm": 0.4236242175102234, "learning_rate": 2.5061780816101228e-05, "loss": 0.0559, "step": 19490 }, { "epoch": 5.958728217670437, "grad_norm": 0.44452521204948425, "learning_rate": 2.506135620568129e-05, "loss": 0.0825, "step": 19491 }, { "epoch": 5.959033934576582, "grad_norm": 0.39763736724853516, "learning_rate": 2.506093159526135e-05, "loss": 0.0942, "step": 19492 }, { "epoch": 5.959339651482727, "grad_norm": 0.4829461872577667, "learning_rate": 2.5060506984841408e-05, "loss": 0.0775, "step": 19493 }, { "epoch": 5.9596453683888715, "grad_norm": 0.5648297667503357, "learning_rate": 2.506008237442147e-05, "loss": 0.0899, "step": 19494 }, { "epoch": 5.959951085295017, "grad_norm": 0.42947840690612793, "learning_rate": 2.505965776400153e-05, "loss": 0.1304, "step": 19495 }, { "epoch": 5.960256802201162, "grad_norm": 0.7985420823097229, "learning_rate": 2.505923315358159e-05, "loss": 0.1425, "step": 19496 }, { "epoch": 5.960562519107307, "grad_norm": 0.7140769958496094, "learning_rate": 2.505880854316165e-05, "loss": 0.1856, "step": 19497 }, { "epoch": 5.960868236013452, "grad_norm": 1.8104345798492432, "learning_rate": 2.505838393274171e-05, "loss": 0.1408, "step": 19498 }, { "epoch": 5.961173952919596, "grad_norm": 0.9748376607894897, "learning_rate": 2.505795932232177e-05, "loss": 0.1616, "step": 19499 }, { "epoch": 5.961479669825741, "grad_norm": 1.2011680603027344, "learning_rate": 2.5057534711901832e-05, "loss": 0.1388, "step": 19500 }, { "epoch": 5.961785386731886, "grad_norm": 0.7899913787841797, "learning_rate": 2.505711010148189e-05, "loss": 0.1568, "step": 19501 }, { "epoch": 5.9620911036380315, "grad_norm": 2.6257717609405518, "learning_rate": 2.5056685491061953e-05, "loss": 0.2034, "step": 19502 }, { "epoch": 5.962396820544176, "grad_norm": 0.7968435287475586, "learning_rate": 2.505626088064201e-05, "loss": 0.1417, "step": 19503 }, { "epoch": 5.962702537450321, "grad_norm": 1.2439836263656616, "learning_rate": 2.5055836270222074e-05, "loss": 0.1895, "step": 19504 }, { "epoch": 5.963008254356466, "grad_norm": 1.2923780679702759, "learning_rate": 2.5055411659802132e-05, "loss": 0.282, "step": 19505 }, { "epoch": 5.963313971262611, "grad_norm": 0.4747266471385956, "learning_rate": 2.505498704938219e-05, "loss": 0.1151, "step": 19506 }, { "epoch": 5.963619688168755, "grad_norm": 0.7225367426872253, "learning_rate": 2.5054562438962253e-05, "loss": 0.0916, "step": 19507 }, { "epoch": 5.9639254050749, "grad_norm": 0.34366586804389954, "learning_rate": 2.5054137828542312e-05, "loss": 0.0814, "step": 19508 }, { "epoch": 5.9642311219810455, "grad_norm": 0.21903009712696075, "learning_rate": 2.5053713218122374e-05, "loss": 0.0548, "step": 19509 }, { "epoch": 5.964536838887191, "grad_norm": 0.37310582399368286, "learning_rate": 2.5053288607702433e-05, "loss": 0.062, "step": 19510 }, { "epoch": 5.964842555793336, "grad_norm": 0.32283973693847656, "learning_rate": 2.5052863997282495e-05, "loss": 0.0635, "step": 19511 }, { "epoch": 5.96514827269948, "grad_norm": 0.4085811376571655, "learning_rate": 2.5052439386862554e-05, "loss": 0.0545, "step": 19512 }, { "epoch": 5.965453989605625, "grad_norm": 1.0053119659423828, "learning_rate": 2.5052014776442616e-05, "loss": 0.0455, "step": 19513 }, { "epoch": 5.96575970651177, "grad_norm": 1.1231598854064941, "learning_rate": 2.5051590166022674e-05, "loss": 0.0748, "step": 19514 }, { "epoch": 5.966065423417915, "grad_norm": 0.9664137959480286, "learning_rate": 2.5051165555602736e-05, "loss": 0.0612, "step": 19515 }, { "epoch": 5.96637114032406, "grad_norm": 0.7626085877418518, "learning_rate": 2.5050740945182795e-05, "loss": 0.0853, "step": 19516 }, { "epoch": 5.966676857230205, "grad_norm": 0.2925960421562195, "learning_rate": 2.5050316334762857e-05, "loss": 0.0749, "step": 19517 }, { "epoch": 5.96698257413635, "grad_norm": 0.30214470624923706, "learning_rate": 2.5049891724342916e-05, "loss": 0.095, "step": 19518 }, { "epoch": 5.967288291042495, "grad_norm": 0.5817774534225464, "learning_rate": 2.5049467113922975e-05, "loss": 0.0917, "step": 19519 }, { "epoch": 5.967594007948639, "grad_norm": 0.46605759859085083, "learning_rate": 2.5049042503503037e-05, "loss": 0.1285, "step": 19520 }, { "epoch": 5.967899724854784, "grad_norm": 0.3735159635543823, "learning_rate": 2.5048617893083095e-05, "loss": 0.1233, "step": 19521 }, { "epoch": 5.968205441760929, "grad_norm": 0.5422471165657043, "learning_rate": 2.5048193282663157e-05, "loss": 0.1454, "step": 19522 }, { "epoch": 5.9685111586670745, "grad_norm": 0.5181217193603516, "learning_rate": 2.5047768672243216e-05, "loss": 0.1552, "step": 19523 }, { "epoch": 5.96881687557322, "grad_norm": 6.312695026397705, "learning_rate": 2.5047344061823278e-05, "loss": 0.1727, "step": 19524 }, { "epoch": 5.969122592479364, "grad_norm": 0.5697810649871826, "learning_rate": 2.5046919451403337e-05, "loss": 0.1395, "step": 19525 }, { "epoch": 5.969428309385509, "grad_norm": 1.0555455684661865, "learning_rate": 2.50464948409834e-05, "loss": 0.1827, "step": 19526 }, { "epoch": 5.969734026291654, "grad_norm": 0.9553812146186829, "learning_rate": 2.5046070230563458e-05, "loss": 0.1634, "step": 19527 }, { "epoch": 5.970039743197799, "grad_norm": 0.7828571200370789, "learning_rate": 2.504564562014352e-05, "loss": 0.1969, "step": 19528 }, { "epoch": 5.970345460103943, "grad_norm": 1.2816498279571533, "learning_rate": 2.504522100972358e-05, "loss": 0.2292, "step": 19529 }, { "epoch": 5.9706511770100885, "grad_norm": 11.549436569213867, "learning_rate": 2.504479639930364e-05, "loss": 0.2117, "step": 19530 }, { "epoch": 5.970956893916234, "grad_norm": 0.8092071413993835, "learning_rate": 2.50443717888837e-05, "loss": 0.1401, "step": 19531 }, { "epoch": 5.971262610822379, "grad_norm": 0.32458874583244324, "learning_rate": 2.5043947178463758e-05, "loss": 0.0877, "step": 19532 }, { "epoch": 5.971568327728523, "grad_norm": 0.2091149389743805, "learning_rate": 2.504352256804382e-05, "loss": 0.0624, "step": 19533 }, { "epoch": 5.971874044634668, "grad_norm": 0.4134000837802887, "learning_rate": 2.504309795762388e-05, "loss": 0.0542, "step": 19534 }, { "epoch": 5.972179761540813, "grad_norm": 0.29373034834861755, "learning_rate": 2.504267334720394e-05, "loss": 0.0488, "step": 19535 }, { "epoch": 5.972485478446958, "grad_norm": 0.25158095359802246, "learning_rate": 2.5042248736784e-05, "loss": 0.07, "step": 19536 }, { "epoch": 5.972791195353103, "grad_norm": 0.26633065938949585, "learning_rate": 2.5041824126364062e-05, "loss": 0.0315, "step": 19537 }, { "epoch": 5.973096912259248, "grad_norm": 0.3357628285884857, "learning_rate": 2.504139951594412e-05, "loss": 0.0727, "step": 19538 }, { "epoch": 5.973402629165393, "grad_norm": 0.4296885132789612, "learning_rate": 2.5040974905524182e-05, "loss": 0.0614, "step": 19539 }, { "epoch": 5.973708346071538, "grad_norm": 0.3028337359428406, "learning_rate": 2.504055029510424e-05, "loss": 0.0629, "step": 19540 }, { "epoch": 5.974014062977683, "grad_norm": 0.25244998931884766, "learning_rate": 2.5040125684684303e-05, "loss": 0.0965, "step": 19541 }, { "epoch": 5.974319779883827, "grad_norm": 1.3188005685806274, "learning_rate": 2.5039701074264362e-05, "loss": 0.0621, "step": 19542 }, { "epoch": 5.974625496789972, "grad_norm": 0.6112147569656372, "learning_rate": 2.5039276463844424e-05, "loss": 0.0842, "step": 19543 }, { "epoch": 5.9749312136961175, "grad_norm": 1.105160117149353, "learning_rate": 2.5038851853424483e-05, "loss": 0.0906, "step": 19544 }, { "epoch": 5.975236930602263, "grad_norm": 1.2796275615692139, "learning_rate": 2.503842724300454e-05, "loss": 0.1087, "step": 19545 }, { "epoch": 5.975542647508407, "grad_norm": 0.5073051452636719, "learning_rate": 2.5038002632584604e-05, "loss": 0.1255, "step": 19546 }, { "epoch": 5.975848364414552, "grad_norm": 0.45870164036750793, "learning_rate": 2.5037578022164662e-05, "loss": 0.1419, "step": 19547 }, { "epoch": 5.976154081320697, "grad_norm": 0.7691488862037659, "learning_rate": 2.5037153411744724e-05, "loss": 0.1466, "step": 19548 }, { "epoch": 5.976459798226842, "grad_norm": 0.4717442989349365, "learning_rate": 2.5036728801324783e-05, "loss": 0.1214, "step": 19549 }, { "epoch": 5.976765515132987, "grad_norm": 0.7199065685272217, "learning_rate": 2.5036304190904845e-05, "loss": 0.1872, "step": 19550 }, { "epoch": 5.9770712320391315, "grad_norm": 0.7708861827850342, "learning_rate": 2.5035879580484904e-05, "loss": 0.1563, "step": 19551 }, { "epoch": 5.977376948945277, "grad_norm": 0.8596890568733215, "learning_rate": 2.5035454970064966e-05, "loss": 0.1879, "step": 19552 }, { "epoch": 5.977682665851422, "grad_norm": 1.019743800163269, "learning_rate": 2.5035030359645025e-05, "loss": 0.1629, "step": 19553 }, { "epoch": 5.977988382757567, "grad_norm": 4.502157211303711, "learning_rate": 2.5034605749225087e-05, "loss": 0.2052, "step": 19554 }, { "epoch": 5.978294099663711, "grad_norm": 1.8829307556152344, "learning_rate": 2.5034181138805145e-05, "loss": 0.2525, "step": 19555 }, { "epoch": 5.978599816569856, "grad_norm": 0.8420386910438538, "learning_rate": 2.5033756528385207e-05, "loss": 0.1344, "step": 19556 }, { "epoch": 5.978905533476001, "grad_norm": 0.2856535315513611, "learning_rate": 2.5033331917965266e-05, "loss": 0.0865, "step": 19557 }, { "epoch": 5.979211250382146, "grad_norm": 0.28210267424583435, "learning_rate": 2.5032907307545325e-05, "loss": 0.0538, "step": 19558 }, { "epoch": 5.979516967288291, "grad_norm": 0.28120824694633484, "learning_rate": 2.5032482697125387e-05, "loss": 0.0454, "step": 19559 }, { "epoch": 5.979822684194436, "grad_norm": 0.18178091943264008, "learning_rate": 2.5032058086705446e-05, "loss": 0.045, "step": 19560 }, { "epoch": 5.980128401100581, "grad_norm": 0.28960031270980835, "learning_rate": 2.5031633476285508e-05, "loss": 0.0664, "step": 19561 }, { "epoch": 5.980434118006726, "grad_norm": 0.2327905148267746, "learning_rate": 2.5031208865865566e-05, "loss": 0.0684, "step": 19562 }, { "epoch": 5.980739834912871, "grad_norm": 0.18513278663158417, "learning_rate": 2.503078425544563e-05, "loss": 0.0513, "step": 19563 }, { "epoch": 5.981045551819015, "grad_norm": 0.22318772971630096, "learning_rate": 2.5030359645025687e-05, "loss": 0.0844, "step": 19564 }, { "epoch": 5.98135126872516, "grad_norm": 0.21850377321243286, "learning_rate": 2.502993503460575e-05, "loss": 0.0594, "step": 19565 }, { "epoch": 5.9816569856313055, "grad_norm": 0.5177920460700989, "learning_rate": 2.5029510424185808e-05, "loss": 0.1053, "step": 19566 }, { "epoch": 5.981962702537451, "grad_norm": 0.44628655910491943, "learning_rate": 2.502908581376587e-05, "loss": 0.0903, "step": 19567 }, { "epoch": 5.982268419443595, "grad_norm": 0.43890196084976196, "learning_rate": 2.5028661203345932e-05, "loss": 0.0699, "step": 19568 }, { "epoch": 5.98257413634974, "grad_norm": 0.528836190700531, "learning_rate": 2.5028236592925994e-05, "loss": 0.1446, "step": 19569 }, { "epoch": 5.982879853255885, "grad_norm": 1.7460819482803345, "learning_rate": 2.5027811982506053e-05, "loss": 0.1196, "step": 19570 }, { "epoch": 5.98318557016203, "grad_norm": 14.803101539611816, "learning_rate": 2.5027387372086112e-05, "loss": 0.1561, "step": 19571 }, { "epoch": 5.9834912870681745, "grad_norm": 0.7834157943725586, "learning_rate": 2.5026962761666174e-05, "loss": 0.1778, "step": 19572 }, { "epoch": 5.98379700397432, "grad_norm": 0.6418386101722717, "learning_rate": 2.5026538151246232e-05, "loss": 0.2116, "step": 19573 }, { "epoch": 5.984102720880465, "grad_norm": 1.1294807195663452, "learning_rate": 2.5026113540826295e-05, "loss": 0.1744, "step": 19574 }, { "epoch": 5.98440843778661, "grad_norm": 0.7583065629005432, "learning_rate": 2.5025688930406353e-05, "loss": 0.1908, "step": 19575 }, { "epoch": 5.984714154692755, "grad_norm": 2.665764570236206, "learning_rate": 2.5025264319986415e-05, "loss": 0.1528, "step": 19576 }, { "epoch": 5.985019871598899, "grad_norm": 0.7799127697944641, "learning_rate": 2.5024839709566474e-05, "loss": 0.1778, "step": 19577 }, { "epoch": 5.985325588505044, "grad_norm": 1.3619141578674316, "learning_rate": 2.5024415099146536e-05, "loss": 0.1621, "step": 19578 }, { "epoch": 5.985631305411189, "grad_norm": 0.6861135363578796, "learning_rate": 2.5023990488726595e-05, "loss": 0.1626, "step": 19579 }, { "epoch": 5.9859370223173345, "grad_norm": 1.0758802890777588, "learning_rate": 2.5023565878306657e-05, "loss": 0.2781, "step": 19580 }, { "epoch": 5.986242739223479, "grad_norm": 0.5714113712310791, "learning_rate": 2.5023141267886716e-05, "loss": 0.1355, "step": 19581 }, { "epoch": 5.986548456129624, "grad_norm": 0.22001738846302032, "learning_rate": 2.5022716657466778e-05, "loss": 0.0858, "step": 19582 }, { "epoch": 5.986854173035769, "grad_norm": 0.6732701063156128, "learning_rate": 2.5022292047046836e-05, "loss": 0.0516, "step": 19583 }, { "epoch": 5.987159889941914, "grad_norm": 0.5018265843391418, "learning_rate": 2.5021867436626895e-05, "loss": 0.0664, "step": 19584 }, { "epoch": 5.987465606848058, "grad_norm": 0.19145376980304718, "learning_rate": 2.5021442826206957e-05, "loss": 0.0408, "step": 19585 }, { "epoch": 5.987771323754203, "grad_norm": 0.326712042093277, "learning_rate": 2.5021018215787016e-05, "loss": 0.0641, "step": 19586 }, { "epoch": 5.9880770406603485, "grad_norm": 0.4448851943016052, "learning_rate": 2.5020593605367078e-05, "loss": 0.0499, "step": 19587 }, { "epoch": 5.988382757566494, "grad_norm": 0.2647816240787506, "learning_rate": 2.5020168994947137e-05, "loss": 0.0559, "step": 19588 }, { "epoch": 5.988688474472639, "grad_norm": 0.7183878421783447, "learning_rate": 2.50197443845272e-05, "loss": 0.0564, "step": 19589 }, { "epoch": 5.988994191378783, "grad_norm": 0.47890540957450867, "learning_rate": 2.5019319774107258e-05, "loss": 0.0728, "step": 19590 }, { "epoch": 5.989299908284928, "grad_norm": 0.32270529866218567, "learning_rate": 2.501889516368732e-05, "loss": 0.0833, "step": 19591 }, { "epoch": 5.989605625191073, "grad_norm": 0.4088325798511505, "learning_rate": 2.5018470553267378e-05, "loss": 0.096, "step": 19592 }, { "epoch": 5.989911342097218, "grad_norm": 0.34805288910865784, "learning_rate": 2.501804594284744e-05, "loss": 0.1076, "step": 19593 }, { "epoch": 5.9902170590033625, "grad_norm": 0.7529622316360474, "learning_rate": 2.50176213324275e-05, "loss": 0.1411, "step": 19594 }, { "epoch": 5.990522775909508, "grad_norm": 0.394332617521286, "learning_rate": 2.501719672200756e-05, "loss": 0.118, "step": 19595 }, { "epoch": 5.990828492815653, "grad_norm": 0.43290090560913086, "learning_rate": 2.501677211158762e-05, "loss": 0.0916, "step": 19596 }, { "epoch": 5.991134209721798, "grad_norm": 0.877945601940155, "learning_rate": 2.501634750116768e-05, "loss": 0.1445, "step": 19597 }, { "epoch": 5.991439926627942, "grad_norm": 0.42660650610923767, "learning_rate": 2.501592289074774e-05, "loss": 0.1219, "step": 19598 }, { "epoch": 5.991745643534087, "grad_norm": 0.5819357633590698, "learning_rate": 2.50154982803278e-05, "loss": 0.1495, "step": 19599 }, { "epoch": 5.992051360440232, "grad_norm": 1.1735187768936157, "learning_rate": 2.501507366990786e-05, "loss": 0.1421, "step": 19600 }, { "epoch": 5.9923570773463775, "grad_norm": 1.3237652778625488, "learning_rate": 2.501464905948792e-05, "loss": 0.1937, "step": 19601 }, { "epoch": 5.992662794252523, "grad_norm": 1.955494999885559, "learning_rate": 2.5014224449067982e-05, "loss": 0.1741, "step": 19602 }, { "epoch": 5.992968511158667, "grad_norm": 1.2559304237365723, "learning_rate": 2.501379983864804e-05, "loss": 0.1766, "step": 19603 }, { "epoch": 5.993274228064812, "grad_norm": 0.6370802521705627, "learning_rate": 2.5013375228228103e-05, "loss": 0.184, "step": 19604 }, { "epoch": 5.993579944970957, "grad_norm": 1.8884328603744507, "learning_rate": 2.5012950617808162e-05, "loss": 0.2139, "step": 19605 }, { "epoch": 5.993885661877102, "grad_norm": 0.4733698070049286, "learning_rate": 2.5012526007388224e-05, "loss": 0.1537, "step": 19606 }, { "epoch": 5.994191378783246, "grad_norm": 0.31924986839294434, "learning_rate": 2.5012101396968283e-05, "loss": 0.1036, "step": 19607 }, { "epoch": 5.9944970956893915, "grad_norm": 0.24746398627758026, "learning_rate": 2.501167678654834e-05, "loss": 0.0477, "step": 19608 }, { "epoch": 5.994802812595537, "grad_norm": 0.31354594230651855, "learning_rate": 2.5011252176128403e-05, "loss": 0.0482, "step": 19609 }, { "epoch": 5.995108529501682, "grad_norm": 0.13765211403369904, "learning_rate": 2.5010827565708462e-05, "loss": 0.0463, "step": 19610 }, { "epoch": 5.995414246407826, "grad_norm": 0.2699909806251526, "learning_rate": 2.5010402955288524e-05, "loss": 0.0605, "step": 19611 }, { "epoch": 5.995719963313971, "grad_norm": 0.20700767636299133, "learning_rate": 2.5009978344868583e-05, "loss": 0.0708, "step": 19612 }, { "epoch": 5.996025680220116, "grad_norm": 0.4271169602870941, "learning_rate": 2.5009553734448645e-05, "loss": 0.0707, "step": 19613 }, { "epoch": 5.996331397126261, "grad_norm": 0.4340328574180603, "learning_rate": 2.5009129124028704e-05, "loss": 0.1106, "step": 19614 }, { "epoch": 5.996637114032406, "grad_norm": 0.3136501610279083, "learning_rate": 2.5008704513608766e-05, "loss": 0.0907, "step": 19615 }, { "epoch": 5.996942830938551, "grad_norm": 0.2974627912044525, "learning_rate": 2.5008279903188824e-05, "loss": 0.081, "step": 19616 }, { "epoch": 5.997248547844696, "grad_norm": 0.40147215127944946, "learning_rate": 2.5007855292768886e-05, "loss": 0.1113, "step": 19617 }, { "epoch": 5.997554264750841, "grad_norm": 0.4122265875339508, "learning_rate": 2.5007430682348945e-05, "loss": 0.1069, "step": 19618 }, { "epoch": 5.997859981656986, "grad_norm": 0.6440460085868835, "learning_rate": 2.5007006071929007e-05, "loss": 0.1585, "step": 19619 }, { "epoch": 5.99816569856313, "grad_norm": 0.45459091663360596, "learning_rate": 2.5006581461509066e-05, "loss": 0.1467, "step": 19620 }, { "epoch": 5.998471415469275, "grad_norm": 0.7922104001045227, "learning_rate": 2.5006156851089125e-05, "loss": 0.1558, "step": 19621 }, { "epoch": 5.99877713237542, "grad_norm": 0.5950323939323425, "learning_rate": 2.5005732240669187e-05, "loss": 0.1813, "step": 19622 }, { "epoch": 5.9990828492815655, "grad_norm": 1.681985855102539, "learning_rate": 2.5005307630249245e-05, "loss": 0.1949, "step": 19623 }, { "epoch": 5.99938856618771, "grad_norm": 1.9843248128890991, "learning_rate": 2.5004883019829308e-05, "loss": 0.1664, "step": 19624 }, { "epoch": 5.999694283093855, "grad_norm": 1.6840102672576904, "learning_rate": 2.5004458409409366e-05, "loss": 0.2272, "step": 19625 }, { "epoch": 6.0, "grad_norm": 1.1381633281707764, "learning_rate": 2.5004033798989428e-05, "loss": 0.2045, "step": 19626 }, { "epoch": 6.000305716906145, "grad_norm": 0.34952595829963684, "learning_rate": 2.5003609188569487e-05, "loss": 0.1524, "step": 19627 }, { "epoch": 6.00061143381229, "grad_norm": 0.2995494306087494, "learning_rate": 2.500318457814955e-05, "loss": 0.0741, "step": 19628 }, { "epoch": 6.0009171507184345, "grad_norm": 0.17897705733776093, "learning_rate": 2.5002759967729608e-05, "loss": 0.0604, "step": 19629 }, { "epoch": 6.00122286762458, "grad_norm": 0.2088368535041809, "learning_rate": 2.500233535730967e-05, "loss": 0.0497, "step": 19630 }, { "epoch": 6.001528584530725, "grad_norm": 0.9484440684318542, "learning_rate": 2.500191074688973e-05, "loss": 0.0437, "step": 19631 }, { "epoch": 6.00183430143687, "grad_norm": 0.12535977363586426, "learning_rate": 2.500148613646979e-05, "loss": 0.0352, "step": 19632 }, { "epoch": 6.002140018343014, "grad_norm": 0.22380635142326355, "learning_rate": 2.500106152604985e-05, "loss": 0.0549, "step": 19633 }, { "epoch": 6.002445735249159, "grad_norm": 0.4616662859916687, "learning_rate": 2.5000636915629908e-05, "loss": 0.0999, "step": 19634 }, { "epoch": 6.002751452155304, "grad_norm": 0.31804221868515015, "learning_rate": 2.500021230520997e-05, "loss": 0.0484, "step": 19635 }, { "epoch": 6.003057169061449, "grad_norm": 0.5451232194900513, "learning_rate": 2.499978769479003e-05, "loss": 0.0471, "step": 19636 }, { "epoch": 6.003362885967594, "grad_norm": 0.2553704082965851, "learning_rate": 2.499936308437009e-05, "loss": 0.1081, "step": 19637 }, { "epoch": 6.003668602873739, "grad_norm": 0.26924532651901245, "learning_rate": 2.499893847395015e-05, "loss": 0.0899, "step": 19638 }, { "epoch": 6.003974319779884, "grad_norm": 0.5008813142776489, "learning_rate": 2.4998513863530212e-05, "loss": 0.0877, "step": 19639 }, { "epoch": 6.004280036686029, "grad_norm": 0.4848710000514984, "learning_rate": 2.499808925311027e-05, "loss": 0.1235, "step": 19640 }, { "epoch": 6.004585753592174, "grad_norm": 0.388339102268219, "learning_rate": 2.4997664642690333e-05, "loss": 0.1149, "step": 19641 }, { "epoch": 6.004891470498318, "grad_norm": 0.6681573390960693, "learning_rate": 2.499724003227039e-05, "loss": 0.1356, "step": 19642 }, { "epoch": 6.005197187404463, "grad_norm": 2.147925853729248, "learning_rate": 2.4996815421850453e-05, "loss": 0.1667, "step": 19643 }, { "epoch": 6.0055029043106085, "grad_norm": 0.7772433161735535, "learning_rate": 2.4996390811430512e-05, "loss": 0.1673, "step": 19644 }, { "epoch": 6.005808621216754, "grad_norm": 0.48681193590164185, "learning_rate": 2.4995966201010574e-05, "loss": 0.1618, "step": 19645 }, { "epoch": 6.006114338122898, "grad_norm": 0.6073489189147949, "learning_rate": 2.4995541590590633e-05, "loss": 0.1581, "step": 19646 }, { "epoch": 6.006420055029043, "grad_norm": 0.4532187879085541, "learning_rate": 2.499511698017069e-05, "loss": 0.1496, "step": 19647 }, { "epoch": 6.006725771935188, "grad_norm": 0.9044231176376343, "learning_rate": 2.4994692369750754e-05, "loss": 0.1834, "step": 19648 }, { "epoch": 6.007031488841333, "grad_norm": 1.0255244970321655, "learning_rate": 2.4994267759330812e-05, "loss": 0.1862, "step": 19649 }, { "epoch": 6.007337205747477, "grad_norm": 1.41958487033844, "learning_rate": 2.4993843148910874e-05, "loss": 0.2057, "step": 19650 }, { "epoch": 6.0076429226536225, "grad_norm": 1.3323336839675903, "learning_rate": 2.4993418538490933e-05, "loss": 0.2864, "step": 19651 }, { "epoch": 6.007948639559768, "grad_norm": 0.287812739610672, "learning_rate": 2.4992993928070995e-05, "loss": 0.1217, "step": 19652 }, { "epoch": 6.008254356465913, "grad_norm": 0.24882523715496063, "learning_rate": 2.4992569317651054e-05, "loss": 0.0791, "step": 19653 }, { "epoch": 6.008560073372058, "grad_norm": 0.4178329110145569, "learning_rate": 2.4992144707231116e-05, "loss": 0.0676, "step": 19654 }, { "epoch": 6.008865790278202, "grad_norm": 0.16728951036930084, "learning_rate": 2.4991720096811175e-05, "loss": 0.0794, "step": 19655 }, { "epoch": 6.009171507184347, "grad_norm": 0.30056172609329224, "learning_rate": 2.4991295486391237e-05, "loss": 0.0682, "step": 19656 }, { "epoch": 6.009477224090492, "grad_norm": 0.17148078978061676, "learning_rate": 2.4990870875971295e-05, "loss": 0.0446, "step": 19657 }, { "epoch": 6.0097829409966375, "grad_norm": 0.31725725531578064, "learning_rate": 2.4990446265551358e-05, "loss": 0.0438, "step": 19658 }, { "epoch": 6.010088657902782, "grad_norm": 0.18021926283836365, "learning_rate": 2.4990021655131416e-05, "loss": 0.0774, "step": 19659 }, { "epoch": 6.010394374808927, "grad_norm": 0.26388242840766907, "learning_rate": 2.4989597044711475e-05, "loss": 0.0479, "step": 19660 }, { "epoch": 6.010700091715072, "grad_norm": 0.24389494955539703, "learning_rate": 2.4989172434291537e-05, "loss": 0.0659, "step": 19661 }, { "epoch": 6.011005808621217, "grad_norm": 0.5069573521614075, "learning_rate": 2.4988747823871596e-05, "loss": 0.0993, "step": 19662 }, { "epoch": 6.011311525527361, "grad_norm": 0.4760988652706146, "learning_rate": 2.4988323213451658e-05, "loss": 0.0691, "step": 19663 }, { "epoch": 6.011617242433506, "grad_norm": 0.25795963406562805, "learning_rate": 2.4987898603031716e-05, "loss": 0.0945, "step": 19664 }, { "epoch": 6.0119229593396515, "grad_norm": 0.2391241490840912, "learning_rate": 2.498747399261178e-05, "loss": 0.0945, "step": 19665 }, { "epoch": 6.012228676245797, "grad_norm": 0.5070210695266724, "learning_rate": 2.4987049382191837e-05, "loss": 0.1151, "step": 19666 }, { "epoch": 6.012534393151942, "grad_norm": 0.5113363265991211, "learning_rate": 2.49866247717719e-05, "loss": 0.1102, "step": 19667 }, { "epoch": 6.012840110058086, "grad_norm": 0.33754050731658936, "learning_rate": 2.4986200161351958e-05, "loss": 0.1536, "step": 19668 }, { "epoch": 6.013145826964231, "grad_norm": 0.6806614995002747, "learning_rate": 2.498577555093202e-05, "loss": 0.1397, "step": 19669 }, { "epoch": 6.013451543870376, "grad_norm": 0.37392884492874146, "learning_rate": 2.4985350940512082e-05, "loss": 0.1388, "step": 19670 }, { "epoch": 6.013757260776521, "grad_norm": 0.8566163778305054, "learning_rate": 2.4984926330092144e-05, "loss": 0.1469, "step": 19671 }, { "epoch": 6.0140629776826655, "grad_norm": 0.8546872138977051, "learning_rate": 2.4984501719672203e-05, "loss": 0.1532, "step": 19672 }, { "epoch": 6.014368694588811, "grad_norm": 1.4943913221359253, "learning_rate": 2.4984077109252262e-05, "loss": 0.1762, "step": 19673 }, { "epoch": 6.014674411494956, "grad_norm": 1.196723461151123, "learning_rate": 2.4983652498832324e-05, "loss": 0.1703, "step": 19674 }, { "epoch": 6.014980128401101, "grad_norm": 2.7256839275360107, "learning_rate": 2.4983227888412383e-05, "loss": 0.175, "step": 19675 }, { "epoch": 6.015285845307245, "grad_norm": 0.9685139656066895, "learning_rate": 2.4982803277992445e-05, "loss": 0.2013, "step": 19676 }, { "epoch": 6.01559156221339, "grad_norm": 0.37462419271469116, "learning_rate": 2.4982378667572503e-05, "loss": 0.1294, "step": 19677 }, { "epoch": 6.015897279119535, "grad_norm": 0.2824888229370117, "learning_rate": 2.4981954057152565e-05, "loss": 0.0641, "step": 19678 }, { "epoch": 6.01620299602568, "grad_norm": 0.9086729884147644, "learning_rate": 2.4981529446732624e-05, "loss": 0.0646, "step": 19679 }, { "epoch": 6.0165087129318255, "grad_norm": 0.2181077003479004, "learning_rate": 2.4981104836312686e-05, "loss": 0.0474, "step": 19680 }, { "epoch": 6.01681442983797, "grad_norm": 0.5027155876159668, "learning_rate": 2.4980680225892745e-05, "loss": 0.072, "step": 19681 }, { "epoch": 6.017120146744115, "grad_norm": 0.2302534133195877, "learning_rate": 2.4980255615472807e-05, "loss": 0.0529, "step": 19682 }, { "epoch": 6.01742586365026, "grad_norm": 0.14662078022956848, "learning_rate": 2.4979831005052866e-05, "loss": 0.0365, "step": 19683 }, { "epoch": 6.017731580556405, "grad_norm": 0.24089466035366058, "learning_rate": 2.4979406394632928e-05, "loss": 0.05, "step": 19684 }, { "epoch": 6.018037297462549, "grad_norm": 0.28180450201034546, "learning_rate": 2.4978981784212986e-05, "loss": 0.0482, "step": 19685 }, { "epoch": 6.0183430143686945, "grad_norm": 0.2690946161746979, "learning_rate": 2.4978557173793045e-05, "loss": 0.0607, "step": 19686 }, { "epoch": 6.01864873127484, "grad_norm": 1.0741316080093384, "learning_rate": 2.4978132563373107e-05, "loss": 0.0897, "step": 19687 }, { "epoch": 6.018954448180985, "grad_norm": 0.3193904459476471, "learning_rate": 2.4977707952953166e-05, "loss": 0.0612, "step": 19688 }, { "epoch": 6.019260165087129, "grad_norm": 0.34909215569496155, "learning_rate": 2.4977283342533228e-05, "loss": 0.0885, "step": 19689 }, { "epoch": 6.019565881993274, "grad_norm": 2.0530853271484375, "learning_rate": 2.4976858732113287e-05, "loss": 0.1382, "step": 19690 }, { "epoch": 6.019871598899419, "grad_norm": 0.5125149488449097, "learning_rate": 2.497643412169335e-05, "loss": 0.1318, "step": 19691 }, { "epoch": 6.020177315805564, "grad_norm": 0.4957102835178375, "learning_rate": 2.4976009511273408e-05, "loss": 0.1782, "step": 19692 }, { "epoch": 6.020483032711709, "grad_norm": 0.7164099216461182, "learning_rate": 2.497558490085347e-05, "loss": 0.1422, "step": 19693 }, { "epoch": 6.020788749617854, "grad_norm": 1.0956158638000488, "learning_rate": 2.497516029043353e-05, "loss": 0.1666, "step": 19694 }, { "epoch": 6.021094466523999, "grad_norm": 0.5561614036560059, "learning_rate": 2.497473568001359e-05, "loss": 0.1445, "step": 19695 }, { "epoch": 6.021400183430144, "grad_norm": 0.5434489250183105, "learning_rate": 2.497431106959365e-05, "loss": 0.1613, "step": 19696 }, { "epoch": 6.021705900336289, "grad_norm": 0.45424774289131165, "learning_rate": 2.497388645917371e-05, "loss": 0.1682, "step": 19697 }, { "epoch": 6.022011617242433, "grad_norm": 0.7786252498626709, "learning_rate": 2.497346184875377e-05, "loss": 0.2052, "step": 19698 }, { "epoch": 6.022317334148578, "grad_norm": 2.7799439430236816, "learning_rate": 2.497303723833383e-05, "loss": 0.1977, "step": 19699 }, { "epoch": 6.022623051054723, "grad_norm": 1.1755698919296265, "learning_rate": 2.497261262791389e-05, "loss": 0.1973, "step": 19700 }, { "epoch": 6.0229287679608685, "grad_norm": 3.7342851161956787, "learning_rate": 2.497218801749395e-05, "loss": 0.2472, "step": 19701 }, { "epoch": 6.023234484867013, "grad_norm": 0.42324861884117126, "learning_rate": 2.497176340707401e-05, "loss": 0.1547, "step": 19702 }, { "epoch": 6.023540201773158, "grad_norm": 0.3137890100479126, "learning_rate": 2.497133879665407e-05, "loss": 0.0773, "step": 19703 }, { "epoch": 6.023845918679303, "grad_norm": 0.3793846070766449, "learning_rate": 2.4970914186234132e-05, "loss": 0.0906, "step": 19704 }, { "epoch": 6.024151635585448, "grad_norm": 0.36400285363197327, "learning_rate": 2.497048957581419e-05, "loss": 0.0754, "step": 19705 }, { "epoch": 6.024457352491593, "grad_norm": 0.18969549238681793, "learning_rate": 2.4970064965394253e-05, "loss": 0.0503, "step": 19706 }, { "epoch": 6.024763069397737, "grad_norm": 0.25845181941986084, "learning_rate": 2.4969640354974312e-05, "loss": 0.0693, "step": 19707 }, { "epoch": 6.0250687863038825, "grad_norm": 0.2271837294101715, "learning_rate": 2.4969215744554374e-05, "loss": 0.0582, "step": 19708 }, { "epoch": 6.025374503210028, "grad_norm": 0.577528715133667, "learning_rate": 2.4968791134134433e-05, "loss": 0.0501, "step": 19709 }, { "epoch": 6.025680220116173, "grad_norm": 0.33606410026550293, "learning_rate": 2.4968366523714495e-05, "loss": 0.0704, "step": 19710 }, { "epoch": 6.025985937022317, "grad_norm": 0.4328300952911377, "learning_rate": 2.4967941913294553e-05, "loss": 0.0681, "step": 19711 }, { "epoch": 6.026291653928462, "grad_norm": 0.5989173054695129, "learning_rate": 2.4967517302874612e-05, "loss": 0.0977, "step": 19712 }, { "epoch": 6.026597370834607, "grad_norm": 0.48202064633369446, "learning_rate": 2.4967092692454674e-05, "loss": 0.0763, "step": 19713 }, { "epoch": 6.026903087740752, "grad_norm": 0.3836113214492798, "learning_rate": 2.4966668082034733e-05, "loss": 0.1013, "step": 19714 }, { "epoch": 6.027208804646897, "grad_norm": 0.3378424644470215, "learning_rate": 2.4966243471614795e-05, "loss": 0.0965, "step": 19715 }, { "epoch": 6.027514521553042, "grad_norm": 0.6236023306846619, "learning_rate": 2.4965818861194854e-05, "loss": 0.1113, "step": 19716 }, { "epoch": 6.027820238459187, "grad_norm": 0.9040539860725403, "learning_rate": 2.4965394250774916e-05, "loss": 0.1351, "step": 19717 }, { "epoch": 6.028125955365332, "grad_norm": 0.5101187229156494, "learning_rate": 2.4964969640354974e-05, "loss": 0.1458, "step": 19718 }, { "epoch": 6.028431672271477, "grad_norm": 0.5186773538589478, "learning_rate": 2.4964545029935036e-05, "loss": 0.1296, "step": 19719 }, { "epoch": 6.028737389177621, "grad_norm": 0.9947389364242554, "learning_rate": 2.4964120419515095e-05, "loss": 0.1618, "step": 19720 }, { "epoch": 6.029043106083766, "grad_norm": 0.9769911766052246, "learning_rate": 2.4963695809095157e-05, "loss": 0.1556, "step": 19721 }, { "epoch": 6.0293488229899115, "grad_norm": 0.7994919419288635, "learning_rate": 2.4963271198675216e-05, "loss": 0.129, "step": 19722 }, { "epoch": 6.029654539896057, "grad_norm": 1.4712209701538086, "learning_rate": 2.4962846588255275e-05, "loss": 0.1812, "step": 19723 }, { "epoch": 6.029960256802201, "grad_norm": 2.7065675258636475, "learning_rate": 2.4962421977835337e-05, "loss": 0.171, "step": 19724 }, { "epoch": 6.030265973708346, "grad_norm": 2.243659734725952, "learning_rate": 2.4961997367415395e-05, "loss": 0.2103, "step": 19725 }, { "epoch": 6.030571690614491, "grad_norm": 2.3423871994018555, "learning_rate": 2.4961572756995458e-05, "loss": 0.2299, "step": 19726 }, { "epoch": 6.030877407520636, "grad_norm": 0.749951958656311, "learning_rate": 2.4961148146575516e-05, "loss": 0.1284, "step": 19727 }, { "epoch": 6.03118312442678, "grad_norm": 0.266623318195343, "learning_rate": 2.496072353615558e-05, "loss": 0.1106, "step": 19728 }, { "epoch": 6.0314888413329255, "grad_norm": 0.2954595983028412, "learning_rate": 2.4960298925735637e-05, "loss": 0.0788, "step": 19729 }, { "epoch": 6.031794558239071, "grad_norm": 0.2384815365076065, "learning_rate": 2.49598743153157e-05, "loss": 0.041, "step": 19730 }, { "epoch": 6.032100275145216, "grad_norm": 0.18361565470695496, "learning_rate": 2.4959449704895758e-05, "loss": 0.0502, "step": 19731 }, { "epoch": 6.032405992051361, "grad_norm": 0.5082542896270752, "learning_rate": 2.495902509447582e-05, "loss": 0.0547, "step": 19732 }, { "epoch": 6.032711708957505, "grad_norm": 0.14404518902301788, "learning_rate": 2.495860048405588e-05, "loss": 0.0487, "step": 19733 }, { "epoch": 6.03301742586365, "grad_norm": 0.20602881908416748, "learning_rate": 2.495817587363594e-05, "loss": 0.0635, "step": 19734 }, { "epoch": 6.033323142769795, "grad_norm": 0.9228920340538025, "learning_rate": 2.4957751263216e-05, "loss": 0.0387, "step": 19735 }, { "epoch": 6.03362885967594, "grad_norm": 0.4560678005218506, "learning_rate": 2.4957326652796058e-05, "loss": 0.056, "step": 19736 }, { "epoch": 6.033934576582085, "grad_norm": 0.2667970061302185, "learning_rate": 2.495690204237612e-05, "loss": 0.083, "step": 19737 }, { "epoch": 6.03424029348823, "grad_norm": 0.3759712874889374, "learning_rate": 2.495647743195618e-05, "loss": 0.0893, "step": 19738 }, { "epoch": 6.034546010394375, "grad_norm": 0.4735567569732666, "learning_rate": 2.495605282153624e-05, "loss": 0.0869, "step": 19739 }, { "epoch": 6.03485172730052, "grad_norm": 0.5290672779083252, "learning_rate": 2.49556282111163e-05, "loss": 0.1068, "step": 19740 }, { "epoch": 6.035157444206664, "grad_norm": 0.5540738701820374, "learning_rate": 2.4955203600696362e-05, "loss": 0.093, "step": 19741 }, { "epoch": 6.035463161112809, "grad_norm": 0.9901629090309143, "learning_rate": 2.495477899027642e-05, "loss": 0.144, "step": 19742 }, { "epoch": 6.0357688780189545, "grad_norm": 0.7805847525596619, "learning_rate": 2.4954354379856483e-05, "loss": 0.123, "step": 19743 }, { "epoch": 6.0360745949251, "grad_norm": 0.5085725784301758, "learning_rate": 2.495392976943654e-05, "loss": 0.1498, "step": 19744 }, { "epoch": 6.036380311831245, "grad_norm": 2.349203109741211, "learning_rate": 2.4953505159016603e-05, "loss": 0.1417, "step": 19745 }, { "epoch": 6.036686028737389, "grad_norm": 0.6316443085670471, "learning_rate": 2.4953080548596662e-05, "loss": 0.1751, "step": 19746 }, { "epoch": 6.036991745643534, "grad_norm": 2.451103925704956, "learning_rate": 2.4952655938176724e-05, "loss": 0.1862, "step": 19747 }, { "epoch": 6.037297462549679, "grad_norm": 0.5679850578308105, "learning_rate": 2.4952231327756783e-05, "loss": 0.1353, "step": 19748 }, { "epoch": 6.037603179455824, "grad_norm": 0.8942867517471313, "learning_rate": 2.495180671733684e-05, "loss": 0.1907, "step": 19749 }, { "epoch": 6.0379088963619685, "grad_norm": 2.3785934448242188, "learning_rate": 2.4951382106916904e-05, "loss": 0.1995, "step": 19750 }, { "epoch": 6.038214613268114, "grad_norm": 1.1997150182724, "learning_rate": 2.4950957496496962e-05, "loss": 0.2144, "step": 19751 }, { "epoch": 6.038520330174259, "grad_norm": 0.43878063559532166, "learning_rate": 2.4950532886077024e-05, "loss": 0.1269, "step": 19752 }, { "epoch": 6.038826047080404, "grad_norm": 0.19527341425418854, "learning_rate": 2.4950108275657083e-05, "loss": 0.0591, "step": 19753 }, { "epoch": 6.039131763986548, "grad_norm": 0.15989787876605988, "learning_rate": 2.4949683665237145e-05, "loss": 0.0627, "step": 19754 }, { "epoch": 6.039437480892693, "grad_norm": 0.2767657935619354, "learning_rate": 2.4949259054817204e-05, "loss": 0.0558, "step": 19755 }, { "epoch": 6.039743197798838, "grad_norm": 0.17647336423397064, "learning_rate": 2.4948834444397266e-05, "loss": 0.0372, "step": 19756 }, { "epoch": 6.040048914704983, "grad_norm": 0.3467719852924347, "learning_rate": 2.4948409833977325e-05, "loss": 0.0568, "step": 19757 }, { "epoch": 6.0403546316111285, "grad_norm": 0.2675173580646515, "learning_rate": 2.4947985223557387e-05, "loss": 0.0503, "step": 19758 }, { "epoch": 6.040660348517273, "grad_norm": 0.1708517223596573, "learning_rate": 2.4947560613137445e-05, "loss": 0.0476, "step": 19759 }, { "epoch": 6.040966065423418, "grad_norm": 0.3023242652416229, "learning_rate": 2.4947136002717508e-05, "loss": 0.0975, "step": 19760 }, { "epoch": 6.041271782329563, "grad_norm": 0.2691199779510498, "learning_rate": 2.4946711392297566e-05, "loss": 0.0667, "step": 19761 }, { "epoch": 6.041577499235708, "grad_norm": 0.25243616104125977, "learning_rate": 2.4946286781877625e-05, "loss": 0.0666, "step": 19762 }, { "epoch": 6.041883216141852, "grad_norm": 0.2543877959251404, "learning_rate": 2.4945862171457687e-05, "loss": 0.0558, "step": 19763 }, { "epoch": 6.042188933047997, "grad_norm": 1.8872523307800293, "learning_rate": 2.4945437561037746e-05, "loss": 0.0901, "step": 19764 }, { "epoch": 6.0424946499541425, "grad_norm": 0.3511504530906677, "learning_rate": 2.4945012950617808e-05, "loss": 0.096, "step": 19765 }, { "epoch": 6.042800366860288, "grad_norm": 0.496861070394516, "learning_rate": 2.4944588340197867e-05, "loss": 0.1115, "step": 19766 }, { "epoch": 6.043106083766432, "grad_norm": 1.546974778175354, "learning_rate": 2.494416372977793e-05, "loss": 0.1153, "step": 19767 }, { "epoch": 6.043411800672577, "grad_norm": 0.6918210387229919, "learning_rate": 2.4943739119357987e-05, "loss": 0.1652, "step": 19768 }, { "epoch": 6.043717517578722, "grad_norm": 1.2330892086029053, "learning_rate": 2.494331450893805e-05, "loss": 0.1484, "step": 19769 }, { "epoch": 6.044023234484867, "grad_norm": 0.9603457450866699, "learning_rate": 2.4942889898518108e-05, "loss": 0.1517, "step": 19770 }, { "epoch": 6.044328951391012, "grad_norm": 0.6684256196022034, "learning_rate": 2.494246528809817e-05, "loss": 0.1235, "step": 19771 }, { "epoch": 6.044634668297157, "grad_norm": 3.0305638313293457, "learning_rate": 2.4942040677678232e-05, "loss": 0.1992, "step": 19772 }, { "epoch": 6.044940385203302, "grad_norm": 0.9363853335380554, "learning_rate": 2.4941616067258294e-05, "loss": 0.1709, "step": 19773 }, { "epoch": 6.045246102109447, "grad_norm": 1.250166893005371, "learning_rate": 2.4941191456838353e-05, "loss": 0.2289, "step": 19774 }, { "epoch": 6.045551819015592, "grad_norm": 0.9464903473854065, "learning_rate": 2.4940766846418412e-05, "loss": 0.1791, "step": 19775 }, { "epoch": 6.045857535921736, "grad_norm": 0.9455187320709229, "learning_rate": 2.4940342235998474e-05, "loss": 0.226, "step": 19776 }, { "epoch": 6.046163252827881, "grad_norm": 0.4738786220550537, "learning_rate": 2.4939917625578533e-05, "loss": 0.141, "step": 19777 }, { "epoch": 6.046468969734026, "grad_norm": 0.3402746617794037, "learning_rate": 2.4939493015158595e-05, "loss": 0.0618, "step": 19778 }, { "epoch": 6.0467746866401715, "grad_norm": 0.41399553418159485, "learning_rate": 2.4939068404738653e-05, "loss": 0.0687, "step": 19779 }, { "epoch": 6.047080403546316, "grad_norm": 0.20001734793186188, "learning_rate": 2.4938643794318715e-05, "loss": 0.0727, "step": 19780 }, { "epoch": 6.047386120452461, "grad_norm": 0.18454477190971375, "learning_rate": 2.4938219183898774e-05, "loss": 0.0454, "step": 19781 }, { "epoch": 6.047691837358606, "grad_norm": 0.46053823828697205, "learning_rate": 2.4937794573478836e-05, "loss": 0.0387, "step": 19782 }, { "epoch": 6.047997554264751, "grad_norm": 1.17256498336792, "learning_rate": 2.4937369963058895e-05, "loss": 0.0765, "step": 19783 }, { "epoch": 6.048303271170896, "grad_norm": 0.34308114647865295, "learning_rate": 2.4936945352638957e-05, "loss": 0.0422, "step": 19784 }, { "epoch": 6.04860898807704, "grad_norm": 0.3065820336341858, "learning_rate": 2.4936520742219016e-05, "loss": 0.0607, "step": 19785 }, { "epoch": 6.0489147049831855, "grad_norm": 0.5873053073883057, "learning_rate": 2.4936096131799078e-05, "loss": 0.0666, "step": 19786 }, { "epoch": 6.049220421889331, "grad_norm": 1.4158662557601929, "learning_rate": 2.4935671521379136e-05, "loss": 0.1312, "step": 19787 }, { "epoch": 6.049526138795476, "grad_norm": 0.43408331274986267, "learning_rate": 2.4935246910959195e-05, "loss": 0.0935, "step": 19788 }, { "epoch": 6.04983185570162, "grad_norm": 0.26505404710769653, "learning_rate": 2.4934822300539257e-05, "loss": 0.0631, "step": 19789 }, { "epoch": 6.050137572607765, "grad_norm": 0.43603190779685974, "learning_rate": 2.4934397690119316e-05, "loss": 0.1194, "step": 19790 }, { "epoch": 6.05044328951391, "grad_norm": 0.8014103174209595, "learning_rate": 2.4933973079699378e-05, "loss": 0.1206, "step": 19791 }, { "epoch": 6.050749006420055, "grad_norm": 0.753441333770752, "learning_rate": 2.4933548469279437e-05, "loss": 0.172, "step": 19792 }, { "epoch": 6.0510547233261995, "grad_norm": 0.6911348104476929, "learning_rate": 2.49331238588595e-05, "loss": 0.1219, "step": 19793 }, { "epoch": 6.051360440232345, "grad_norm": 1.0179394483566284, "learning_rate": 2.4932699248439558e-05, "loss": 0.1459, "step": 19794 }, { "epoch": 6.05166615713849, "grad_norm": 1.9344879388809204, "learning_rate": 2.493227463801962e-05, "loss": 0.165, "step": 19795 }, { "epoch": 6.051971874044635, "grad_norm": 1.6405668258666992, "learning_rate": 2.493185002759968e-05, "loss": 0.1779, "step": 19796 }, { "epoch": 6.05227759095078, "grad_norm": 2.2410714626312256, "learning_rate": 2.493142541717974e-05, "loss": 0.1494, "step": 19797 }, { "epoch": 6.052583307856924, "grad_norm": 0.4794202744960785, "learning_rate": 2.49310008067598e-05, "loss": 0.1646, "step": 19798 }, { "epoch": 6.052889024763069, "grad_norm": 0.9932767748832703, "learning_rate": 2.493057619633986e-05, "loss": 0.1598, "step": 19799 }, { "epoch": 6.0531947416692145, "grad_norm": 1.390828013420105, "learning_rate": 2.493015158591992e-05, "loss": 0.1612, "step": 19800 }, { "epoch": 6.05350045857536, "grad_norm": 1.5844480991363525, "learning_rate": 2.492972697549998e-05, "loss": 0.1972, "step": 19801 }, { "epoch": 6.053806175481504, "grad_norm": 0.43144652247428894, "learning_rate": 2.492930236508004e-05, "loss": 0.1447, "step": 19802 }, { "epoch": 6.054111892387649, "grad_norm": 0.17638416588306427, "learning_rate": 2.49288777546601e-05, "loss": 0.0595, "step": 19803 }, { "epoch": 6.054417609293794, "grad_norm": 0.8418397903442383, "learning_rate": 2.492845314424016e-05, "loss": 0.0646, "step": 19804 }, { "epoch": 6.054723326199939, "grad_norm": 0.45638126134872437, "learning_rate": 2.492802853382022e-05, "loss": 0.0528, "step": 19805 }, { "epoch": 6.055029043106083, "grad_norm": 0.14947156608104706, "learning_rate": 2.4927603923400282e-05, "loss": 0.0542, "step": 19806 }, { "epoch": 6.0553347600122285, "grad_norm": 0.500843346118927, "learning_rate": 2.492717931298034e-05, "loss": 0.0581, "step": 19807 }, { "epoch": 6.055640476918374, "grad_norm": 1.773140549659729, "learning_rate": 2.4926754702560403e-05, "loss": 0.0531, "step": 19808 }, { "epoch": 6.055946193824519, "grad_norm": 0.9463527202606201, "learning_rate": 2.4926330092140462e-05, "loss": 0.0585, "step": 19809 }, { "epoch": 6.056251910730664, "grad_norm": 0.541976273059845, "learning_rate": 2.4925905481720524e-05, "loss": 0.0774, "step": 19810 }, { "epoch": 6.056557627636808, "grad_norm": 0.5023859143257141, "learning_rate": 2.4925480871300583e-05, "loss": 0.0808, "step": 19811 }, { "epoch": 6.056863344542953, "grad_norm": 0.3275667428970337, "learning_rate": 2.4925056260880645e-05, "loss": 0.0721, "step": 19812 }, { "epoch": 6.057169061449098, "grad_norm": 0.2444440722465515, "learning_rate": 2.4924631650460703e-05, "loss": 0.0739, "step": 19813 }, { "epoch": 6.057474778355243, "grad_norm": 0.3330933749675751, "learning_rate": 2.4924207040040762e-05, "loss": 0.077, "step": 19814 }, { "epoch": 6.057780495261388, "grad_norm": 0.3769465684890747, "learning_rate": 2.4923782429620824e-05, "loss": 0.1072, "step": 19815 }, { "epoch": 6.058086212167533, "grad_norm": 0.5746341943740845, "learning_rate": 2.4923357819200883e-05, "loss": 0.1163, "step": 19816 }, { "epoch": 6.058391929073678, "grad_norm": 0.5680860280990601, "learning_rate": 2.4922933208780945e-05, "loss": 0.1407, "step": 19817 }, { "epoch": 6.058697645979823, "grad_norm": 0.4831336736679077, "learning_rate": 2.4922508598361004e-05, "loss": 0.154, "step": 19818 }, { "epoch": 6.059003362885967, "grad_norm": 0.6971529126167297, "learning_rate": 2.4922083987941066e-05, "loss": 0.1616, "step": 19819 }, { "epoch": 6.059309079792112, "grad_norm": 0.5299879908561707, "learning_rate": 2.4921659377521124e-05, "loss": 0.1596, "step": 19820 }, { "epoch": 6.059614796698257, "grad_norm": 0.47773125767707825, "learning_rate": 2.4921234767101187e-05, "loss": 0.1534, "step": 19821 }, { "epoch": 6.0599205136044025, "grad_norm": 1.2608202695846558, "learning_rate": 2.4920810156681245e-05, "loss": 0.1741, "step": 19822 }, { "epoch": 6.060226230510548, "grad_norm": 1.481078028678894, "learning_rate": 2.4920385546261307e-05, "loss": 0.1738, "step": 19823 }, { "epoch": 6.060531947416692, "grad_norm": 0.9232473373413086, "learning_rate": 2.4919960935841366e-05, "loss": 0.1393, "step": 19824 }, { "epoch": 6.060837664322837, "grad_norm": 0.9329032301902771, "learning_rate": 2.4919536325421428e-05, "loss": 0.1538, "step": 19825 }, { "epoch": 6.061143381228982, "grad_norm": 3.004714012145996, "learning_rate": 2.4919111715001487e-05, "loss": 0.2318, "step": 19826 }, { "epoch": 6.061449098135127, "grad_norm": 0.36691975593566895, "learning_rate": 2.4918687104581545e-05, "loss": 0.1217, "step": 19827 }, { "epoch": 6.0617548150412714, "grad_norm": 0.19279059767723083, "learning_rate": 2.4918262494161608e-05, "loss": 0.0668, "step": 19828 }, { "epoch": 6.062060531947417, "grad_norm": 0.1994250863790512, "learning_rate": 2.4917837883741666e-05, "loss": 0.0897, "step": 19829 }, { "epoch": 6.062366248853562, "grad_norm": 0.17671608924865723, "learning_rate": 2.491741327332173e-05, "loss": 0.06, "step": 19830 }, { "epoch": 6.062671965759707, "grad_norm": 0.22143885493278503, "learning_rate": 2.4916988662901787e-05, "loss": 0.0505, "step": 19831 }, { "epoch": 6.062977682665851, "grad_norm": 1.8278026580810547, "learning_rate": 2.491656405248185e-05, "loss": 0.0655, "step": 19832 }, { "epoch": 6.063283399571996, "grad_norm": 0.381874680519104, "learning_rate": 2.4916139442061908e-05, "loss": 0.054, "step": 19833 }, { "epoch": 6.063589116478141, "grad_norm": 0.6876820921897888, "learning_rate": 2.491571483164197e-05, "loss": 0.0633, "step": 19834 }, { "epoch": 6.063894833384286, "grad_norm": 0.30713239312171936, "learning_rate": 2.491529022122203e-05, "loss": 0.0532, "step": 19835 }, { "epoch": 6.0642005502904315, "grad_norm": 0.3278258740901947, "learning_rate": 2.491486561080209e-05, "loss": 0.0816, "step": 19836 }, { "epoch": 6.064506267196576, "grad_norm": 0.604788064956665, "learning_rate": 2.491444100038215e-05, "loss": 0.0867, "step": 19837 }, { "epoch": 6.064811984102721, "grad_norm": 0.29718002676963806, "learning_rate": 2.4914016389962208e-05, "loss": 0.0531, "step": 19838 }, { "epoch": 6.065117701008866, "grad_norm": 0.22023062407970428, "learning_rate": 2.491359177954227e-05, "loss": 0.0614, "step": 19839 }, { "epoch": 6.065423417915011, "grad_norm": 0.3811971843242645, "learning_rate": 2.491316716912233e-05, "loss": 0.1128, "step": 19840 }, { "epoch": 6.065729134821155, "grad_norm": 0.4026639461517334, "learning_rate": 2.491274255870239e-05, "loss": 0.1012, "step": 19841 }, { "epoch": 6.0660348517273, "grad_norm": 0.6640880703926086, "learning_rate": 2.491231794828245e-05, "loss": 0.1514, "step": 19842 }, { "epoch": 6.0663405686334455, "grad_norm": 0.6493112444877625, "learning_rate": 2.4911893337862512e-05, "loss": 0.129, "step": 19843 }, { "epoch": 6.066646285539591, "grad_norm": 0.5000889897346497, "learning_rate": 2.491146872744257e-05, "loss": 0.1587, "step": 19844 }, { "epoch": 6.066952002445735, "grad_norm": 0.42486119270324707, "learning_rate": 2.4911044117022633e-05, "loss": 0.1476, "step": 19845 }, { "epoch": 6.06725771935188, "grad_norm": 0.6946542859077454, "learning_rate": 2.491061950660269e-05, "loss": 0.1411, "step": 19846 }, { "epoch": 6.067563436258025, "grad_norm": 0.5544107556343079, "learning_rate": 2.4910194896182753e-05, "loss": 0.1221, "step": 19847 }, { "epoch": 6.06786915316417, "grad_norm": 0.6024075746536255, "learning_rate": 2.4909770285762812e-05, "loss": 0.1838, "step": 19848 }, { "epoch": 6.068174870070315, "grad_norm": 1.319591760635376, "learning_rate": 2.4909345675342874e-05, "loss": 0.1949, "step": 19849 }, { "epoch": 6.0684805869764595, "grad_norm": 0.8768177032470703, "learning_rate": 2.4908921064922933e-05, "loss": 0.1782, "step": 19850 }, { "epoch": 6.068786303882605, "grad_norm": 1.6386322975158691, "learning_rate": 2.490849645450299e-05, "loss": 0.2144, "step": 19851 }, { "epoch": 6.06909202078875, "grad_norm": 0.40137630701065063, "learning_rate": 2.4908071844083054e-05, "loss": 0.1481, "step": 19852 }, { "epoch": 6.069397737694895, "grad_norm": 0.17534969747066498, "learning_rate": 2.4907647233663112e-05, "loss": 0.0551, "step": 19853 }, { "epoch": 6.069703454601039, "grad_norm": 0.21559211611747742, "learning_rate": 2.4907222623243174e-05, "loss": 0.0613, "step": 19854 }, { "epoch": 6.070009171507184, "grad_norm": 0.12090151757001877, "learning_rate": 2.4906798012823233e-05, "loss": 0.0519, "step": 19855 }, { "epoch": 6.070314888413329, "grad_norm": 0.18735161423683167, "learning_rate": 2.4906373402403295e-05, "loss": 0.0453, "step": 19856 }, { "epoch": 6.0706206053194744, "grad_norm": 0.28165388107299805, "learning_rate": 2.4905948791983354e-05, "loss": 0.0597, "step": 19857 }, { "epoch": 6.070926322225619, "grad_norm": 0.8542152047157288, "learning_rate": 2.4905524181563416e-05, "loss": 0.065, "step": 19858 }, { "epoch": 6.071232039131764, "grad_norm": 0.32240030169487, "learning_rate": 2.4905099571143475e-05, "loss": 0.0676, "step": 19859 }, { "epoch": 6.071537756037909, "grad_norm": 0.3412214517593384, "learning_rate": 2.4904674960723537e-05, "loss": 0.109, "step": 19860 }, { "epoch": 6.071843472944054, "grad_norm": 0.3560123145580292, "learning_rate": 2.4904250350303595e-05, "loss": 0.0659, "step": 19861 }, { "epoch": 6.072149189850199, "grad_norm": 0.3535694479942322, "learning_rate": 2.4903825739883658e-05, "loss": 0.0949, "step": 19862 }, { "epoch": 6.072454906756343, "grad_norm": 0.29662221670150757, "learning_rate": 2.4903401129463716e-05, "loss": 0.0523, "step": 19863 }, { "epoch": 6.0727606236624885, "grad_norm": 0.28648024797439575, "learning_rate": 2.4902976519043775e-05, "loss": 0.0706, "step": 19864 }, { "epoch": 6.073066340568634, "grad_norm": 0.4250667095184326, "learning_rate": 2.4902551908623837e-05, "loss": 0.1296, "step": 19865 }, { "epoch": 6.073372057474779, "grad_norm": 0.5882096290588379, "learning_rate": 2.4902127298203896e-05, "loss": 0.1109, "step": 19866 }, { "epoch": 6.073677774380923, "grad_norm": 1.104628562927246, "learning_rate": 2.4901702687783958e-05, "loss": 0.15, "step": 19867 }, { "epoch": 6.073983491287068, "grad_norm": 0.7408469915390015, "learning_rate": 2.4901278077364017e-05, "loss": 0.1437, "step": 19868 }, { "epoch": 6.074289208193213, "grad_norm": 0.5308287739753723, "learning_rate": 2.490085346694408e-05, "loss": 0.1406, "step": 19869 }, { "epoch": 6.074594925099358, "grad_norm": 0.7462009191513062, "learning_rate": 2.4900428856524137e-05, "loss": 0.1521, "step": 19870 }, { "epoch": 6.0749006420055025, "grad_norm": 1.3398385047912598, "learning_rate": 2.49000042461042e-05, "loss": 0.1541, "step": 19871 }, { "epoch": 6.075206358911648, "grad_norm": 0.808414101600647, "learning_rate": 2.4899579635684258e-05, "loss": 0.1727, "step": 19872 }, { "epoch": 6.075512075817793, "grad_norm": 0.5440101623535156, "learning_rate": 2.489915502526432e-05, "loss": 0.1652, "step": 19873 }, { "epoch": 6.075817792723938, "grad_norm": 0.7239301800727844, "learning_rate": 2.4898730414844382e-05, "loss": 0.1892, "step": 19874 }, { "epoch": 6.076123509630083, "grad_norm": 1.8097336292266846, "learning_rate": 2.4898305804424444e-05, "loss": 0.185, "step": 19875 }, { "epoch": 6.076429226536227, "grad_norm": 1.2732813358306885, "learning_rate": 2.4897881194004503e-05, "loss": 0.2367, "step": 19876 }, { "epoch": 6.076734943442372, "grad_norm": 0.6691729426383972, "learning_rate": 2.4897456583584562e-05, "loss": 0.1315, "step": 19877 }, { "epoch": 6.077040660348517, "grad_norm": 0.5767754912376404, "learning_rate": 2.4897031973164624e-05, "loss": 0.0841, "step": 19878 }, { "epoch": 6.0773463772546625, "grad_norm": 0.2968657612800598, "learning_rate": 2.4896607362744683e-05, "loss": 0.066, "step": 19879 }, { "epoch": 6.077652094160807, "grad_norm": 0.18441042304039001, "learning_rate": 2.4896182752324745e-05, "loss": 0.0528, "step": 19880 }, { "epoch": 6.077957811066952, "grad_norm": 0.1822187453508377, "learning_rate": 2.4895758141904803e-05, "loss": 0.063, "step": 19881 }, { "epoch": 6.078263527973097, "grad_norm": 0.1468658149242401, "learning_rate": 2.4895333531484865e-05, "loss": 0.0329, "step": 19882 }, { "epoch": 6.078569244879242, "grad_norm": 0.3944590985774994, "learning_rate": 2.4894908921064924e-05, "loss": 0.0654, "step": 19883 }, { "epoch": 6.078874961785386, "grad_norm": 0.15712283551692963, "learning_rate": 2.4894484310644986e-05, "loss": 0.0679, "step": 19884 }, { "epoch": 6.0791806786915314, "grad_norm": 0.2002270668745041, "learning_rate": 2.4894059700225045e-05, "loss": 0.0482, "step": 19885 }, { "epoch": 6.079486395597677, "grad_norm": 0.19557620584964752, "learning_rate": 2.4893635089805107e-05, "loss": 0.0767, "step": 19886 }, { "epoch": 6.079792112503822, "grad_norm": 0.24094459414482117, "learning_rate": 2.4893210479385166e-05, "loss": 0.0885, "step": 19887 }, { "epoch": 6.080097829409967, "grad_norm": 0.25817742943763733, "learning_rate": 2.4892785868965228e-05, "loss": 0.0734, "step": 19888 }, { "epoch": 6.080403546316111, "grad_norm": 0.4193473160266876, "learning_rate": 2.4892361258545287e-05, "loss": 0.1027, "step": 19889 }, { "epoch": 6.080709263222256, "grad_norm": 0.221846804022789, "learning_rate": 2.4891936648125345e-05, "loss": 0.0992, "step": 19890 }, { "epoch": 6.081014980128401, "grad_norm": 0.3911884129047394, "learning_rate": 2.4891512037705407e-05, "loss": 0.1077, "step": 19891 }, { "epoch": 6.081320697034546, "grad_norm": 0.33835628628730774, "learning_rate": 2.4891087427285466e-05, "loss": 0.1288, "step": 19892 }, { "epoch": 6.081626413940691, "grad_norm": 0.6224759221076965, "learning_rate": 2.4890662816865528e-05, "loss": 0.1402, "step": 19893 }, { "epoch": 6.081932130846836, "grad_norm": 0.8952966332435608, "learning_rate": 2.4890238206445587e-05, "loss": 0.1304, "step": 19894 }, { "epoch": 6.082237847752981, "grad_norm": 0.6202644109725952, "learning_rate": 2.488981359602565e-05, "loss": 0.1682, "step": 19895 }, { "epoch": 6.082543564659126, "grad_norm": 0.6069774627685547, "learning_rate": 2.4889388985605708e-05, "loss": 0.187, "step": 19896 }, { "epoch": 6.08284928156527, "grad_norm": 0.8295747637748718, "learning_rate": 2.488896437518577e-05, "loss": 0.1895, "step": 19897 }, { "epoch": 6.083154998471415, "grad_norm": 0.4835392236709595, "learning_rate": 2.488853976476583e-05, "loss": 0.1586, "step": 19898 }, { "epoch": 6.08346071537756, "grad_norm": 0.5007269382476807, "learning_rate": 2.488811515434589e-05, "loss": 0.1604, "step": 19899 }, { "epoch": 6.0837664322837055, "grad_norm": 5.311293125152588, "learning_rate": 2.488769054392595e-05, "loss": 0.1644, "step": 19900 }, { "epoch": 6.084072149189851, "grad_norm": 1.0553345680236816, "learning_rate": 2.488726593350601e-05, "loss": 0.203, "step": 19901 }, { "epoch": 6.084377866095995, "grad_norm": 0.3781841993331909, "learning_rate": 2.488684132308607e-05, "loss": 0.1333, "step": 19902 }, { "epoch": 6.08468358300214, "grad_norm": 0.18943357467651367, "learning_rate": 2.488641671266613e-05, "loss": 0.077, "step": 19903 }, { "epoch": 6.084989299908285, "grad_norm": 0.3493582606315613, "learning_rate": 2.488599210224619e-05, "loss": 0.0644, "step": 19904 }, { "epoch": 6.08529501681443, "grad_norm": 0.2636881172657013, "learning_rate": 2.488556749182625e-05, "loss": 0.0722, "step": 19905 }, { "epoch": 6.085600733720574, "grad_norm": 0.5306391716003418, "learning_rate": 2.488514288140631e-05, "loss": 0.0285, "step": 19906 }, { "epoch": 6.0859064506267195, "grad_norm": 0.30418357253074646, "learning_rate": 2.488471827098637e-05, "loss": 0.065, "step": 19907 }, { "epoch": 6.086212167532865, "grad_norm": 0.40167877078056335, "learning_rate": 2.4884293660566432e-05, "loss": 0.053, "step": 19908 }, { "epoch": 6.08651788443901, "grad_norm": 0.30347880721092224, "learning_rate": 2.488386905014649e-05, "loss": 0.0546, "step": 19909 }, { "epoch": 6.086823601345154, "grad_norm": 1.325457215309143, "learning_rate": 2.4883444439726553e-05, "loss": 0.0556, "step": 19910 }, { "epoch": 6.087129318251299, "grad_norm": 0.31938791275024414, "learning_rate": 2.4883019829306612e-05, "loss": 0.046, "step": 19911 }, { "epoch": 6.087435035157444, "grad_norm": 1.9249578714370728, "learning_rate": 2.4882595218886674e-05, "loss": 0.0931, "step": 19912 }, { "epoch": 6.087740752063589, "grad_norm": 0.1830013394355774, "learning_rate": 2.4882170608466733e-05, "loss": 0.0511, "step": 19913 }, { "epoch": 6.0880464689697344, "grad_norm": 0.2424648106098175, "learning_rate": 2.4881745998046795e-05, "loss": 0.0921, "step": 19914 }, { "epoch": 6.088352185875879, "grad_norm": 0.4016853868961334, "learning_rate": 2.4881321387626853e-05, "loss": 0.0955, "step": 19915 }, { "epoch": 6.088657902782024, "grad_norm": 0.7585169076919556, "learning_rate": 2.4880896777206912e-05, "loss": 0.1305, "step": 19916 }, { "epoch": 6.088963619688169, "grad_norm": 0.6287317872047424, "learning_rate": 2.4880472166786974e-05, "loss": 0.1423, "step": 19917 }, { "epoch": 6.089269336594314, "grad_norm": 0.5696415305137634, "learning_rate": 2.4880047556367033e-05, "loss": 0.1295, "step": 19918 }, { "epoch": 6.089575053500458, "grad_norm": 0.6140748262405396, "learning_rate": 2.4879622945947095e-05, "loss": 0.1315, "step": 19919 }, { "epoch": 6.089880770406603, "grad_norm": 0.6834477186203003, "learning_rate": 2.4879198335527154e-05, "loss": 0.1596, "step": 19920 }, { "epoch": 6.0901864873127485, "grad_norm": 0.5184687972068787, "learning_rate": 2.4878773725107216e-05, "loss": 0.1939, "step": 19921 }, { "epoch": 6.090492204218894, "grad_norm": 0.6512826085090637, "learning_rate": 2.4878349114687274e-05, "loss": 0.1552, "step": 19922 }, { "epoch": 6.090797921125038, "grad_norm": 1.5190485715866089, "learning_rate": 2.4877924504267337e-05, "loss": 0.1739, "step": 19923 }, { "epoch": 6.091103638031183, "grad_norm": 1.1432491540908813, "learning_rate": 2.4877499893847395e-05, "loss": 0.1626, "step": 19924 }, { "epoch": 6.091409354937328, "grad_norm": 1.1763533353805542, "learning_rate": 2.4877075283427457e-05, "loss": 0.1918, "step": 19925 }, { "epoch": 6.091715071843473, "grad_norm": 0.6478255987167358, "learning_rate": 2.4876650673007516e-05, "loss": 0.1736, "step": 19926 }, { "epoch": 6.092020788749618, "grad_norm": 0.4107470214366913, "learning_rate": 2.4876226062587578e-05, "loss": 0.1285, "step": 19927 }, { "epoch": 6.0923265056557625, "grad_norm": 0.3474617600440979, "learning_rate": 2.4875801452167637e-05, "loss": 0.0837, "step": 19928 }, { "epoch": 6.092632222561908, "grad_norm": 0.19752146303653717, "learning_rate": 2.4875376841747696e-05, "loss": 0.0623, "step": 19929 }, { "epoch": 6.092937939468053, "grad_norm": 0.18005865812301636, "learning_rate": 2.4874952231327758e-05, "loss": 0.0534, "step": 19930 }, { "epoch": 6.093243656374198, "grad_norm": 0.1684458702802658, "learning_rate": 2.4874527620907816e-05, "loss": 0.0308, "step": 19931 }, { "epoch": 6.093549373280342, "grad_norm": 1.1553531885147095, "learning_rate": 2.487410301048788e-05, "loss": 0.0567, "step": 19932 }, { "epoch": 6.093855090186487, "grad_norm": 0.33901405334472656, "learning_rate": 2.4873678400067937e-05, "loss": 0.0566, "step": 19933 }, { "epoch": 6.094160807092632, "grad_norm": 0.29207420349121094, "learning_rate": 2.4873253789648e-05, "loss": 0.0677, "step": 19934 }, { "epoch": 6.094466523998777, "grad_norm": 0.34399598836898804, "learning_rate": 2.4872829179228058e-05, "loss": 0.0634, "step": 19935 }, { "epoch": 6.094772240904922, "grad_norm": 0.26935985684394836, "learning_rate": 2.487240456880812e-05, "loss": 0.0607, "step": 19936 }, { "epoch": 6.095077957811067, "grad_norm": 0.2906731367111206, "learning_rate": 2.487197995838818e-05, "loss": 0.0836, "step": 19937 }, { "epoch": 6.095383674717212, "grad_norm": 1.903587818145752, "learning_rate": 2.487155534796824e-05, "loss": 0.0724, "step": 19938 }, { "epoch": 6.095689391623357, "grad_norm": 0.5834397077560425, "learning_rate": 2.48711307375483e-05, "loss": 0.0677, "step": 19939 }, { "epoch": 6.095995108529502, "grad_norm": 0.6854841113090515, "learning_rate": 2.487070612712836e-05, "loss": 0.1433, "step": 19940 }, { "epoch": 6.096300825435646, "grad_norm": 0.5331303477287292, "learning_rate": 2.487028151670842e-05, "loss": 0.094, "step": 19941 }, { "epoch": 6.0966065423417914, "grad_norm": 0.4218860864639282, "learning_rate": 2.486985690628848e-05, "loss": 0.1291, "step": 19942 }, { "epoch": 6.096912259247937, "grad_norm": 1.2486780881881714, "learning_rate": 2.486943229586854e-05, "loss": 0.1637, "step": 19943 }, { "epoch": 6.097217976154082, "grad_norm": 1.419704794883728, "learning_rate": 2.48690076854486e-05, "loss": 0.1519, "step": 19944 }, { "epoch": 6.097523693060226, "grad_norm": 0.5460822582244873, "learning_rate": 2.4868583075028662e-05, "loss": 0.1408, "step": 19945 }, { "epoch": 6.097829409966371, "grad_norm": 0.7606019377708435, "learning_rate": 2.486815846460872e-05, "loss": 0.1483, "step": 19946 }, { "epoch": 6.098135126872516, "grad_norm": 0.43589282035827637, "learning_rate": 2.4867733854188783e-05, "loss": 0.1537, "step": 19947 }, { "epoch": 6.098440843778661, "grad_norm": 3.147671937942505, "learning_rate": 2.486730924376884e-05, "loss": 0.1668, "step": 19948 }, { "epoch": 6.0987465606848055, "grad_norm": 0.9603869318962097, "learning_rate": 2.4866884633348903e-05, "loss": 0.1612, "step": 19949 }, { "epoch": 6.099052277590951, "grad_norm": 1.0561351776123047, "learning_rate": 2.4866460022928962e-05, "loss": 0.1757, "step": 19950 }, { "epoch": 6.099357994497096, "grad_norm": 2.0390894412994385, "learning_rate": 2.4866035412509024e-05, "loss": 0.1951, "step": 19951 }, { "epoch": 6.099663711403241, "grad_norm": 0.5840420126914978, "learning_rate": 2.4865610802089083e-05, "loss": 0.1288, "step": 19952 }, { "epoch": 6.099969428309386, "grad_norm": 0.32540029287338257, "learning_rate": 2.486518619166914e-05, "loss": 0.0815, "step": 19953 }, { "epoch": 6.10027514521553, "grad_norm": 0.25858598947525024, "learning_rate": 2.4864761581249204e-05, "loss": 0.0782, "step": 19954 }, { "epoch": 6.100580862121675, "grad_norm": 0.5324111580848694, "learning_rate": 2.4864336970829262e-05, "loss": 0.0664, "step": 19955 }, { "epoch": 6.10088657902782, "grad_norm": 0.19806672632694244, "learning_rate": 2.4863912360409324e-05, "loss": 0.0462, "step": 19956 }, { "epoch": 6.1011922959339655, "grad_norm": 0.4267543852329254, "learning_rate": 2.4863487749989383e-05, "loss": 0.0554, "step": 19957 }, { "epoch": 6.10149801284011, "grad_norm": 0.42229020595550537, "learning_rate": 2.4863063139569445e-05, "loss": 0.0633, "step": 19958 }, { "epoch": 6.101803729746255, "grad_norm": 0.31061115860939026, "learning_rate": 2.4862638529149504e-05, "loss": 0.0691, "step": 19959 }, { "epoch": 6.1021094466524, "grad_norm": 0.2036275714635849, "learning_rate": 2.4862213918729566e-05, "loss": 0.0616, "step": 19960 }, { "epoch": 6.102415163558545, "grad_norm": 0.2506101131439209, "learning_rate": 2.4861789308309625e-05, "loss": 0.0645, "step": 19961 }, { "epoch": 6.102720880464689, "grad_norm": 0.2963463068008423, "learning_rate": 2.4861364697889687e-05, "loss": 0.092, "step": 19962 }, { "epoch": 6.103026597370834, "grad_norm": 0.3584853410720825, "learning_rate": 2.4860940087469746e-05, "loss": 0.0885, "step": 19963 }, { "epoch": 6.1033323142769795, "grad_norm": 0.30029886960983276, "learning_rate": 2.4860515477049808e-05, "loss": 0.0862, "step": 19964 }, { "epoch": 6.103638031183125, "grad_norm": 0.410064697265625, "learning_rate": 2.4860090866629866e-05, "loss": 0.1131, "step": 19965 }, { "epoch": 6.10394374808927, "grad_norm": 0.4744073748588562, "learning_rate": 2.4859666256209925e-05, "loss": 0.0979, "step": 19966 }, { "epoch": 6.104249464995414, "grad_norm": 0.6344954371452332, "learning_rate": 2.4859241645789987e-05, "loss": 0.13, "step": 19967 }, { "epoch": 6.104555181901559, "grad_norm": 0.739997923374176, "learning_rate": 2.4858817035370046e-05, "loss": 0.1505, "step": 19968 }, { "epoch": 6.104860898807704, "grad_norm": 0.8400346636772156, "learning_rate": 2.4858392424950108e-05, "loss": 0.1793, "step": 19969 }, { "epoch": 6.105166615713849, "grad_norm": 0.9937939643859863, "learning_rate": 2.4857967814530167e-05, "loss": 0.1728, "step": 19970 }, { "epoch": 6.105472332619994, "grad_norm": 1.13655424118042, "learning_rate": 2.485754320411023e-05, "loss": 0.1691, "step": 19971 }, { "epoch": 6.105778049526139, "grad_norm": 0.7149106860160828, "learning_rate": 2.4857118593690287e-05, "loss": 0.1535, "step": 19972 }, { "epoch": 6.106083766432284, "grad_norm": 1.8682109117507935, "learning_rate": 2.485669398327035e-05, "loss": 0.1627, "step": 19973 }, { "epoch": 6.106389483338429, "grad_norm": 0.9349708557128906, "learning_rate": 2.4856269372850408e-05, "loss": 0.1605, "step": 19974 }, { "epoch": 6.106695200244573, "grad_norm": 3.0610692501068115, "learning_rate": 2.485584476243047e-05, "loss": 0.1849, "step": 19975 }, { "epoch": 6.107000917150718, "grad_norm": 1.691949725151062, "learning_rate": 2.485542015201053e-05, "loss": 0.1821, "step": 19976 }, { "epoch": 6.107306634056863, "grad_norm": 0.48088592290878296, "learning_rate": 2.485499554159059e-05, "loss": 0.1435, "step": 19977 }, { "epoch": 6.1076123509630085, "grad_norm": 1.1346766948699951, "learning_rate": 2.4854570931170653e-05, "loss": 0.0866, "step": 19978 }, { "epoch": 6.107918067869154, "grad_norm": 0.6547658443450928, "learning_rate": 2.4854146320750712e-05, "loss": 0.0645, "step": 19979 }, { "epoch": 6.108223784775298, "grad_norm": 0.2037203460931778, "learning_rate": 2.4853721710330774e-05, "loss": 0.0522, "step": 19980 }, { "epoch": 6.108529501681443, "grad_norm": 0.1902245432138443, "learning_rate": 2.4853297099910833e-05, "loss": 0.044, "step": 19981 }, { "epoch": 6.108835218587588, "grad_norm": 0.578431248664856, "learning_rate": 2.4852872489490895e-05, "loss": 0.06, "step": 19982 }, { "epoch": 6.109140935493733, "grad_norm": 1.2664282321929932, "learning_rate": 2.4852447879070953e-05, "loss": 0.0329, "step": 19983 }, { "epoch": 6.109446652399877, "grad_norm": 0.12993751466274261, "learning_rate": 2.4852023268651015e-05, "loss": 0.0376, "step": 19984 }, { "epoch": 6.1097523693060225, "grad_norm": 0.4201193153858185, "learning_rate": 2.4851598658231074e-05, "loss": 0.0699, "step": 19985 }, { "epoch": 6.110058086212168, "grad_norm": 0.3284445106983185, "learning_rate": 2.4851174047811136e-05, "loss": 0.0633, "step": 19986 }, { "epoch": 6.110363803118313, "grad_norm": 0.869158923625946, "learning_rate": 2.4850749437391195e-05, "loss": 0.0797, "step": 19987 }, { "epoch": 6.110669520024457, "grad_norm": 0.7455506920814514, "learning_rate": 2.4850324826971257e-05, "loss": 0.054, "step": 19988 }, { "epoch": 6.110975236930602, "grad_norm": 1.1545263528823853, "learning_rate": 2.4849900216551316e-05, "loss": 0.0957, "step": 19989 }, { "epoch": 6.111280953836747, "grad_norm": 0.46228376030921936, "learning_rate": 2.4849475606131378e-05, "loss": 0.1145, "step": 19990 }, { "epoch": 6.111586670742892, "grad_norm": 0.48746687173843384, "learning_rate": 2.4849050995711437e-05, "loss": 0.1184, "step": 19991 }, { "epoch": 6.111892387649037, "grad_norm": 0.6009523868560791, "learning_rate": 2.4848626385291495e-05, "loss": 0.1555, "step": 19992 }, { "epoch": 6.112198104555182, "grad_norm": 0.6588444709777832, "learning_rate": 2.4848201774871557e-05, "loss": 0.2024, "step": 19993 }, { "epoch": 6.112503821461327, "grad_norm": 0.514028787612915, "learning_rate": 2.4847777164451616e-05, "loss": 0.1988, "step": 19994 }, { "epoch": 6.112809538367472, "grad_norm": 0.7283613085746765, "learning_rate": 2.4847352554031678e-05, "loss": 0.1483, "step": 19995 }, { "epoch": 6.113115255273617, "grad_norm": 1.2133303880691528, "learning_rate": 2.4846927943611737e-05, "loss": 0.183, "step": 19996 }, { "epoch": 6.113420972179761, "grad_norm": 0.912172794342041, "learning_rate": 2.48465033331918e-05, "loss": 0.1694, "step": 19997 }, { "epoch": 6.113726689085906, "grad_norm": 0.9513779282569885, "learning_rate": 2.4846078722771858e-05, "loss": 0.1842, "step": 19998 }, { "epoch": 6.1140324059920514, "grad_norm": 2.020568609237671, "learning_rate": 2.484565411235192e-05, "loss": 0.2147, "step": 19999 }, { "epoch": 6.114338122898197, "grad_norm": 1.0235761404037476, "learning_rate": 2.484522950193198e-05, "loss": 0.2498, "step": 20000 }, { "epoch": 6.114338122898197, "eval_cer": 0.18867358406162374, "eval_loss": 0.22898529469966888, "eval_runtime": 19.0149, "eval_samples_per_second": 238.655, "eval_steps_per_second": 0.789, "eval_wer": 0.3290969525990894, "step": 20000 }, { "epoch": 6.114643839804341, "grad_norm": 1.2547979354858398, "learning_rate": 2.484480489151204e-05, "loss": 0.2028, "step": 20001 }, { "epoch": 6.114949556710486, "grad_norm": 0.3573864698410034, "learning_rate": 2.48443802810921e-05, "loss": 0.1321, "step": 20002 }, { "epoch": 6.115255273616631, "grad_norm": 0.2522013783454895, "learning_rate": 2.484395567067216e-05, "loss": 0.0886, "step": 20003 }, { "epoch": 6.115560990522776, "grad_norm": 0.4584152102470398, "learning_rate": 2.484353106025222e-05, "loss": 0.0651, "step": 20004 }, { "epoch": 6.115866707428921, "grad_norm": 0.3608650267124176, "learning_rate": 2.484310644983228e-05, "loss": 0.0537, "step": 20005 }, { "epoch": 6.1161724243350655, "grad_norm": 0.17288556694984436, "learning_rate": 2.484268183941234e-05, "loss": 0.037, "step": 20006 }, { "epoch": 6.116478141241211, "grad_norm": 0.6777356863021851, "learning_rate": 2.48422572289924e-05, "loss": 0.04, "step": 20007 }, { "epoch": 6.116783858147356, "grad_norm": 0.19145022332668304, "learning_rate": 2.484183261857246e-05, "loss": 0.0498, "step": 20008 }, { "epoch": 6.117089575053501, "grad_norm": 0.7982898950576782, "learning_rate": 2.484140800815252e-05, "loss": 0.0738, "step": 20009 }, { "epoch": 6.117395291959645, "grad_norm": 0.3078325688838959, "learning_rate": 2.4840983397732582e-05, "loss": 0.0708, "step": 20010 }, { "epoch": 6.11770100886579, "grad_norm": 0.1905602216720581, "learning_rate": 2.484055878731264e-05, "loss": 0.063, "step": 20011 }, { "epoch": 6.118006725771935, "grad_norm": 1.7849355936050415, "learning_rate": 2.4840134176892703e-05, "loss": 0.0819, "step": 20012 }, { "epoch": 6.11831244267808, "grad_norm": 0.2498578578233719, "learning_rate": 2.4839709566472762e-05, "loss": 0.0681, "step": 20013 }, { "epoch": 6.118618159584225, "grad_norm": 0.5292383432388306, "learning_rate": 2.4839284956052824e-05, "loss": 0.0938, "step": 20014 }, { "epoch": 6.11892387649037, "grad_norm": 1.4859169721603394, "learning_rate": 2.4838860345632883e-05, "loss": 0.1431, "step": 20015 }, { "epoch": 6.119229593396515, "grad_norm": 0.5185948014259338, "learning_rate": 2.4838435735212945e-05, "loss": 0.1372, "step": 20016 }, { "epoch": 6.11953531030266, "grad_norm": 0.9317001700401306, "learning_rate": 2.4838011124793003e-05, "loss": 0.1459, "step": 20017 }, { "epoch": 6.119841027208805, "grad_norm": 0.9072756767272949, "learning_rate": 2.4837586514373062e-05, "loss": 0.1456, "step": 20018 }, { "epoch": 6.120146744114949, "grad_norm": 0.385139137506485, "learning_rate": 2.4837161903953124e-05, "loss": 0.15, "step": 20019 }, { "epoch": 6.120452461021094, "grad_norm": 0.9279235005378723, "learning_rate": 2.4836737293533183e-05, "loss": 0.1477, "step": 20020 }, { "epoch": 6.1207581779272395, "grad_norm": 0.6589357256889343, "learning_rate": 2.4836312683113245e-05, "loss": 0.1946, "step": 20021 }, { "epoch": 6.121063894833385, "grad_norm": 1.8496410846710205, "learning_rate": 2.4835888072693304e-05, "loss": 0.1623, "step": 20022 }, { "epoch": 6.121369611739529, "grad_norm": 0.7936557531356812, "learning_rate": 2.4835463462273366e-05, "loss": 0.1485, "step": 20023 }, { "epoch": 6.121675328645674, "grad_norm": 0.9255039691925049, "learning_rate": 2.4835038851853424e-05, "loss": 0.172, "step": 20024 }, { "epoch": 6.121981045551819, "grad_norm": 0.9961348176002502, "learning_rate": 2.4834614241433487e-05, "loss": 0.2215, "step": 20025 }, { "epoch": 6.122286762457964, "grad_norm": 4.02610445022583, "learning_rate": 2.4834189631013545e-05, "loss": 0.1927, "step": 20026 }, { "epoch": 6.122592479364108, "grad_norm": 0.5254974961280823, "learning_rate": 2.4833765020593607e-05, "loss": 0.1316, "step": 20027 }, { "epoch": 6.122898196270254, "grad_norm": 0.43087247014045715, "learning_rate": 2.4833340410173666e-05, "loss": 0.0748, "step": 20028 }, { "epoch": 6.123203913176399, "grad_norm": 0.19606365263462067, "learning_rate": 2.4832915799753728e-05, "loss": 0.0579, "step": 20029 }, { "epoch": 6.123509630082544, "grad_norm": 0.45763981342315674, "learning_rate": 2.4832491189333787e-05, "loss": 0.052, "step": 20030 }, { "epoch": 6.123815346988689, "grad_norm": 0.17534081637859344, "learning_rate": 2.4832066578913846e-05, "loss": 0.0713, "step": 20031 }, { "epoch": 6.124121063894833, "grad_norm": 0.35705211758613586, "learning_rate": 2.4831641968493908e-05, "loss": 0.0639, "step": 20032 }, { "epoch": 6.124426780800978, "grad_norm": 0.22972489893436432, "learning_rate": 2.4831217358073966e-05, "loss": 0.0413, "step": 20033 }, { "epoch": 6.124732497707123, "grad_norm": 0.5749980807304382, "learning_rate": 2.483079274765403e-05, "loss": 0.0506, "step": 20034 }, { "epoch": 6.1250382146132685, "grad_norm": 0.4807780086994171, "learning_rate": 2.4830368137234087e-05, "loss": 0.0766, "step": 20035 }, { "epoch": 6.125343931519413, "grad_norm": 0.4539906978607178, "learning_rate": 2.482994352681415e-05, "loss": 0.0754, "step": 20036 }, { "epoch": 6.125649648425558, "grad_norm": 0.18951162695884705, "learning_rate": 2.4829518916394208e-05, "loss": 0.0711, "step": 20037 }, { "epoch": 6.125955365331703, "grad_norm": 0.37651053071022034, "learning_rate": 2.482909430597427e-05, "loss": 0.0562, "step": 20038 }, { "epoch": 6.126261082237848, "grad_norm": 0.34798187017440796, "learning_rate": 2.482866969555433e-05, "loss": 0.1166, "step": 20039 }, { "epoch": 6.126566799143992, "grad_norm": 0.31354188919067383, "learning_rate": 2.482824508513439e-05, "loss": 0.119, "step": 20040 }, { "epoch": 6.126872516050137, "grad_norm": 0.4313022494316101, "learning_rate": 2.482782047471445e-05, "loss": 0.1272, "step": 20041 }, { "epoch": 6.1271782329562825, "grad_norm": 0.4057951867580414, "learning_rate": 2.482739586429451e-05, "loss": 0.1371, "step": 20042 }, { "epoch": 6.127483949862428, "grad_norm": 0.4876025915145874, "learning_rate": 2.482697125387457e-05, "loss": 0.1541, "step": 20043 }, { "epoch": 6.127789666768573, "grad_norm": 0.6185230612754822, "learning_rate": 2.482654664345463e-05, "loss": 0.1729, "step": 20044 }, { "epoch": 6.128095383674717, "grad_norm": 0.6111595034599304, "learning_rate": 2.482612203303469e-05, "loss": 0.175, "step": 20045 }, { "epoch": 6.128401100580862, "grad_norm": 0.5053378343582153, "learning_rate": 2.482569742261475e-05, "loss": 0.1627, "step": 20046 }, { "epoch": 6.128706817487007, "grad_norm": 1.4949229955673218, "learning_rate": 2.4825272812194812e-05, "loss": 0.1556, "step": 20047 }, { "epoch": 6.129012534393152, "grad_norm": 1.9458024501800537, "learning_rate": 2.482484820177487e-05, "loss": 0.1423, "step": 20048 }, { "epoch": 6.1293182512992965, "grad_norm": 1.2721468210220337, "learning_rate": 2.4824423591354933e-05, "loss": 0.2013, "step": 20049 }, { "epoch": 6.129623968205442, "grad_norm": 1.4745368957519531, "learning_rate": 2.482399898093499e-05, "loss": 0.1799, "step": 20050 }, { "epoch": 6.129929685111587, "grad_norm": 1.9840422868728638, "learning_rate": 2.4823574370515053e-05, "loss": 0.186, "step": 20051 }, { "epoch": 6.130235402017732, "grad_norm": 0.2885703146457672, "learning_rate": 2.4823149760095112e-05, "loss": 0.1243, "step": 20052 }, { "epoch": 6.130541118923876, "grad_norm": 0.397899329662323, "learning_rate": 2.4822725149675174e-05, "loss": 0.0566, "step": 20053 }, { "epoch": 6.130846835830021, "grad_norm": 0.19123125076293945, "learning_rate": 2.4822300539255233e-05, "loss": 0.0675, "step": 20054 }, { "epoch": 6.131152552736166, "grad_norm": 0.28262609243392944, "learning_rate": 2.4821875928835295e-05, "loss": 0.0595, "step": 20055 }, { "epoch": 6.131458269642311, "grad_norm": 0.32557299733161926, "learning_rate": 2.4821451318415354e-05, "loss": 0.0602, "step": 20056 }, { "epoch": 6.131763986548457, "grad_norm": 0.19462990760803223, "learning_rate": 2.4821026707995412e-05, "loss": 0.0321, "step": 20057 }, { "epoch": 6.132069703454601, "grad_norm": 0.2688189446926117, "learning_rate": 2.4820602097575474e-05, "loss": 0.0522, "step": 20058 }, { "epoch": 6.132375420360746, "grad_norm": 0.23523059487342834, "learning_rate": 2.4820177487155533e-05, "loss": 0.0511, "step": 20059 }, { "epoch": 6.132681137266891, "grad_norm": 0.229433074593544, "learning_rate": 2.4819752876735595e-05, "loss": 0.0479, "step": 20060 }, { "epoch": 6.132986854173036, "grad_norm": 0.24379856884479523, "learning_rate": 2.4819328266315654e-05, "loss": 0.0573, "step": 20061 }, { "epoch": 6.13329257107918, "grad_norm": 1.0900676250457764, "learning_rate": 2.4818903655895716e-05, "loss": 0.0833, "step": 20062 }, { "epoch": 6.1335982879853255, "grad_norm": 1.153342843055725, "learning_rate": 2.4818479045475775e-05, "loss": 0.0984, "step": 20063 }, { "epoch": 6.133904004891471, "grad_norm": 0.4444144666194916, "learning_rate": 2.4818054435055837e-05, "loss": 0.0689, "step": 20064 }, { "epoch": 6.134209721797616, "grad_norm": 2.719917058944702, "learning_rate": 2.4817629824635896e-05, "loss": 0.1342, "step": 20065 }, { "epoch": 6.13451543870376, "grad_norm": 0.45973753929138184, "learning_rate": 2.4817205214215958e-05, "loss": 0.0953, "step": 20066 }, { "epoch": 6.134821155609905, "grad_norm": 1.5075887441635132, "learning_rate": 2.4816780603796016e-05, "loss": 0.1266, "step": 20067 }, { "epoch": 6.13512687251605, "grad_norm": 2.6875383853912354, "learning_rate": 2.4816355993376075e-05, "loss": 0.1279, "step": 20068 }, { "epoch": 6.135432589422195, "grad_norm": 1.0308547019958496, "learning_rate": 2.4815931382956137e-05, "loss": 0.1435, "step": 20069 }, { "epoch": 6.13573830632834, "grad_norm": 1.358459711074829, "learning_rate": 2.4815506772536196e-05, "loss": 0.1726, "step": 20070 }, { "epoch": 6.136044023234485, "grad_norm": 12.8378324508667, "learning_rate": 2.4815082162116258e-05, "loss": 0.2297, "step": 20071 }, { "epoch": 6.13634974014063, "grad_norm": 2.2743887901306152, "learning_rate": 2.4814657551696317e-05, "loss": 0.1869, "step": 20072 }, { "epoch": 6.136655457046775, "grad_norm": 1.2718961238861084, "learning_rate": 2.481423294127638e-05, "loss": 0.1552, "step": 20073 }, { "epoch": 6.13696117395292, "grad_norm": 0.9508559107780457, "learning_rate": 2.4813808330856437e-05, "loss": 0.1746, "step": 20074 }, { "epoch": 6.137266890859064, "grad_norm": 1.2071402072906494, "learning_rate": 2.48133837204365e-05, "loss": 0.1606, "step": 20075 }, { "epoch": 6.137572607765209, "grad_norm": 1.7062819004058838, "learning_rate": 2.4812959110016558e-05, "loss": 0.2458, "step": 20076 }, { "epoch": 6.137878324671354, "grad_norm": 0.38832446932792664, "learning_rate": 2.481253449959662e-05, "loss": 0.1294, "step": 20077 }, { "epoch": 6.1381840415774995, "grad_norm": 0.3766445815563202, "learning_rate": 2.481210988917668e-05, "loss": 0.0675, "step": 20078 }, { "epoch": 6.138489758483644, "grad_norm": 0.4837579131126404, "learning_rate": 2.481168527875674e-05, "loss": 0.0693, "step": 20079 }, { "epoch": 6.138795475389789, "grad_norm": 1.7309014797210693, "learning_rate": 2.4811260668336803e-05, "loss": 0.0514, "step": 20080 }, { "epoch": 6.139101192295934, "grad_norm": 1.0511382818222046, "learning_rate": 2.4810836057916862e-05, "loss": 0.064, "step": 20081 }, { "epoch": 6.139406909202079, "grad_norm": 0.22710208594799042, "learning_rate": 2.4810411447496924e-05, "loss": 0.0671, "step": 20082 }, { "epoch": 6.139712626108224, "grad_norm": 0.2142905443906784, "learning_rate": 2.4809986837076983e-05, "loss": 0.0511, "step": 20083 }, { "epoch": 6.140018343014368, "grad_norm": 0.22528788447380066, "learning_rate": 2.4809562226657045e-05, "loss": 0.0449, "step": 20084 }, { "epoch": 6.1403240599205136, "grad_norm": 0.30468952655792236, "learning_rate": 2.4809137616237103e-05, "loss": 0.0849, "step": 20085 }, { "epoch": 6.140629776826659, "grad_norm": 0.32673898339271545, "learning_rate": 2.4808713005817166e-05, "loss": 0.0616, "step": 20086 }, { "epoch": 6.140935493732804, "grad_norm": 0.39549100399017334, "learning_rate": 2.4808288395397224e-05, "loss": 0.0897, "step": 20087 }, { "epoch": 6.141241210638948, "grad_norm": 0.3479348421096802, "learning_rate": 2.4807863784977286e-05, "loss": 0.0834, "step": 20088 }, { "epoch": 6.141546927545093, "grad_norm": 0.5725882649421692, "learning_rate": 2.4807439174557345e-05, "loss": 0.0891, "step": 20089 }, { "epoch": 6.141852644451238, "grad_norm": 0.7765361666679382, "learning_rate": 2.4807014564137407e-05, "loss": 0.0983, "step": 20090 }, { "epoch": 6.142158361357383, "grad_norm": 1.6028186082839966, "learning_rate": 2.4806589953717466e-05, "loss": 0.1118, "step": 20091 }, { "epoch": 6.142464078263528, "grad_norm": 0.7016972303390503, "learning_rate": 2.4806165343297528e-05, "loss": 0.158, "step": 20092 }, { "epoch": 6.142769795169673, "grad_norm": 0.3718530237674713, "learning_rate": 2.4805740732877587e-05, "loss": 0.1476, "step": 20093 }, { "epoch": 6.143075512075818, "grad_norm": 1.1421587467193604, "learning_rate": 2.4805316122457645e-05, "loss": 0.1636, "step": 20094 }, { "epoch": 6.143381228981963, "grad_norm": 0.48134684562683105, "learning_rate": 2.4804891512037707e-05, "loss": 0.1494, "step": 20095 }, { "epoch": 6.143686945888108, "grad_norm": 0.6081392765045166, "learning_rate": 2.4804466901617766e-05, "loss": 0.1764, "step": 20096 }, { "epoch": 6.143992662794252, "grad_norm": 1.2247326374053955, "learning_rate": 2.4804042291197828e-05, "loss": 0.1909, "step": 20097 }, { "epoch": 6.144298379700397, "grad_norm": 0.6731672286987305, "learning_rate": 2.4803617680777887e-05, "loss": 0.1701, "step": 20098 }, { "epoch": 6.1446040966065425, "grad_norm": 0.9438702464103699, "learning_rate": 2.480319307035795e-05, "loss": 0.153, "step": 20099 }, { "epoch": 6.144909813512688, "grad_norm": 0.7976147532463074, "learning_rate": 2.4802768459938008e-05, "loss": 0.1753, "step": 20100 }, { "epoch": 6.145215530418832, "grad_norm": 2.526564359664917, "learning_rate": 2.480234384951807e-05, "loss": 0.2032, "step": 20101 }, { "epoch": 6.145521247324977, "grad_norm": 0.5862466096878052, "learning_rate": 2.480191923909813e-05, "loss": 0.1493, "step": 20102 }, { "epoch": 6.145826964231122, "grad_norm": 0.2493760585784912, "learning_rate": 2.480149462867819e-05, "loss": 0.0728, "step": 20103 }, { "epoch": 6.146132681137267, "grad_norm": 0.40468549728393555, "learning_rate": 2.480107001825825e-05, "loss": 0.0751, "step": 20104 }, { "epoch": 6.146438398043411, "grad_norm": 0.14847777783870697, "learning_rate": 2.480064540783831e-05, "loss": 0.0459, "step": 20105 }, { "epoch": 6.1467441149495565, "grad_norm": 0.3227570950984955, "learning_rate": 2.480022079741837e-05, "loss": 0.0553, "step": 20106 }, { "epoch": 6.147049831855702, "grad_norm": 0.3809252679347992, "learning_rate": 2.479979618699843e-05, "loss": 0.0454, "step": 20107 }, { "epoch": 6.147355548761847, "grad_norm": 0.26486828923225403, "learning_rate": 2.479937157657849e-05, "loss": 0.0484, "step": 20108 }, { "epoch": 6.147661265667992, "grad_norm": 0.26664048433303833, "learning_rate": 2.479894696615855e-05, "loss": 0.0581, "step": 20109 }, { "epoch": 6.147966982574136, "grad_norm": 0.3954389989376068, "learning_rate": 2.479852235573861e-05, "loss": 0.0934, "step": 20110 }, { "epoch": 6.148272699480281, "grad_norm": 0.3424180746078491, "learning_rate": 2.479809774531867e-05, "loss": 0.0599, "step": 20111 }, { "epoch": 6.148578416386426, "grad_norm": 0.5172868371009827, "learning_rate": 2.4797673134898732e-05, "loss": 0.0813, "step": 20112 }, { "epoch": 6.148884133292571, "grad_norm": 0.34510165452957153, "learning_rate": 2.479724852447879e-05, "loss": 0.0854, "step": 20113 }, { "epoch": 6.149189850198716, "grad_norm": 0.6055574417114258, "learning_rate": 2.4796823914058853e-05, "loss": 0.0774, "step": 20114 }, { "epoch": 6.149495567104861, "grad_norm": 0.7596948146820068, "learning_rate": 2.4796399303638912e-05, "loss": 0.0813, "step": 20115 }, { "epoch": 6.149801284011006, "grad_norm": 0.6818168759346008, "learning_rate": 2.4795974693218974e-05, "loss": 0.1337, "step": 20116 }, { "epoch": 6.150107000917151, "grad_norm": 0.73282390832901, "learning_rate": 2.4795550082799033e-05, "loss": 0.1154, "step": 20117 }, { "epoch": 6.150412717823295, "grad_norm": 1.3488432168960571, "learning_rate": 2.4795125472379095e-05, "loss": 0.1252, "step": 20118 }, { "epoch": 6.15071843472944, "grad_norm": 0.3932856023311615, "learning_rate": 2.4794700861959153e-05, "loss": 0.145, "step": 20119 }, { "epoch": 6.1510241516355855, "grad_norm": 0.7178389430046082, "learning_rate": 2.4794276251539212e-05, "loss": 0.1951, "step": 20120 }, { "epoch": 6.151329868541731, "grad_norm": 1.585100769996643, "learning_rate": 2.4793851641119274e-05, "loss": 0.1671, "step": 20121 }, { "epoch": 6.151635585447876, "grad_norm": 0.7396789193153381, "learning_rate": 2.4793427030699333e-05, "loss": 0.1708, "step": 20122 }, { "epoch": 6.15194130235402, "grad_norm": 1.4580085277557373, "learning_rate": 2.4793002420279395e-05, "loss": 0.1626, "step": 20123 }, { "epoch": 6.152247019260165, "grad_norm": 2.7738986015319824, "learning_rate": 2.4792577809859454e-05, "loss": 0.1559, "step": 20124 }, { "epoch": 6.15255273616631, "grad_norm": 1.3182408809661865, "learning_rate": 2.4792153199439516e-05, "loss": 0.1743, "step": 20125 }, { "epoch": 6.152858453072455, "grad_norm": 2.207758903503418, "learning_rate": 2.4791728589019575e-05, "loss": 0.2052, "step": 20126 }, { "epoch": 6.1531641699785995, "grad_norm": 0.4596731662750244, "learning_rate": 2.4791303978599637e-05, "loss": 0.1171, "step": 20127 }, { "epoch": 6.153469886884745, "grad_norm": 0.1942089945077896, "learning_rate": 2.4790879368179695e-05, "loss": 0.0672, "step": 20128 }, { "epoch": 6.15377560379089, "grad_norm": 0.18716222047805786, "learning_rate": 2.4790454757759757e-05, "loss": 0.0591, "step": 20129 }, { "epoch": 6.154081320697035, "grad_norm": 0.21345117688179016, "learning_rate": 2.4790030147339816e-05, "loss": 0.0693, "step": 20130 }, { "epoch": 6.154387037603179, "grad_norm": 0.5924190878868103, "learning_rate": 2.4789605536919878e-05, "loss": 0.0547, "step": 20131 }, { "epoch": 6.154692754509324, "grad_norm": 0.2695184051990509, "learning_rate": 2.4789180926499937e-05, "loss": 0.0382, "step": 20132 }, { "epoch": 6.154998471415469, "grad_norm": 0.35957202315330505, "learning_rate": 2.4788756316079996e-05, "loss": 0.0748, "step": 20133 }, { "epoch": 6.155304188321614, "grad_norm": 0.8777598142623901, "learning_rate": 2.4788331705660058e-05, "loss": 0.0367, "step": 20134 }, { "epoch": 6.1556099052277595, "grad_norm": 0.3967845141887665, "learning_rate": 2.4787907095240116e-05, "loss": 0.0629, "step": 20135 }, { "epoch": 6.155915622133904, "grad_norm": 0.29549726843833923, "learning_rate": 2.478748248482018e-05, "loss": 0.0629, "step": 20136 }, { "epoch": 6.156221339040049, "grad_norm": 0.3033007085323334, "learning_rate": 2.4787057874400237e-05, "loss": 0.073, "step": 20137 }, { "epoch": 6.156527055946194, "grad_norm": 0.9561851024627686, "learning_rate": 2.47866332639803e-05, "loss": 0.0585, "step": 20138 }, { "epoch": 6.156832772852339, "grad_norm": 1.0223305225372314, "learning_rate": 2.4786208653560358e-05, "loss": 0.1067, "step": 20139 }, { "epoch": 6.157138489758483, "grad_norm": 0.4633241891860962, "learning_rate": 2.478578404314042e-05, "loss": 0.1032, "step": 20140 }, { "epoch": 6.157444206664628, "grad_norm": 2.452773332595825, "learning_rate": 2.478535943272048e-05, "loss": 0.1105, "step": 20141 }, { "epoch": 6.1577499235707736, "grad_norm": 0.9274468421936035, "learning_rate": 2.478493482230054e-05, "loss": 0.1194, "step": 20142 }, { "epoch": 6.158055640476919, "grad_norm": 0.8864976167678833, "learning_rate": 2.47845102118806e-05, "loss": 0.139, "step": 20143 }, { "epoch": 6.158361357383063, "grad_norm": 0.5005914568901062, "learning_rate": 2.478408560146066e-05, "loss": 0.1459, "step": 20144 }, { "epoch": 6.158667074289208, "grad_norm": 1.0965567827224731, "learning_rate": 2.478366099104072e-05, "loss": 0.1278, "step": 20145 }, { "epoch": 6.158972791195353, "grad_norm": 1.031868815422058, "learning_rate": 2.478323638062078e-05, "loss": 0.2094, "step": 20146 }, { "epoch": 6.159278508101498, "grad_norm": 1.5783618688583374, "learning_rate": 2.478281177020084e-05, "loss": 0.1409, "step": 20147 }, { "epoch": 6.159584225007643, "grad_norm": 0.676652193069458, "learning_rate": 2.47823871597809e-05, "loss": 0.182, "step": 20148 }, { "epoch": 6.159889941913788, "grad_norm": 0.7211918234825134, "learning_rate": 2.4781962549360962e-05, "loss": 0.1545, "step": 20149 }, { "epoch": 6.160195658819933, "grad_norm": 1.4745007753372192, "learning_rate": 2.478153793894102e-05, "loss": 0.1784, "step": 20150 }, { "epoch": 6.160501375726078, "grad_norm": 1.1542949676513672, "learning_rate": 2.4781113328521083e-05, "loss": 0.2012, "step": 20151 }, { "epoch": 6.160807092632223, "grad_norm": 0.46301695704460144, "learning_rate": 2.478068871810114e-05, "loss": 0.1652, "step": 20152 }, { "epoch": 6.161112809538367, "grad_norm": 0.23654909431934357, "learning_rate": 2.4780264107681203e-05, "loss": 0.0656, "step": 20153 }, { "epoch": 6.161418526444512, "grad_norm": 0.17184892296791077, "learning_rate": 2.4779839497261262e-05, "loss": 0.0532, "step": 20154 }, { "epoch": 6.161724243350657, "grad_norm": 0.6853874325752258, "learning_rate": 2.4779414886841324e-05, "loss": 0.052, "step": 20155 }, { "epoch": 6.1620299602568025, "grad_norm": 0.15484920144081116, "learning_rate": 2.4778990276421383e-05, "loss": 0.0429, "step": 20156 }, { "epoch": 6.162335677162947, "grad_norm": 0.24733856320381165, "learning_rate": 2.4778565666001445e-05, "loss": 0.0671, "step": 20157 }, { "epoch": 6.162641394069092, "grad_norm": 0.3381647765636444, "learning_rate": 2.4778141055581504e-05, "loss": 0.0469, "step": 20158 }, { "epoch": 6.162947110975237, "grad_norm": 0.22932559251785278, "learning_rate": 2.4777716445161562e-05, "loss": 0.0606, "step": 20159 }, { "epoch": 6.163252827881382, "grad_norm": 1.1800395250320435, "learning_rate": 2.4777291834741625e-05, "loss": 0.0814, "step": 20160 }, { "epoch": 6.163558544787527, "grad_norm": 0.4484797418117523, "learning_rate": 2.4776867224321683e-05, "loss": 0.0565, "step": 20161 }, { "epoch": 6.163864261693671, "grad_norm": 0.42320960760116577, "learning_rate": 2.4776442613901745e-05, "loss": 0.0908, "step": 20162 }, { "epoch": 6.1641699785998165, "grad_norm": 0.5567547678947449, "learning_rate": 2.4776018003481804e-05, "loss": 0.0769, "step": 20163 }, { "epoch": 6.164475695505962, "grad_norm": 0.9069399833679199, "learning_rate": 2.4775593393061866e-05, "loss": 0.105, "step": 20164 }, { "epoch": 6.164781412412107, "grad_norm": 0.7071782350540161, "learning_rate": 2.4775168782641925e-05, "loss": 0.123, "step": 20165 }, { "epoch": 6.165087129318251, "grad_norm": 0.30127501487731934, "learning_rate": 2.4774744172221987e-05, "loss": 0.1018, "step": 20166 }, { "epoch": 6.165392846224396, "grad_norm": 0.3987311124801636, "learning_rate": 2.4774319561802046e-05, "loss": 0.1183, "step": 20167 }, { "epoch": 6.165698563130541, "grad_norm": 0.41334062814712524, "learning_rate": 2.4773894951382108e-05, "loss": 0.1465, "step": 20168 }, { "epoch": 6.166004280036686, "grad_norm": 0.4757171869277954, "learning_rate": 2.4773470340962166e-05, "loss": 0.1369, "step": 20169 }, { "epoch": 6.1663099969428306, "grad_norm": 0.7952318787574768, "learning_rate": 2.477304573054223e-05, "loss": 0.1387, "step": 20170 }, { "epoch": 6.166615713848976, "grad_norm": 3.015622615814209, "learning_rate": 2.4772621120122287e-05, "loss": 0.1802, "step": 20171 }, { "epoch": 6.166921430755121, "grad_norm": 0.6810576319694519, "learning_rate": 2.4772196509702346e-05, "loss": 0.1517, "step": 20172 }, { "epoch": 6.167227147661266, "grad_norm": 1.1769286394119263, "learning_rate": 2.4771771899282408e-05, "loss": 0.1633, "step": 20173 }, { "epoch": 6.167532864567411, "grad_norm": 0.8638617992401123, "learning_rate": 2.4771347288862467e-05, "loss": 0.1826, "step": 20174 }, { "epoch": 6.167838581473555, "grad_norm": 1.1985764503479004, "learning_rate": 2.477092267844253e-05, "loss": 0.167, "step": 20175 }, { "epoch": 6.1681442983797, "grad_norm": 1.9234542846679688, "learning_rate": 2.4770498068022587e-05, "loss": 0.1918, "step": 20176 }, { "epoch": 6.1684500152858455, "grad_norm": 0.3582899272441864, "learning_rate": 2.477007345760265e-05, "loss": 0.1329, "step": 20177 }, { "epoch": 6.168755732191991, "grad_norm": 0.3255607783794403, "learning_rate": 2.4769648847182708e-05, "loss": 0.08, "step": 20178 }, { "epoch": 6.169061449098135, "grad_norm": 0.22057455778121948, "learning_rate": 2.476922423676277e-05, "loss": 0.0793, "step": 20179 }, { "epoch": 6.16936716600428, "grad_norm": 0.17923881113529205, "learning_rate": 2.476879962634283e-05, "loss": 0.0447, "step": 20180 }, { "epoch": 6.169672882910425, "grad_norm": 0.33315160870552063, "learning_rate": 2.476837501592289e-05, "loss": 0.0614, "step": 20181 }, { "epoch": 6.16997859981657, "grad_norm": 0.5757232904434204, "learning_rate": 2.4767950405502953e-05, "loss": 0.0585, "step": 20182 }, { "epoch": 6.170284316722714, "grad_norm": 0.18154026567935944, "learning_rate": 2.4767525795083015e-05, "loss": 0.0329, "step": 20183 }, { "epoch": 6.1705900336288595, "grad_norm": 0.269721657037735, "learning_rate": 2.4767101184663074e-05, "loss": 0.0599, "step": 20184 }, { "epoch": 6.170895750535005, "grad_norm": 0.32140469551086426, "learning_rate": 2.4766676574243133e-05, "loss": 0.0828, "step": 20185 }, { "epoch": 6.17120146744115, "grad_norm": 0.2949029803276062, "learning_rate": 2.4766251963823195e-05, "loss": 0.0546, "step": 20186 }, { "epoch": 6.171507184347295, "grad_norm": 0.6139920353889465, "learning_rate": 2.4765827353403253e-05, "loss": 0.0957, "step": 20187 }, { "epoch": 6.171812901253439, "grad_norm": 0.37113186717033386, "learning_rate": 2.4765402742983316e-05, "loss": 0.0671, "step": 20188 }, { "epoch": 6.172118618159584, "grad_norm": 0.3969821631908417, "learning_rate": 2.4764978132563374e-05, "loss": 0.0937, "step": 20189 }, { "epoch": 6.172424335065729, "grad_norm": 0.49632489681243896, "learning_rate": 2.4764553522143436e-05, "loss": 0.1123, "step": 20190 }, { "epoch": 6.172730051971874, "grad_norm": 0.5312902331352234, "learning_rate": 2.4764128911723495e-05, "loss": 0.1145, "step": 20191 }, { "epoch": 6.173035768878019, "grad_norm": 0.3676295280456543, "learning_rate": 2.4763704301303557e-05, "loss": 0.14, "step": 20192 }, { "epoch": 6.173341485784164, "grad_norm": 0.5329541563987732, "learning_rate": 2.4763279690883616e-05, "loss": 0.1478, "step": 20193 }, { "epoch": 6.173647202690309, "grad_norm": 1.453994870185852, "learning_rate": 2.4762855080463678e-05, "loss": 0.1573, "step": 20194 }, { "epoch": 6.173952919596454, "grad_norm": 0.7711138129234314, "learning_rate": 2.4762430470043737e-05, "loss": 0.1236, "step": 20195 }, { "epoch": 6.174258636502598, "grad_norm": 2.3496155738830566, "learning_rate": 2.4762005859623795e-05, "loss": 0.1784, "step": 20196 }, { "epoch": 6.174564353408743, "grad_norm": 0.5918294191360474, "learning_rate": 2.4761581249203857e-05, "loss": 0.1751, "step": 20197 }, { "epoch": 6.174870070314888, "grad_norm": 0.8149284720420837, "learning_rate": 2.4761156638783916e-05, "loss": 0.1436, "step": 20198 }, { "epoch": 6.1751757872210336, "grad_norm": 0.9209536910057068, "learning_rate": 2.4760732028363978e-05, "loss": 0.1786, "step": 20199 }, { "epoch": 6.175481504127179, "grad_norm": 0.7016799449920654, "learning_rate": 2.4760307417944037e-05, "loss": 0.1868, "step": 20200 }, { "epoch": 6.175787221033323, "grad_norm": 1.2125885486602783, "learning_rate": 2.47598828075241e-05, "loss": 0.2085, "step": 20201 }, { "epoch": 6.176092937939468, "grad_norm": 0.4991765022277832, "learning_rate": 2.4759458197104158e-05, "loss": 0.1271, "step": 20202 }, { "epoch": 6.176398654845613, "grad_norm": 0.3573283851146698, "learning_rate": 2.475903358668422e-05, "loss": 0.0732, "step": 20203 }, { "epoch": 6.176704371751758, "grad_norm": 0.4796690046787262, "learning_rate": 2.475860897626428e-05, "loss": 0.0743, "step": 20204 }, { "epoch": 6.1770100886579025, "grad_norm": 0.25219032168388367, "learning_rate": 2.475818436584434e-05, "loss": 0.0512, "step": 20205 }, { "epoch": 6.177315805564048, "grad_norm": 0.2673010528087616, "learning_rate": 2.47577597554244e-05, "loss": 0.0645, "step": 20206 }, { "epoch": 6.177621522470193, "grad_norm": 0.2365383803844452, "learning_rate": 2.475733514500446e-05, "loss": 0.0564, "step": 20207 }, { "epoch": 6.177927239376338, "grad_norm": 0.2932113707065582, "learning_rate": 2.475691053458452e-05, "loss": 0.046, "step": 20208 }, { "epoch": 6.178232956282482, "grad_norm": 0.18076029419898987, "learning_rate": 2.475648592416458e-05, "loss": 0.0578, "step": 20209 }, { "epoch": 6.178538673188627, "grad_norm": 19.174489974975586, "learning_rate": 2.475606131374464e-05, "loss": 0.0776, "step": 20210 }, { "epoch": 6.178844390094772, "grad_norm": 0.3176775872707367, "learning_rate": 2.47556367033247e-05, "loss": 0.0659, "step": 20211 }, { "epoch": 6.179150107000917, "grad_norm": 0.43118345737457275, "learning_rate": 2.475521209290476e-05, "loss": 0.0954, "step": 20212 }, { "epoch": 6.1794558239070625, "grad_norm": 0.2538415789604187, "learning_rate": 2.475478748248482e-05, "loss": 0.0854, "step": 20213 }, { "epoch": 6.179761540813207, "grad_norm": 0.6249674558639526, "learning_rate": 2.4754362872064882e-05, "loss": 0.0854, "step": 20214 }, { "epoch": 6.180067257719352, "grad_norm": 0.781885027885437, "learning_rate": 2.475393826164494e-05, "loss": 0.1154, "step": 20215 }, { "epoch": 6.180372974625497, "grad_norm": 0.7962324619293213, "learning_rate": 2.4753513651225003e-05, "loss": 0.1272, "step": 20216 }, { "epoch": 6.180678691531642, "grad_norm": 0.7083433270454407, "learning_rate": 2.4753089040805062e-05, "loss": 0.1153, "step": 20217 }, { "epoch": 6.180984408437786, "grad_norm": 2.552975654602051, "learning_rate": 2.4752664430385124e-05, "loss": 0.1321, "step": 20218 }, { "epoch": 6.181290125343931, "grad_norm": 0.5999229550361633, "learning_rate": 2.4752239819965183e-05, "loss": 0.1271, "step": 20219 }, { "epoch": 6.1815958422500765, "grad_norm": 0.5960456132888794, "learning_rate": 2.4751815209545245e-05, "loss": 0.1359, "step": 20220 }, { "epoch": 6.181901559156222, "grad_norm": 0.35781851410865784, "learning_rate": 2.4751390599125303e-05, "loss": 0.1318, "step": 20221 }, { "epoch": 6.182207276062366, "grad_norm": 1.110703468322754, "learning_rate": 2.4750965988705362e-05, "loss": 0.1814, "step": 20222 }, { "epoch": 6.182512992968511, "grad_norm": 1.2965927124023438, "learning_rate": 2.4750541378285424e-05, "loss": 0.2047, "step": 20223 }, { "epoch": 6.182818709874656, "grad_norm": 1.7790979146957397, "learning_rate": 2.4750116767865483e-05, "loss": 0.1886, "step": 20224 }, { "epoch": 6.183124426780801, "grad_norm": 6.327579021453857, "learning_rate": 2.4749692157445545e-05, "loss": 0.2047, "step": 20225 }, { "epoch": 6.183430143686946, "grad_norm": 1.880335807800293, "learning_rate": 2.4749267547025604e-05, "loss": 0.234, "step": 20226 }, { "epoch": 6.1837358605930905, "grad_norm": 0.305472195148468, "learning_rate": 2.4748842936605666e-05, "loss": 0.1301, "step": 20227 }, { "epoch": 6.184041577499236, "grad_norm": 0.24218297004699707, "learning_rate": 2.4748418326185725e-05, "loss": 0.0697, "step": 20228 }, { "epoch": 6.184347294405381, "grad_norm": 0.24553491175174713, "learning_rate": 2.4747993715765787e-05, "loss": 0.0595, "step": 20229 }, { "epoch": 6.184653011311526, "grad_norm": 0.3771705627441406, "learning_rate": 2.4747569105345845e-05, "loss": 0.0462, "step": 20230 }, { "epoch": 6.18495872821767, "grad_norm": 0.16506467759609222, "learning_rate": 2.4747144494925907e-05, "loss": 0.0499, "step": 20231 }, { "epoch": 6.185264445123815, "grad_norm": 0.2945798933506012, "learning_rate": 2.4746719884505966e-05, "loss": 0.0476, "step": 20232 }, { "epoch": 6.18557016202996, "grad_norm": 0.4549633860588074, "learning_rate": 2.4746295274086028e-05, "loss": 0.069, "step": 20233 }, { "epoch": 6.1858758789361055, "grad_norm": 0.2400166094303131, "learning_rate": 2.4745870663666087e-05, "loss": 0.0531, "step": 20234 }, { "epoch": 6.18618159584225, "grad_norm": 0.2049706131219864, "learning_rate": 2.4745446053246146e-05, "loss": 0.0819, "step": 20235 }, { "epoch": 6.186487312748395, "grad_norm": 0.21390201151371002, "learning_rate": 2.4745021442826208e-05, "loss": 0.0485, "step": 20236 }, { "epoch": 6.18679302965454, "grad_norm": 0.3643645942211151, "learning_rate": 2.4744596832406266e-05, "loss": 0.0636, "step": 20237 }, { "epoch": 6.187098746560685, "grad_norm": 0.4589006006717682, "learning_rate": 2.474417222198633e-05, "loss": 0.1111, "step": 20238 }, { "epoch": 6.18740446346683, "grad_norm": 0.2854933440685272, "learning_rate": 2.4743747611566387e-05, "loss": 0.0792, "step": 20239 }, { "epoch": 6.187710180372974, "grad_norm": 0.9229062795639038, "learning_rate": 2.474332300114645e-05, "loss": 0.128, "step": 20240 }, { "epoch": 6.1880158972791195, "grad_norm": 0.7534128427505493, "learning_rate": 2.4742898390726508e-05, "loss": 0.1303, "step": 20241 }, { "epoch": 6.188321614185265, "grad_norm": 0.4578711986541748, "learning_rate": 2.474247378030657e-05, "loss": 0.125, "step": 20242 }, { "epoch": 6.18862733109141, "grad_norm": 1.9258586168289185, "learning_rate": 2.474204916988663e-05, "loss": 0.116, "step": 20243 }, { "epoch": 6.188933047997554, "grad_norm": 0.48379313945770264, "learning_rate": 2.474162455946669e-05, "loss": 0.1494, "step": 20244 }, { "epoch": 6.189238764903699, "grad_norm": 1.5178927183151245, "learning_rate": 2.474119994904675e-05, "loss": 0.1764, "step": 20245 }, { "epoch": 6.189544481809844, "grad_norm": 0.4633564352989197, "learning_rate": 2.474077533862681e-05, "loss": 0.1455, "step": 20246 }, { "epoch": 6.189850198715989, "grad_norm": 0.4652079939842224, "learning_rate": 2.474035072820687e-05, "loss": 0.1966, "step": 20247 }, { "epoch": 6.1901559156221335, "grad_norm": 1.40357506275177, "learning_rate": 2.473992611778693e-05, "loss": 0.182, "step": 20248 }, { "epoch": 6.190461632528279, "grad_norm": 0.9409819841384888, "learning_rate": 2.473950150736699e-05, "loss": 0.1665, "step": 20249 }, { "epoch": 6.190767349434424, "grad_norm": 0.749186098575592, "learning_rate": 2.473907689694705e-05, "loss": 0.2059, "step": 20250 }, { "epoch": 6.191073066340569, "grad_norm": 1.1089478731155396, "learning_rate": 2.4738652286527112e-05, "loss": 0.2004, "step": 20251 }, { "epoch": 6.191378783246714, "grad_norm": 0.3464609980583191, "learning_rate": 2.473822767610717e-05, "loss": 0.1439, "step": 20252 }, { "epoch": 6.191684500152858, "grad_norm": 0.5277817845344543, "learning_rate": 2.4737803065687233e-05, "loss": 0.0762, "step": 20253 }, { "epoch": 6.191990217059003, "grad_norm": 0.17343983054161072, "learning_rate": 2.473737845526729e-05, "loss": 0.0483, "step": 20254 }, { "epoch": 6.192295933965148, "grad_norm": 0.24818333983421326, "learning_rate": 2.4736953844847353e-05, "loss": 0.0871, "step": 20255 }, { "epoch": 6.1926016508712936, "grad_norm": 0.6048958897590637, "learning_rate": 2.4736529234427412e-05, "loss": 0.0502, "step": 20256 }, { "epoch": 6.192907367777438, "grad_norm": 0.179742693901062, "learning_rate": 2.4736104624007474e-05, "loss": 0.0512, "step": 20257 }, { "epoch": 6.193213084683583, "grad_norm": 6.514947414398193, "learning_rate": 2.4735680013587533e-05, "loss": 0.0636, "step": 20258 }, { "epoch": 6.193518801589728, "grad_norm": 0.898047149181366, "learning_rate": 2.4735255403167595e-05, "loss": 0.0772, "step": 20259 }, { "epoch": 6.193824518495873, "grad_norm": 0.4281521737575531, "learning_rate": 2.4734830792747654e-05, "loss": 0.0567, "step": 20260 }, { "epoch": 6.194130235402017, "grad_norm": 0.2294282466173172, "learning_rate": 2.4734406182327712e-05, "loss": 0.0573, "step": 20261 }, { "epoch": 6.1944359523081625, "grad_norm": 0.28275278210639954, "learning_rate": 2.4733981571907775e-05, "loss": 0.069, "step": 20262 }, { "epoch": 6.194741669214308, "grad_norm": 1.383967399597168, "learning_rate": 2.4733556961487833e-05, "loss": 0.0529, "step": 20263 }, { "epoch": 6.195047386120453, "grad_norm": 0.34884849190711975, "learning_rate": 2.4733132351067895e-05, "loss": 0.0752, "step": 20264 }, { "epoch": 6.195353103026598, "grad_norm": 0.6759428977966309, "learning_rate": 2.4732707740647954e-05, "loss": 0.1048, "step": 20265 }, { "epoch": 6.195658819932742, "grad_norm": 1.746781349182129, "learning_rate": 2.4732283130228016e-05, "loss": 0.0921, "step": 20266 }, { "epoch": 6.195964536838887, "grad_norm": 1.0125584602355957, "learning_rate": 2.4731858519808075e-05, "loss": 0.1583, "step": 20267 }, { "epoch": 6.196270253745032, "grad_norm": 0.6413965225219727, "learning_rate": 2.4731433909388137e-05, "loss": 0.1137, "step": 20268 }, { "epoch": 6.196575970651177, "grad_norm": 0.4849487245082855, "learning_rate": 2.4731009298968196e-05, "loss": 0.1596, "step": 20269 }, { "epoch": 6.196881687557322, "grad_norm": 0.5380329489707947, "learning_rate": 2.4730584688548258e-05, "loss": 0.1544, "step": 20270 }, { "epoch": 6.197187404463467, "grad_norm": 1.119619369506836, "learning_rate": 2.4730160078128316e-05, "loss": 0.1788, "step": 20271 }, { "epoch": 6.197493121369612, "grad_norm": 0.6747249364852905, "learning_rate": 2.472973546770838e-05, "loss": 0.164, "step": 20272 }, { "epoch": 6.197798838275757, "grad_norm": 1.9542324542999268, "learning_rate": 2.4729310857288437e-05, "loss": 0.2003, "step": 20273 }, { "epoch": 6.198104555181901, "grad_norm": 1.247698187828064, "learning_rate": 2.4728886246868496e-05, "loss": 0.1334, "step": 20274 }, { "epoch": 6.198410272088046, "grad_norm": 1.8968443870544434, "learning_rate": 2.4728461636448558e-05, "loss": 0.1841, "step": 20275 }, { "epoch": 6.198715988994191, "grad_norm": 1.875147819519043, "learning_rate": 2.4728037026028617e-05, "loss": 0.2135, "step": 20276 }, { "epoch": 6.1990217059003365, "grad_norm": 0.3943915069103241, "learning_rate": 2.472761241560868e-05, "loss": 0.1262, "step": 20277 }, { "epoch": 6.199327422806482, "grad_norm": 0.6918339729309082, "learning_rate": 2.4727187805188737e-05, "loss": 0.0741, "step": 20278 }, { "epoch": 6.199633139712626, "grad_norm": 0.5521054863929749, "learning_rate": 2.47267631947688e-05, "loss": 0.0658, "step": 20279 }, { "epoch": 6.199938856618771, "grad_norm": 0.2903653383255005, "learning_rate": 2.4726338584348858e-05, "loss": 0.0527, "step": 20280 }, { "epoch": 6.200244573524916, "grad_norm": 0.21777872741222382, "learning_rate": 2.472591397392892e-05, "loss": 0.0442, "step": 20281 }, { "epoch": 6.200550290431061, "grad_norm": 0.21777355670928955, "learning_rate": 2.472548936350898e-05, "loss": 0.0603, "step": 20282 }, { "epoch": 6.200856007337205, "grad_norm": 2.189793825149536, "learning_rate": 2.472506475308904e-05, "loss": 0.0386, "step": 20283 }, { "epoch": 6.2011617242433505, "grad_norm": 0.5332350134849548, "learning_rate": 2.4724640142669103e-05, "loss": 0.0649, "step": 20284 }, { "epoch": 6.201467441149496, "grad_norm": 0.48060232400894165, "learning_rate": 2.4724215532249165e-05, "loss": 0.0627, "step": 20285 }, { "epoch": 6.201773158055641, "grad_norm": 0.4529995322227478, "learning_rate": 2.4723790921829224e-05, "loss": 0.0555, "step": 20286 }, { "epoch": 6.202078874961785, "grad_norm": 0.3075523376464844, "learning_rate": 2.4723366311409283e-05, "loss": 0.0782, "step": 20287 }, { "epoch": 6.20238459186793, "grad_norm": 0.4020178020000458, "learning_rate": 2.4722941700989345e-05, "loss": 0.1044, "step": 20288 }, { "epoch": 6.202690308774075, "grad_norm": 0.559368371963501, "learning_rate": 2.4722517090569403e-05, "loss": 0.0839, "step": 20289 }, { "epoch": 6.20299602568022, "grad_norm": 0.5567466616630554, "learning_rate": 2.4722092480149466e-05, "loss": 0.1216, "step": 20290 }, { "epoch": 6.2033017425863655, "grad_norm": 0.7004779577255249, "learning_rate": 2.4721667869729524e-05, "loss": 0.1398, "step": 20291 }, { "epoch": 6.20360745949251, "grad_norm": 0.9349180459976196, "learning_rate": 2.4721243259309586e-05, "loss": 0.1531, "step": 20292 }, { "epoch": 6.203913176398655, "grad_norm": 0.8051888942718506, "learning_rate": 2.4720818648889645e-05, "loss": 0.1267, "step": 20293 }, { "epoch": 6.2042188933048, "grad_norm": 0.5031418204307556, "learning_rate": 2.4720394038469707e-05, "loss": 0.1522, "step": 20294 }, { "epoch": 6.204524610210945, "grad_norm": 0.6306962966918945, "learning_rate": 2.4719969428049766e-05, "loss": 0.1492, "step": 20295 }, { "epoch": 6.204830327117089, "grad_norm": 0.6830251812934875, "learning_rate": 2.4719544817629828e-05, "loss": 0.1718, "step": 20296 }, { "epoch": 6.205136044023234, "grad_norm": 0.8635686039924622, "learning_rate": 2.4719120207209887e-05, "loss": 0.1652, "step": 20297 }, { "epoch": 6.2054417609293795, "grad_norm": 1.1389591693878174, "learning_rate": 2.471869559678995e-05, "loss": 0.1705, "step": 20298 }, { "epoch": 6.205747477835525, "grad_norm": 1.3800934553146362, "learning_rate": 2.4718270986370007e-05, "loss": 0.1631, "step": 20299 }, { "epoch": 6.206053194741669, "grad_norm": 0.8757414817810059, "learning_rate": 2.4717846375950066e-05, "loss": 0.189, "step": 20300 }, { "epoch": 6.206358911647814, "grad_norm": 2.3627359867095947, "learning_rate": 2.4717421765530128e-05, "loss": 0.2421, "step": 20301 }, { "epoch": 6.206664628553959, "grad_norm": 0.4976535737514496, "learning_rate": 2.4716997155110187e-05, "loss": 0.1424, "step": 20302 }, { "epoch": 6.206970345460104, "grad_norm": 0.32796710729599, "learning_rate": 2.471657254469025e-05, "loss": 0.0865, "step": 20303 }, { "epoch": 6.207276062366249, "grad_norm": 0.47776225209236145, "learning_rate": 2.4716147934270308e-05, "loss": 0.0713, "step": 20304 }, { "epoch": 6.2075817792723935, "grad_norm": 0.7580615878105164, "learning_rate": 2.471572332385037e-05, "loss": 0.0533, "step": 20305 }, { "epoch": 6.207887496178539, "grad_norm": 1.521399736404419, "learning_rate": 2.471529871343043e-05, "loss": 0.0458, "step": 20306 }, { "epoch": 6.208193213084684, "grad_norm": 0.42115673422813416, "learning_rate": 2.471487410301049e-05, "loss": 0.0462, "step": 20307 }, { "epoch": 6.208498929990829, "grad_norm": 0.27821221947669983, "learning_rate": 2.471444949259055e-05, "loss": 0.0541, "step": 20308 }, { "epoch": 6.208804646896973, "grad_norm": 0.4424527883529663, "learning_rate": 2.471402488217061e-05, "loss": 0.0637, "step": 20309 }, { "epoch": 6.209110363803118, "grad_norm": 0.5357734560966492, "learning_rate": 2.471360027175067e-05, "loss": 0.0707, "step": 20310 }, { "epoch": 6.209416080709263, "grad_norm": 0.4614108204841614, "learning_rate": 2.471317566133073e-05, "loss": 0.0684, "step": 20311 }, { "epoch": 6.209721797615408, "grad_norm": 0.37310779094696045, "learning_rate": 2.471275105091079e-05, "loss": 0.0758, "step": 20312 }, { "epoch": 6.210027514521553, "grad_norm": 0.7252302765846252, "learning_rate": 2.471232644049085e-05, "loss": 0.0654, "step": 20313 }, { "epoch": 6.210333231427698, "grad_norm": 0.952415406703949, "learning_rate": 2.471190183007091e-05, "loss": 0.1094, "step": 20314 }, { "epoch": 6.210638948333843, "grad_norm": 0.48074832558631897, "learning_rate": 2.471147721965097e-05, "loss": 0.1029, "step": 20315 }, { "epoch": 6.210944665239988, "grad_norm": 0.3620920777320862, "learning_rate": 2.4711052609231032e-05, "loss": 0.108, "step": 20316 }, { "epoch": 6.211250382146133, "grad_norm": 0.8033514022827148, "learning_rate": 2.471062799881109e-05, "loss": 0.1207, "step": 20317 }, { "epoch": 6.211556099052277, "grad_norm": 2.7571237087249756, "learning_rate": 2.4710203388391153e-05, "loss": 0.165, "step": 20318 }, { "epoch": 6.2118618159584225, "grad_norm": 1.9437448978424072, "learning_rate": 2.4709778777971212e-05, "loss": 0.1431, "step": 20319 }, { "epoch": 6.212167532864568, "grad_norm": 0.47176235914230347, "learning_rate": 2.4709354167551274e-05, "loss": 0.1594, "step": 20320 }, { "epoch": 6.212473249770713, "grad_norm": 0.7748541235923767, "learning_rate": 2.4708929557131333e-05, "loss": 0.1557, "step": 20321 }, { "epoch": 6.212778966676857, "grad_norm": 2.09682297706604, "learning_rate": 2.4708504946711395e-05, "loss": 0.1667, "step": 20322 }, { "epoch": 6.213084683583002, "grad_norm": 1.5884188413619995, "learning_rate": 2.4708080336291454e-05, "loss": 0.1784, "step": 20323 }, { "epoch": 6.213390400489147, "grad_norm": 1.2436048984527588, "learning_rate": 2.4707655725871512e-05, "loss": 0.2034, "step": 20324 }, { "epoch": 6.213696117395292, "grad_norm": 1.0126806497573853, "learning_rate": 2.4707231115451574e-05, "loss": 0.1985, "step": 20325 }, { "epoch": 6.2140018343014365, "grad_norm": 0.9211768507957458, "learning_rate": 2.4706806505031633e-05, "loss": 0.2087, "step": 20326 }, { "epoch": 6.214307551207582, "grad_norm": 0.468448668718338, "learning_rate": 2.4706381894611695e-05, "loss": 0.1309, "step": 20327 }, { "epoch": 6.214613268113727, "grad_norm": 2.9734864234924316, "learning_rate": 2.4705957284191754e-05, "loss": 0.0769, "step": 20328 }, { "epoch": 6.214918985019872, "grad_norm": 0.4350382089614868, "learning_rate": 2.4705532673771816e-05, "loss": 0.0695, "step": 20329 }, { "epoch": 6.215224701926017, "grad_norm": 0.7325553894042969, "learning_rate": 2.4705108063351875e-05, "loss": 0.0613, "step": 20330 }, { "epoch": 6.215530418832161, "grad_norm": 0.17086546123027802, "learning_rate": 2.4704683452931937e-05, "loss": 0.0578, "step": 20331 }, { "epoch": 6.215836135738306, "grad_norm": 0.1995551884174347, "learning_rate": 2.4704258842511995e-05, "loss": 0.053, "step": 20332 }, { "epoch": 6.216141852644451, "grad_norm": 0.9328352808952332, "learning_rate": 2.4703834232092057e-05, "loss": 0.0717, "step": 20333 }, { "epoch": 6.2164475695505965, "grad_norm": 0.19965557754039764, "learning_rate": 2.4703409621672116e-05, "loss": 0.0394, "step": 20334 }, { "epoch": 6.216753286456741, "grad_norm": 1.0501383543014526, "learning_rate": 2.4702985011252178e-05, "loss": 0.0791, "step": 20335 }, { "epoch": 6.217059003362886, "grad_norm": 0.24805772304534912, "learning_rate": 2.4702560400832237e-05, "loss": 0.0859, "step": 20336 }, { "epoch": 6.217364720269031, "grad_norm": 0.5678126811981201, "learning_rate": 2.4702135790412296e-05, "loss": 0.1034, "step": 20337 }, { "epoch": 6.217670437175176, "grad_norm": 0.32150399684906006, "learning_rate": 2.4701711179992358e-05, "loss": 0.0599, "step": 20338 }, { "epoch": 6.21797615408132, "grad_norm": 0.48797255754470825, "learning_rate": 2.4701286569572416e-05, "loss": 0.0767, "step": 20339 }, { "epoch": 6.218281870987465, "grad_norm": 0.47549283504486084, "learning_rate": 2.470086195915248e-05, "loss": 0.1201, "step": 20340 }, { "epoch": 6.2185875878936105, "grad_norm": 0.7120321989059448, "learning_rate": 2.4700437348732537e-05, "loss": 0.1078, "step": 20341 }, { "epoch": 6.218893304799756, "grad_norm": 0.7800975441932678, "learning_rate": 2.47000127383126e-05, "loss": 0.1247, "step": 20342 }, { "epoch": 6.219199021705901, "grad_norm": 0.8157461285591125, "learning_rate": 2.4699588127892658e-05, "loss": 0.1185, "step": 20343 }, { "epoch": 6.219504738612045, "grad_norm": 0.8764286637306213, "learning_rate": 2.469916351747272e-05, "loss": 0.1857, "step": 20344 }, { "epoch": 6.21981045551819, "grad_norm": 0.8343998789787292, "learning_rate": 2.469873890705278e-05, "loss": 0.1959, "step": 20345 }, { "epoch": 6.220116172424335, "grad_norm": 1.2634660005569458, "learning_rate": 2.469831429663284e-05, "loss": 0.1451, "step": 20346 }, { "epoch": 6.22042188933048, "grad_norm": 3.563753843307495, "learning_rate": 2.46978896862129e-05, "loss": 0.1661, "step": 20347 }, { "epoch": 6.220727606236625, "grad_norm": 1.0946975946426392, "learning_rate": 2.469746507579296e-05, "loss": 0.2206, "step": 20348 }, { "epoch": 6.22103332314277, "grad_norm": 6.346890449523926, "learning_rate": 2.469704046537302e-05, "loss": 0.1679, "step": 20349 }, { "epoch": 6.221339040048915, "grad_norm": 1.3643969297409058, "learning_rate": 2.469661585495308e-05, "loss": 0.1688, "step": 20350 }, { "epoch": 6.22164475695506, "grad_norm": 3.7358057498931885, "learning_rate": 2.469619124453314e-05, "loss": 0.2188, "step": 20351 }, { "epoch": 6.221950473861204, "grad_norm": 0.8019737601280212, "learning_rate": 2.46957666341132e-05, "loss": 0.1315, "step": 20352 }, { "epoch": 6.222256190767349, "grad_norm": 0.294727623462677, "learning_rate": 2.4695342023693262e-05, "loss": 0.0784, "step": 20353 }, { "epoch": 6.222561907673494, "grad_norm": 0.2841339409351349, "learning_rate": 2.469491741327332e-05, "loss": 0.0613, "step": 20354 }, { "epoch": 6.2228676245796395, "grad_norm": 0.3266289532184601, "learning_rate": 2.4694492802853383e-05, "loss": 0.0639, "step": 20355 }, { "epoch": 6.223173341485785, "grad_norm": 0.33084091544151306, "learning_rate": 2.469406819243344e-05, "loss": 0.0758, "step": 20356 }, { "epoch": 6.223479058391929, "grad_norm": 0.15423481166362762, "learning_rate": 2.4693643582013504e-05, "loss": 0.0375, "step": 20357 }, { "epoch": 6.223784775298074, "grad_norm": 0.17553012073040009, "learning_rate": 2.4693218971593562e-05, "loss": 0.0527, "step": 20358 }, { "epoch": 6.224090492204219, "grad_norm": 0.3865848481655121, "learning_rate": 2.4692794361173624e-05, "loss": 0.0593, "step": 20359 }, { "epoch": 6.224396209110364, "grad_norm": 0.24709106981754303, "learning_rate": 2.4692369750753683e-05, "loss": 0.0836, "step": 20360 }, { "epoch": 6.224701926016508, "grad_norm": 1.2337173223495483, "learning_rate": 2.4691945140333745e-05, "loss": 0.0686, "step": 20361 }, { "epoch": 6.2250076429226535, "grad_norm": 0.43926841020584106, "learning_rate": 2.4691520529913804e-05, "loss": 0.099, "step": 20362 }, { "epoch": 6.225313359828799, "grad_norm": 0.6429024338722229, "learning_rate": 2.4691095919493862e-05, "loss": 0.0795, "step": 20363 }, { "epoch": 6.225619076734944, "grad_norm": 0.33768510818481445, "learning_rate": 2.4690671309073925e-05, "loss": 0.1387, "step": 20364 }, { "epoch": 6.225924793641088, "grad_norm": 0.5089166164398193, "learning_rate": 2.4690246698653983e-05, "loss": 0.129, "step": 20365 }, { "epoch": 6.226230510547233, "grad_norm": 0.856375515460968, "learning_rate": 2.4689822088234045e-05, "loss": 0.1247, "step": 20366 }, { "epoch": 6.226536227453378, "grad_norm": 2.171257495880127, "learning_rate": 2.4689397477814104e-05, "loss": 0.1024, "step": 20367 }, { "epoch": 6.226841944359523, "grad_norm": 2.108734369277954, "learning_rate": 2.4688972867394166e-05, "loss": 0.1438, "step": 20368 }, { "epoch": 6.227147661265668, "grad_norm": 0.9025638699531555, "learning_rate": 2.4688548256974225e-05, "loss": 0.1471, "step": 20369 }, { "epoch": 6.227453378171813, "grad_norm": 0.6141025424003601, "learning_rate": 2.4688123646554287e-05, "loss": 0.1708, "step": 20370 }, { "epoch": 6.227759095077958, "grad_norm": 0.4326651096343994, "learning_rate": 2.4687699036134346e-05, "loss": 0.1423, "step": 20371 }, { "epoch": 6.228064811984103, "grad_norm": 1.9154560565948486, "learning_rate": 2.4687274425714408e-05, "loss": 0.1897, "step": 20372 }, { "epoch": 6.228370528890248, "grad_norm": 0.8144353628158569, "learning_rate": 2.4686849815294466e-05, "loss": 0.1759, "step": 20373 }, { "epoch": 6.228676245796392, "grad_norm": 3.4713642597198486, "learning_rate": 2.468642520487453e-05, "loss": 0.1969, "step": 20374 }, { "epoch": 6.228981962702537, "grad_norm": 0.7728306651115417, "learning_rate": 2.4686000594454587e-05, "loss": 0.2051, "step": 20375 }, { "epoch": 6.2292876796086825, "grad_norm": 1.1823674440383911, "learning_rate": 2.4685575984034646e-05, "loss": 0.2062, "step": 20376 }, { "epoch": 6.229593396514828, "grad_norm": 0.5330361723899841, "learning_rate": 2.4685151373614708e-05, "loss": 0.2329, "step": 20377 }, { "epoch": 6.229899113420972, "grad_norm": 0.37567320466041565, "learning_rate": 2.4684726763194767e-05, "loss": 0.0843, "step": 20378 }, { "epoch": 6.230204830327117, "grad_norm": 0.24004384875297546, "learning_rate": 2.468430215277483e-05, "loss": 0.0624, "step": 20379 }, { "epoch": 6.230510547233262, "grad_norm": 0.31485190987586975, "learning_rate": 2.4683877542354887e-05, "loss": 0.0619, "step": 20380 }, { "epoch": 6.230816264139407, "grad_norm": 0.4025113880634308, "learning_rate": 2.468345293193495e-05, "loss": 0.0399, "step": 20381 }, { "epoch": 6.231121981045552, "grad_norm": 0.3457639217376709, "learning_rate": 2.4683028321515008e-05, "loss": 0.0597, "step": 20382 }, { "epoch": 6.2314276979516965, "grad_norm": 0.28959396481513977, "learning_rate": 2.468260371109507e-05, "loss": 0.0405, "step": 20383 }, { "epoch": 6.231733414857842, "grad_norm": 0.2906443774700165, "learning_rate": 2.468217910067513e-05, "loss": 0.0728, "step": 20384 }, { "epoch": 6.232039131763987, "grad_norm": 0.4155977964401245, "learning_rate": 2.468175449025519e-05, "loss": 0.0715, "step": 20385 }, { "epoch": 6.232344848670132, "grad_norm": 1.1568701267242432, "learning_rate": 2.4681329879835253e-05, "loss": 0.0666, "step": 20386 }, { "epoch": 6.232650565576276, "grad_norm": 0.47581946849823, "learning_rate": 2.4680905269415315e-05, "loss": 0.0785, "step": 20387 }, { "epoch": 6.232956282482421, "grad_norm": 0.32814714312553406, "learning_rate": 2.4680480658995374e-05, "loss": 0.088, "step": 20388 }, { "epoch": 6.233261999388566, "grad_norm": 0.6214190125465393, "learning_rate": 2.4680056048575433e-05, "loss": 0.0827, "step": 20389 }, { "epoch": 6.233567716294711, "grad_norm": 0.4383259117603302, "learning_rate": 2.4679631438155495e-05, "loss": 0.1033, "step": 20390 }, { "epoch": 6.233873433200856, "grad_norm": 0.7100869417190552, "learning_rate": 2.4679206827735554e-05, "loss": 0.1103, "step": 20391 }, { "epoch": 6.234179150107001, "grad_norm": 0.4185184836387634, "learning_rate": 2.4678782217315616e-05, "loss": 0.1148, "step": 20392 }, { "epoch": 6.234484867013146, "grad_norm": 2.1383917331695557, "learning_rate": 2.4678357606895674e-05, "loss": 0.1537, "step": 20393 }, { "epoch": 6.234790583919291, "grad_norm": 0.6003685593605042, "learning_rate": 2.4677932996475736e-05, "loss": 0.188, "step": 20394 }, { "epoch": 6.235096300825436, "grad_norm": 0.9529579281806946, "learning_rate": 2.4677508386055795e-05, "loss": 0.1876, "step": 20395 }, { "epoch": 6.23540201773158, "grad_norm": 0.8564022183418274, "learning_rate": 2.4677083775635857e-05, "loss": 0.1464, "step": 20396 }, { "epoch": 6.235707734637725, "grad_norm": 0.9400513768196106, "learning_rate": 2.4676659165215916e-05, "loss": 0.1642, "step": 20397 }, { "epoch": 6.2360134515438705, "grad_norm": 0.7940175533294678, "learning_rate": 2.4676234554795978e-05, "loss": 0.164, "step": 20398 }, { "epoch": 6.236319168450016, "grad_norm": 1.1054922342300415, "learning_rate": 2.4675809944376037e-05, "loss": 0.1855, "step": 20399 }, { "epoch": 6.23662488535616, "grad_norm": 2.535687208175659, "learning_rate": 2.46753853339561e-05, "loss": 0.2094, "step": 20400 }, { "epoch": 6.236930602262305, "grad_norm": 6.577815532684326, "learning_rate": 2.4674960723536157e-05, "loss": 0.2242, "step": 20401 }, { "epoch": 6.23723631916845, "grad_norm": 0.5669596195220947, "learning_rate": 2.4674536113116216e-05, "loss": 0.1586, "step": 20402 }, { "epoch": 6.237542036074595, "grad_norm": 0.251908540725708, "learning_rate": 2.4674111502696278e-05, "loss": 0.0728, "step": 20403 }, { "epoch": 6.2378477529807395, "grad_norm": 1.5923668146133423, "learning_rate": 2.4673686892276337e-05, "loss": 0.0513, "step": 20404 }, { "epoch": 6.238153469886885, "grad_norm": 0.2648613452911377, "learning_rate": 2.46732622818564e-05, "loss": 0.0504, "step": 20405 }, { "epoch": 6.23845918679303, "grad_norm": 0.16591887176036835, "learning_rate": 2.4672837671436458e-05, "loss": 0.0516, "step": 20406 }, { "epoch": 6.238764903699175, "grad_norm": 0.3671473562717438, "learning_rate": 2.467241306101652e-05, "loss": 0.06, "step": 20407 }, { "epoch": 6.23907062060532, "grad_norm": 0.3575999140739441, "learning_rate": 2.467198845059658e-05, "loss": 0.0456, "step": 20408 }, { "epoch": 6.239376337511464, "grad_norm": 0.18171587586402893, "learning_rate": 2.467156384017664e-05, "loss": 0.0541, "step": 20409 }, { "epoch": 6.239682054417609, "grad_norm": 0.9433700442314148, "learning_rate": 2.46711392297567e-05, "loss": 0.0543, "step": 20410 }, { "epoch": 6.239987771323754, "grad_norm": 0.5618821978569031, "learning_rate": 2.467071461933676e-05, "loss": 0.0493, "step": 20411 }, { "epoch": 6.2402934882298995, "grad_norm": 0.43723753094673157, "learning_rate": 2.467029000891682e-05, "loss": 0.0784, "step": 20412 }, { "epoch": 6.240599205136044, "grad_norm": 0.1724397987127304, "learning_rate": 2.4669865398496882e-05, "loss": 0.0458, "step": 20413 }, { "epoch": 6.240904922042189, "grad_norm": 0.3042412996292114, "learning_rate": 2.466944078807694e-05, "loss": 0.0869, "step": 20414 }, { "epoch": 6.241210638948334, "grad_norm": 0.795559287071228, "learning_rate": 2.4669016177657e-05, "loss": 0.1245, "step": 20415 }, { "epoch": 6.241516355854479, "grad_norm": 0.4103502035140991, "learning_rate": 2.466859156723706e-05, "loss": 0.1234, "step": 20416 }, { "epoch": 6.241822072760623, "grad_norm": 0.6305211782455444, "learning_rate": 2.466816695681712e-05, "loss": 0.1214, "step": 20417 }, { "epoch": 6.242127789666768, "grad_norm": 0.6498208045959473, "learning_rate": 2.4667742346397182e-05, "loss": 0.1707, "step": 20418 }, { "epoch": 6.2424335065729135, "grad_norm": 0.4978840947151184, "learning_rate": 2.466731773597724e-05, "loss": 0.1417, "step": 20419 }, { "epoch": 6.242739223479059, "grad_norm": 1.135201096534729, "learning_rate": 2.4666893125557303e-05, "loss": 0.2007, "step": 20420 }, { "epoch": 6.243044940385204, "grad_norm": 0.5574920773506165, "learning_rate": 2.4666468515137362e-05, "loss": 0.1525, "step": 20421 }, { "epoch": 6.243350657291348, "grad_norm": 0.7807177305221558, "learning_rate": 2.4666043904717424e-05, "loss": 0.2022, "step": 20422 }, { "epoch": 6.243656374197493, "grad_norm": 0.917242705821991, "learning_rate": 2.4665619294297483e-05, "loss": 0.1553, "step": 20423 }, { "epoch": 6.243962091103638, "grad_norm": 2.0076963901519775, "learning_rate": 2.4665194683877545e-05, "loss": 0.1955, "step": 20424 }, { "epoch": 6.244267808009783, "grad_norm": 0.8111322522163391, "learning_rate": 2.4664770073457604e-05, "loss": 0.1648, "step": 20425 }, { "epoch": 6.2445735249159275, "grad_norm": 1.671962022781372, "learning_rate": 2.4664345463037662e-05, "loss": 0.2431, "step": 20426 }, { "epoch": 6.244879241822073, "grad_norm": 0.530168890953064, "learning_rate": 2.4663920852617724e-05, "loss": 0.1281, "step": 20427 }, { "epoch": 6.245184958728218, "grad_norm": 0.31049084663391113, "learning_rate": 2.4663496242197783e-05, "loss": 0.0966, "step": 20428 }, { "epoch": 6.245490675634363, "grad_norm": 0.70224928855896, "learning_rate": 2.4663071631777845e-05, "loss": 0.0973, "step": 20429 }, { "epoch": 6.245796392540507, "grad_norm": 0.5096299648284912, "learning_rate": 2.4662647021357904e-05, "loss": 0.0426, "step": 20430 }, { "epoch": 6.246102109446652, "grad_norm": 0.4333474040031433, "learning_rate": 2.4662222410937966e-05, "loss": 0.054, "step": 20431 }, { "epoch": 6.246407826352797, "grad_norm": 0.2833002209663391, "learning_rate": 2.4661797800518025e-05, "loss": 0.0685, "step": 20432 }, { "epoch": 6.2467135432589425, "grad_norm": 0.5185602903366089, "learning_rate": 2.4661373190098087e-05, "loss": 0.0782, "step": 20433 }, { "epoch": 6.247019260165088, "grad_norm": 0.6193782687187195, "learning_rate": 2.4660948579678145e-05, "loss": 0.0699, "step": 20434 }, { "epoch": 6.247324977071232, "grad_norm": 0.6355588436126709, "learning_rate": 2.4660523969258207e-05, "loss": 0.0928, "step": 20435 }, { "epoch": 6.247630693977377, "grad_norm": 0.973335862159729, "learning_rate": 2.4660099358838266e-05, "loss": 0.0448, "step": 20436 }, { "epoch": 6.247936410883522, "grad_norm": 0.3475862741470337, "learning_rate": 2.4659674748418328e-05, "loss": 0.096, "step": 20437 }, { "epoch": 6.248242127789667, "grad_norm": 0.26958906650543213, "learning_rate": 2.4659250137998387e-05, "loss": 0.0931, "step": 20438 }, { "epoch": 6.248547844695811, "grad_norm": 1.1396828889846802, "learning_rate": 2.4658825527578446e-05, "loss": 0.1027, "step": 20439 }, { "epoch": 6.2488535616019565, "grad_norm": 0.577954888343811, "learning_rate": 2.4658400917158508e-05, "loss": 0.1041, "step": 20440 }, { "epoch": 6.249159278508102, "grad_norm": 0.4021517038345337, "learning_rate": 2.4657976306738566e-05, "loss": 0.1109, "step": 20441 }, { "epoch": 6.249464995414247, "grad_norm": 0.33919593691825867, "learning_rate": 2.465755169631863e-05, "loss": 0.1193, "step": 20442 }, { "epoch": 6.249770712320391, "grad_norm": 0.30558836460113525, "learning_rate": 2.4657127085898687e-05, "loss": 0.1319, "step": 20443 }, { "epoch": 6.250076429226536, "grad_norm": 1.13478422164917, "learning_rate": 2.465670247547875e-05, "loss": 0.1828, "step": 20444 }, { "epoch": 6.250382146132681, "grad_norm": 0.750201940536499, "learning_rate": 2.4656277865058808e-05, "loss": 0.1791, "step": 20445 }, { "epoch": 6.250687863038826, "grad_norm": 0.5285910367965698, "learning_rate": 2.465585325463887e-05, "loss": 0.1812, "step": 20446 }, { "epoch": 6.250993579944971, "grad_norm": 1.8283168077468872, "learning_rate": 2.465542864421893e-05, "loss": 0.1477, "step": 20447 }, { "epoch": 6.251299296851116, "grad_norm": 0.849952220916748, "learning_rate": 2.465500403379899e-05, "loss": 0.1783, "step": 20448 }, { "epoch": 6.251605013757261, "grad_norm": 1.9097341299057007, "learning_rate": 2.465457942337905e-05, "loss": 0.2151, "step": 20449 }, { "epoch": 6.251910730663406, "grad_norm": 2.980550765991211, "learning_rate": 2.465415481295911e-05, "loss": 0.2013, "step": 20450 }, { "epoch": 6.252216447569551, "grad_norm": 1.2095954418182373, "learning_rate": 2.465373020253917e-05, "loss": 0.2161, "step": 20451 }, { "epoch": 6.252522164475695, "grad_norm": 0.5546583533287048, "learning_rate": 2.465330559211923e-05, "loss": 0.1478, "step": 20452 }, { "epoch": 6.25282788138184, "grad_norm": 0.357021301984787, "learning_rate": 2.465288098169929e-05, "loss": 0.0779, "step": 20453 }, { "epoch": 6.253133598287985, "grad_norm": 0.878466784954071, "learning_rate": 2.465245637127935e-05, "loss": 0.1049, "step": 20454 }, { "epoch": 6.2534393151941305, "grad_norm": 0.2155127376317978, "learning_rate": 2.4652031760859412e-05, "loss": 0.0736, "step": 20455 }, { "epoch": 6.253745032100275, "grad_norm": 0.6554815769195557, "learning_rate": 2.465160715043947e-05, "loss": 0.0525, "step": 20456 }, { "epoch": 6.25405074900642, "grad_norm": 0.31401634216308594, "learning_rate": 2.4651182540019533e-05, "loss": 0.0616, "step": 20457 }, { "epoch": 6.254356465912565, "grad_norm": 0.35591956973075867, "learning_rate": 2.465075792959959e-05, "loss": 0.0533, "step": 20458 }, { "epoch": 6.25466218281871, "grad_norm": 0.28896448016166687, "learning_rate": 2.4650333319179654e-05, "loss": 0.0979, "step": 20459 }, { "epoch": 6.254967899724855, "grad_norm": 0.5964662432670593, "learning_rate": 2.4649908708759712e-05, "loss": 0.0631, "step": 20460 }, { "epoch": 6.2552736166309995, "grad_norm": 1.6005606651306152, "learning_rate": 2.4649484098339774e-05, "loss": 0.065, "step": 20461 }, { "epoch": 6.255579333537145, "grad_norm": 2.3987035751342773, "learning_rate": 2.4649059487919833e-05, "loss": 0.1008, "step": 20462 }, { "epoch": 6.25588505044329, "grad_norm": 0.2599339187145233, "learning_rate": 2.4648634877499895e-05, "loss": 0.0638, "step": 20463 }, { "epoch": 6.256190767349435, "grad_norm": 0.6531580090522766, "learning_rate": 2.4648210267079954e-05, "loss": 0.0815, "step": 20464 }, { "epoch": 6.256496484255579, "grad_norm": 0.46375197172164917, "learning_rate": 2.4647785656660013e-05, "loss": 0.0945, "step": 20465 }, { "epoch": 6.256802201161724, "grad_norm": 0.7840709090232849, "learning_rate": 2.4647361046240075e-05, "loss": 0.1384, "step": 20466 }, { "epoch": 6.257107918067869, "grad_norm": 0.36975187063217163, "learning_rate": 2.4646936435820133e-05, "loss": 0.1504, "step": 20467 }, { "epoch": 6.257413634974014, "grad_norm": 1.101265549659729, "learning_rate": 2.4646511825400195e-05, "loss": 0.1489, "step": 20468 }, { "epoch": 6.257719351880159, "grad_norm": 0.4178028106689453, "learning_rate": 2.4646087214980254e-05, "loss": 0.1715, "step": 20469 }, { "epoch": 6.258025068786304, "grad_norm": 0.7832471132278442, "learning_rate": 2.4645662604560316e-05, "loss": 0.1623, "step": 20470 }, { "epoch": 6.258330785692449, "grad_norm": 0.7726292610168457, "learning_rate": 2.4645237994140375e-05, "loss": 0.1643, "step": 20471 }, { "epoch": 6.258636502598594, "grad_norm": 0.7250409126281738, "learning_rate": 2.4644813383720437e-05, "loss": 0.1887, "step": 20472 }, { "epoch": 6.258942219504739, "grad_norm": 1.2560112476348877, "learning_rate": 2.4644388773300496e-05, "loss": 0.1824, "step": 20473 }, { "epoch": 6.259247936410883, "grad_norm": 17.838964462280273, "learning_rate": 2.4643964162880558e-05, "loss": 0.1577, "step": 20474 }, { "epoch": 6.259553653317028, "grad_norm": 2.5190422534942627, "learning_rate": 2.4643539552460616e-05, "loss": 0.1675, "step": 20475 }, { "epoch": 6.2598593702231735, "grad_norm": 2.486232042312622, "learning_rate": 2.464311494204068e-05, "loss": 0.2486, "step": 20476 }, { "epoch": 6.260165087129319, "grad_norm": 0.38996395468711853, "learning_rate": 2.4642690331620737e-05, "loss": 0.1228, "step": 20477 }, { "epoch": 6.260470804035463, "grad_norm": 0.18455414474010468, "learning_rate": 2.4642265721200796e-05, "loss": 0.0671, "step": 20478 }, { "epoch": 6.260776520941608, "grad_norm": 0.37149757146835327, "learning_rate": 2.4641841110780858e-05, "loss": 0.0806, "step": 20479 }, { "epoch": 6.261082237847753, "grad_norm": 0.8032139539718628, "learning_rate": 2.4641416500360917e-05, "loss": 0.0713, "step": 20480 }, { "epoch": 6.261387954753898, "grad_norm": 0.27390196919441223, "learning_rate": 2.464099188994098e-05, "loss": 0.0518, "step": 20481 }, { "epoch": 6.261693671660042, "grad_norm": 0.21533255279064178, "learning_rate": 2.4640567279521038e-05, "loss": 0.0604, "step": 20482 }, { "epoch": 6.2619993885661875, "grad_norm": 0.23303869366645813, "learning_rate": 2.46401426691011e-05, "loss": 0.044, "step": 20483 }, { "epoch": 6.262305105472333, "grad_norm": 0.3563937246799469, "learning_rate": 2.463971805868116e-05, "loss": 0.0579, "step": 20484 }, { "epoch": 6.262610822378478, "grad_norm": 0.2558675706386566, "learning_rate": 2.463929344826122e-05, "loss": 0.0709, "step": 20485 }, { "epoch": 6.262916539284623, "grad_norm": 0.2855742871761322, "learning_rate": 2.463886883784128e-05, "loss": 0.0814, "step": 20486 }, { "epoch": 6.263222256190767, "grad_norm": 1.0314935445785522, "learning_rate": 2.463844422742134e-05, "loss": 0.0773, "step": 20487 }, { "epoch": 6.263527973096912, "grad_norm": 0.5812258124351501, "learning_rate": 2.4638019617001403e-05, "loss": 0.0698, "step": 20488 }, { "epoch": 6.263833690003057, "grad_norm": 0.47934162616729736, "learning_rate": 2.4637595006581465e-05, "loss": 0.0931, "step": 20489 }, { "epoch": 6.2641394069092025, "grad_norm": 0.4094822108745575, "learning_rate": 2.4637170396161524e-05, "loss": 0.0992, "step": 20490 }, { "epoch": 6.264445123815347, "grad_norm": 0.32210662961006165, "learning_rate": 2.4636745785741583e-05, "loss": 0.1004, "step": 20491 }, { "epoch": 6.264750840721492, "grad_norm": 0.3991433084011078, "learning_rate": 2.4636321175321645e-05, "loss": 0.1254, "step": 20492 }, { "epoch": 6.265056557627637, "grad_norm": 1.300249695777893, "learning_rate": 2.4635896564901704e-05, "loss": 0.1522, "step": 20493 }, { "epoch": 6.265362274533782, "grad_norm": 3.147094488143921, "learning_rate": 2.4635471954481766e-05, "loss": 0.1414, "step": 20494 }, { "epoch": 6.265667991439926, "grad_norm": 1.2231025695800781, "learning_rate": 2.4635047344061824e-05, "loss": 0.1838, "step": 20495 }, { "epoch": 6.265973708346071, "grad_norm": 4.264914512634277, "learning_rate": 2.4634622733641886e-05, "loss": 0.1731, "step": 20496 }, { "epoch": 6.2662794252522165, "grad_norm": 1.781532883644104, "learning_rate": 2.4634198123221945e-05, "loss": 0.1808, "step": 20497 }, { "epoch": 6.266585142158362, "grad_norm": 4.431701183319092, "learning_rate": 2.4633773512802007e-05, "loss": 0.1894, "step": 20498 }, { "epoch": 6.266890859064507, "grad_norm": 0.6199817657470703, "learning_rate": 2.4633348902382066e-05, "loss": 0.1837, "step": 20499 }, { "epoch": 6.267196575970651, "grad_norm": 0.9559866189956665, "learning_rate": 2.4632924291962128e-05, "loss": 0.1684, "step": 20500 }, { "epoch": 6.267502292876796, "grad_norm": 0.8026087880134583, "learning_rate": 2.4632499681542187e-05, "loss": 0.2236, "step": 20501 }, { "epoch": 6.267808009782941, "grad_norm": 0.5169669985771179, "learning_rate": 2.463207507112225e-05, "loss": 0.139, "step": 20502 }, { "epoch": 6.268113726689086, "grad_norm": 0.5535604953765869, "learning_rate": 2.4631650460702307e-05, "loss": 0.1034, "step": 20503 }, { "epoch": 6.2684194435952305, "grad_norm": 0.44175973534584045, "learning_rate": 2.4631225850282366e-05, "loss": 0.0694, "step": 20504 }, { "epoch": 6.268725160501376, "grad_norm": 0.5184630751609802, "learning_rate": 2.4630801239862428e-05, "loss": 0.0548, "step": 20505 }, { "epoch": 6.269030877407521, "grad_norm": 0.2202775627374649, "learning_rate": 2.4630376629442487e-05, "loss": 0.0514, "step": 20506 }, { "epoch": 6.269336594313666, "grad_norm": 1.5866072177886963, "learning_rate": 2.462995201902255e-05, "loss": 0.0834, "step": 20507 }, { "epoch": 6.26964231121981, "grad_norm": 0.6425320506095886, "learning_rate": 2.4629527408602608e-05, "loss": 0.0811, "step": 20508 }, { "epoch": 6.269948028125955, "grad_norm": 0.2666257619857788, "learning_rate": 2.462910279818267e-05, "loss": 0.0723, "step": 20509 }, { "epoch": 6.2702537450321, "grad_norm": 0.4337107837200165, "learning_rate": 2.462867818776273e-05, "loss": 0.0592, "step": 20510 }, { "epoch": 6.270559461938245, "grad_norm": 0.23916378617286682, "learning_rate": 2.462825357734279e-05, "loss": 0.0646, "step": 20511 }, { "epoch": 6.2708651788443905, "grad_norm": 0.39709213376045227, "learning_rate": 2.462782896692285e-05, "loss": 0.0808, "step": 20512 }, { "epoch": 6.271170895750535, "grad_norm": 0.2951914668083191, "learning_rate": 2.462740435650291e-05, "loss": 0.0819, "step": 20513 }, { "epoch": 6.27147661265668, "grad_norm": 0.5491182208061218, "learning_rate": 2.462697974608297e-05, "loss": 0.1002, "step": 20514 }, { "epoch": 6.271782329562825, "grad_norm": 0.7246773838996887, "learning_rate": 2.4626555135663032e-05, "loss": 0.1234, "step": 20515 }, { "epoch": 6.27208804646897, "grad_norm": 1.0907995700836182, "learning_rate": 2.462613052524309e-05, "loss": 0.1128, "step": 20516 }, { "epoch": 6.272393763375114, "grad_norm": 0.9849863052368164, "learning_rate": 2.462570591482315e-05, "loss": 0.1084, "step": 20517 }, { "epoch": 6.2726994802812595, "grad_norm": 0.7069827318191528, "learning_rate": 2.4625281304403212e-05, "loss": 0.1698, "step": 20518 }, { "epoch": 6.273005197187405, "grad_norm": 0.9225093722343445, "learning_rate": 2.462485669398327e-05, "loss": 0.1555, "step": 20519 }, { "epoch": 6.27331091409355, "grad_norm": 0.6662130355834961, "learning_rate": 2.4624432083563332e-05, "loss": 0.1819, "step": 20520 }, { "epoch": 6.273616630999694, "grad_norm": 2.3405823707580566, "learning_rate": 2.462400747314339e-05, "loss": 0.1746, "step": 20521 }, { "epoch": 6.273922347905839, "grad_norm": 0.7264167070388794, "learning_rate": 2.4623582862723453e-05, "loss": 0.145, "step": 20522 }, { "epoch": 6.274228064811984, "grad_norm": 0.8808690309524536, "learning_rate": 2.4623158252303512e-05, "loss": 0.1559, "step": 20523 }, { "epoch": 6.274533781718129, "grad_norm": 1.1378982067108154, "learning_rate": 2.4622733641883574e-05, "loss": 0.1647, "step": 20524 }, { "epoch": 6.274839498624274, "grad_norm": 1.0388157367706299, "learning_rate": 2.4622309031463633e-05, "loss": 0.1912, "step": 20525 }, { "epoch": 6.275145215530419, "grad_norm": 2.374178409576416, "learning_rate": 2.4621884421043695e-05, "loss": 0.1958, "step": 20526 }, { "epoch": 6.275450932436564, "grad_norm": 0.38017863035202026, "learning_rate": 2.4621459810623754e-05, "loss": 0.1526, "step": 20527 }, { "epoch": 6.275756649342709, "grad_norm": 0.2916736304759979, "learning_rate": 2.4621035200203816e-05, "loss": 0.0889, "step": 20528 }, { "epoch": 6.276062366248854, "grad_norm": 0.5035272240638733, "learning_rate": 2.4620610589783874e-05, "loss": 0.0694, "step": 20529 }, { "epoch": 6.276368083154998, "grad_norm": 2.2162024974823, "learning_rate": 2.4620185979363933e-05, "loss": 0.0617, "step": 20530 }, { "epoch": 6.276673800061143, "grad_norm": 0.22445549070835114, "learning_rate": 2.4619761368943995e-05, "loss": 0.0456, "step": 20531 }, { "epoch": 6.276979516967288, "grad_norm": 0.2082822173833847, "learning_rate": 2.4619336758524054e-05, "loss": 0.0537, "step": 20532 }, { "epoch": 6.2772852338734335, "grad_norm": 0.3477279841899872, "learning_rate": 2.4618912148104116e-05, "loss": 0.0473, "step": 20533 }, { "epoch": 6.277590950779578, "grad_norm": 0.3551945686340332, "learning_rate": 2.4618487537684175e-05, "loss": 0.0622, "step": 20534 }, { "epoch": 6.277896667685723, "grad_norm": 0.44842466711997986, "learning_rate": 2.4618062927264237e-05, "loss": 0.067, "step": 20535 }, { "epoch": 6.278202384591868, "grad_norm": 0.2340509593486786, "learning_rate": 2.4617638316844295e-05, "loss": 0.0566, "step": 20536 }, { "epoch": 6.278508101498013, "grad_norm": 0.41659513115882874, "learning_rate": 2.4617213706424357e-05, "loss": 0.0786, "step": 20537 }, { "epoch": 6.278813818404158, "grad_norm": 0.8422670960426331, "learning_rate": 2.4616789096004416e-05, "loss": 0.0841, "step": 20538 }, { "epoch": 6.279119535310302, "grad_norm": 0.8786563873291016, "learning_rate": 2.4616364485584478e-05, "loss": 0.0821, "step": 20539 }, { "epoch": 6.2794252522164475, "grad_norm": 0.6427392959594727, "learning_rate": 2.4615939875164537e-05, "loss": 0.1293, "step": 20540 }, { "epoch": 6.279730969122593, "grad_norm": 1.0062838792800903, "learning_rate": 2.4615515264744596e-05, "loss": 0.1298, "step": 20541 }, { "epoch": 6.280036686028738, "grad_norm": 0.5200824737548828, "learning_rate": 2.4615090654324658e-05, "loss": 0.153, "step": 20542 }, { "epoch": 6.280342402934882, "grad_norm": 0.5701680183410645, "learning_rate": 2.4614666043904716e-05, "loss": 0.1631, "step": 20543 }, { "epoch": 6.280648119841027, "grad_norm": 0.603046715259552, "learning_rate": 2.461424143348478e-05, "loss": 0.1787, "step": 20544 }, { "epoch": 6.280953836747172, "grad_norm": 0.8817220330238342, "learning_rate": 2.4613816823064837e-05, "loss": 0.1667, "step": 20545 }, { "epoch": 6.281259553653317, "grad_norm": 0.9578282833099365, "learning_rate": 2.46133922126449e-05, "loss": 0.175, "step": 20546 }, { "epoch": 6.281565270559462, "grad_norm": 0.7770274877548218, "learning_rate": 2.4612967602224958e-05, "loss": 0.1768, "step": 20547 }, { "epoch": 6.281870987465607, "grad_norm": 0.5808359980583191, "learning_rate": 2.461254299180502e-05, "loss": 0.1702, "step": 20548 }, { "epoch": 6.282176704371752, "grad_norm": 1.485681176185608, "learning_rate": 2.461211838138508e-05, "loss": 0.2104, "step": 20549 }, { "epoch": 6.282482421277897, "grad_norm": 1.1663455963134766, "learning_rate": 2.461169377096514e-05, "loss": 0.2005, "step": 20550 }, { "epoch": 6.282788138184042, "grad_norm": 3.701110601425171, "learning_rate": 2.46112691605452e-05, "loss": 0.2762, "step": 20551 }, { "epoch": 6.283093855090186, "grad_norm": 0.5110062956809998, "learning_rate": 2.4610844550125262e-05, "loss": 0.1414, "step": 20552 }, { "epoch": 6.283399571996331, "grad_norm": 1.519737720489502, "learning_rate": 2.461041993970532e-05, "loss": 0.0831, "step": 20553 }, { "epoch": 6.2837052889024765, "grad_norm": 0.5599533319473267, "learning_rate": 2.460999532928538e-05, "loss": 0.0776, "step": 20554 }, { "epoch": 6.284011005808622, "grad_norm": 0.3613547384738922, "learning_rate": 2.460957071886544e-05, "loss": 0.0478, "step": 20555 }, { "epoch": 6.284316722714766, "grad_norm": 0.257935494184494, "learning_rate": 2.46091461084455e-05, "loss": 0.0437, "step": 20556 }, { "epoch": 6.284622439620911, "grad_norm": 0.21573634445667267, "learning_rate": 2.4608721498025562e-05, "loss": 0.0408, "step": 20557 }, { "epoch": 6.284928156527056, "grad_norm": 0.2586638033390045, "learning_rate": 2.460829688760562e-05, "loss": 0.0755, "step": 20558 }, { "epoch": 6.285233873433201, "grad_norm": 0.4912983477115631, "learning_rate": 2.4607872277185683e-05, "loss": 0.0834, "step": 20559 }, { "epoch": 6.285539590339345, "grad_norm": 1.028978943824768, "learning_rate": 2.460744766676574e-05, "loss": 0.0466, "step": 20560 }, { "epoch": 6.2858453072454905, "grad_norm": 0.32856065034866333, "learning_rate": 2.4607023056345804e-05, "loss": 0.0629, "step": 20561 }, { "epoch": 6.286151024151636, "grad_norm": 0.30250054597854614, "learning_rate": 2.4606598445925862e-05, "loss": 0.0657, "step": 20562 }, { "epoch": 6.286456741057781, "grad_norm": 0.5031628608703613, "learning_rate": 2.4606173835505924e-05, "loss": 0.1104, "step": 20563 }, { "epoch": 6.286762457963926, "grad_norm": 0.7825090885162354, "learning_rate": 2.4605749225085983e-05, "loss": 0.1201, "step": 20564 }, { "epoch": 6.28706817487007, "grad_norm": 0.48126354813575745, "learning_rate": 2.4605324614666045e-05, "loss": 0.1084, "step": 20565 }, { "epoch": 6.287373891776215, "grad_norm": 0.8235324025154114, "learning_rate": 2.4604900004246104e-05, "loss": 0.1319, "step": 20566 }, { "epoch": 6.28767960868236, "grad_norm": 1.2460702657699585, "learning_rate": 2.4604475393826163e-05, "loss": 0.1393, "step": 20567 }, { "epoch": 6.287985325588505, "grad_norm": 0.5433230996131897, "learning_rate": 2.4604050783406225e-05, "loss": 0.1395, "step": 20568 }, { "epoch": 6.28829104249465, "grad_norm": 1.5368655920028687, "learning_rate": 2.4603626172986283e-05, "loss": 0.1844, "step": 20569 }, { "epoch": 6.288596759400795, "grad_norm": 2.4901227951049805, "learning_rate": 2.4603201562566345e-05, "loss": 0.2008, "step": 20570 }, { "epoch": 6.28890247630694, "grad_norm": 1.2130221128463745, "learning_rate": 2.4602776952146404e-05, "loss": 0.1814, "step": 20571 }, { "epoch": 6.289208193213085, "grad_norm": 1.7252039909362793, "learning_rate": 2.4602352341726466e-05, "loss": 0.1724, "step": 20572 }, { "epoch": 6.289513910119229, "grad_norm": 1.013859510421753, "learning_rate": 2.4601927731306525e-05, "loss": 0.1732, "step": 20573 }, { "epoch": 6.289819627025374, "grad_norm": 1.4870346784591675, "learning_rate": 2.4601503120886587e-05, "loss": 0.2275, "step": 20574 }, { "epoch": 6.2901253439315195, "grad_norm": 1.3030496835708618, "learning_rate": 2.4601078510466646e-05, "loss": 0.2415, "step": 20575 }, { "epoch": 6.290431060837665, "grad_norm": 0.9679726362228394, "learning_rate": 2.4600653900046708e-05, "loss": 0.2182, "step": 20576 }, { "epoch": 6.29073677774381, "grad_norm": 0.8472238183021545, "learning_rate": 2.4600229289626766e-05, "loss": 0.1506, "step": 20577 }, { "epoch": 6.291042494649954, "grad_norm": 0.5921621322631836, "learning_rate": 2.459980467920683e-05, "loss": 0.0789, "step": 20578 }, { "epoch": 6.291348211556099, "grad_norm": 0.5565490126609802, "learning_rate": 2.4599380068786887e-05, "loss": 0.0758, "step": 20579 }, { "epoch": 6.291653928462244, "grad_norm": 0.3953445553779602, "learning_rate": 2.4598955458366946e-05, "loss": 0.0736, "step": 20580 }, { "epoch": 6.291959645368389, "grad_norm": 0.14287282526493073, "learning_rate": 2.4598530847947008e-05, "loss": 0.0514, "step": 20581 }, { "epoch": 6.2922653622745335, "grad_norm": 0.7322268486022949, "learning_rate": 2.4598106237527067e-05, "loss": 0.0736, "step": 20582 }, { "epoch": 6.292571079180679, "grad_norm": 0.22059392929077148, "learning_rate": 2.459768162710713e-05, "loss": 0.0608, "step": 20583 }, { "epoch": 6.292876796086824, "grad_norm": 0.2099466472864151, "learning_rate": 2.4597257016687188e-05, "loss": 0.0735, "step": 20584 }, { "epoch": 6.293182512992969, "grad_norm": 1.0382838249206543, "learning_rate": 2.459683240626725e-05, "loss": 0.0666, "step": 20585 }, { "epoch": 6.293488229899113, "grad_norm": 0.39644548296928406, "learning_rate": 2.459640779584731e-05, "loss": 0.0649, "step": 20586 }, { "epoch": 6.293793946805258, "grad_norm": 0.3076232671737671, "learning_rate": 2.459598318542737e-05, "loss": 0.0804, "step": 20587 }, { "epoch": 6.294099663711403, "grad_norm": 0.3773645758628845, "learning_rate": 2.459555857500743e-05, "loss": 0.0592, "step": 20588 }, { "epoch": 6.294405380617548, "grad_norm": 0.33619147539138794, "learning_rate": 2.459513396458749e-05, "loss": 0.0837, "step": 20589 }, { "epoch": 6.2947110975236935, "grad_norm": 0.5117653608322144, "learning_rate": 2.4594709354167553e-05, "loss": 0.095, "step": 20590 }, { "epoch": 6.295016814429838, "grad_norm": 0.5010021924972534, "learning_rate": 2.4594284743747615e-05, "loss": 0.1023, "step": 20591 }, { "epoch": 6.295322531335983, "grad_norm": 0.7011531591415405, "learning_rate": 2.4593860133327674e-05, "loss": 0.1313, "step": 20592 }, { "epoch": 6.295628248242128, "grad_norm": 0.5661648511886597, "learning_rate": 2.4593435522907733e-05, "loss": 0.1378, "step": 20593 }, { "epoch": 6.295933965148273, "grad_norm": 0.940248966217041, "learning_rate": 2.4593010912487795e-05, "loss": 0.1592, "step": 20594 }, { "epoch": 6.296239682054417, "grad_norm": 0.5201418995857239, "learning_rate": 2.4592586302067854e-05, "loss": 0.1264, "step": 20595 }, { "epoch": 6.296545398960562, "grad_norm": 1.1981229782104492, "learning_rate": 2.4592161691647916e-05, "loss": 0.157, "step": 20596 }, { "epoch": 6.2968511158667075, "grad_norm": 0.626494824886322, "learning_rate": 2.4591737081227974e-05, "loss": 0.1554, "step": 20597 }, { "epoch": 6.297156832772853, "grad_norm": 0.9556162357330322, "learning_rate": 2.4591312470808036e-05, "loss": 0.1704, "step": 20598 }, { "epoch": 6.297462549678997, "grad_norm": 1.36262845993042, "learning_rate": 2.4590887860388095e-05, "loss": 0.1672, "step": 20599 }, { "epoch": 6.297768266585142, "grad_norm": 1.3679521083831787, "learning_rate": 2.4590463249968157e-05, "loss": 0.2065, "step": 20600 }, { "epoch": 6.298073983491287, "grad_norm": 3.951383352279663, "learning_rate": 2.4590038639548216e-05, "loss": 0.2026, "step": 20601 }, { "epoch": 6.298379700397432, "grad_norm": 0.2837997078895569, "learning_rate": 2.4589614029128278e-05, "loss": 0.1346, "step": 20602 }, { "epoch": 6.298685417303577, "grad_norm": 0.23962466418743134, "learning_rate": 2.4589189418708337e-05, "loss": 0.0726, "step": 20603 }, { "epoch": 6.298991134209722, "grad_norm": 0.5052900314331055, "learning_rate": 2.45887648082884e-05, "loss": 0.1082, "step": 20604 }, { "epoch": 6.299296851115867, "grad_norm": 0.4810147285461426, "learning_rate": 2.4588340197868458e-05, "loss": 0.0703, "step": 20605 }, { "epoch": 6.299602568022012, "grad_norm": 0.5401982665061951, "learning_rate": 2.4587915587448516e-05, "loss": 0.0612, "step": 20606 }, { "epoch": 6.299908284928157, "grad_norm": 0.19769784808158875, "learning_rate": 2.458749097702858e-05, "loss": 0.045, "step": 20607 }, { "epoch": 6.300214001834301, "grad_norm": 0.5537563562393188, "learning_rate": 2.4587066366608637e-05, "loss": 0.0592, "step": 20608 }, { "epoch": 6.300519718740446, "grad_norm": 0.8241945505142212, "learning_rate": 2.45866417561887e-05, "loss": 0.0572, "step": 20609 }, { "epoch": 6.300825435646591, "grad_norm": 0.3160577416419983, "learning_rate": 2.4586217145768758e-05, "loss": 0.0581, "step": 20610 }, { "epoch": 6.3011311525527365, "grad_norm": 0.3463614583015442, "learning_rate": 2.458579253534882e-05, "loss": 0.0696, "step": 20611 }, { "epoch": 6.301436869458881, "grad_norm": 0.970562756061554, "learning_rate": 2.458536792492888e-05, "loss": 0.0918, "step": 20612 }, { "epoch": 6.301742586365026, "grad_norm": 0.4095824956893921, "learning_rate": 2.458494331450894e-05, "loss": 0.0764, "step": 20613 }, { "epoch": 6.302048303271171, "grad_norm": 0.3340032994747162, "learning_rate": 2.4584518704089e-05, "loss": 0.0914, "step": 20614 }, { "epoch": 6.302354020177316, "grad_norm": 0.5890899300575256, "learning_rate": 2.458409409366906e-05, "loss": 0.1265, "step": 20615 }, { "epoch": 6.302659737083461, "grad_norm": 0.4970606863498688, "learning_rate": 2.458366948324912e-05, "loss": 0.099, "step": 20616 }, { "epoch": 6.302965453989605, "grad_norm": 0.5350572466850281, "learning_rate": 2.4583244872829182e-05, "loss": 0.1473, "step": 20617 }, { "epoch": 6.3032711708957505, "grad_norm": 1.008342981338501, "learning_rate": 2.458282026240924e-05, "loss": 0.1574, "step": 20618 }, { "epoch": 6.303576887801896, "grad_norm": 0.7805175185203552, "learning_rate": 2.45823956519893e-05, "loss": 0.1447, "step": 20619 }, { "epoch": 6.303882604708041, "grad_norm": 0.8615054488182068, "learning_rate": 2.4581971041569362e-05, "loss": 0.168, "step": 20620 }, { "epoch": 6.304188321614185, "grad_norm": 1.2642954587936401, "learning_rate": 2.458154643114942e-05, "loss": 0.1564, "step": 20621 }, { "epoch": 6.30449403852033, "grad_norm": 0.9221076369285583, "learning_rate": 2.4581121820729483e-05, "loss": 0.1439, "step": 20622 }, { "epoch": 6.304799755426475, "grad_norm": 0.9218765497207642, "learning_rate": 2.458069721030954e-05, "loss": 0.21, "step": 20623 }, { "epoch": 6.30510547233262, "grad_norm": 0.8245738744735718, "learning_rate": 2.4580272599889603e-05, "loss": 0.1822, "step": 20624 }, { "epoch": 6.3054111892387645, "grad_norm": 0.8447156548500061, "learning_rate": 2.4579847989469662e-05, "loss": 0.1802, "step": 20625 }, { "epoch": 6.30571690614491, "grad_norm": 1.2079252004623413, "learning_rate": 2.4579423379049724e-05, "loss": 0.2252, "step": 20626 }, { "epoch": 6.306022623051055, "grad_norm": 0.35744717717170715, "learning_rate": 2.4578998768629783e-05, "loss": 0.1383, "step": 20627 }, { "epoch": 6.3063283399572, "grad_norm": 0.5224502682685852, "learning_rate": 2.4578574158209845e-05, "loss": 0.0804, "step": 20628 }, { "epoch": 6.306634056863345, "grad_norm": 0.3353728950023651, "learning_rate": 2.4578149547789904e-05, "loss": 0.0727, "step": 20629 }, { "epoch": 6.306939773769489, "grad_norm": 0.22134660184383392, "learning_rate": 2.4577724937369966e-05, "loss": 0.056, "step": 20630 }, { "epoch": 6.307245490675634, "grad_norm": 0.32569220662117004, "learning_rate": 2.4577300326950024e-05, "loss": 0.0532, "step": 20631 }, { "epoch": 6.3075512075817795, "grad_norm": 0.18778513371944427, "learning_rate": 2.4576875716530083e-05, "loss": 0.0448, "step": 20632 }, { "epoch": 6.307856924487925, "grad_norm": 0.28508737683296204, "learning_rate": 2.4576451106110145e-05, "loss": 0.0747, "step": 20633 }, { "epoch": 6.308162641394069, "grad_norm": 0.273972749710083, "learning_rate": 2.4576026495690204e-05, "loss": 0.0774, "step": 20634 }, { "epoch": 6.308468358300214, "grad_norm": 0.3331663906574249, "learning_rate": 2.4575601885270266e-05, "loss": 0.0672, "step": 20635 }, { "epoch": 6.308774075206359, "grad_norm": 0.5940147042274475, "learning_rate": 2.4575177274850325e-05, "loss": 0.0736, "step": 20636 }, { "epoch": 6.309079792112504, "grad_norm": 0.23327326774597168, "learning_rate": 2.4574752664430387e-05, "loss": 0.085, "step": 20637 }, { "epoch": 6.309385509018648, "grad_norm": 0.27928027510643005, "learning_rate": 2.4574328054010445e-05, "loss": 0.0915, "step": 20638 }, { "epoch": 6.3096912259247935, "grad_norm": 1.98115873336792, "learning_rate": 2.4573903443590508e-05, "loss": 0.0841, "step": 20639 }, { "epoch": 6.309996942830939, "grad_norm": 0.42020612955093384, "learning_rate": 2.4573478833170566e-05, "loss": 0.0902, "step": 20640 }, { "epoch": 6.310302659737084, "grad_norm": 0.5758538246154785, "learning_rate": 2.457305422275063e-05, "loss": 0.1308, "step": 20641 }, { "epoch": 6.310608376643229, "grad_norm": 0.3528634011745453, "learning_rate": 2.4572629612330687e-05, "loss": 0.1226, "step": 20642 }, { "epoch": 6.310914093549373, "grad_norm": 0.5203597545623779, "learning_rate": 2.457220500191075e-05, "loss": 0.1294, "step": 20643 }, { "epoch": 6.311219810455518, "grad_norm": 0.4377926290035248, "learning_rate": 2.4571780391490808e-05, "loss": 0.152, "step": 20644 }, { "epoch": 6.311525527361663, "grad_norm": 2.1450998783111572, "learning_rate": 2.4571355781070867e-05, "loss": 0.1368, "step": 20645 }, { "epoch": 6.311831244267808, "grad_norm": 0.9800832271575928, "learning_rate": 2.457093117065093e-05, "loss": 0.173, "step": 20646 }, { "epoch": 6.312136961173953, "grad_norm": 1.5519853830337524, "learning_rate": 2.4570506560230987e-05, "loss": 0.1517, "step": 20647 }, { "epoch": 6.312442678080098, "grad_norm": 1.508379578590393, "learning_rate": 2.457008194981105e-05, "loss": 0.1604, "step": 20648 }, { "epoch": 6.312748394986243, "grad_norm": 1.1199580430984497, "learning_rate": 2.4569657339391108e-05, "loss": 0.1553, "step": 20649 }, { "epoch": 6.313054111892388, "grad_norm": 1.0097436904907227, "learning_rate": 2.456923272897117e-05, "loss": 0.2098, "step": 20650 }, { "epoch": 6.313359828798532, "grad_norm": 1.8226103782653809, "learning_rate": 2.456880811855123e-05, "loss": 0.2263, "step": 20651 }, { "epoch": 6.313665545704677, "grad_norm": 0.4581280052661896, "learning_rate": 2.456838350813129e-05, "loss": 0.1465, "step": 20652 }, { "epoch": 6.313971262610822, "grad_norm": 0.6009646058082581, "learning_rate": 2.456795889771135e-05, "loss": 0.0905, "step": 20653 }, { "epoch": 6.3142769795169675, "grad_norm": 0.3219684958457947, "learning_rate": 2.4567534287291412e-05, "loss": 0.0555, "step": 20654 }, { "epoch": 6.314582696423113, "grad_norm": 0.20476704835891724, "learning_rate": 2.456710967687147e-05, "loss": 0.0567, "step": 20655 }, { "epoch": 6.314888413329257, "grad_norm": 0.21086819469928741, "learning_rate": 2.4566685066451533e-05, "loss": 0.0678, "step": 20656 }, { "epoch": 6.315194130235402, "grad_norm": 0.332366406917572, "learning_rate": 2.456626045603159e-05, "loss": 0.0373, "step": 20657 }, { "epoch": 6.315499847141547, "grad_norm": 0.18618114292621613, "learning_rate": 2.456583584561165e-05, "loss": 0.052, "step": 20658 }, { "epoch": 6.315805564047692, "grad_norm": 0.23005954921245575, "learning_rate": 2.4565411235191712e-05, "loss": 0.0516, "step": 20659 }, { "epoch": 6.3161112809538364, "grad_norm": 0.3353956341743469, "learning_rate": 2.456498662477177e-05, "loss": 0.0672, "step": 20660 }, { "epoch": 6.316416997859982, "grad_norm": 0.2813246548175812, "learning_rate": 2.4564562014351833e-05, "loss": 0.0563, "step": 20661 }, { "epoch": 6.316722714766127, "grad_norm": 0.5143972635269165, "learning_rate": 2.456413740393189e-05, "loss": 0.0966, "step": 20662 }, { "epoch": 6.317028431672272, "grad_norm": 0.3341114819049835, "learning_rate": 2.4563712793511954e-05, "loss": 0.0757, "step": 20663 }, { "epoch": 6.317334148578416, "grad_norm": 0.35507798194885254, "learning_rate": 2.4563288183092012e-05, "loss": 0.0709, "step": 20664 }, { "epoch": 6.317639865484561, "grad_norm": 0.9922248721122742, "learning_rate": 2.4562863572672074e-05, "loss": 0.1226, "step": 20665 }, { "epoch": 6.317945582390706, "grad_norm": 0.4807116985321045, "learning_rate": 2.4562438962252133e-05, "loss": 0.0967, "step": 20666 }, { "epoch": 6.318251299296851, "grad_norm": 0.9948383569717407, "learning_rate": 2.4562014351832195e-05, "loss": 0.1273, "step": 20667 }, { "epoch": 6.3185570162029965, "grad_norm": 0.360777884721756, "learning_rate": 2.4561589741412254e-05, "loss": 0.1348, "step": 20668 }, { "epoch": 6.318862733109141, "grad_norm": 0.38129696249961853, "learning_rate": 2.4561165130992313e-05, "loss": 0.1451, "step": 20669 }, { "epoch": 6.319168450015286, "grad_norm": 0.5303404927253723, "learning_rate": 2.4560740520572375e-05, "loss": 0.1839, "step": 20670 }, { "epoch": 6.319474166921431, "grad_norm": 0.8339914083480835, "learning_rate": 2.4560315910152433e-05, "loss": 0.1717, "step": 20671 }, { "epoch": 6.319779883827576, "grad_norm": 1.5652018785476685, "learning_rate": 2.4559891299732495e-05, "loss": 0.1501, "step": 20672 }, { "epoch": 6.32008560073372, "grad_norm": 0.7208895087242126, "learning_rate": 2.4559466689312554e-05, "loss": 0.1587, "step": 20673 }, { "epoch": 6.320391317639865, "grad_norm": 1.7053261995315552, "learning_rate": 2.4559042078892616e-05, "loss": 0.1708, "step": 20674 }, { "epoch": 6.3206970345460105, "grad_norm": 2.1989877223968506, "learning_rate": 2.4558617468472675e-05, "loss": 0.2237, "step": 20675 }, { "epoch": 6.321002751452156, "grad_norm": 0.9788371324539185, "learning_rate": 2.4558192858052737e-05, "loss": 0.2243, "step": 20676 }, { "epoch": 6.3213084683583, "grad_norm": 0.4325160086154938, "learning_rate": 2.4557768247632796e-05, "loss": 0.1352, "step": 20677 }, { "epoch": 6.321614185264445, "grad_norm": 0.39763376116752625, "learning_rate": 2.4557343637212858e-05, "loss": 0.0599, "step": 20678 }, { "epoch": 6.32191990217059, "grad_norm": 0.20961564779281616, "learning_rate": 2.4556919026792917e-05, "loss": 0.0548, "step": 20679 }, { "epoch": 6.322225619076735, "grad_norm": 0.39249399304389954, "learning_rate": 2.455649441637298e-05, "loss": 0.0562, "step": 20680 }, { "epoch": 6.32253133598288, "grad_norm": 0.31461864709854126, "learning_rate": 2.4556069805953037e-05, "loss": 0.0523, "step": 20681 }, { "epoch": 6.3228370528890245, "grad_norm": 0.3155258297920227, "learning_rate": 2.4555645195533096e-05, "loss": 0.0548, "step": 20682 }, { "epoch": 6.32314276979517, "grad_norm": 0.1866508275270462, "learning_rate": 2.4555220585113158e-05, "loss": 0.0424, "step": 20683 }, { "epoch": 6.323448486701315, "grad_norm": 0.35070833563804626, "learning_rate": 2.4554795974693217e-05, "loss": 0.0538, "step": 20684 }, { "epoch": 6.32375420360746, "grad_norm": 0.9568943977355957, "learning_rate": 2.455437136427328e-05, "loss": 0.0782, "step": 20685 }, { "epoch": 6.324059920513604, "grad_norm": 0.45156583189964294, "learning_rate": 2.4553946753853338e-05, "loss": 0.0729, "step": 20686 }, { "epoch": 6.324365637419749, "grad_norm": 0.4691314697265625, "learning_rate": 2.45535221434334e-05, "loss": 0.1364, "step": 20687 }, { "epoch": 6.324671354325894, "grad_norm": 0.3992782235145569, "learning_rate": 2.455309753301346e-05, "loss": 0.076, "step": 20688 }, { "epoch": 6.3249770712320394, "grad_norm": 0.21582773327827454, "learning_rate": 2.455267292259352e-05, "loss": 0.0807, "step": 20689 }, { "epoch": 6.325282788138184, "grad_norm": 0.6696636080741882, "learning_rate": 2.455224831217358e-05, "loss": 0.114, "step": 20690 }, { "epoch": 6.325588505044329, "grad_norm": 0.43466562032699585, "learning_rate": 2.455182370175364e-05, "loss": 0.1076, "step": 20691 }, { "epoch": 6.325894221950474, "grad_norm": 0.4800623059272766, "learning_rate": 2.4551399091333703e-05, "loss": 0.1402, "step": 20692 }, { "epoch": 6.326199938856619, "grad_norm": 0.4984884560108185, "learning_rate": 2.4550974480913765e-05, "loss": 0.1244, "step": 20693 }, { "epoch": 6.326505655762764, "grad_norm": 0.730692446231842, "learning_rate": 2.4550549870493824e-05, "loss": 0.1488, "step": 20694 }, { "epoch": 6.326811372668908, "grad_norm": 0.5954033732414246, "learning_rate": 2.4550125260073883e-05, "loss": 0.1822, "step": 20695 }, { "epoch": 6.3271170895750535, "grad_norm": 0.6148799657821655, "learning_rate": 2.4549700649653945e-05, "loss": 0.1643, "step": 20696 }, { "epoch": 6.327422806481199, "grad_norm": 0.808568000793457, "learning_rate": 2.4549276039234004e-05, "loss": 0.1451, "step": 20697 }, { "epoch": 6.327728523387344, "grad_norm": 0.4807569086551666, "learning_rate": 2.4548851428814066e-05, "loss": 0.1708, "step": 20698 }, { "epoch": 6.328034240293488, "grad_norm": 0.930526614189148, "learning_rate": 2.4548426818394124e-05, "loss": 0.174, "step": 20699 }, { "epoch": 6.328339957199633, "grad_norm": 1.6664994955062866, "learning_rate": 2.4548002207974186e-05, "loss": 0.1877, "step": 20700 }, { "epoch": 6.328645674105778, "grad_norm": 0.9991376996040344, "learning_rate": 2.4547577597554245e-05, "loss": 0.2723, "step": 20701 }, { "epoch": 6.328951391011923, "grad_norm": 0.5056381821632385, "learning_rate": 2.4547152987134307e-05, "loss": 0.1233, "step": 20702 }, { "epoch": 6.3292571079180675, "grad_norm": 0.24575479328632355, "learning_rate": 2.4546728376714366e-05, "loss": 0.1002, "step": 20703 }, { "epoch": 6.329562824824213, "grad_norm": 0.712785005569458, "learning_rate": 2.4546303766294428e-05, "loss": 0.0603, "step": 20704 }, { "epoch": 6.329868541730358, "grad_norm": 0.26045116782188416, "learning_rate": 2.4545879155874487e-05, "loss": 0.0388, "step": 20705 }, { "epoch": 6.330174258636503, "grad_norm": 0.2323751002550125, "learning_rate": 2.454545454545455e-05, "loss": 0.0503, "step": 20706 }, { "epoch": 6.330479975542648, "grad_norm": 0.11217483133077621, "learning_rate": 2.4545029935034608e-05, "loss": 0.0521, "step": 20707 }, { "epoch": 6.330785692448792, "grad_norm": 0.42428502440452576, "learning_rate": 2.4544605324614666e-05, "loss": 0.063, "step": 20708 }, { "epoch": 6.331091409354937, "grad_norm": 0.16802555322647095, "learning_rate": 2.454418071419473e-05, "loss": 0.045, "step": 20709 }, { "epoch": 6.331397126261082, "grad_norm": 0.20309320092201233, "learning_rate": 2.4543756103774787e-05, "loss": 0.0779, "step": 20710 }, { "epoch": 6.3317028431672275, "grad_norm": 0.5917788743972778, "learning_rate": 2.454333149335485e-05, "loss": 0.0822, "step": 20711 }, { "epoch": 6.332008560073372, "grad_norm": 0.3148867189884186, "learning_rate": 2.4542906882934908e-05, "loss": 0.0767, "step": 20712 }, { "epoch": 6.332314276979517, "grad_norm": 1.3999415636062622, "learning_rate": 2.454248227251497e-05, "loss": 0.0968, "step": 20713 }, { "epoch": 6.332619993885662, "grad_norm": 0.7691846489906311, "learning_rate": 2.454205766209503e-05, "loss": 0.1202, "step": 20714 }, { "epoch": 6.332925710791807, "grad_norm": 0.3591379225254059, "learning_rate": 2.454163305167509e-05, "loss": 0.0949, "step": 20715 }, { "epoch": 6.333231427697951, "grad_norm": 0.4425278902053833, "learning_rate": 2.454120844125515e-05, "loss": 0.1215, "step": 20716 }, { "epoch": 6.3335371446040964, "grad_norm": 0.7344486117362976, "learning_rate": 2.454078383083521e-05, "loss": 0.1613, "step": 20717 }, { "epoch": 6.333842861510242, "grad_norm": 0.37918445467948914, "learning_rate": 2.454035922041527e-05, "loss": 0.1513, "step": 20718 }, { "epoch": 6.334148578416387, "grad_norm": 0.8392148613929749, "learning_rate": 2.4539934609995332e-05, "loss": 0.1305, "step": 20719 }, { "epoch": 6.334454295322532, "grad_norm": 0.9530242681503296, "learning_rate": 2.453950999957539e-05, "loss": 0.1377, "step": 20720 }, { "epoch": 6.334760012228676, "grad_norm": 0.8755111694335938, "learning_rate": 2.453908538915545e-05, "loss": 0.1379, "step": 20721 }, { "epoch": 6.335065729134821, "grad_norm": 2.222134590148926, "learning_rate": 2.4538660778735512e-05, "loss": 0.1687, "step": 20722 }, { "epoch": 6.335371446040966, "grad_norm": 1.2035216093063354, "learning_rate": 2.453823616831557e-05, "loss": 0.2013, "step": 20723 }, { "epoch": 6.335677162947111, "grad_norm": 3.0631656646728516, "learning_rate": 2.4537811557895633e-05, "loss": 0.1775, "step": 20724 }, { "epoch": 6.335982879853256, "grad_norm": 2.9048237800598145, "learning_rate": 2.453738694747569e-05, "loss": 0.1893, "step": 20725 }, { "epoch": 6.336288596759401, "grad_norm": 2.0024354457855225, "learning_rate": 2.4536962337055753e-05, "loss": 0.2678, "step": 20726 }, { "epoch": 6.336594313665546, "grad_norm": 0.4606305658817291, "learning_rate": 2.4536537726635812e-05, "loss": 0.1313, "step": 20727 }, { "epoch": 6.336900030571691, "grad_norm": 0.6858883500099182, "learning_rate": 2.4536113116215874e-05, "loss": 0.0613, "step": 20728 }, { "epoch": 6.337205747477835, "grad_norm": 0.252150297164917, "learning_rate": 2.4535688505795933e-05, "loss": 0.0663, "step": 20729 }, { "epoch": 6.33751146438398, "grad_norm": 0.29252558946609497, "learning_rate": 2.4535263895375995e-05, "loss": 0.0562, "step": 20730 }, { "epoch": 6.337817181290125, "grad_norm": 0.2432592362165451, "learning_rate": 2.4534839284956054e-05, "loss": 0.0591, "step": 20731 }, { "epoch": 6.3381228981962705, "grad_norm": 0.3308227062225342, "learning_rate": 2.4534414674536116e-05, "loss": 0.0514, "step": 20732 }, { "epoch": 6.338428615102416, "grad_norm": 0.4596160054206848, "learning_rate": 2.4533990064116174e-05, "loss": 0.0653, "step": 20733 }, { "epoch": 6.33873433200856, "grad_norm": 0.1775357723236084, "learning_rate": 2.4533565453696233e-05, "loss": 0.0389, "step": 20734 }, { "epoch": 6.339040048914705, "grad_norm": 0.301155149936676, "learning_rate": 2.4533140843276295e-05, "loss": 0.0656, "step": 20735 }, { "epoch": 6.33934576582085, "grad_norm": 1.173912763595581, "learning_rate": 2.4532716232856354e-05, "loss": 0.0759, "step": 20736 }, { "epoch": 6.339651482726995, "grad_norm": 0.6503285765647888, "learning_rate": 2.4532291622436416e-05, "loss": 0.0727, "step": 20737 }, { "epoch": 6.339957199633139, "grad_norm": 0.22250650823116302, "learning_rate": 2.4531867012016475e-05, "loss": 0.0965, "step": 20738 }, { "epoch": 6.3402629165392845, "grad_norm": 0.35069698095321655, "learning_rate": 2.4531442401596537e-05, "loss": 0.0936, "step": 20739 }, { "epoch": 6.34056863344543, "grad_norm": 0.8836945295333862, "learning_rate": 2.4531017791176595e-05, "loss": 0.143, "step": 20740 }, { "epoch": 6.340874350351575, "grad_norm": 0.894981324672699, "learning_rate": 2.4530593180756658e-05, "loss": 0.0915, "step": 20741 }, { "epoch": 6.341180067257719, "grad_norm": 0.35233303904533386, "learning_rate": 2.4530168570336716e-05, "loss": 0.134, "step": 20742 }, { "epoch": 6.341485784163864, "grad_norm": 1.5254148244857788, "learning_rate": 2.452974395991678e-05, "loss": 0.1609, "step": 20743 }, { "epoch": 6.341791501070009, "grad_norm": 2.1809418201446533, "learning_rate": 2.4529319349496837e-05, "loss": 0.1462, "step": 20744 }, { "epoch": 6.342097217976154, "grad_norm": 0.7249935269355774, "learning_rate": 2.45288947390769e-05, "loss": 0.1651, "step": 20745 }, { "epoch": 6.3424029348822994, "grad_norm": 0.6359952092170715, "learning_rate": 2.4528470128656958e-05, "loss": 0.1714, "step": 20746 }, { "epoch": 6.342708651788444, "grad_norm": 1.4797629117965698, "learning_rate": 2.4528045518237017e-05, "loss": 0.1768, "step": 20747 }, { "epoch": 6.343014368694589, "grad_norm": 0.7335125803947449, "learning_rate": 2.452762090781708e-05, "loss": 0.1703, "step": 20748 }, { "epoch": 6.343320085600734, "grad_norm": 3.1622724533081055, "learning_rate": 2.4527196297397137e-05, "loss": 0.1895, "step": 20749 }, { "epoch": 6.343625802506879, "grad_norm": 0.7952778935432434, "learning_rate": 2.45267716869772e-05, "loss": 0.2027, "step": 20750 }, { "epoch": 6.343931519413023, "grad_norm": 1.5136553049087524, "learning_rate": 2.4526347076557258e-05, "loss": 0.2534, "step": 20751 }, { "epoch": 6.344237236319168, "grad_norm": 0.5944880247116089, "learning_rate": 2.452592246613732e-05, "loss": 0.1391, "step": 20752 }, { "epoch": 6.3445429532253135, "grad_norm": 0.18037955462932587, "learning_rate": 2.452549785571738e-05, "loss": 0.0709, "step": 20753 }, { "epoch": 6.344848670131459, "grad_norm": 0.32461684942245483, "learning_rate": 2.452507324529744e-05, "loss": 0.1001, "step": 20754 }, { "epoch": 6.345154387037603, "grad_norm": 0.193599671125412, "learning_rate": 2.45246486348775e-05, "loss": 0.0488, "step": 20755 }, { "epoch": 6.345460103943748, "grad_norm": 0.9051923155784607, "learning_rate": 2.4524224024457562e-05, "loss": 0.0555, "step": 20756 }, { "epoch": 6.345765820849893, "grad_norm": 0.7588847279548645, "learning_rate": 2.452379941403762e-05, "loss": 0.0435, "step": 20757 }, { "epoch": 6.346071537756038, "grad_norm": 0.4420318901538849, "learning_rate": 2.4523374803617683e-05, "loss": 0.0526, "step": 20758 }, { "epoch": 6.346377254662183, "grad_norm": 0.8531930446624756, "learning_rate": 2.452295019319774e-05, "loss": 0.0436, "step": 20759 }, { "epoch": 6.3466829715683275, "grad_norm": 0.33859848976135254, "learning_rate": 2.45225255827778e-05, "loss": 0.079, "step": 20760 }, { "epoch": 6.346988688474473, "grad_norm": 1.8424781560897827, "learning_rate": 2.4522100972357862e-05, "loss": 0.053, "step": 20761 }, { "epoch": 6.347294405380618, "grad_norm": 0.42021453380584717, "learning_rate": 2.452167636193792e-05, "loss": 0.0854, "step": 20762 }, { "epoch": 6.347600122286763, "grad_norm": 0.7745544910430908, "learning_rate": 2.4521251751517983e-05, "loss": 0.094, "step": 20763 }, { "epoch": 6.347905839192907, "grad_norm": 0.6826174259185791, "learning_rate": 2.452082714109804e-05, "loss": 0.1036, "step": 20764 }, { "epoch": 6.348211556099052, "grad_norm": 0.41461485624313354, "learning_rate": 2.4520402530678104e-05, "loss": 0.1018, "step": 20765 }, { "epoch": 6.348517273005197, "grad_norm": 0.46224361658096313, "learning_rate": 2.4519977920258162e-05, "loss": 0.1255, "step": 20766 }, { "epoch": 6.348822989911342, "grad_norm": 0.7324396967887878, "learning_rate": 2.4519553309838224e-05, "loss": 0.1259, "step": 20767 }, { "epoch": 6.349128706817487, "grad_norm": 0.4478324055671692, "learning_rate": 2.4519128699418283e-05, "loss": 0.1419, "step": 20768 }, { "epoch": 6.349434423723632, "grad_norm": 1.0068215131759644, "learning_rate": 2.4518704088998345e-05, "loss": 0.1592, "step": 20769 }, { "epoch": 6.349740140629777, "grad_norm": 0.8977115750312805, "learning_rate": 2.4518279478578404e-05, "loss": 0.2058, "step": 20770 }, { "epoch": 6.350045857535922, "grad_norm": 0.8925197720527649, "learning_rate": 2.4517854868158466e-05, "loss": 0.167, "step": 20771 }, { "epoch": 6.350351574442067, "grad_norm": 0.7771360874176025, "learning_rate": 2.4517430257738525e-05, "loss": 0.1556, "step": 20772 }, { "epoch": 6.350657291348211, "grad_norm": 1.0169256925582886, "learning_rate": 2.4517005647318583e-05, "loss": 0.1699, "step": 20773 }, { "epoch": 6.3509630082543564, "grad_norm": 0.9958142042160034, "learning_rate": 2.4516581036898645e-05, "loss": 0.1904, "step": 20774 }, { "epoch": 6.351268725160502, "grad_norm": 1.563978672027588, "learning_rate": 2.4516156426478704e-05, "loss": 0.1852, "step": 20775 }, { "epoch": 6.351574442066647, "grad_norm": 1.5779340267181396, "learning_rate": 2.4515731816058766e-05, "loss": 0.2279, "step": 20776 }, { "epoch": 6.351880158972791, "grad_norm": 0.47913533449172974, "learning_rate": 2.4515307205638825e-05, "loss": 0.1481, "step": 20777 }, { "epoch": 6.352185875878936, "grad_norm": 0.29675450921058655, "learning_rate": 2.4514882595218887e-05, "loss": 0.0922, "step": 20778 }, { "epoch": 6.352491592785081, "grad_norm": 0.33241361379623413, "learning_rate": 2.4514457984798946e-05, "loss": 0.0605, "step": 20779 }, { "epoch": 6.352797309691226, "grad_norm": 0.2878945469856262, "learning_rate": 2.4514033374379008e-05, "loss": 0.0548, "step": 20780 }, { "epoch": 6.3531030265973705, "grad_norm": 0.5336966514587402, "learning_rate": 2.4513608763959067e-05, "loss": 0.0523, "step": 20781 }, { "epoch": 6.353408743503516, "grad_norm": 0.2831367254257202, "learning_rate": 2.451318415353913e-05, "loss": 0.0759, "step": 20782 }, { "epoch": 6.353714460409661, "grad_norm": 0.6436519026756287, "learning_rate": 2.4512759543119187e-05, "loss": 0.0522, "step": 20783 }, { "epoch": 6.354020177315806, "grad_norm": 0.4807453453540802, "learning_rate": 2.4512334932699246e-05, "loss": 0.0922, "step": 20784 }, { "epoch": 6.354325894221951, "grad_norm": 0.21853721141815186, "learning_rate": 2.4511910322279308e-05, "loss": 0.0617, "step": 20785 }, { "epoch": 6.354631611128095, "grad_norm": 0.25355514883995056, "learning_rate": 2.4511485711859367e-05, "loss": 0.0663, "step": 20786 }, { "epoch": 6.35493732803424, "grad_norm": 0.35892587900161743, "learning_rate": 2.451106110143943e-05, "loss": 0.087, "step": 20787 }, { "epoch": 6.355243044940385, "grad_norm": 0.2573871910572052, "learning_rate": 2.4510636491019488e-05, "loss": 0.0865, "step": 20788 }, { "epoch": 6.3555487618465305, "grad_norm": 0.444635808467865, "learning_rate": 2.451021188059955e-05, "loss": 0.09, "step": 20789 }, { "epoch": 6.355854478752675, "grad_norm": 0.5155428647994995, "learning_rate": 2.450978727017961e-05, "loss": 0.1107, "step": 20790 }, { "epoch": 6.35616019565882, "grad_norm": 1.15459406375885, "learning_rate": 2.450936265975967e-05, "loss": 0.1055, "step": 20791 }, { "epoch": 6.356465912564965, "grad_norm": 1.2904051542282104, "learning_rate": 2.450893804933973e-05, "loss": 0.1346, "step": 20792 }, { "epoch": 6.35677162947111, "grad_norm": 0.6227739453315735, "learning_rate": 2.450851343891979e-05, "loss": 0.1657, "step": 20793 }, { "epoch": 6.357077346377254, "grad_norm": 0.9869690537452698, "learning_rate": 2.450808882849985e-05, "loss": 0.1448, "step": 20794 }, { "epoch": 6.357383063283399, "grad_norm": 0.746314525604248, "learning_rate": 2.4507664218079912e-05, "loss": 0.1544, "step": 20795 }, { "epoch": 6.3576887801895445, "grad_norm": 0.612938404083252, "learning_rate": 2.4507239607659974e-05, "loss": 0.1968, "step": 20796 }, { "epoch": 6.35799449709569, "grad_norm": 1.0481189489364624, "learning_rate": 2.4506814997240033e-05, "loss": 0.1601, "step": 20797 }, { "epoch": 6.358300214001835, "grad_norm": 0.8710691332817078, "learning_rate": 2.4506390386820095e-05, "loss": 0.1592, "step": 20798 }, { "epoch": 6.358605930907979, "grad_norm": 0.7082776427268982, "learning_rate": 2.4505965776400154e-05, "loss": 0.1833, "step": 20799 }, { "epoch": 6.358911647814124, "grad_norm": 1.2541494369506836, "learning_rate": 2.4505541165980216e-05, "loss": 0.1659, "step": 20800 }, { "epoch": 6.359217364720269, "grad_norm": 2.1047730445861816, "learning_rate": 2.4505116555560274e-05, "loss": 0.2303, "step": 20801 }, { "epoch": 6.359523081626414, "grad_norm": 0.48106300830841064, "learning_rate": 2.4504691945140337e-05, "loss": 0.1571, "step": 20802 }, { "epoch": 6.359828798532559, "grad_norm": 0.7652716040611267, "learning_rate": 2.4504267334720395e-05, "loss": 0.0896, "step": 20803 }, { "epoch": 6.360134515438704, "grad_norm": 0.44108498096466064, "learning_rate": 2.4503842724300457e-05, "loss": 0.0902, "step": 20804 }, { "epoch": 6.360440232344849, "grad_norm": 0.14487072825431824, "learning_rate": 2.4503418113880516e-05, "loss": 0.0515, "step": 20805 }, { "epoch": 6.360745949250994, "grad_norm": 0.26723524928092957, "learning_rate": 2.4502993503460578e-05, "loss": 0.0784, "step": 20806 }, { "epoch": 6.361051666157138, "grad_norm": 0.6172659397125244, "learning_rate": 2.4502568893040637e-05, "loss": 0.0671, "step": 20807 }, { "epoch": 6.361357383063283, "grad_norm": 0.18334268033504486, "learning_rate": 2.45021442826207e-05, "loss": 0.0473, "step": 20808 }, { "epoch": 6.361663099969428, "grad_norm": 0.3601658046245575, "learning_rate": 2.4501719672200758e-05, "loss": 0.0653, "step": 20809 }, { "epoch": 6.3619688168755735, "grad_norm": 0.24478258192539215, "learning_rate": 2.4501295061780816e-05, "loss": 0.0625, "step": 20810 }, { "epoch": 6.362274533781719, "grad_norm": 0.1743299514055252, "learning_rate": 2.450087045136088e-05, "loss": 0.0646, "step": 20811 }, { "epoch": 6.362580250687863, "grad_norm": 0.201565682888031, "learning_rate": 2.4500445840940937e-05, "loss": 0.0732, "step": 20812 }, { "epoch": 6.362885967594008, "grad_norm": 1.6817764043807983, "learning_rate": 2.4500021230521e-05, "loss": 0.085, "step": 20813 }, { "epoch": 6.363191684500153, "grad_norm": 0.2865011692047119, "learning_rate": 2.4499596620101058e-05, "loss": 0.0955, "step": 20814 }, { "epoch": 6.363497401406298, "grad_norm": 1.1337571144104004, "learning_rate": 2.449917200968112e-05, "loss": 0.1188, "step": 20815 }, { "epoch": 6.363803118312442, "grad_norm": 0.7490277290344238, "learning_rate": 2.449874739926118e-05, "loss": 0.0948, "step": 20816 }, { "epoch": 6.3641088352185875, "grad_norm": 0.6772965788841248, "learning_rate": 2.449832278884124e-05, "loss": 0.1351, "step": 20817 }, { "epoch": 6.364414552124733, "grad_norm": 0.67130047082901, "learning_rate": 2.44978981784213e-05, "loss": 0.1329, "step": 20818 }, { "epoch": 6.364720269030878, "grad_norm": 0.5600557923316956, "learning_rate": 2.449747356800136e-05, "loss": 0.1373, "step": 20819 }, { "epoch": 6.365025985937022, "grad_norm": 0.7295796275138855, "learning_rate": 2.449704895758142e-05, "loss": 0.1858, "step": 20820 }, { "epoch": 6.365331702843167, "grad_norm": 0.5538411140441895, "learning_rate": 2.4496624347161482e-05, "loss": 0.1718, "step": 20821 }, { "epoch": 6.365637419749312, "grad_norm": 2.0557425022125244, "learning_rate": 2.449619973674154e-05, "loss": 0.1529, "step": 20822 }, { "epoch": 6.365943136655457, "grad_norm": 1.2598568201065063, "learning_rate": 2.44957751263216e-05, "loss": 0.18, "step": 20823 }, { "epoch": 6.366248853561602, "grad_norm": 1.1065735816955566, "learning_rate": 2.4495350515901662e-05, "loss": 0.1769, "step": 20824 }, { "epoch": 6.366554570467747, "grad_norm": 0.6158260107040405, "learning_rate": 2.449492590548172e-05, "loss": 0.1542, "step": 20825 }, { "epoch": 6.366860287373892, "grad_norm": 3.604067802429199, "learning_rate": 2.4494501295061783e-05, "loss": 0.2732, "step": 20826 }, { "epoch": 6.367166004280037, "grad_norm": 0.3545355796813965, "learning_rate": 2.449407668464184e-05, "loss": 0.1425, "step": 20827 }, { "epoch": 6.367471721186182, "grad_norm": 0.34290552139282227, "learning_rate": 2.4493652074221903e-05, "loss": 0.0969, "step": 20828 }, { "epoch": 6.367777438092326, "grad_norm": 0.5364081859588623, "learning_rate": 2.4493227463801962e-05, "loss": 0.059, "step": 20829 }, { "epoch": 6.368083154998471, "grad_norm": 0.22025887668132782, "learning_rate": 2.4492802853382024e-05, "loss": 0.0564, "step": 20830 }, { "epoch": 6.368388871904616, "grad_norm": 0.45138081908226013, "learning_rate": 2.4492378242962083e-05, "loss": 0.0575, "step": 20831 }, { "epoch": 6.368694588810762, "grad_norm": 0.29509344696998596, "learning_rate": 2.4491953632542145e-05, "loss": 0.0443, "step": 20832 }, { "epoch": 6.369000305716906, "grad_norm": 0.7519261837005615, "learning_rate": 2.4491529022122204e-05, "loss": 0.0554, "step": 20833 }, { "epoch": 6.369306022623051, "grad_norm": 0.2366943359375, "learning_rate": 2.4491104411702266e-05, "loss": 0.0538, "step": 20834 }, { "epoch": 6.369611739529196, "grad_norm": 0.7248708009719849, "learning_rate": 2.4490679801282324e-05, "loss": 0.0641, "step": 20835 }, { "epoch": 6.369917456435341, "grad_norm": 0.4046768248081207, "learning_rate": 2.4490255190862383e-05, "loss": 0.0663, "step": 20836 }, { "epoch": 6.370223173341486, "grad_norm": 0.6326362490653992, "learning_rate": 2.4489830580442445e-05, "loss": 0.0822, "step": 20837 }, { "epoch": 6.3705288902476305, "grad_norm": 0.6660568118095398, "learning_rate": 2.4489405970022504e-05, "loss": 0.0764, "step": 20838 }, { "epoch": 6.370834607153776, "grad_norm": 0.5228462815284729, "learning_rate": 2.4488981359602566e-05, "loss": 0.0939, "step": 20839 }, { "epoch": 6.371140324059921, "grad_norm": 1.2988239526748657, "learning_rate": 2.4488556749182625e-05, "loss": 0.139, "step": 20840 }, { "epoch": 6.371446040966066, "grad_norm": 0.5169496536254883, "learning_rate": 2.4488132138762687e-05, "loss": 0.1294, "step": 20841 }, { "epoch": 6.37175175787221, "grad_norm": 0.9262478351593018, "learning_rate": 2.4487707528342745e-05, "loss": 0.1359, "step": 20842 }, { "epoch": 6.372057474778355, "grad_norm": 0.42597657442092896, "learning_rate": 2.4487282917922808e-05, "loss": 0.1927, "step": 20843 }, { "epoch": 6.3723631916845, "grad_norm": 0.8654471039772034, "learning_rate": 2.4486858307502866e-05, "loss": 0.1342, "step": 20844 }, { "epoch": 6.372668908590645, "grad_norm": 0.9644010663032532, "learning_rate": 2.448643369708293e-05, "loss": 0.1697, "step": 20845 }, { "epoch": 6.37297462549679, "grad_norm": 0.9850157499313354, "learning_rate": 2.4486009086662987e-05, "loss": 0.2122, "step": 20846 }, { "epoch": 6.373280342402935, "grad_norm": 0.9532856941223145, "learning_rate": 2.448558447624305e-05, "loss": 0.1656, "step": 20847 }, { "epoch": 6.37358605930908, "grad_norm": 0.9923420548439026, "learning_rate": 2.4485159865823108e-05, "loss": 0.1798, "step": 20848 }, { "epoch": 6.373891776215225, "grad_norm": 0.9938650727272034, "learning_rate": 2.4484735255403167e-05, "loss": 0.1582, "step": 20849 }, { "epoch": 6.37419749312137, "grad_norm": 0.8436753749847412, "learning_rate": 2.448431064498323e-05, "loss": 0.1749, "step": 20850 }, { "epoch": 6.374503210027514, "grad_norm": 1.0561366081237793, "learning_rate": 2.4483886034563287e-05, "loss": 0.2199, "step": 20851 }, { "epoch": 6.374808926933659, "grad_norm": 0.6752643585205078, "learning_rate": 2.448346142414335e-05, "loss": 0.1148, "step": 20852 }, { "epoch": 6.3751146438398045, "grad_norm": 0.38259372115135193, "learning_rate": 2.4483036813723408e-05, "loss": 0.0711, "step": 20853 }, { "epoch": 6.37542036074595, "grad_norm": 0.8574398159980774, "learning_rate": 2.448261220330347e-05, "loss": 0.0559, "step": 20854 }, { "epoch": 6.375726077652094, "grad_norm": 0.6379141807556152, "learning_rate": 2.448218759288353e-05, "loss": 0.0487, "step": 20855 }, { "epoch": 6.376031794558239, "grad_norm": 0.16213653981685638, "learning_rate": 2.448176298246359e-05, "loss": 0.0311, "step": 20856 }, { "epoch": 6.376337511464384, "grad_norm": 0.5738499164581299, "learning_rate": 2.448133837204365e-05, "loss": 0.0858, "step": 20857 }, { "epoch": 6.376643228370529, "grad_norm": 0.48762479424476624, "learning_rate": 2.4480913761623712e-05, "loss": 0.0511, "step": 20858 }, { "epoch": 6.376948945276673, "grad_norm": 0.19697903096675873, "learning_rate": 2.448048915120377e-05, "loss": 0.0414, "step": 20859 }, { "epoch": 6.377254662182819, "grad_norm": 0.3272898197174072, "learning_rate": 2.4480064540783833e-05, "loss": 0.068, "step": 20860 }, { "epoch": 6.377560379088964, "grad_norm": 0.4670545160770416, "learning_rate": 2.447963993036389e-05, "loss": 0.0515, "step": 20861 }, { "epoch": 6.377866095995109, "grad_norm": 0.4414689540863037, "learning_rate": 2.447921531994395e-05, "loss": 0.105, "step": 20862 }, { "epoch": 6.378171812901254, "grad_norm": 0.2731935977935791, "learning_rate": 2.4478790709524012e-05, "loss": 0.0802, "step": 20863 }, { "epoch": 6.378477529807398, "grad_norm": 0.3282011151313782, "learning_rate": 2.447836609910407e-05, "loss": 0.0784, "step": 20864 }, { "epoch": 6.378783246713543, "grad_norm": 2.73058819770813, "learning_rate": 2.4477941488684133e-05, "loss": 0.0896, "step": 20865 }, { "epoch": 6.379088963619688, "grad_norm": 0.6785876750946045, "learning_rate": 2.447751687826419e-05, "loss": 0.127, "step": 20866 }, { "epoch": 6.3793946805258335, "grad_norm": 0.46412205696105957, "learning_rate": 2.4477092267844254e-05, "loss": 0.1528, "step": 20867 }, { "epoch": 6.379700397431978, "grad_norm": 0.7207449674606323, "learning_rate": 2.4476667657424312e-05, "loss": 0.1521, "step": 20868 }, { "epoch": 6.380006114338123, "grad_norm": 0.7529546618461609, "learning_rate": 2.4476243047004374e-05, "loss": 0.1508, "step": 20869 }, { "epoch": 6.380311831244268, "grad_norm": 1.1604433059692383, "learning_rate": 2.4475818436584433e-05, "loss": 0.1871, "step": 20870 }, { "epoch": 6.380617548150413, "grad_norm": 0.6885167956352234, "learning_rate": 2.4475393826164495e-05, "loss": 0.1978, "step": 20871 }, { "epoch": 6.380923265056557, "grad_norm": 1.1538333892822266, "learning_rate": 2.4474969215744554e-05, "loss": 0.1497, "step": 20872 }, { "epoch": 6.381228981962702, "grad_norm": 0.6469218730926514, "learning_rate": 2.4474544605324616e-05, "loss": 0.1705, "step": 20873 }, { "epoch": 6.3815346988688475, "grad_norm": 0.9707721471786499, "learning_rate": 2.4474119994904675e-05, "loss": 0.1643, "step": 20874 }, { "epoch": 6.381840415774993, "grad_norm": 0.6740341782569885, "learning_rate": 2.4473695384484733e-05, "loss": 0.2088, "step": 20875 }, { "epoch": 6.382146132681138, "grad_norm": 4.004771709442139, "learning_rate": 2.4473270774064796e-05, "loss": 0.2795, "step": 20876 }, { "epoch": 6.382451849587282, "grad_norm": 0.9554669857025146, "learning_rate": 2.4472846163644854e-05, "loss": 0.124, "step": 20877 }, { "epoch": 6.382757566493427, "grad_norm": 0.22563211619853973, "learning_rate": 2.4472421553224916e-05, "loss": 0.0717, "step": 20878 }, { "epoch": 6.383063283399572, "grad_norm": 0.4400072395801544, "learning_rate": 2.4471996942804975e-05, "loss": 0.0517, "step": 20879 }, { "epoch": 6.383369000305717, "grad_norm": 0.22288772463798523, "learning_rate": 2.4471572332385037e-05, "loss": 0.0577, "step": 20880 }, { "epoch": 6.3836747172118615, "grad_norm": 0.6335301995277405, "learning_rate": 2.4471147721965096e-05, "loss": 0.0533, "step": 20881 }, { "epoch": 6.383980434118007, "grad_norm": 0.29078352451324463, "learning_rate": 2.4470723111545158e-05, "loss": 0.0641, "step": 20882 }, { "epoch": 6.384286151024152, "grad_norm": 0.9629629850387573, "learning_rate": 2.4470298501125217e-05, "loss": 0.0722, "step": 20883 }, { "epoch": 6.384591867930297, "grad_norm": 0.15762904286384583, "learning_rate": 2.446987389070528e-05, "loss": 0.0444, "step": 20884 }, { "epoch": 6.384897584836441, "grad_norm": 0.3288898766040802, "learning_rate": 2.4469449280285337e-05, "loss": 0.0728, "step": 20885 }, { "epoch": 6.385203301742586, "grad_norm": 0.2522473633289337, "learning_rate": 2.44690246698654e-05, "loss": 0.0525, "step": 20886 }, { "epoch": 6.385509018648731, "grad_norm": 0.40743783116340637, "learning_rate": 2.4468600059445458e-05, "loss": 0.0982, "step": 20887 }, { "epoch": 6.385814735554876, "grad_norm": 1.1068310737609863, "learning_rate": 2.4468175449025517e-05, "loss": 0.0771, "step": 20888 }, { "epoch": 6.386120452461022, "grad_norm": 0.2642365097999573, "learning_rate": 2.446775083860558e-05, "loss": 0.1148, "step": 20889 }, { "epoch": 6.386426169367166, "grad_norm": 0.4598744809627533, "learning_rate": 2.4467326228185638e-05, "loss": 0.0833, "step": 20890 }, { "epoch": 6.386731886273311, "grad_norm": 0.8053269386291504, "learning_rate": 2.44669016177657e-05, "loss": 0.1197, "step": 20891 }, { "epoch": 6.387037603179456, "grad_norm": 0.7563912868499756, "learning_rate": 2.446647700734576e-05, "loss": 0.1323, "step": 20892 }, { "epoch": 6.387343320085601, "grad_norm": 0.40227070450782776, "learning_rate": 2.446605239692582e-05, "loss": 0.1708, "step": 20893 }, { "epoch": 6.387649036991745, "grad_norm": 1.0295498371124268, "learning_rate": 2.446562778650588e-05, "loss": 0.1564, "step": 20894 }, { "epoch": 6.3879547538978905, "grad_norm": 0.5338943600654602, "learning_rate": 2.446520317608594e-05, "loss": 0.1285, "step": 20895 }, { "epoch": 6.388260470804036, "grad_norm": 0.6048856973648071, "learning_rate": 2.4464778565666e-05, "loss": 0.1666, "step": 20896 }, { "epoch": 6.388566187710181, "grad_norm": 0.7623575329780579, "learning_rate": 2.4464353955246062e-05, "loss": 0.1865, "step": 20897 }, { "epoch": 6.388871904616325, "grad_norm": 1.2240349054336548, "learning_rate": 2.4463929344826124e-05, "loss": 0.1771, "step": 20898 }, { "epoch": 6.38917762152247, "grad_norm": 0.5403316020965576, "learning_rate": 2.4463504734406183e-05, "loss": 0.1766, "step": 20899 }, { "epoch": 6.389483338428615, "grad_norm": 0.9294285178184509, "learning_rate": 2.4463080123986245e-05, "loss": 0.1696, "step": 20900 }, { "epoch": 6.38978905533476, "grad_norm": 1.5928575992584229, "learning_rate": 2.4462655513566304e-05, "loss": 0.2068, "step": 20901 }, { "epoch": 6.390094772240905, "grad_norm": 0.4389861226081848, "learning_rate": 2.4462230903146366e-05, "loss": 0.1273, "step": 20902 }, { "epoch": 6.39040048914705, "grad_norm": 0.18371862173080444, "learning_rate": 2.4461806292726424e-05, "loss": 0.0728, "step": 20903 }, { "epoch": 6.390706206053195, "grad_norm": 0.4088149964809418, "learning_rate": 2.4461381682306487e-05, "loss": 0.0448, "step": 20904 }, { "epoch": 6.39101192295934, "grad_norm": 0.2643793225288391, "learning_rate": 2.4460957071886545e-05, "loss": 0.0601, "step": 20905 }, { "epoch": 6.391317639865485, "grad_norm": 0.1651867777109146, "learning_rate": 2.4460532461466607e-05, "loss": 0.0556, "step": 20906 }, { "epoch": 6.391623356771629, "grad_norm": 0.24067145586013794, "learning_rate": 2.4460107851046666e-05, "loss": 0.0343, "step": 20907 }, { "epoch": 6.391929073677774, "grad_norm": 0.30842170119285583, "learning_rate": 2.4459683240626728e-05, "loss": 0.0743, "step": 20908 }, { "epoch": 6.392234790583919, "grad_norm": 0.2775842845439911, "learning_rate": 2.4459258630206787e-05, "loss": 0.0466, "step": 20909 }, { "epoch": 6.3925405074900645, "grad_norm": 1.0951910018920898, "learning_rate": 2.445883401978685e-05, "loss": 0.0737, "step": 20910 }, { "epoch": 6.392846224396209, "grad_norm": 0.43480005860328674, "learning_rate": 2.4458409409366908e-05, "loss": 0.0642, "step": 20911 }, { "epoch": 6.393151941302354, "grad_norm": 0.6323080062866211, "learning_rate": 2.4457984798946966e-05, "loss": 0.1166, "step": 20912 }, { "epoch": 6.393457658208499, "grad_norm": 0.2505916655063629, "learning_rate": 2.445756018852703e-05, "loss": 0.0707, "step": 20913 }, { "epoch": 6.393763375114644, "grad_norm": 0.2705225944519043, "learning_rate": 2.4457135578107087e-05, "loss": 0.0784, "step": 20914 }, { "epoch": 6.394069092020789, "grad_norm": 0.4187723696231842, "learning_rate": 2.445671096768715e-05, "loss": 0.1164, "step": 20915 }, { "epoch": 6.394374808926933, "grad_norm": 0.5521932244300842, "learning_rate": 2.4456286357267208e-05, "loss": 0.1174, "step": 20916 }, { "epoch": 6.3946805258330786, "grad_norm": 0.820152223110199, "learning_rate": 2.445586174684727e-05, "loss": 0.1305, "step": 20917 }, { "epoch": 6.394986242739224, "grad_norm": 0.5626426935195923, "learning_rate": 2.445543713642733e-05, "loss": 0.1485, "step": 20918 }, { "epoch": 6.395291959645369, "grad_norm": 0.4893423020839691, "learning_rate": 2.445501252600739e-05, "loss": 0.1516, "step": 20919 }, { "epoch": 6.395597676551513, "grad_norm": 1.0744307041168213, "learning_rate": 2.445458791558745e-05, "loss": 0.1577, "step": 20920 }, { "epoch": 6.395903393457658, "grad_norm": 1.0428416728973389, "learning_rate": 2.445416330516751e-05, "loss": 0.1917, "step": 20921 }, { "epoch": 6.396209110363803, "grad_norm": 0.792060911655426, "learning_rate": 2.445373869474757e-05, "loss": 0.1863, "step": 20922 }, { "epoch": 6.396514827269948, "grad_norm": 0.6615444421768188, "learning_rate": 2.4453314084327632e-05, "loss": 0.1877, "step": 20923 }, { "epoch": 6.396820544176093, "grad_norm": 3.0174639225006104, "learning_rate": 2.445288947390769e-05, "loss": 0.1908, "step": 20924 }, { "epoch": 6.397126261082238, "grad_norm": 2.803884744644165, "learning_rate": 2.445246486348775e-05, "loss": 0.2222, "step": 20925 }, { "epoch": 6.397431977988383, "grad_norm": 1.2058576345443726, "learning_rate": 2.4452040253067812e-05, "loss": 0.2077, "step": 20926 }, { "epoch": 6.397737694894528, "grad_norm": 0.34948837757110596, "learning_rate": 2.445161564264787e-05, "loss": 0.123, "step": 20927 }, { "epoch": 6.398043411800673, "grad_norm": 0.5455261468887329, "learning_rate": 2.4451191032227933e-05, "loss": 0.0708, "step": 20928 }, { "epoch": 6.398349128706817, "grad_norm": 0.3028753697872162, "learning_rate": 2.445076642180799e-05, "loss": 0.0672, "step": 20929 }, { "epoch": 6.398654845612962, "grad_norm": 0.3303554952144623, "learning_rate": 2.4450341811388053e-05, "loss": 0.0651, "step": 20930 }, { "epoch": 6.3989605625191075, "grad_norm": 0.1941302865743637, "learning_rate": 2.4449917200968112e-05, "loss": 0.054, "step": 20931 }, { "epoch": 6.399266279425253, "grad_norm": 0.33486422896385193, "learning_rate": 2.4449492590548174e-05, "loss": 0.0622, "step": 20932 }, { "epoch": 6.399571996331397, "grad_norm": 2.237304449081421, "learning_rate": 2.4449067980128233e-05, "loss": 0.0391, "step": 20933 }, { "epoch": 6.399877713237542, "grad_norm": 0.2167193442583084, "learning_rate": 2.4448643369708295e-05, "loss": 0.0515, "step": 20934 }, { "epoch": 6.400183430143687, "grad_norm": 0.7899556159973145, "learning_rate": 2.4448218759288354e-05, "loss": 0.0855, "step": 20935 }, { "epoch": 6.400489147049832, "grad_norm": 0.5321210026741028, "learning_rate": 2.4447794148868416e-05, "loss": 0.0595, "step": 20936 }, { "epoch": 6.400794863955976, "grad_norm": 0.4954352080821991, "learning_rate": 2.4447369538448474e-05, "loss": 0.0978, "step": 20937 }, { "epoch": 6.4011005808621215, "grad_norm": 0.8271102905273438, "learning_rate": 2.4446944928028533e-05, "loss": 0.107, "step": 20938 }, { "epoch": 6.401406297768267, "grad_norm": 0.23471355438232422, "learning_rate": 2.4446520317608595e-05, "loss": 0.0693, "step": 20939 }, { "epoch": 6.401712014674412, "grad_norm": 0.5535313487052917, "learning_rate": 2.4446095707188654e-05, "loss": 0.1064, "step": 20940 }, { "epoch": 6.402017731580557, "grad_norm": 0.9927694201469421, "learning_rate": 2.4445671096768716e-05, "loss": 0.1368, "step": 20941 }, { "epoch": 6.402323448486701, "grad_norm": 0.5947354435920715, "learning_rate": 2.4445246486348775e-05, "loss": 0.1348, "step": 20942 }, { "epoch": 6.402629165392846, "grad_norm": 0.7741236090660095, "learning_rate": 2.4444821875928837e-05, "loss": 0.1458, "step": 20943 }, { "epoch": 6.402934882298991, "grad_norm": 1.5347063541412354, "learning_rate": 2.4444397265508896e-05, "loss": 0.1626, "step": 20944 }, { "epoch": 6.403240599205136, "grad_norm": 0.4697580635547638, "learning_rate": 2.4443972655088958e-05, "loss": 0.1674, "step": 20945 }, { "epoch": 6.403546316111281, "grad_norm": 0.8594868183135986, "learning_rate": 2.4443548044669016e-05, "loss": 0.1731, "step": 20946 }, { "epoch": 6.403852033017426, "grad_norm": 0.8566654324531555, "learning_rate": 2.444312343424908e-05, "loss": 0.1829, "step": 20947 }, { "epoch": 6.404157749923571, "grad_norm": 0.840917706489563, "learning_rate": 2.4442698823829137e-05, "loss": 0.1875, "step": 20948 }, { "epoch": 6.404463466829716, "grad_norm": 2.012878179550171, "learning_rate": 2.44422742134092e-05, "loss": 0.178, "step": 20949 }, { "epoch": 6.40476918373586, "grad_norm": 0.996599555015564, "learning_rate": 2.4441849602989258e-05, "loss": 0.1991, "step": 20950 }, { "epoch": 6.405074900642005, "grad_norm": 2.073925733566284, "learning_rate": 2.4441424992569317e-05, "loss": 0.2642, "step": 20951 }, { "epoch": 6.4053806175481505, "grad_norm": 0.4770854711532593, "learning_rate": 2.444100038214938e-05, "loss": 0.124, "step": 20952 }, { "epoch": 6.405686334454296, "grad_norm": 1.6834198236465454, "learning_rate": 2.4440575771729437e-05, "loss": 0.0926, "step": 20953 }, { "epoch": 6.405992051360441, "grad_norm": 1.2110487222671509, "learning_rate": 2.44401511613095e-05, "loss": 0.0647, "step": 20954 }, { "epoch": 6.406297768266585, "grad_norm": 0.8055396676063538, "learning_rate": 2.4439726550889558e-05, "loss": 0.0705, "step": 20955 }, { "epoch": 6.40660348517273, "grad_norm": 0.2481374591588974, "learning_rate": 2.443930194046962e-05, "loss": 0.0478, "step": 20956 }, { "epoch": 6.406909202078875, "grad_norm": 0.3532992899417877, "learning_rate": 2.443887733004968e-05, "loss": 0.0529, "step": 20957 }, { "epoch": 6.40721491898502, "grad_norm": 0.40429946780204773, "learning_rate": 2.443845271962974e-05, "loss": 0.0537, "step": 20958 }, { "epoch": 6.4075206358911645, "grad_norm": 0.8092073798179626, "learning_rate": 2.44380281092098e-05, "loss": 0.071, "step": 20959 }, { "epoch": 6.40782635279731, "grad_norm": 0.25486835837364197, "learning_rate": 2.4437603498789862e-05, "loss": 0.0741, "step": 20960 }, { "epoch": 6.408132069703455, "grad_norm": 0.24176417291164398, "learning_rate": 2.443717888836992e-05, "loss": 0.0421, "step": 20961 }, { "epoch": 6.4084377866096, "grad_norm": 1.0445218086242676, "learning_rate": 2.4436754277949983e-05, "loss": 0.087, "step": 20962 }, { "epoch": 6.408743503515744, "grad_norm": 0.25016719102859497, "learning_rate": 2.443632966753004e-05, "loss": 0.0715, "step": 20963 }, { "epoch": 6.409049220421889, "grad_norm": 0.5545873045921326, "learning_rate": 2.44359050571101e-05, "loss": 0.0758, "step": 20964 }, { "epoch": 6.409354937328034, "grad_norm": 0.4359446167945862, "learning_rate": 2.4435480446690162e-05, "loss": 0.1114, "step": 20965 }, { "epoch": 6.409660654234179, "grad_norm": 0.9792553186416626, "learning_rate": 2.443505583627022e-05, "loss": 0.1338, "step": 20966 }, { "epoch": 6.4099663711403245, "grad_norm": 1.0446492433547974, "learning_rate": 2.4434631225850283e-05, "loss": 0.108, "step": 20967 }, { "epoch": 6.410272088046469, "grad_norm": 0.6228029131889343, "learning_rate": 2.443420661543034e-05, "loss": 0.1793, "step": 20968 }, { "epoch": 6.410577804952614, "grad_norm": 0.4919849932193756, "learning_rate": 2.4433782005010404e-05, "loss": 0.1443, "step": 20969 }, { "epoch": 6.410883521858759, "grad_norm": 0.7031140923500061, "learning_rate": 2.4433357394590462e-05, "loss": 0.1481, "step": 20970 }, { "epoch": 6.411189238764904, "grad_norm": 0.589214026927948, "learning_rate": 2.4432932784170524e-05, "loss": 0.151, "step": 20971 }, { "epoch": 6.411494955671048, "grad_norm": 0.8298421502113342, "learning_rate": 2.4432508173750583e-05, "loss": 0.1722, "step": 20972 }, { "epoch": 6.411800672577193, "grad_norm": 0.6949973702430725, "learning_rate": 2.4432083563330645e-05, "loss": 0.1682, "step": 20973 }, { "epoch": 6.4121063894833386, "grad_norm": 1.1325324773788452, "learning_rate": 2.4431658952910704e-05, "loss": 0.1936, "step": 20974 }, { "epoch": 6.412412106389484, "grad_norm": 0.7531071901321411, "learning_rate": 2.4431234342490766e-05, "loss": 0.1696, "step": 20975 }, { "epoch": 6.412717823295628, "grad_norm": 1.4907678365707397, "learning_rate": 2.4430809732070825e-05, "loss": 0.233, "step": 20976 }, { "epoch": 6.413023540201773, "grad_norm": 0.4583301544189453, "learning_rate": 2.4430385121650883e-05, "loss": 0.1811, "step": 20977 }, { "epoch": 6.413329257107918, "grad_norm": 0.3427920639514923, "learning_rate": 2.4429960511230946e-05, "loss": 0.0689, "step": 20978 }, { "epoch": 6.413634974014063, "grad_norm": 1.3025399446487427, "learning_rate": 2.4429535900811004e-05, "loss": 0.0815, "step": 20979 }, { "epoch": 6.413940690920208, "grad_norm": 0.5231608748435974, "learning_rate": 2.4429111290391066e-05, "loss": 0.04, "step": 20980 }, { "epoch": 6.414246407826353, "grad_norm": 0.14207690954208374, "learning_rate": 2.4428686679971125e-05, "loss": 0.0351, "step": 20981 }, { "epoch": 6.414552124732498, "grad_norm": 0.24199821054935455, "learning_rate": 2.4428262069551187e-05, "loss": 0.0422, "step": 20982 }, { "epoch": 6.414857841638643, "grad_norm": 0.1719154715538025, "learning_rate": 2.4427837459131246e-05, "loss": 0.0448, "step": 20983 }, { "epoch": 6.415163558544788, "grad_norm": 0.20321805775165558, "learning_rate": 2.4427412848711308e-05, "loss": 0.069, "step": 20984 }, { "epoch": 6.415469275450932, "grad_norm": 0.29996562004089355, "learning_rate": 2.4426988238291367e-05, "loss": 0.0777, "step": 20985 }, { "epoch": 6.415774992357077, "grad_norm": 0.86492919921875, "learning_rate": 2.442656362787143e-05, "loss": 0.0745, "step": 20986 }, { "epoch": 6.416080709263222, "grad_norm": 1.0349539518356323, "learning_rate": 2.4426139017451487e-05, "loss": 0.0628, "step": 20987 }, { "epoch": 6.4163864261693675, "grad_norm": 0.4135187566280365, "learning_rate": 2.442571440703155e-05, "loss": 0.0719, "step": 20988 }, { "epoch": 6.416692143075512, "grad_norm": 0.5115470290184021, "learning_rate": 2.4425289796611608e-05, "loss": 0.1258, "step": 20989 }, { "epoch": 6.416997859981657, "grad_norm": 1.1505101919174194, "learning_rate": 2.4424865186191667e-05, "loss": 0.0993, "step": 20990 }, { "epoch": 6.417303576887802, "grad_norm": 0.3594244718551636, "learning_rate": 2.442444057577173e-05, "loss": 0.124, "step": 20991 }, { "epoch": 6.417609293793947, "grad_norm": 0.3739723861217499, "learning_rate": 2.4424015965351788e-05, "loss": 0.1516, "step": 20992 }, { "epoch": 6.417915010700092, "grad_norm": 0.8124770522117615, "learning_rate": 2.442359135493185e-05, "loss": 0.1643, "step": 20993 }, { "epoch": 6.418220727606236, "grad_norm": 3.407886266708374, "learning_rate": 2.442316674451191e-05, "loss": 0.139, "step": 20994 }, { "epoch": 6.4185264445123815, "grad_norm": 0.5973469614982605, "learning_rate": 2.442274213409197e-05, "loss": 0.1647, "step": 20995 }, { "epoch": 6.418832161418527, "grad_norm": 0.8993066549301147, "learning_rate": 2.442231752367203e-05, "loss": 0.1798, "step": 20996 }, { "epoch": 6.419137878324672, "grad_norm": 0.6276899576187134, "learning_rate": 2.442189291325209e-05, "loss": 0.1815, "step": 20997 }, { "epoch": 6.419443595230816, "grad_norm": 0.9851887822151184, "learning_rate": 2.442146830283215e-05, "loss": 0.1989, "step": 20998 }, { "epoch": 6.419749312136961, "grad_norm": 1.0412907600402832, "learning_rate": 2.4421043692412212e-05, "loss": 0.1803, "step": 20999 }, { "epoch": 6.420055029043106, "grad_norm": 0.9052855372428894, "learning_rate": 2.4420619081992274e-05, "loss": 0.1528, "step": 21000 }, { "epoch": 6.420055029043106, "eval_cer": 0.18867358406162374, "eval_loss": 0.2342015504837036, "eval_runtime": 18.9244, "eval_samples_per_second": 239.796, "eval_steps_per_second": 0.793, "eval_wer": 0.3315829166783051, "step": 21000 }, { "epoch": 6.420360745949251, "grad_norm": 0.900945246219635, "learning_rate": 2.4420194471572336e-05, "loss": 0.2155, "step": 21001 }, { "epoch": 6.4206664628553956, "grad_norm": 0.8385405540466309, "learning_rate": 2.4419769861152395e-05, "loss": 0.1264, "step": 21002 }, { "epoch": 6.420972179761541, "grad_norm": 0.34769541025161743, "learning_rate": 2.4419345250732454e-05, "loss": 0.0706, "step": 21003 }, { "epoch": 6.421277896667686, "grad_norm": 0.6061612367630005, "learning_rate": 2.4418920640312516e-05, "loss": 0.0633, "step": 21004 }, { "epoch": 6.421583613573831, "grad_norm": 0.1728285253047943, "learning_rate": 2.4418496029892574e-05, "loss": 0.0605, "step": 21005 }, { "epoch": 6.421889330479976, "grad_norm": 0.24451500177383423, "learning_rate": 2.4418071419472637e-05, "loss": 0.0472, "step": 21006 }, { "epoch": 6.42219504738612, "grad_norm": 0.5165807604789734, "learning_rate": 2.4417646809052695e-05, "loss": 0.0505, "step": 21007 }, { "epoch": 6.422500764292265, "grad_norm": 0.644522488117218, "learning_rate": 2.4417222198632757e-05, "loss": 0.0682, "step": 21008 }, { "epoch": 6.4228064811984105, "grad_norm": 0.21140913665294647, "learning_rate": 2.4416797588212816e-05, "loss": 0.0474, "step": 21009 }, { "epoch": 6.423112198104556, "grad_norm": 0.7118785381317139, "learning_rate": 2.4416372977792878e-05, "loss": 0.0784, "step": 21010 }, { "epoch": 6.4234179150107, "grad_norm": 0.6465210914611816, "learning_rate": 2.4415948367372937e-05, "loss": 0.0642, "step": 21011 }, { "epoch": 6.423723631916845, "grad_norm": 0.4150407910346985, "learning_rate": 2.4415523756953e-05, "loss": 0.079, "step": 21012 }, { "epoch": 6.42402934882299, "grad_norm": 0.5677318572998047, "learning_rate": 2.4415099146533058e-05, "loss": 0.0764, "step": 21013 }, { "epoch": 6.424335065729135, "grad_norm": 0.3003270924091339, "learning_rate": 2.4414674536113116e-05, "loss": 0.1021, "step": 21014 }, { "epoch": 6.424640782635279, "grad_norm": 2.3295013904571533, "learning_rate": 2.441424992569318e-05, "loss": 0.1211, "step": 21015 }, { "epoch": 6.4249464995414245, "grad_norm": 0.8933654427528381, "learning_rate": 2.4413825315273237e-05, "loss": 0.1065, "step": 21016 }, { "epoch": 6.42525221644757, "grad_norm": 0.7383325099945068, "learning_rate": 2.44134007048533e-05, "loss": 0.1465, "step": 21017 }, { "epoch": 6.425557933353715, "grad_norm": 0.4500420391559601, "learning_rate": 2.4412976094433358e-05, "loss": 0.1624, "step": 21018 }, { "epoch": 6.42586365025986, "grad_norm": 0.5502668619155884, "learning_rate": 2.441255148401342e-05, "loss": 0.1199, "step": 21019 }, { "epoch": 6.426169367166004, "grad_norm": 0.8627225756645203, "learning_rate": 2.441212687359348e-05, "loss": 0.156, "step": 21020 }, { "epoch": 6.426475084072149, "grad_norm": 0.7447921633720398, "learning_rate": 2.441170226317354e-05, "loss": 0.1833, "step": 21021 }, { "epoch": 6.426780800978294, "grad_norm": 0.45281049609184265, "learning_rate": 2.44112776527536e-05, "loss": 0.1595, "step": 21022 }, { "epoch": 6.427086517884439, "grad_norm": 0.952216625213623, "learning_rate": 2.441085304233366e-05, "loss": 0.1702, "step": 21023 }, { "epoch": 6.427392234790584, "grad_norm": 0.7891136407852173, "learning_rate": 2.441042843191372e-05, "loss": 0.159, "step": 21024 }, { "epoch": 6.427697951696729, "grad_norm": 1.0049792528152466, "learning_rate": 2.4410003821493782e-05, "loss": 0.2175, "step": 21025 }, { "epoch": 6.428003668602874, "grad_norm": 1.150678277015686, "learning_rate": 2.440957921107384e-05, "loss": 0.2293, "step": 21026 }, { "epoch": 6.428309385509019, "grad_norm": 0.39033812284469604, "learning_rate": 2.44091546006539e-05, "loss": 0.1411, "step": 21027 }, { "epoch": 6.428615102415163, "grad_norm": 0.2557485103607178, "learning_rate": 2.4408729990233962e-05, "loss": 0.0761, "step": 21028 }, { "epoch": 6.428920819321308, "grad_norm": 0.15679402649402618, "learning_rate": 2.440830537981402e-05, "loss": 0.0519, "step": 21029 }, { "epoch": 6.429226536227453, "grad_norm": 0.8043853640556335, "learning_rate": 2.4407880769394083e-05, "loss": 0.0591, "step": 21030 }, { "epoch": 6.4295322531335986, "grad_norm": 0.8588950037956238, "learning_rate": 2.440745615897414e-05, "loss": 0.0461, "step": 21031 }, { "epoch": 6.429837970039744, "grad_norm": 0.2987615466117859, "learning_rate": 2.4407031548554203e-05, "loss": 0.0692, "step": 21032 }, { "epoch": 6.430143686945888, "grad_norm": 0.2938638925552368, "learning_rate": 2.4406606938134262e-05, "loss": 0.0421, "step": 21033 }, { "epoch": 6.430449403852033, "grad_norm": 0.5555037260055542, "learning_rate": 2.4406182327714324e-05, "loss": 0.0816, "step": 21034 }, { "epoch": 6.430755120758178, "grad_norm": 0.2958492934703827, "learning_rate": 2.4405757717294383e-05, "loss": 0.0678, "step": 21035 }, { "epoch": 6.431060837664323, "grad_norm": 0.2664194405078888, "learning_rate": 2.4405333106874445e-05, "loss": 0.0586, "step": 21036 }, { "epoch": 6.4313665545704675, "grad_norm": 0.33985042572021484, "learning_rate": 2.4404908496454504e-05, "loss": 0.0834, "step": 21037 }, { "epoch": 6.431672271476613, "grad_norm": 0.22911058366298676, "learning_rate": 2.4404483886034566e-05, "loss": 0.0536, "step": 21038 }, { "epoch": 6.431977988382758, "grad_norm": 0.8127742409706116, "learning_rate": 2.4404059275614624e-05, "loss": 0.103, "step": 21039 }, { "epoch": 6.432283705288903, "grad_norm": 0.4137517511844635, "learning_rate": 2.4403634665194683e-05, "loss": 0.1064, "step": 21040 }, { "epoch": 6.432589422195047, "grad_norm": 0.5957220792770386, "learning_rate": 2.4403210054774745e-05, "loss": 0.1463, "step": 21041 }, { "epoch": 6.432895139101192, "grad_norm": 1.5609619617462158, "learning_rate": 2.4402785444354804e-05, "loss": 0.1256, "step": 21042 }, { "epoch": 6.433200856007337, "grad_norm": 0.44585365056991577, "learning_rate": 2.4402360833934866e-05, "loss": 0.1585, "step": 21043 }, { "epoch": 6.433506572913482, "grad_norm": 0.6135515570640564, "learning_rate": 2.4401936223514925e-05, "loss": 0.1812, "step": 21044 }, { "epoch": 6.4338122898196275, "grad_norm": 1.0240330696105957, "learning_rate": 2.4401511613094987e-05, "loss": 0.1757, "step": 21045 }, { "epoch": 6.434118006725772, "grad_norm": 0.7277366518974304, "learning_rate": 2.4401087002675046e-05, "loss": 0.164, "step": 21046 }, { "epoch": 6.434423723631917, "grad_norm": 1.0739461183547974, "learning_rate": 2.4400662392255108e-05, "loss": 0.1574, "step": 21047 }, { "epoch": 6.434729440538062, "grad_norm": 0.8486257791519165, "learning_rate": 2.4400237781835166e-05, "loss": 0.1793, "step": 21048 }, { "epoch": 6.435035157444207, "grad_norm": 1.1004884243011475, "learning_rate": 2.439981317141523e-05, "loss": 0.2013, "step": 21049 }, { "epoch": 6.435340874350351, "grad_norm": 0.9021925926208496, "learning_rate": 2.4399388560995287e-05, "loss": 0.2157, "step": 21050 }, { "epoch": 6.435646591256496, "grad_norm": 3.2436225414276123, "learning_rate": 2.439896395057535e-05, "loss": 0.2175, "step": 21051 }, { "epoch": 6.4359523081626415, "grad_norm": 0.4639446437358856, "learning_rate": 2.4398539340155408e-05, "loss": 0.1273, "step": 21052 }, { "epoch": 6.436258025068787, "grad_norm": 0.3214331865310669, "learning_rate": 2.4398114729735467e-05, "loss": 0.0632, "step": 21053 }, { "epoch": 6.436563741974931, "grad_norm": 0.17591001093387604, "learning_rate": 2.439769011931553e-05, "loss": 0.0667, "step": 21054 }, { "epoch": 6.436869458881076, "grad_norm": 0.32777875661849976, "learning_rate": 2.4397265508895587e-05, "loss": 0.062, "step": 21055 }, { "epoch": 6.437175175787221, "grad_norm": 0.31625083088874817, "learning_rate": 2.439684089847565e-05, "loss": 0.0462, "step": 21056 }, { "epoch": 6.437480892693366, "grad_norm": 0.34037840366363525, "learning_rate": 2.4396416288055708e-05, "loss": 0.0535, "step": 21057 }, { "epoch": 6.437786609599511, "grad_norm": 0.24832956492900848, "learning_rate": 2.439599167763577e-05, "loss": 0.0467, "step": 21058 }, { "epoch": 6.4380923265056555, "grad_norm": 0.18948182463645935, "learning_rate": 2.439556706721583e-05, "loss": 0.0707, "step": 21059 }, { "epoch": 6.438398043411801, "grad_norm": 0.1696087121963501, "learning_rate": 2.439514245679589e-05, "loss": 0.0609, "step": 21060 }, { "epoch": 6.438703760317946, "grad_norm": 0.49500882625579834, "learning_rate": 2.439471784637595e-05, "loss": 0.0856, "step": 21061 }, { "epoch": 6.439009477224091, "grad_norm": 0.3146938681602478, "learning_rate": 2.4394293235956012e-05, "loss": 0.1212, "step": 21062 }, { "epoch": 6.439315194130235, "grad_norm": 0.5197005271911621, "learning_rate": 2.439386862553607e-05, "loss": 0.0918, "step": 21063 }, { "epoch": 6.43962091103638, "grad_norm": 0.3072299063205719, "learning_rate": 2.4393444015116133e-05, "loss": 0.0867, "step": 21064 }, { "epoch": 6.439926627942525, "grad_norm": 0.4778369069099426, "learning_rate": 2.439301940469619e-05, "loss": 0.0982, "step": 21065 }, { "epoch": 6.4402323448486705, "grad_norm": 1.1907615661621094, "learning_rate": 2.439259479427625e-05, "loss": 0.0935, "step": 21066 }, { "epoch": 6.440538061754815, "grad_norm": 0.4020076096057892, "learning_rate": 2.4392170183856312e-05, "loss": 0.1297, "step": 21067 }, { "epoch": 6.44084377866096, "grad_norm": 0.5872511267662048, "learning_rate": 2.439174557343637e-05, "loss": 0.1518, "step": 21068 }, { "epoch": 6.441149495567105, "grad_norm": 0.9960411787033081, "learning_rate": 2.4391320963016433e-05, "loss": 0.1629, "step": 21069 }, { "epoch": 6.44145521247325, "grad_norm": 0.7921140789985657, "learning_rate": 2.439089635259649e-05, "loss": 0.1751, "step": 21070 }, { "epoch": 6.441760929379395, "grad_norm": 0.9016783833503723, "learning_rate": 2.4390471742176554e-05, "loss": 0.1693, "step": 21071 }, { "epoch": 6.442066646285539, "grad_norm": 0.7743616104125977, "learning_rate": 2.4390047131756612e-05, "loss": 0.1702, "step": 21072 }, { "epoch": 6.4423723631916845, "grad_norm": 1.034303903579712, "learning_rate": 2.4389622521336675e-05, "loss": 0.202, "step": 21073 }, { "epoch": 6.44267808009783, "grad_norm": 1.3581154346466064, "learning_rate": 2.4389197910916733e-05, "loss": 0.188, "step": 21074 }, { "epoch": 6.442983797003975, "grad_norm": 1.2159839868545532, "learning_rate": 2.4388773300496795e-05, "loss": 0.1717, "step": 21075 }, { "epoch": 6.443289513910119, "grad_norm": 2.840715169906616, "learning_rate": 2.4388348690076854e-05, "loss": 0.2777, "step": 21076 }, { "epoch": 6.443595230816264, "grad_norm": 0.2984501123428345, "learning_rate": 2.4387924079656916e-05, "loss": 0.1224, "step": 21077 }, { "epoch": 6.443900947722409, "grad_norm": 0.3564610183238983, "learning_rate": 2.4387499469236975e-05, "loss": 0.0797, "step": 21078 }, { "epoch": 6.444206664628554, "grad_norm": 0.3684658706188202, "learning_rate": 2.4387074858817033e-05, "loss": 0.0664, "step": 21079 }, { "epoch": 6.4445123815346985, "grad_norm": 0.2812865674495697, "learning_rate": 2.4386650248397096e-05, "loss": 0.0712, "step": 21080 }, { "epoch": 6.444818098440844, "grad_norm": 0.18568815290927887, "learning_rate": 2.4386225637977154e-05, "loss": 0.0596, "step": 21081 }, { "epoch": 6.445123815346989, "grad_norm": 0.3036751449108124, "learning_rate": 2.4385801027557216e-05, "loss": 0.0648, "step": 21082 }, { "epoch": 6.445429532253134, "grad_norm": 0.2439431995153427, "learning_rate": 2.4385376417137275e-05, "loss": 0.0535, "step": 21083 }, { "epoch": 6.445735249159279, "grad_norm": 0.23463274538516998, "learning_rate": 2.4384951806717337e-05, "loss": 0.0416, "step": 21084 }, { "epoch": 6.446040966065423, "grad_norm": 0.376630961894989, "learning_rate": 2.4384527196297396e-05, "loss": 0.0918, "step": 21085 }, { "epoch": 6.446346682971568, "grad_norm": 0.6959889531135559, "learning_rate": 2.4384102585877458e-05, "loss": 0.0669, "step": 21086 }, { "epoch": 6.446652399877713, "grad_norm": 0.24960863590240479, "learning_rate": 2.4383677975457517e-05, "loss": 0.0689, "step": 21087 }, { "epoch": 6.4469581167838586, "grad_norm": 0.3386487364768982, "learning_rate": 2.438325336503758e-05, "loss": 0.0649, "step": 21088 }, { "epoch": 6.447263833690003, "grad_norm": 0.6030732989311218, "learning_rate": 2.4382828754617637e-05, "loss": 0.0999, "step": 21089 }, { "epoch": 6.447569550596148, "grad_norm": 0.87702876329422, "learning_rate": 2.43824041441977e-05, "loss": 0.1487, "step": 21090 }, { "epoch": 6.447875267502293, "grad_norm": 1.2755635976791382, "learning_rate": 2.4381979533777758e-05, "loss": 0.1003, "step": 21091 }, { "epoch": 6.448180984408438, "grad_norm": 0.76798415184021, "learning_rate": 2.4381554923357817e-05, "loss": 0.1197, "step": 21092 }, { "epoch": 6.448486701314582, "grad_norm": 0.4681628346443176, "learning_rate": 2.438113031293788e-05, "loss": 0.1591, "step": 21093 }, { "epoch": 6.4487924182207275, "grad_norm": 0.5950112342834473, "learning_rate": 2.4380705702517938e-05, "loss": 0.136, "step": 21094 }, { "epoch": 6.449098135126873, "grad_norm": 0.589030921459198, "learning_rate": 2.4380281092098e-05, "loss": 0.1551, "step": 21095 }, { "epoch": 6.449403852033018, "grad_norm": 0.9314661622047424, "learning_rate": 2.437985648167806e-05, "loss": 0.1545, "step": 21096 }, { "epoch": 6.449709568939163, "grad_norm": 1.1314061880111694, "learning_rate": 2.437943187125812e-05, "loss": 0.1619, "step": 21097 }, { "epoch": 6.450015285845307, "grad_norm": 0.8863416910171509, "learning_rate": 2.437900726083818e-05, "loss": 0.1754, "step": 21098 }, { "epoch": 6.450321002751452, "grad_norm": 1.1803622245788574, "learning_rate": 2.437858265041824e-05, "loss": 0.1611, "step": 21099 }, { "epoch": 6.450626719657597, "grad_norm": 0.7805042266845703, "learning_rate": 2.43781580399983e-05, "loss": 0.1763, "step": 21100 }, { "epoch": 6.450932436563742, "grad_norm": 1.3070231676101685, "learning_rate": 2.4377733429578362e-05, "loss": 0.2398, "step": 21101 }, { "epoch": 6.451238153469887, "grad_norm": 0.319174200296402, "learning_rate": 2.4377308819158424e-05, "loss": 0.109, "step": 21102 }, { "epoch": 6.451543870376032, "grad_norm": 0.22961270809173584, "learning_rate": 2.4376884208738486e-05, "loss": 0.0725, "step": 21103 }, { "epoch": 6.451849587282177, "grad_norm": 0.2593769133090973, "learning_rate": 2.4376459598318545e-05, "loss": 0.0551, "step": 21104 }, { "epoch": 6.452155304188322, "grad_norm": 0.38287097215652466, "learning_rate": 2.4376034987898604e-05, "loss": 0.0783, "step": 21105 }, { "epoch": 6.452461021094466, "grad_norm": 0.18084017932415009, "learning_rate": 2.4375610377478666e-05, "loss": 0.0488, "step": 21106 }, { "epoch": 6.452766738000611, "grad_norm": 0.37445926666259766, "learning_rate": 2.4375185767058725e-05, "loss": 0.0446, "step": 21107 }, { "epoch": 6.453072454906756, "grad_norm": 0.23587492108345032, "learning_rate": 2.4374761156638787e-05, "loss": 0.0601, "step": 21108 }, { "epoch": 6.4533781718129015, "grad_norm": 0.1700010597705841, "learning_rate": 2.4374336546218845e-05, "loss": 0.056, "step": 21109 }, { "epoch": 6.453683888719046, "grad_norm": 0.35836100578308105, "learning_rate": 2.4373911935798907e-05, "loss": 0.0718, "step": 21110 }, { "epoch": 6.453989605625191, "grad_norm": 0.3419038653373718, "learning_rate": 2.4373487325378966e-05, "loss": 0.0797, "step": 21111 }, { "epoch": 6.454295322531336, "grad_norm": 0.2842985689640045, "learning_rate": 2.4373062714959028e-05, "loss": 0.0868, "step": 21112 }, { "epoch": 6.454601039437481, "grad_norm": 0.2802741229534149, "learning_rate": 2.4372638104539087e-05, "loss": 0.0674, "step": 21113 }, { "epoch": 6.454906756343626, "grad_norm": 0.3051473796367645, "learning_rate": 2.437221349411915e-05, "loss": 0.1094, "step": 21114 }, { "epoch": 6.45521247324977, "grad_norm": 0.5988242626190186, "learning_rate": 2.4371788883699208e-05, "loss": 0.1114, "step": 21115 }, { "epoch": 6.4555181901559155, "grad_norm": 0.3546742796897888, "learning_rate": 2.437136427327927e-05, "loss": 0.1349, "step": 21116 }, { "epoch": 6.455823907062061, "grad_norm": 0.47592130303382874, "learning_rate": 2.437093966285933e-05, "loss": 0.1208, "step": 21117 }, { "epoch": 6.456129623968206, "grad_norm": 0.7901360988616943, "learning_rate": 2.4370515052439387e-05, "loss": 0.1365, "step": 21118 }, { "epoch": 6.45643534087435, "grad_norm": 0.5274546146392822, "learning_rate": 2.437009044201945e-05, "loss": 0.1402, "step": 21119 }, { "epoch": 6.456741057780495, "grad_norm": 0.6214151978492737, "learning_rate": 2.4369665831599508e-05, "loss": 0.1552, "step": 21120 }, { "epoch": 6.45704677468664, "grad_norm": 0.8494945764541626, "learning_rate": 2.436924122117957e-05, "loss": 0.1553, "step": 21121 }, { "epoch": 6.457352491592785, "grad_norm": 0.8504915237426758, "learning_rate": 2.436881661075963e-05, "loss": 0.2114, "step": 21122 }, { "epoch": 6.45765820849893, "grad_norm": 1.2069203853607178, "learning_rate": 2.436839200033969e-05, "loss": 0.1942, "step": 21123 }, { "epoch": 6.457963925405075, "grad_norm": 1.2964320182800293, "learning_rate": 2.436796738991975e-05, "loss": 0.1904, "step": 21124 }, { "epoch": 6.45826964231122, "grad_norm": 0.7485326528549194, "learning_rate": 2.436754277949981e-05, "loss": 0.1658, "step": 21125 }, { "epoch": 6.458575359217365, "grad_norm": 1.1776810884475708, "learning_rate": 2.436711816907987e-05, "loss": 0.2204, "step": 21126 }, { "epoch": 6.45888107612351, "grad_norm": 0.4840731620788574, "learning_rate": 2.4366693558659932e-05, "loss": 0.163, "step": 21127 }, { "epoch": 6.459186793029654, "grad_norm": 0.818253755569458, "learning_rate": 2.436626894823999e-05, "loss": 0.1147, "step": 21128 }, { "epoch": 6.459492509935799, "grad_norm": 0.5820049047470093, "learning_rate": 2.4365844337820053e-05, "loss": 0.0798, "step": 21129 }, { "epoch": 6.4597982268419445, "grad_norm": 0.46738970279693604, "learning_rate": 2.4365419727400112e-05, "loss": 0.0685, "step": 21130 }, { "epoch": 6.46010394374809, "grad_norm": 0.3602985143661499, "learning_rate": 2.436499511698017e-05, "loss": 0.0536, "step": 21131 }, { "epoch": 6.460409660654234, "grad_norm": 0.2999561131000519, "learning_rate": 2.4364570506560233e-05, "loss": 0.065, "step": 21132 }, { "epoch": 6.460715377560379, "grad_norm": 0.2027633786201477, "learning_rate": 2.436414589614029e-05, "loss": 0.0528, "step": 21133 }, { "epoch": 6.461021094466524, "grad_norm": 2.447208881378174, "learning_rate": 2.4363721285720353e-05, "loss": 0.1059, "step": 21134 }, { "epoch": 6.461326811372669, "grad_norm": 0.32255396246910095, "learning_rate": 2.4363296675300412e-05, "loss": 0.0531, "step": 21135 }, { "epoch": 6.461632528278813, "grad_norm": 0.31852099299430847, "learning_rate": 2.4362872064880474e-05, "loss": 0.0637, "step": 21136 }, { "epoch": 6.4619382451849585, "grad_norm": 0.498914897441864, "learning_rate": 2.4362447454460533e-05, "loss": 0.0734, "step": 21137 }, { "epoch": 6.462243962091104, "grad_norm": 0.4104301333427429, "learning_rate": 2.4362022844040595e-05, "loss": 0.0848, "step": 21138 }, { "epoch": 6.462549678997249, "grad_norm": 0.43690988421440125, "learning_rate": 2.4361598233620654e-05, "loss": 0.0864, "step": 21139 }, { "epoch": 6.462855395903394, "grad_norm": 0.9984493255615234, "learning_rate": 2.4361173623200716e-05, "loss": 0.1171, "step": 21140 }, { "epoch": 6.463161112809538, "grad_norm": 0.4175480902194977, "learning_rate": 2.4360749012780775e-05, "loss": 0.1446, "step": 21141 }, { "epoch": 6.463466829715683, "grad_norm": 0.6759471297264099, "learning_rate": 2.4360324402360833e-05, "loss": 0.1026, "step": 21142 }, { "epoch": 6.463772546621828, "grad_norm": 0.4860764145851135, "learning_rate": 2.4359899791940895e-05, "loss": 0.1478, "step": 21143 }, { "epoch": 6.464078263527973, "grad_norm": 0.5675286650657654, "learning_rate": 2.4359475181520954e-05, "loss": 0.1757, "step": 21144 }, { "epoch": 6.464383980434118, "grad_norm": 0.4308995008468628, "learning_rate": 2.4359050571101016e-05, "loss": 0.1356, "step": 21145 }, { "epoch": 6.464689697340263, "grad_norm": 0.8614185452461243, "learning_rate": 2.4358625960681075e-05, "loss": 0.1712, "step": 21146 }, { "epoch": 6.464995414246408, "grad_norm": 0.4983447194099426, "learning_rate": 2.4358201350261137e-05, "loss": 0.1906, "step": 21147 }, { "epoch": 6.465301131152553, "grad_norm": 0.6957868933677673, "learning_rate": 2.4357776739841196e-05, "loss": 0.1504, "step": 21148 }, { "epoch": 6.465606848058697, "grad_norm": 0.47280353307724, "learning_rate": 2.4357352129421258e-05, "loss": 0.1778, "step": 21149 }, { "epoch": 6.465912564964842, "grad_norm": 1.0648672580718994, "learning_rate": 2.4356927519001316e-05, "loss": 0.1645, "step": 21150 }, { "epoch": 6.4662182818709875, "grad_norm": 0.9849948287010193, "learning_rate": 2.435650290858138e-05, "loss": 0.2418, "step": 21151 }, { "epoch": 6.466523998777133, "grad_norm": 0.4942988157272339, "learning_rate": 2.4356078298161437e-05, "loss": 0.1279, "step": 21152 }, { "epoch": 6.466829715683278, "grad_norm": 0.6881023049354553, "learning_rate": 2.43556536877415e-05, "loss": 0.089, "step": 21153 }, { "epoch": 6.467135432589422, "grad_norm": 0.4248145818710327, "learning_rate": 2.4355229077321558e-05, "loss": 0.0684, "step": 21154 }, { "epoch": 6.467441149495567, "grad_norm": 0.5786542296409607, "learning_rate": 2.4354804466901617e-05, "loss": 0.0651, "step": 21155 }, { "epoch": 6.467746866401712, "grad_norm": 0.10637341439723969, "learning_rate": 2.435437985648168e-05, "loss": 0.0338, "step": 21156 }, { "epoch": 6.468052583307857, "grad_norm": 0.246757373213768, "learning_rate": 2.4353955246061737e-05, "loss": 0.0537, "step": 21157 }, { "epoch": 6.4683583002140015, "grad_norm": 0.12246604263782501, "learning_rate": 2.43535306356418e-05, "loss": 0.0349, "step": 21158 }, { "epoch": 6.468664017120147, "grad_norm": 0.21032683551311493, "learning_rate": 2.4353106025221858e-05, "loss": 0.0538, "step": 21159 }, { "epoch": 6.468969734026292, "grad_norm": 0.2084798365831375, "learning_rate": 2.435268141480192e-05, "loss": 0.0648, "step": 21160 }, { "epoch": 6.469275450932437, "grad_norm": 0.3045758605003357, "learning_rate": 2.435225680438198e-05, "loss": 0.0749, "step": 21161 }, { "epoch": 6.469581167838581, "grad_norm": 0.5170230269432068, "learning_rate": 2.435183219396204e-05, "loss": 0.1019, "step": 21162 }, { "epoch": 6.469886884744726, "grad_norm": 0.3225102722644806, "learning_rate": 2.43514075835421e-05, "loss": 0.0834, "step": 21163 }, { "epoch": 6.470192601650871, "grad_norm": 0.7961736917495728, "learning_rate": 2.4350982973122162e-05, "loss": 0.1314, "step": 21164 }, { "epoch": 6.470498318557016, "grad_norm": 1.5090020895004272, "learning_rate": 2.435055836270222e-05, "loss": 0.1189, "step": 21165 }, { "epoch": 6.4708040354631615, "grad_norm": 0.910624086856842, "learning_rate": 2.4350133752282283e-05, "loss": 0.127, "step": 21166 }, { "epoch": 6.471109752369306, "grad_norm": 0.41668635606765747, "learning_rate": 2.434970914186234e-05, "loss": 0.118, "step": 21167 }, { "epoch": 6.471415469275451, "grad_norm": 0.38670167326927185, "learning_rate": 2.43492845314424e-05, "loss": 0.1493, "step": 21168 }, { "epoch": 6.471721186181596, "grad_norm": 0.48303940892219543, "learning_rate": 2.4348859921022462e-05, "loss": 0.1749, "step": 21169 }, { "epoch": 6.472026903087741, "grad_norm": 0.7110071778297424, "learning_rate": 2.434843531060252e-05, "loss": 0.1674, "step": 21170 }, { "epoch": 6.472332619993885, "grad_norm": 0.7075230479240417, "learning_rate": 2.4348010700182583e-05, "loss": 0.1573, "step": 21171 }, { "epoch": 6.47263833690003, "grad_norm": 0.3755456507205963, "learning_rate": 2.434758608976264e-05, "loss": 0.1422, "step": 21172 }, { "epoch": 6.4729440538061755, "grad_norm": 0.7424768209457397, "learning_rate": 2.4347161479342704e-05, "loss": 0.1811, "step": 21173 }, { "epoch": 6.473249770712321, "grad_norm": 0.7596369981765747, "learning_rate": 2.4346736868922762e-05, "loss": 0.2176, "step": 21174 }, { "epoch": 6.473555487618465, "grad_norm": 1.09476637840271, "learning_rate": 2.4346312258502825e-05, "loss": 0.1803, "step": 21175 }, { "epoch": 6.47386120452461, "grad_norm": 1.1207994222640991, "learning_rate": 2.4345887648082883e-05, "loss": 0.2029, "step": 21176 }, { "epoch": 6.474166921430755, "grad_norm": 0.27188655734062195, "learning_rate": 2.4345463037662945e-05, "loss": 0.136, "step": 21177 }, { "epoch": 6.4744726383369, "grad_norm": 0.5620416402816772, "learning_rate": 2.4345038427243004e-05, "loss": 0.0645, "step": 21178 }, { "epoch": 6.474778355243045, "grad_norm": 0.24241016805171967, "learning_rate": 2.4344613816823066e-05, "loss": 0.0702, "step": 21179 }, { "epoch": 6.47508407214919, "grad_norm": 0.37449580430984497, "learning_rate": 2.4344189206403125e-05, "loss": 0.057, "step": 21180 }, { "epoch": 6.475389789055335, "grad_norm": 0.3061099350452423, "learning_rate": 2.4343764595983184e-05, "loss": 0.0703, "step": 21181 }, { "epoch": 6.47569550596148, "grad_norm": 0.3359372913837433, "learning_rate": 2.4343339985563246e-05, "loss": 0.0487, "step": 21182 }, { "epoch": 6.476001222867625, "grad_norm": 0.38199642300605774, "learning_rate": 2.4342915375143304e-05, "loss": 0.044, "step": 21183 }, { "epoch": 6.476306939773769, "grad_norm": 0.15816228091716766, "learning_rate": 2.4342490764723366e-05, "loss": 0.0493, "step": 21184 }, { "epoch": 6.476612656679914, "grad_norm": 0.3719354271888733, "learning_rate": 2.4342066154303425e-05, "loss": 0.0927, "step": 21185 }, { "epoch": 6.476918373586059, "grad_norm": 0.2232489287853241, "learning_rate": 2.4341641543883487e-05, "loss": 0.0775, "step": 21186 }, { "epoch": 6.4772240904922045, "grad_norm": 0.4626959562301636, "learning_rate": 2.4341216933463546e-05, "loss": 0.0683, "step": 21187 }, { "epoch": 6.477529807398349, "grad_norm": 0.2823159992694855, "learning_rate": 2.4340792323043608e-05, "loss": 0.0837, "step": 21188 }, { "epoch": 6.477835524304494, "grad_norm": 0.3208959996700287, "learning_rate": 2.4340367712623667e-05, "loss": 0.0876, "step": 21189 }, { "epoch": 6.478141241210639, "grad_norm": 0.4082961976528168, "learning_rate": 2.433994310220373e-05, "loss": 0.0963, "step": 21190 }, { "epoch": 6.478446958116784, "grad_norm": 1.0657689571380615, "learning_rate": 2.4339518491783787e-05, "loss": 0.1176, "step": 21191 }, { "epoch": 6.478752675022929, "grad_norm": 1.1735142469406128, "learning_rate": 2.433909388136385e-05, "loss": 0.1282, "step": 21192 }, { "epoch": 6.479058391929073, "grad_norm": 0.5139588117599487, "learning_rate": 2.4338669270943908e-05, "loss": 0.1455, "step": 21193 }, { "epoch": 6.4793641088352185, "grad_norm": 0.43525150418281555, "learning_rate": 2.4338244660523967e-05, "loss": 0.1366, "step": 21194 }, { "epoch": 6.479669825741364, "grad_norm": 1.1126691102981567, "learning_rate": 2.433782005010403e-05, "loss": 0.1411, "step": 21195 }, { "epoch": 6.479975542647509, "grad_norm": 0.7801110148429871, "learning_rate": 2.4337395439684088e-05, "loss": 0.1499, "step": 21196 }, { "epoch": 6.480281259553653, "grad_norm": 0.6157200932502747, "learning_rate": 2.433697082926415e-05, "loss": 0.1987, "step": 21197 }, { "epoch": 6.480586976459798, "grad_norm": 0.8228620886802673, "learning_rate": 2.433654621884421e-05, "loss": 0.1813, "step": 21198 }, { "epoch": 6.480892693365943, "grad_norm": 0.5352910161018372, "learning_rate": 2.433612160842427e-05, "loss": 0.1882, "step": 21199 }, { "epoch": 6.481198410272088, "grad_norm": 1.2654013633728027, "learning_rate": 2.433569699800433e-05, "loss": 0.2052, "step": 21200 }, { "epoch": 6.4815041271782325, "grad_norm": 1.320130705833435, "learning_rate": 2.433527238758439e-05, "loss": 0.2467, "step": 21201 }, { "epoch": 6.481809844084378, "grad_norm": 1.11849045753479, "learning_rate": 2.433484777716445e-05, "loss": 0.1198, "step": 21202 }, { "epoch": 6.482115560990523, "grad_norm": 0.32409968972206116, "learning_rate": 2.4334423166744512e-05, "loss": 0.0808, "step": 21203 }, { "epoch": 6.482421277896668, "grad_norm": 0.34061750769615173, "learning_rate": 2.4333998556324574e-05, "loss": 0.0804, "step": 21204 }, { "epoch": 6.482726994802813, "grad_norm": 0.30567029118537903, "learning_rate": 2.4333573945904636e-05, "loss": 0.0535, "step": 21205 }, { "epoch": 6.483032711708957, "grad_norm": 0.1899145394563675, "learning_rate": 2.4333149335484695e-05, "loss": 0.0551, "step": 21206 }, { "epoch": 6.483338428615102, "grad_norm": 0.3500620722770691, "learning_rate": 2.4332724725064754e-05, "loss": 0.0613, "step": 21207 }, { "epoch": 6.4836441455212475, "grad_norm": 0.19316251575946808, "learning_rate": 2.4332300114644816e-05, "loss": 0.0567, "step": 21208 }, { "epoch": 6.483949862427393, "grad_norm": 0.22644606232643127, "learning_rate": 2.4331875504224875e-05, "loss": 0.0468, "step": 21209 }, { "epoch": 6.484255579333537, "grad_norm": 0.529836893081665, "learning_rate": 2.4331450893804937e-05, "loss": 0.0665, "step": 21210 }, { "epoch": 6.484561296239682, "grad_norm": 0.4540751874446869, "learning_rate": 2.4331026283384995e-05, "loss": 0.0508, "step": 21211 }, { "epoch": 6.484867013145827, "grad_norm": 0.29796621203422546, "learning_rate": 2.4330601672965057e-05, "loss": 0.0584, "step": 21212 }, { "epoch": 6.485172730051972, "grad_norm": 0.45629793405532837, "learning_rate": 2.4330177062545116e-05, "loss": 0.0841, "step": 21213 }, { "epoch": 6.485478446958116, "grad_norm": 0.2577899396419525, "learning_rate": 2.4329752452125178e-05, "loss": 0.0936, "step": 21214 }, { "epoch": 6.4857841638642615, "grad_norm": 0.6413977146148682, "learning_rate": 2.4329327841705237e-05, "loss": 0.1044, "step": 21215 }, { "epoch": 6.486089880770407, "grad_norm": 0.6604660153388977, "learning_rate": 2.43289032312853e-05, "loss": 0.1266, "step": 21216 }, { "epoch": 6.486395597676552, "grad_norm": 0.39858943223953247, "learning_rate": 2.4328478620865358e-05, "loss": 0.1293, "step": 21217 }, { "epoch": 6.486701314582697, "grad_norm": 0.6352435946464539, "learning_rate": 2.432805401044542e-05, "loss": 0.1663, "step": 21218 }, { "epoch": 6.487007031488841, "grad_norm": 0.6919175982475281, "learning_rate": 2.432762940002548e-05, "loss": 0.1701, "step": 21219 }, { "epoch": 6.487312748394986, "grad_norm": 1.2496452331542969, "learning_rate": 2.4327204789605537e-05, "loss": 0.1711, "step": 21220 }, { "epoch": 6.487618465301131, "grad_norm": 0.5435243844985962, "learning_rate": 2.43267801791856e-05, "loss": 0.1499, "step": 21221 }, { "epoch": 6.487924182207276, "grad_norm": 0.7520875930786133, "learning_rate": 2.4326355568765658e-05, "loss": 0.1685, "step": 21222 }, { "epoch": 6.488229899113421, "grad_norm": 1.0640395879745483, "learning_rate": 2.432593095834572e-05, "loss": 0.1847, "step": 21223 }, { "epoch": 6.488535616019566, "grad_norm": 1.7650845050811768, "learning_rate": 2.432550634792578e-05, "loss": 0.2139, "step": 21224 }, { "epoch": 6.488841332925711, "grad_norm": 0.620752215385437, "learning_rate": 2.432508173750584e-05, "loss": 0.1595, "step": 21225 }, { "epoch": 6.489147049831856, "grad_norm": 3.0745434761047363, "learning_rate": 2.43246571270859e-05, "loss": 0.2209, "step": 21226 }, { "epoch": 6.489452766738, "grad_norm": 0.575003981590271, "learning_rate": 2.432423251666596e-05, "loss": 0.1609, "step": 21227 }, { "epoch": 6.489758483644145, "grad_norm": 0.484542578458786, "learning_rate": 2.432380790624602e-05, "loss": 0.066, "step": 21228 }, { "epoch": 6.49006420055029, "grad_norm": 0.9338580369949341, "learning_rate": 2.4323383295826082e-05, "loss": 0.0797, "step": 21229 }, { "epoch": 6.4903699174564355, "grad_norm": 0.1647675335407257, "learning_rate": 2.432295868540614e-05, "loss": 0.0596, "step": 21230 }, { "epoch": 6.490675634362581, "grad_norm": 0.23379327356815338, "learning_rate": 2.4322534074986203e-05, "loss": 0.0395, "step": 21231 }, { "epoch": 6.490981351268725, "grad_norm": 0.45197975635528564, "learning_rate": 2.4322109464566262e-05, "loss": 0.0551, "step": 21232 }, { "epoch": 6.49128706817487, "grad_norm": 0.15844079852104187, "learning_rate": 2.432168485414632e-05, "loss": 0.0372, "step": 21233 }, { "epoch": 6.491592785081015, "grad_norm": 0.5212318301200867, "learning_rate": 2.4321260243726383e-05, "loss": 0.0604, "step": 21234 }, { "epoch": 6.49189850198716, "grad_norm": 0.5201197862625122, "learning_rate": 2.432083563330644e-05, "loss": 0.0722, "step": 21235 }, { "epoch": 6.4922042188933045, "grad_norm": 0.16924042999744415, "learning_rate": 2.4320411022886503e-05, "loss": 0.0577, "step": 21236 }, { "epoch": 6.49250993579945, "grad_norm": 0.3039367198944092, "learning_rate": 2.4319986412466562e-05, "loss": 0.0859, "step": 21237 }, { "epoch": 6.492815652705595, "grad_norm": 0.8628252744674683, "learning_rate": 2.4319561802046624e-05, "loss": 0.0946, "step": 21238 }, { "epoch": 6.49312136961174, "grad_norm": 0.26428502798080444, "learning_rate": 2.4319137191626683e-05, "loss": 0.0772, "step": 21239 }, { "epoch": 6.493427086517884, "grad_norm": 0.4523508846759796, "learning_rate": 2.4318712581206745e-05, "loss": 0.1106, "step": 21240 }, { "epoch": 6.493732803424029, "grad_norm": 0.43505096435546875, "learning_rate": 2.4318287970786804e-05, "loss": 0.0938, "step": 21241 }, { "epoch": 6.494038520330174, "grad_norm": 0.8011338114738464, "learning_rate": 2.4317863360366866e-05, "loss": 0.1185, "step": 21242 }, { "epoch": 6.494344237236319, "grad_norm": 0.7792288661003113, "learning_rate": 2.4317438749946925e-05, "loss": 0.1268, "step": 21243 }, { "epoch": 6.4946499541424645, "grad_norm": 0.4829334318637848, "learning_rate": 2.4317014139526987e-05, "loss": 0.1249, "step": 21244 }, { "epoch": 6.494955671048609, "grad_norm": 1.6146488189697266, "learning_rate": 2.4316589529107045e-05, "loss": 0.1627, "step": 21245 }, { "epoch": 6.495261387954754, "grad_norm": 0.5600395798683167, "learning_rate": 2.4316164918687104e-05, "loss": 0.1604, "step": 21246 }, { "epoch": 6.495567104860899, "grad_norm": 1.3505244255065918, "learning_rate": 2.4315740308267166e-05, "loss": 0.1834, "step": 21247 }, { "epoch": 6.495872821767044, "grad_norm": 1.062747836112976, "learning_rate": 2.4315315697847225e-05, "loss": 0.1561, "step": 21248 }, { "epoch": 6.496178538673188, "grad_norm": 0.8435713052749634, "learning_rate": 2.4314891087427287e-05, "loss": 0.1564, "step": 21249 }, { "epoch": 6.496484255579333, "grad_norm": 0.6295806169509888, "learning_rate": 2.4314466477007346e-05, "loss": 0.192, "step": 21250 }, { "epoch": 6.4967899724854785, "grad_norm": 1.6335101127624512, "learning_rate": 2.4314041866587408e-05, "loss": 0.1866, "step": 21251 }, { "epoch": 6.497095689391624, "grad_norm": 1.2291994094848633, "learning_rate": 2.4313617256167466e-05, "loss": 0.1253, "step": 21252 }, { "epoch": 6.497401406297768, "grad_norm": 0.5105099678039551, "learning_rate": 2.431319264574753e-05, "loss": 0.0767, "step": 21253 }, { "epoch": 6.497707123203913, "grad_norm": 1.1104317903518677, "learning_rate": 2.4312768035327587e-05, "loss": 0.0834, "step": 21254 }, { "epoch": 6.498012840110058, "grad_norm": 0.2878754138946533, "learning_rate": 2.431234342490765e-05, "loss": 0.055, "step": 21255 }, { "epoch": 6.498318557016203, "grad_norm": 0.17920798063278198, "learning_rate": 2.4311918814487708e-05, "loss": 0.051, "step": 21256 }, { "epoch": 6.498624273922348, "grad_norm": 0.159335196018219, "learning_rate": 2.4311494204067767e-05, "loss": 0.0332, "step": 21257 }, { "epoch": 6.4989299908284925, "grad_norm": 0.32308679819107056, "learning_rate": 2.431106959364783e-05, "loss": 0.0701, "step": 21258 }, { "epoch": 6.499235707734638, "grad_norm": 0.39146706461906433, "learning_rate": 2.4310644983227887e-05, "loss": 0.0579, "step": 21259 }, { "epoch": 6.499541424640783, "grad_norm": 0.30655017495155334, "learning_rate": 2.431022037280795e-05, "loss": 0.0692, "step": 21260 }, { "epoch": 6.499847141546928, "grad_norm": 1.0392687320709229, "learning_rate": 2.4309795762388008e-05, "loss": 0.0618, "step": 21261 }, { "epoch": 6.500152858453072, "grad_norm": 0.35413846373558044, "learning_rate": 2.430937115196807e-05, "loss": 0.0883, "step": 21262 }, { "epoch": 6.500458575359217, "grad_norm": 0.5541326403617859, "learning_rate": 2.430894654154813e-05, "loss": 0.1287, "step": 21263 }, { "epoch": 6.500764292265362, "grad_norm": 0.4550943374633789, "learning_rate": 2.430852193112819e-05, "loss": 0.0923, "step": 21264 }, { "epoch": 6.5010700091715075, "grad_norm": 0.4642747640609741, "learning_rate": 2.430809732070825e-05, "loss": 0.118, "step": 21265 }, { "epoch": 6.501375726077653, "grad_norm": 0.8264188766479492, "learning_rate": 2.4307672710288312e-05, "loss": 0.1422, "step": 21266 }, { "epoch": 6.501681442983797, "grad_norm": 1.0834002494812012, "learning_rate": 2.430724809986837e-05, "loss": 0.1301, "step": 21267 }, { "epoch": 6.501987159889942, "grad_norm": 1.0856863260269165, "learning_rate": 2.4306823489448433e-05, "loss": 0.1421, "step": 21268 }, { "epoch": 6.502292876796087, "grad_norm": 0.5686683058738708, "learning_rate": 2.430639887902849e-05, "loss": 0.13, "step": 21269 }, { "epoch": 6.502598593702231, "grad_norm": 0.6909832954406738, "learning_rate": 2.430597426860855e-05, "loss": 0.1751, "step": 21270 }, { "epoch": 6.502904310608376, "grad_norm": 1.7519723176956177, "learning_rate": 2.4305549658188612e-05, "loss": 0.1588, "step": 21271 }, { "epoch": 6.5032100275145215, "grad_norm": 0.5823848843574524, "learning_rate": 2.430512504776867e-05, "loss": 0.1686, "step": 21272 }, { "epoch": 6.503515744420667, "grad_norm": 0.8931500911712646, "learning_rate": 2.4304700437348733e-05, "loss": 0.2184, "step": 21273 }, { "epoch": 6.503821461326812, "grad_norm": 1.0608608722686768, "learning_rate": 2.430427582692879e-05, "loss": 0.1696, "step": 21274 }, { "epoch": 6.504127178232956, "grad_norm": 0.6984303593635559, "learning_rate": 2.4303851216508854e-05, "loss": 0.195, "step": 21275 }, { "epoch": 6.504432895139101, "grad_norm": 2.050719976425171, "learning_rate": 2.4303426606088912e-05, "loss": 0.226, "step": 21276 }, { "epoch": 6.504738612045246, "grad_norm": 0.47581538558006287, "learning_rate": 2.4303001995668975e-05, "loss": 0.1441, "step": 21277 }, { "epoch": 6.505044328951391, "grad_norm": 0.2965508699417114, "learning_rate": 2.4302577385249033e-05, "loss": 0.0787, "step": 21278 }, { "epoch": 6.505350045857536, "grad_norm": 0.2969473898410797, "learning_rate": 2.4302152774829095e-05, "loss": 0.0651, "step": 21279 }, { "epoch": 6.505655762763681, "grad_norm": 0.5973469018936157, "learning_rate": 2.4301728164409154e-05, "loss": 0.0664, "step": 21280 }, { "epoch": 6.505961479669826, "grad_norm": 0.2629515528678894, "learning_rate": 2.4301303553989216e-05, "loss": 0.0564, "step": 21281 }, { "epoch": 6.506267196575971, "grad_norm": 0.4030068814754486, "learning_rate": 2.4300878943569275e-05, "loss": 0.0699, "step": 21282 }, { "epoch": 6.506572913482115, "grad_norm": 0.35698139667510986, "learning_rate": 2.4300454333149334e-05, "loss": 0.0609, "step": 21283 }, { "epoch": 6.50687863038826, "grad_norm": 0.21540048718452454, "learning_rate": 2.4300029722729396e-05, "loss": 0.0437, "step": 21284 }, { "epoch": 6.507184347294405, "grad_norm": 4.12805700302124, "learning_rate": 2.4299605112309454e-05, "loss": 0.0624, "step": 21285 }, { "epoch": 6.50749006420055, "grad_norm": 0.5450907349586487, "learning_rate": 2.4299180501889516e-05, "loss": 0.0567, "step": 21286 }, { "epoch": 6.5077957811066955, "grad_norm": 0.7911312580108643, "learning_rate": 2.4298755891469575e-05, "loss": 0.0937, "step": 21287 }, { "epoch": 6.50810149801284, "grad_norm": 0.6156153678894043, "learning_rate": 2.4298331281049637e-05, "loss": 0.0665, "step": 21288 }, { "epoch": 6.508407214918985, "grad_norm": 0.399783194065094, "learning_rate": 2.4297906670629696e-05, "loss": 0.0802, "step": 21289 }, { "epoch": 6.50871293182513, "grad_norm": 0.6011471152305603, "learning_rate": 2.4297482060209758e-05, "loss": 0.0866, "step": 21290 }, { "epoch": 6.509018648731275, "grad_norm": 0.41271764039993286, "learning_rate": 2.4297057449789817e-05, "loss": 0.0928, "step": 21291 }, { "epoch": 6.50932436563742, "grad_norm": 0.6951785087585449, "learning_rate": 2.429663283936988e-05, "loss": 0.1183, "step": 21292 }, { "epoch": 6.5096300825435645, "grad_norm": 0.8248987793922424, "learning_rate": 2.4296208228949937e-05, "loss": 0.1587, "step": 21293 }, { "epoch": 6.50993579944971, "grad_norm": 0.9063481688499451, "learning_rate": 2.429578361853e-05, "loss": 0.1809, "step": 21294 }, { "epoch": 6.510241516355855, "grad_norm": 1.0517616271972656, "learning_rate": 2.4295359008110058e-05, "loss": 0.1542, "step": 21295 }, { "epoch": 6.510547233261999, "grad_norm": 0.7043637633323669, "learning_rate": 2.4294934397690117e-05, "loss": 0.1602, "step": 21296 }, { "epoch": 6.510852950168144, "grad_norm": 2.0658576488494873, "learning_rate": 2.429450978727018e-05, "loss": 0.1789, "step": 21297 }, { "epoch": 6.511158667074289, "grad_norm": 1.6657620668411255, "learning_rate": 2.4294085176850238e-05, "loss": 0.1666, "step": 21298 }, { "epoch": 6.511464383980434, "grad_norm": 1.2682284116744995, "learning_rate": 2.42936605664303e-05, "loss": 0.207, "step": 21299 }, { "epoch": 6.511770100886579, "grad_norm": 1.8207837343215942, "learning_rate": 2.429323595601036e-05, "loss": 0.2268, "step": 21300 }, { "epoch": 6.512075817792724, "grad_norm": 3.9021546840667725, "learning_rate": 2.429281134559042e-05, "loss": 0.1879, "step": 21301 }, { "epoch": 6.512381534698869, "grad_norm": 0.5370945930480957, "learning_rate": 2.429238673517048e-05, "loss": 0.1364, "step": 21302 }, { "epoch": 6.512687251605014, "grad_norm": 0.2831796705722809, "learning_rate": 2.429196212475054e-05, "loss": 0.0745, "step": 21303 }, { "epoch": 6.512992968511159, "grad_norm": 0.3298843502998352, "learning_rate": 2.42915375143306e-05, "loss": 0.057, "step": 21304 }, { "epoch": 6.513298685417304, "grad_norm": 0.38060247898101807, "learning_rate": 2.4291112903910662e-05, "loss": 0.0912, "step": 21305 }, { "epoch": 6.513604402323448, "grad_norm": 0.17393378913402557, "learning_rate": 2.4290688293490724e-05, "loss": 0.0404, "step": 21306 }, { "epoch": 6.513910119229593, "grad_norm": 0.3149067163467407, "learning_rate": 2.4290263683070786e-05, "loss": 0.0644, "step": 21307 }, { "epoch": 6.5142158361357385, "grad_norm": 0.23550021648406982, "learning_rate": 2.4289839072650845e-05, "loss": 0.0414, "step": 21308 }, { "epoch": 6.514521553041883, "grad_norm": 0.24733427166938782, "learning_rate": 2.4289414462230904e-05, "loss": 0.0658, "step": 21309 }, { "epoch": 6.514827269948028, "grad_norm": 0.261210173368454, "learning_rate": 2.4288989851810966e-05, "loss": 0.0654, "step": 21310 }, { "epoch": 6.515132986854173, "grad_norm": 0.25127944350242615, "learning_rate": 2.4288565241391025e-05, "loss": 0.074, "step": 21311 }, { "epoch": 6.515438703760318, "grad_norm": 0.4696040153503418, "learning_rate": 2.4288140630971087e-05, "loss": 0.0688, "step": 21312 }, { "epoch": 6.515744420666463, "grad_norm": 0.29302966594696045, "learning_rate": 2.4287716020551145e-05, "loss": 0.1068, "step": 21313 }, { "epoch": 6.516050137572607, "grad_norm": 0.384807288646698, "learning_rate": 2.4287291410131207e-05, "loss": 0.0888, "step": 21314 }, { "epoch": 6.5163558544787525, "grad_norm": 0.6312841176986694, "learning_rate": 2.4286866799711266e-05, "loss": 0.0907, "step": 21315 }, { "epoch": 6.516661571384898, "grad_norm": 0.4344030022621155, "learning_rate": 2.4286442189291328e-05, "loss": 0.138, "step": 21316 }, { "epoch": 6.516967288291043, "grad_norm": 0.6529787182807922, "learning_rate": 2.4286017578871387e-05, "loss": 0.1314, "step": 21317 }, { "epoch": 6.517273005197188, "grad_norm": 0.9567509889602661, "learning_rate": 2.428559296845145e-05, "loss": 0.138, "step": 21318 }, { "epoch": 6.517578722103332, "grad_norm": 0.4814702868461609, "learning_rate": 2.4285168358031508e-05, "loss": 0.1332, "step": 21319 }, { "epoch": 6.517884439009477, "grad_norm": 0.5687755942344666, "learning_rate": 2.428474374761157e-05, "loss": 0.1647, "step": 21320 }, { "epoch": 6.518190155915622, "grad_norm": 1.1843624114990234, "learning_rate": 2.428431913719163e-05, "loss": 0.1712, "step": 21321 }, { "epoch": 6.518495872821767, "grad_norm": 1.0355885028839111, "learning_rate": 2.4283894526771687e-05, "loss": 0.1726, "step": 21322 }, { "epoch": 6.518801589727912, "grad_norm": 0.5939488410949707, "learning_rate": 2.428346991635175e-05, "loss": 0.1509, "step": 21323 }, { "epoch": 6.519107306634057, "grad_norm": 0.5657289624214172, "learning_rate": 2.4283045305931808e-05, "loss": 0.1623, "step": 21324 }, { "epoch": 6.519413023540202, "grad_norm": 0.5840144753456116, "learning_rate": 2.428262069551187e-05, "loss": 0.1649, "step": 21325 }, { "epoch": 6.519718740446347, "grad_norm": 0.8190557360649109, "learning_rate": 2.428219608509193e-05, "loss": 0.2276, "step": 21326 }, { "epoch": 6.520024457352491, "grad_norm": 0.6172247529029846, "learning_rate": 2.428177147467199e-05, "loss": 0.1471, "step": 21327 }, { "epoch": 6.520330174258636, "grad_norm": 0.47862452268600464, "learning_rate": 2.428134686425205e-05, "loss": 0.079, "step": 21328 }, { "epoch": 6.5206358911647815, "grad_norm": 0.3313983082771301, "learning_rate": 2.428092225383211e-05, "loss": 0.0802, "step": 21329 }, { "epoch": 6.520941608070927, "grad_norm": 0.4201260507106781, "learning_rate": 2.428049764341217e-05, "loss": 0.0624, "step": 21330 }, { "epoch": 6.521247324977072, "grad_norm": 0.2751399874687195, "learning_rate": 2.4280073032992232e-05, "loss": 0.0476, "step": 21331 }, { "epoch": 6.521553041883216, "grad_norm": 0.3962589502334595, "learning_rate": 2.427964842257229e-05, "loss": 0.0374, "step": 21332 }, { "epoch": 6.521858758789361, "grad_norm": 0.7066413760185242, "learning_rate": 2.4279223812152353e-05, "loss": 0.0529, "step": 21333 }, { "epoch": 6.522164475695506, "grad_norm": 0.22119416296482086, "learning_rate": 2.4278799201732412e-05, "loss": 0.0763, "step": 21334 }, { "epoch": 6.52247019260165, "grad_norm": 0.2598560154438019, "learning_rate": 2.427837459131247e-05, "loss": 0.0623, "step": 21335 }, { "epoch": 6.5227759095077955, "grad_norm": 0.3194124698638916, "learning_rate": 2.4277949980892533e-05, "loss": 0.0505, "step": 21336 }, { "epoch": 6.523081626413941, "grad_norm": 0.35980331897735596, "learning_rate": 2.427752537047259e-05, "loss": 0.0679, "step": 21337 }, { "epoch": 6.523387343320086, "grad_norm": 0.15765802562236786, "learning_rate": 2.4277100760052654e-05, "loss": 0.0566, "step": 21338 }, { "epoch": 6.523693060226231, "grad_norm": 0.5193417072296143, "learning_rate": 2.4276676149632712e-05, "loss": 0.0969, "step": 21339 }, { "epoch": 6.523998777132375, "grad_norm": 0.7768480777740479, "learning_rate": 2.4276251539212774e-05, "loss": 0.1059, "step": 21340 }, { "epoch": 6.52430449403852, "grad_norm": 0.5310235023498535, "learning_rate": 2.4275826928792833e-05, "loss": 0.1596, "step": 21341 }, { "epoch": 6.524610210944665, "grad_norm": 0.5618026256561279, "learning_rate": 2.4275402318372895e-05, "loss": 0.1461, "step": 21342 }, { "epoch": 6.52491592785081, "grad_norm": 0.5595186352729797, "learning_rate": 2.4274977707952954e-05, "loss": 0.1554, "step": 21343 }, { "epoch": 6.5252216447569555, "grad_norm": 1.3093352317810059, "learning_rate": 2.4274553097533016e-05, "loss": 0.1391, "step": 21344 }, { "epoch": 6.5255273616631, "grad_norm": 1.5242550373077393, "learning_rate": 2.4274128487113075e-05, "loss": 0.166, "step": 21345 }, { "epoch": 6.525833078569245, "grad_norm": 0.8524599075317383, "learning_rate": 2.4273703876693137e-05, "loss": 0.2134, "step": 21346 }, { "epoch": 6.52613879547539, "grad_norm": 0.5898444056510925, "learning_rate": 2.4273279266273195e-05, "loss": 0.1576, "step": 21347 }, { "epoch": 6.526444512381534, "grad_norm": 0.881070077419281, "learning_rate": 2.4272854655853254e-05, "loss": 0.1987, "step": 21348 }, { "epoch": 6.526750229287679, "grad_norm": 0.9966383576393127, "learning_rate": 2.4272430045433316e-05, "loss": 0.1517, "step": 21349 }, { "epoch": 6.5270559461938245, "grad_norm": 1.118078589439392, "learning_rate": 2.4272005435013375e-05, "loss": 0.2021, "step": 21350 }, { "epoch": 6.52736166309997, "grad_norm": 1.0858256816864014, "learning_rate": 2.4271580824593437e-05, "loss": 0.1997, "step": 21351 }, { "epoch": 6.527667380006115, "grad_norm": 0.5372995734214783, "learning_rate": 2.4271156214173496e-05, "loss": 0.1345, "step": 21352 }, { "epoch": 6.527973096912259, "grad_norm": 0.45839589834213257, "learning_rate": 2.4270731603753558e-05, "loss": 0.077, "step": 21353 }, { "epoch": 6.528278813818404, "grad_norm": 0.3019616901874542, "learning_rate": 2.4270306993333616e-05, "loss": 0.0628, "step": 21354 }, { "epoch": 6.528584530724549, "grad_norm": 0.2061017006635666, "learning_rate": 2.426988238291368e-05, "loss": 0.0737, "step": 21355 }, { "epoch": 6.528890247630694, "grad_norm": 0.2577838897705078, "learning_rate": 2.4269457772493737e-05, "loss": 0.0617, "step": 21356 }, { "epoch": 6.529195964536839, "grad_norm": 0.22829601168632507, "learning_rate": 2.42690331620738e-05, "loss": 0.0573, "step": 21357 }, { "epoch": 6.529501681442984, "grad_norm": 0.34275704622268677, "learning_rate": 2.4268608551653858e-05, "loss": 0.0589, "step": 21358 }, { "epoch": 6.529807398349129, "grad_norm": 0.3100472390651703, "learning_rate": 2.426818394123392e-05, "loss": 0.051, "step": 21359 }, { "epoch": 6.530113115255274, "grad_norm": 0.41700103878974915, "learning_rate": 2.426775933081398e-05, "loss": 0.0707, "step": 21360 }, { "epoch": 6.530418832161418, "grad_norm": 0.41712313890457153, "learning_rate": 2.4267334720394037e-05, "loss": 0.0627, "step": 21361 }, { "epoch": 6.530724549067563, "grad_norm": 0.18496577441692352, "learning_rate": 2.42669101099741e-05, "loss": 0.0694, "step": 21362 }, { "epoch": 6.531030265973708, "grad_norm": 0.2805708050727844, "learning_rate": 2.4266485499554158e-05, "loss": 0.0687, "step": 21363 }, { "epoch": 6.531335982879853, "grad_norm": 0.30385440587997437, "learning_rate": 2.426606088913422e-05, "loss": 0.0945, "step": 21364 }, { "epoch": 6.5316416997859985, "grad_norm": 0.5005346536636353, "learning_rate": 2.426563627871428e-05, "loss": 0.1201, "step": 21365 }, { "epoch": 6.531947416692143, "grad_norm": 0.8626431226730347, "learning_rate": 2.426521166829434e-05, "loss": 0.0956, "step": 21366 }, { "epoch": 6.532253133598288, "grad_norm": 0.3460749685764313, "learning_rate": 2.42647870578744e-05, "loss": 0.117, "step": 21367 }, { "epoch": 6.532558850504433, "grad_norm": 0.6125866174697876, "learning_rate": 2.4264362447454462e-05, "loss": 0.1184, "step": 21368 }, { "epoch": 6.532864567410578, "grad_norm": 0.5906453132629395, "learning_rate": 2.426393783703452e-05, "loss": 0.1606, "step": 21369 }, { "epoch": 6.533170284316723, "grad_norm": 1.1285871267318726, "learning_rate": 2.4263513226614583e-05, "loss": 0.1824, "step": 21370 }, { "epoch": 6.533476001222867, "grad_norm": 0.41077136993408203, "learning_rate": 2.426308861619464e-05, "loss": 0.159, "step": 21371 }, { "epoch": 6.5337817181290125, "grad_norm": 0.9351300001144409, "learning_rate": 2.42626640057747e-05, "loss": 0.1983, "step": 21372 }, { "epoch": 6.534087435035158, "grad_norm": 0.5744179487228394, "learning_rate": 2.4262239395354762e-05, "loss": 0.2129, "step": 21373 }, { "epoch": 6.534393151941302, "grad_norm": 1.060882329940796, "learning_rate": 2.426181478493482e-05, "loss": 0.2115, "step": 21374 }, { "epoch": 6.534698868847447, "grad_norm": 7.971945762634277, "learning_rate": 2.4261390174514883e-05, "loss": 0.1933, "step": 21375 }, { "epoch": 6.535004585753592, "grad_norm": 0.8472995162010193, "learning_rate": 2.4260965564094942e-05, "loss": 0.2243, "step": 21376 }, { "epoch": 6.535310302659737, "grad_norm": 0.7216202020645142, "learning_rate": 2.4260540953675004e-05, "loss": 0.1609, "step": 21377 }, { "epoch": 6.535616019565882, "grad_norm": 0.25469517707824707, "learning_rate": 2.4260116343255063e-05, "loss": 0.0846, "step": 21378 }, { "epoch": 6.535921736472027, "grad_norm": 0.17780576646327972, "learning_rate": 2.4259691732835125e-05, "loss": 0.0576, "step": 21379 }, { "epoch": 6.536227453378172, "grad_norm": 0.6957874894142151, "learning_rate": 2.4259267122415183e-05, "loss": 0.0484, "step": 21380 }, { "epoch": 6.536533170284317, "grad_norm": 0.3188183903694153, "learning_rate": 2.4258842511995245e-05, "loss": 0.0709, "step": 21381 }, { "epoch": 6.536838887190462, "grad_norm": 0.4460086524486542, "learning_rate": 2.4258417901575304e-05, "loss": 0.0561, "step": 21382 }, { "epoch": 6.537144604096607, "grad_norm": 0.3193986713886261, "learning_rate": 2.4257993291155366e-05, "loss": 0.0584, "step": 21383 }, { "epoch": 6.537450321002751, "grad_norm": 0.7942701578140259, "learning_rate": 2.4257568680735425e-05, "loss": 0.056, "step": 21384 }, { "epoch": 6.537756037908896, "grad_norm": 0.7319483160972595, "learning_rate": 2.4257144070315484e-05, "loss": 0.0599, "step": 21385 }, { "epoch": 6.5380617548150415, "grad_norm": 0.20501072704792023, "learning_rate": 2.4256719459895546e-05, "loss": 0.052, "step": 21386 }, { "epoch": 6.538367471721186, "grad_norm": 0.3337918221950531, "learning_rate": 2.4256294849475604e-05, "loss": 0.1284, "step": 21387 }, { "epoch": 6.538673188627331, "grad_norm": 0.2443014234304428, "learning_rate": 2.4255870239055666e-05, "loss": 0.0647, "step": 21388 }, { "epoch": 6.538978905533476, "grad_norm": 0.2982514500617981, "learning_rate": 2.4255445628635725e-05, "loss": 0.0789, "step": 21389 }, { "epoch": 6.539284622439621, "grad_norm": 0.3146127462387085, "learning_rate": 2.4255021018215787e-05, "loss": 0.1026, "step": 21390 }, { "epoch": 6.539590339345766, "grad_norm": 0.7553523182868958, "learning_rate": 2.4254596407795846e-05, "loss": 0.1052, "step": 21391 }, { "epoch": 6.53989605625191, "grad_norm": 1.1047742366790771, "learning_rate": 2.4254171797375908e-05, "loss": 0.1814, "step": 21392 }, { "epoch": 6.5402017731580555, "grad_norm": 0.9679626226425171, "learning_rate": 2.4253747186955967e-05, "loss": 0.1488, "step": 21393 }, { "epoch": 6.540507490064201, "grad_norm": 0.6713182330131531, "learning_rate": 2.425332257653603e-05, "loss": 0.1889, "step": 21394 }, { "epoch": 6.540813206970346, "grad_norm": 0.7543385028839111, "learning_rate": 2.4252897966116088e-05, "loss": 0.1205, "step": 21395 }, { "epoch": 6.541118923876491, "grad_norm": 0.6076797246932983, "learning_rate": 2.425247335569615e-05, "loss": 0.1594, "step": 21396 }, { "epoch": 6.541424640782635, "grad_norm": 0.3921612501144409, "learning_rate": 2.4252048745276208e-05, "loss": 0.1625, "step": 21397 }, { "epoch": 6.54173035768878, "grad_norm": 0.6033971905708313, "learning_rate": 2.4251624134856267e-05, "loss": 0.1797, "step": 21398 }, { "epoch": 6.542036074594925, "grad_norm": 1.5462969541549683, "learning_rate": 2.425119952443633e-05, "loss": 0.2951, "step": 21399 }, { "epoch": 6.5423417915010695, "grad_norm": 1.0718393325805664, "learning_rate": 2.4250774914016388e-05, "loss": 0.1617, "step": 21400 }, { "epoch": 6.542647508407215, "grad_norm": 1.254709005355835, "learning_rate": 2.425035030359645e-05, "loss": 0.1964, "step": 21401 }, { "epoch": 6.54295322531336, "grad_norm": 0.4664350152015686, "learning_rate": 2.424992569317651e-05, "loss": 0.1407, "step": 21402 }, { "epoch": 6.543258942219505, "grad_norm": 0.23865920305252075, "learning_rate": 2.424950108275657e-05, "loss": 0.0702, "step": 21403 }, { "epoch": 6.54356465912565, "grad_norm": 0.32319432497024536, "learning_rate": 2.424907647233663e-05, "loss": 0.0558, "step": 21404 }, { "epoch": 6.543870376031794, "grad_norm": 0.205963134765625, "learning_rate": 2.424865186191669e-05, "loss": 0.0597, "step": 21405 }, { "epoch": 6.544176092937939, "grad_norm": 0.4830262064933777, "learning_rate": 2.424822725149675e-05, "loss": 0.0532, "step": 21406 }, { "epoch": 6.5444818098440845, "grad_norm": 0.2058921456336975, "learning_rate": 2.4247802641076812e-05, "loss": 0.0636, "step": 21407 }, { "epoch": 6.54478752675023, "grad_norm": 1.0758419036865234, "learning_rate": 2.4247378030656874e-05, "loss": 0.072, "step": 21408 }, { "epoch": 6.545093243656375, "grad_norm": 0.248408243060112, "learning_rate": 2.4246953420236936e-05, "loss": 0.0561, "step": 21409 }, { "epoch": 6.545398960562519, "grad_norm": 0.31666430830955505, "learning_rate": 2.4246528809816995e-05, "loss": 0.0928, "step": 21410 }, { "epoch": 6.545704677468664, "grad_norm": 0.20483221113681793, "learning_rate": 2.4246104199397054e-05, "loss": 0.0799, "step": 21411 }, { "epoch": 6.546010394374809, "grad_norm": 0.4147005081176758, "learning_rate": 2.4245679588977116e-05, "loss": 0.0752, "step": 21412 }, { "epoch": 6.546316111280953, "grad_norm": 0.5322294235229492, "learning_rate": 2.4245254978557175e-05, "loss": 0.0826, "step": 21413 }, { "epoch": 6.5466218281870985, "grad_norm": 0.29105985164642334, "learning_rate": 2.4244830368137237e-05, "loss": 0.0923, "step": 21414 }, { "epoch": 6.546927545093244, "grad_norm": 0.7869717478752136, "learning_rate": 2.4244405757717295e-05, "loss": 0.0991, "step": 21415 }, { "epoch": 6.547233261999389, "grad_norm": 0.6516363620758057, "learning_rate": 2.4243981147297357e-05, "loss": 0.1393, "step": 21416 }, { "epoch": 6.547538978905534, "grad_norm": 0.4654444754123688, "learning_rate": 2.4243556536877416e-05, "loss": 0.1177, "step": 21417 }, { "epoch": 6.547844695811678, "grad_norm": 0.8364895582199097, "learning_rate": 2.4243131926457478e-05, "loss": 0.162, "step": 21418 }, { "epoch": 6.548150412717823, "grad_norm": 1.0504887104034424, "learning_rate": 2.4242707316037537e-05, "loss": 0.152, "step": 21419 }, { "epoch": 6.548456129623968, "grad_norm": 0.6694566607475281, "learning_rate": 2.42422827056176e-05, "loss": 0.1607, "step": 21420 }, { "epoch": 6.548761846530113, "grad_norm": 1.5516300201416016, "learning_rate": 2.4241858095197658e-05, "loss": 0.2039, "step": 21421 }, { "epoch": 6.5490675634362585, "grad_norm": 0.7954932451248169, "learning_rate": 2.424143348477772e-05, "loss": 0.1763, "step": 21422 }, { "epoch": 6.549373280342403, "grad_norm": 0.9332907199859619, "learning_rate": 2.424100887435778e-05, "loss": 0.1739, "step": 21423 }, { "epoch": 6.549678997248548, "grad_norm": 1.1822810173034668, "learning_rate": 2.4240584263937837e-05, "loss": 0.2194, "step": 21424 }, { "epoch": 6.549984714154693, "grad_norm": 0.6618886590003967, "learning_rate": 2.42401596535179e-05, "loss": 0.1702, "step": 21425 }, { "epoch": 6.550290431060837, "grad_norm": 0.8764485120773315, "learning_rate": 2.4239735043097958e-05, "loss": 0.2548, "step": 21426 }, { "epoch": 6.550596147966982, "grad_norm": 0.5309696793556213, "learning_rate": 2.423931043267802e-05, "loss": 0.1564, "step": 21427 }, { "epoch": 6.550901864873127, "grad_norm": 0.31977733969688416, "learning_rate": 2.423888582225808e-05, "loss": 0.0859, "step": 21428 }, { "epoch": 6.5512075817792725, "grad_norm": 0.27919232845306396, "learning_rate": 2.423846121183814e-05, "loss": 0.0861, "step": 21429 }, { "epoch": 6.551513298685418, "grad_norm": 0.14627225697040558, "learning_rate": 2.42380366014182e-05, "loss": 0.0401, "step": 21430 }, { "epoch": 6.551819015591562, "grad_norm": 0.29828986525535583, "learning_rate": 2.423761199099826e-05, "loss": 0.0763, "step": 21431 }, { "epoch": 6.552124732497707, "grad_norm": 0.4607507884502411, "learning_rate": 2.423718738057832e-05, "loss": 0.0612, "step": 21432 }, { "epoch": 6.552430449403852, "grad_norm": 0.21044307947158813, "learning_rate": 2.4236762770158382e-05, "loss": 0.0595, "step": 21433 }, { "epoch": 6.552736166309997, "grad_norm": 0.2374926060438156, "learning_rate": 2.423633815973844e-05, "loss": 0.0702, "step": 21434 }, { "epoch": 6.553041883216142, "grad_norm": 0.7816290259361267, "learning_rate": 2.4235913549318503e-05, "loss": 0.1025, "step": 21435 }, { "epoch": 6.553347600122287, "grad_norm": 0.25926247239112854, "learning_rate": 2.4235488938898562e-05, "loss": 0.0783, "step": 21436 }, { "epoch": 6.553653317028432, "grad_norm": 0.898669421672821, "learning_rate": 2.423506432847862e-05, "loss": 0.1018, "step": 21437 }, { "epoch": 6.553959033934577, "grad_norm": 0.5319616198539734, "learning_rate": 2.4234639718058683e-05, "loss": 0.0668, "step": 21438 }, { "epoch": 6.554264750840721, "grad_norm": 0.41189685463905334, "learning_rate": 2.423421510763874e-05, "loss": 0.1086, "step": 21439 }, { "epoch": 6.554570467746866, "grad_norm": 0.5618849396705627, "learning_rate": 2.4233790497218804e-05, "loss": 0.1191, "step": 21440 }, { "epoch": 6.554876184653011, "grad_norm": 0.5041928291320801, "learning_rate": 2.4233365886798862e-05, "loss": 0.1072, "step": 21441 }, { "epoch": 6.555181901559156, "grad_norm": 1.7797802686691284, "learning_rate": 2.4232941276378924e-05, "loss": 0.1461, "step": 21442 }, { "epoch": 6.5554876184653015, "grad_norm": 0.471292108297348, "learning_rate": 2.4232516665958983e-05, "loss": 0.1519, "step": 21443 }, { "epoch": 6.555793335371446, "grad_norm": 0.5895012617111206, "learning_rate": 2.4232092055539045e-05, "loss": 0.1789, "step": 21444 }, { "epoch": 6.556099052277591, "grad_norm": 0.5048537254333496, "learning_rate": 2.4231667445119104e-05, "loss": 0.1409, "step": 21445 }, { "epoch": 6.556404769183736, "grad_norm": 0.9241306185722351, "learning_rate": 2.4231242834699166e-05, "loss": 0.1507, "step": 21446 }, { "epoch": 6.556710486089881, "grad_norm": 0.4468935430049896, "learning_rate": 2.4230818224279225e-05, "loss": 0.141, "step": 21447 }, { "epoch": 6.557016202996026, "grad_norm": 1.6656941175460815, "learning_rate": 2.4230393613859287e-05, "loss": 0.1983, "step": 21448 }, { "epoch": 6.55732191990217, "grad_norm": 2.4706084728240967, "learning_rate": 2.4229969003439345e-05, "loss": 0.1496, "step": 21449 }, { "epoch": 6.5576276368083155, "grad_norm": 0.7512071132659912, "learning_rate": 2.4229544393019404e-05, "loss": 0.1815, "step": 21450 }, { "epoch": 6.557933353714461, "grad_norm": 1.999038577079773, "learning_rate": 2.4229119782599466e-05, "loss": 0.2096, "step": 21451 }, { "epoch": 6.558239070620605, "grad_norm": 0.6486340165138245, "learning_rate": 2.4228695172179525e-05, "loss": 0.1047, "step": 21452 }, { "epoch": 6.55854478752675, "grad_norm": 1.2017101049423218, "learning_rate": 2.4228270561759587e-05, "loss": 0.0651, "step": 21453 }, { "epoch": 6.558850504432895, "grad_norm": 0.32120513916015625, "learning_rate": 2.4227845951339646e-05, "loss": 0.0586, "step": 21454 }, { "epoch": 6.55915622133904, "grad_norm": 0.19033850729465485, "learning_rate": 2.4227421340919708e-05, "loss": 0.0608, "step": 21455 }, { "epoch": 6.559461938245185, "grad_norm": 0.33510830998420715, "learning_rate": 2.4226996730499766e-05, "loss": 0.0574, "step": 21456 }, { "epoch": 6.5597676551513295, "grad_norm": 0.14582563936710358, "learning_rate": 2.422657212007983e-05, "loss": 0.0371, "step": 21457 }, { "epoch": 6.560073372057475, "grad_norm": 0.21201486885547638, "learning_rate": 2.4226147509659887e-05, "loss": 0.0554, "step": 21458 }, { "epoch": 6.56037908896362, "grad_norm": 0.18885989487171173, "learning_rate": 2.422572289923995e-05, "loss": 0.0449, "step": 21459 }, { "epoch": 6.560684805869765, "grad_norm": 0.3291248381137848, "learning_rate": 2.4225298288820008e-05, "loss": 0.0877, "step": 21460 }, { "epoch": 6.56099052277591, "grad_norm": 0.23065690696239471, "learning_rate": 2.422487367840007e-05, "loss": 0.056, "step": 21461 }, { "epoch": 6.561296239682054, "grad_norm": 0.2912161648273468, "learning_rate": 2.422444906798013e-05, "loss": 0.0771, "step": 21462 }, { "epoch": 6.561601956588199, "grad_norm": 1.230757236480713, "learning_rate": 2.4224024457560188e-05, "loss": 0.0692, "step": 21463 }, { "epoch": 6.5619076734943445, "grad_norm": 0.45426085591316223, "learning_rate": 2.422359984714025e-05, "loss": 0.1054, "step": 21464 }, { "epoch": 6.562213390400489, "grad_norm": 0.9157555103302002, "learning_rate": 2.422317523672031e-05, "loss": 0.1252, "step": 21465 }, { "epoch": 6.562519107306634, "grad_norm": 0.5859290957450867, "learning_rate": 2.422275062630037e-05, "loss": 0.1121, "step": 21466 }, { "epoch": 6.562824824212779, "grad_norm": 0.9746419787406921, "learning_rate": 2.422232601588043e-05, "loss": 0.159, "step": 21467 }, { "epoch": 6.563130541118924, "grad_norm": 0.34775224328041077, "learning_rate": 2.422190140546049e-05, "loss": 0.1263, "step": 21468 }, { "epoch": 6.563436258025069, "grad_norm": 0.3219265937805176, "learning_rate": 2.422147679504055e-05, "loss": 0.1371, "step": 21469 }, { "epoch": 6.563741974931213, "grad_norm": 0.9737737774848938, "learning_rate": 2.4221052184620612e-05, "loss": 0.1531, "step": 21470 }, { "epoch": 6.5640476918373585, "grad_norm": 0.5295964479446411, "learning_rate": 2.422062757420067e-05, "loss": 0.1634, "step": 21471 }, { "epoch": 6.564353408743504, "grad_norm": 0.8467267751693726, "learning_rate": 2.4220202963780733e-05, "loss": 0.1774, "step": 21472 }, { "epoch": 6.564659125649649, "grad_norm": 1.0272302627563477, "learning_rate": 2.421977835336079e-05, "loss": 0.1744, "step": 21473 }, { "epoch": 6.564964842555794, "grad_norm": 1.071305513381958, "learning_rate": 2.4219353742940854e-05, "loss": 0.1955, "step": 21474 }, { "epoch": 6.565270559461938, "grad_norm": 7.092102527618408, "learning_rate": 2.4218929132520912e-05, "loss": 0.1699, "step": 21475 }, { "epoch": 6.565576276368083, "grad_norm": 2.2878289222717285, "learning_rate": 2.421850452210097e-05, "loss": 0.2555, "step": 21476 }, { "epoch": 6.565881993274228, "grad_norm": 0.29457756876945496, "learning_rate": 2.4218079911681033e-05, "loss": 0.1374, "step": 21477 }, { "epoch": 6.5661877101803725, "grad_norm": 0.20888184010982513, "learning_rate": 2.4217655301261092e-05, "loss": 0.0811, "step": 21478 }, { "epoch": 6.566493427086518, "grad_norm": 0.31152525544166565, "learning_rate": 2.4217230690841154e-05, "loss": 0.0913, "step": 21479 }, { "epoch": 6.566799143992663, "grad_norm": 0.310515820980072, "learning_rate": 2.4216806080421213e-05, "loss": 0.0511, "step": 21480 }, { "epoch": 6.567104860898808, "grad_norm": 0.3404650390148163, "learning_rate": 2.4216381470001275e-05, "loss": 0.0583, "step": 21481 }, { "epoch": 6.567410577804953, "grad_norm": 0.19521306455135345, "learning_rate": 2.4215956859581333e-05, "loss": 0.0384, "step": 21482 }, { "epoch": 6.567716294711097, "grad_norm": 0.5852373838424683, "learning_rate": 2.4215532249161395e-05, "loss": 0.0561, "step": 21483 }, { "epoch": 6.568022011617242, "grad_norm": 0.18953937292099, "learning_rate": 2.4215107638741454e-05, "loss": 0.045, "step": 21484 }, { "epoch": 6.568327728523387, "grad_norm": 0.31114041805267334, "learning_rate": 2.4214683028321516e-05, "loss": 0.0545, "step": 21485 }, { "epoch": 6.5686334454295325, "grad_norm": 0.31017008423805237, "learning_rate": 2.4214258417901575e-05, "loss": 0.0723, "step": 21486 }, { "epoch": 6.568939162335678, "grad_norm": 0.24035640060901642, "learning_rate": 2.4213833807481634e-05, "loss": 0.0709, "step": 21487 }, { "epoch": 6.569244879241822, "grad_norm": 0.45373982191085815, "learning_rate": 2.4213409197061696e-05, "loss": 0.0829, "step": 21488 }, { "epoch": 6.569550596147967, "grad_norm": 0.610776424407959, "learning_rate": 2.4212984586641754e-05, "loss": 0.0911, "step": 21489 }, { "epoch": 6.569856313054112, "grad_norm": 0.4006282389163971, "learning_rate": 2.4212559976221816e-05, "loss": 0.1106, "step": 21490 }, { "epoch": 6.570162029960256, "grad_norm": 0.4460471272468567, "learning_rate": 2.4212135365801875e-05, "loss": 0.1237, "step": 21491 }, { "epoch": 6.5704677468664014, "grad_norm": 0.40771064162254333, "learning_rate": 2.4211710755381937e-05, "loss": 0.1159, "step": 21492 }, { "epoch": 6.570773463772547, "grad_norm": 0.48671603202819824, "learning_rate": 2.4211286144961996e-05, "loss": 0.1278, "step": 21493 }, { "epoch": 6.571079180678692, "grad_norm": 0.5469728708267212, "learning_rate": 2.4210861534542058e-05, "loss": 0.1146, "step": 21494 }, { "epoch": 6.571384897584837, "grad_norm": 1.141882300376892, "learning_rate": 2.4210436924122117e-05, "loss": 0.1208, "step": 21495 }, { "epoch": 6.571690614490981, "grad_norm": 0.9741637706756592, "learning_rate": 2.421001231370218e-05, "loss": 0.1516, "step": 21496 }, { "epoch": 6.571996331397126, "grad_norm": 1.3839057683944702, "learning_rate": 2.4209587703282238e-05, "loss": 0.1293, "step": 21497 }, { "epoch": 6.572302048303271, "grad_norm": 2.0233702659606934, "learning_rate": 2.42091630928623e-05, "loss": 0.1784, "step": 21498 }, { "epoch": 6.572607765209416, "grad_norm": 1.203979730606079, "learning_rate": 2.420873848244236e-05, "loss": 0.1907, "step": 21499 }, { "epoch": 6.5729134821155615, "grad_norm": 2.6000478267669678, "learning_rate": 2.4208313872022417e-05, "loss": 0.1762, "step": 21500 }, { "epoch": 6.573219199021706, "grad_norm": 1.0578478574752808, "learning_rate": 2.420788926160248e-05, "loss": 0.2184, "step": 21501 }, { "epoch": 6.573524915927851, "grad_norm": 0.8424854278564453, "learning_rate": 2.4207464651182538e-05, "loss": 0.1538, "step": 21502 }, { "epoch": 6.573830632833996, "grad_norm": 0.6758025288581848, "learning_rate": 2.42070400407626e-05, "loss": 0.0862, "step": 21503 }, { "epoch": 6.57413634974014, "grad_norm": 0.22588768601417542, "learning_rate": 2.420661543034266e-05, "loss": 0.0815, "step": 21504 }, { "epoch": 6.574442066646285, "grad_norm": 1.0673227310180664, "learning_rate": 2.420619081992272e-05, "loss": 0.0851, "step": 21505 }, { "epoch": 6.57474778355243, "grad_norm": 0.387033075094223, "learning_rate": 2.420576620950278e-05, "loss": 0.0742, "step": 21506 }, { "epoch": 6.5750535004585755, "grad_norm": 0.14075063169002533, "learning_rate": 2.420534159908284e-05, "loss": 0.0445, "step": 21507 }, { "epoch": 6.575359217364721, "grad_norm": 0.230246901512146, "learning_rate": 2.42049169886629e-05, "loss": 0.0511, "step": 21508 }, { "epoch": 6.575664934270865, "grad_norm": 0.47865694761276245, "learning_rate": 2.4204492378242962e-05, "loss": 0.0405, "step": 21509 }, { "epoch": 6.57597065117701, "grad_norm": 0.4013589322566986, "learning_rate": 2.4204067767823024e-05, "loss": 0.0886, "step": 21510 }, { "epoch": 6.576276368083155, "grad_norm": 1.526576042175293, "learning_rate": 2.4203643157403086e-05, "loss": 0.0675, "step": 21511 }, { "epoch": 6.5765820849893, "grad_norm": 0.5929735898971558, "learning_rate": 2.4203218546983145e-05, "loss": 0.0831, "step": 21512 }, { "epoch": 6.576887801895445, "grad_norm": 0.3228267729282379, "learning_rate": 2.4202793936563204e-05, "loss": 0.1094, "step": 21513 }, { "epoch": 6.5771935188015895, "grad_norm": 0.39392393827438354, "learning_rate": 2.4202369326143266e-05, "loss": 0.1127, "step": 21514 }, { "epoch": 6.577499235707735, "grad_norm": 0.2872825264930725, "learning_rate": 2.4201944715723325e-05, "loss": 0.089, "step": 21515 }, { "epoch": 6.57780495261388, "grad_norm": 1.4553325176239014, "learning_rate": 2.4201520105303387e-05, "loss": 0.1507, "step": 21516 }, { "epoch": 6.578110669520024, "grad_norm": 0.39942997694015503, "learning_rate": 2.4201095494883445e-05, "loss": 0.1141, "step": 21517 }, { "epoch": 6.578416386426169, "grad_norm": 0.4784875810146332, "learning_rate": 2.4200670884463508e-05, "loss": 0.1765, "step": 21518 }, { "epoch": 6.578722103332314, "grad_norm": 2.973201274871826, "learning_rate": 2.4200246274043566e-05, "loss": 0.1271, "step": 21519 }, { "epoch": 6.579027820238459, "grad_norm": 1.028002142906189, "learning_rate": 2.4199821663623628e-05, "loss": 0.1381, "step": 21520 }, { "epoch": 6.5793335371446044, "grad_norm": 0.5714580416679382, "learning_rate": 2.4199397053203687e-05, "loss": 0.1765, "step": 21521 }, { "epoch": 6.579639254050749, "grad_norm": 0.580322802066803, "learning_rate": 2.419897244278375e-05, "loss": 0.1605, "step": 21522 }, { "epoch": 6.579944970956894, "grad_norm": 0.9143041372299194, "learning_rate": 2.4198547832363808e-05, "loss": 0.1885, "step": 21523 }, { "epoch": 6.580250687863039, "grad_norm": 1.3823963403701782, "learning_rate": 2.419812322194387e-05, "loss": 0.1681, "step": 21524 }, { "epoch": 6.580556404769184, "grad_norm": 2.1119890213012695, "learning_rate": 2.419769861152393e-05, "loss": 0.1898, "step": 21525 }, { "epoch": 6.580862121675329, "grad_norm": 3.6182100772857666, "learning_rate": 2.4197274001103987e-05, "loss": 0.2353, "step": 21526 }, { "epoch": 6.581167838581473, "grad_norm": 0.27683085203170776, "learning_rate": 2.419684939068405e-05, "loss": 0.1125, "step": 21527 }, { "epoch": 6.5814735554876185, "grad_norm": 0.19568634033203125, "learning_rate": 2.4196424780264108e-05, "loss": 0.0593, "step": 21528 }, { "epoch": 6.581779272393764, "grad_norm": 0.2849598228931427, "learning_rate": 2.419600016984417e-05, "loss": 0.0873, "step": 21529 }, { "epoch": 6.582084989299908, "grad_norm": 0.17895324528217316, "learning_rate": 2.419557555942423e-05, "loss": 0.0736, "step": 21530 }, { "epoch": 6.582390706206053, "grad_norm": 0.2753433883190155, "learning_rate": 2.419515094900429e-05, "loss": 0.0462, "step": 21531 }, { "epoch": 6.582696423112198, "grad_norm": 0.34650832414627075, "learning_rate": 2.419472633858435e-05, "loss": 0.0455, "step": 21532 }, { "epoch": 6.583002140018343, "grad_norm": 0.6152823567390442, "learning_rate": 2.4194301728164412e-05, "loss": 0.0605, "step": 21533 }, { "epoch": 6.583307856924488, "grad_norm": 0.29658499360084534, "learning_rate": 2.419387711774447e-05, "loss": 0.0674, "step": 21534 }, { "epoch": 6.5836135738306325, "grad_norm": 0.4074154496192932, "learning_rate": 2.4193452507324533e-05, "loss": 0.0596, "step": 21535 }, { "epoch": 6.583919290736778, "grad_norm": 0.23984475433826447, "learning_rate": 2.419302789690459e-05, "loss": 0.0502, "step": 21536 }, { "epoch": 6.584225007642923, "grad_norm": 0.7354786396026611, "learning_rate": 2.4192603286484653e-05, "loss": 0.1193, "step": 21537 }, { "epoch": 6.584530724549068, "grad_norm": 0.3334639370441437, "learning_rate": 2.4192178676064712e-05, "loss": 0.0764, "step": 21538 }, { "epoch": 6.584836441455213, "grad_norm": 0.3388093113899231, "learning_rate": 2.419175406564477e-05, "loss": 0.1035, "step": 21539 }, { "epoch": 6.585142158361357, "grad_norm": 0.4363846480846405, "learning_rate": 2.4191329455224833e-05, "loss": 0.0809, "step": 21540 }, { "epoch": 6.585447875267502, "grad_norm": 0.7034065127372742, "learning_rate": 2.419090484480489e-05, "loss": 0.1491, "step": 21541 }, { "epoch": 6.585753592173647, "grad_norm": 1.4187690019607544, "learning_rate": 2.4190480234384954e-05, "loss": 0.1163, "step": 21542 }, { "epoch": 6.586059309079792, "grad_norm": 0.9009736180305481, "learning_rate": 2.4190055623965012e-05, "loss": 0.1708, "step": 21543 }, { "epoch": 6.586365025985937, "grad_norm": 1.2943003177642822, "learning_rate": 2.4189631013545074e-05, "loss": 0.1745, "step": 21544 }, { "epoch": 6.586670742892082, "grad_norm": 0.5051141977310181, "learning_rate": 2.4189206403125133e-05, "loss": 0.1367, "step": 21545 }, { "epoch": 6.586976459798227, "grad_norm": 0.9987176060676575, "learning_rate": 2.4188781792705195e-05, "loss": 0.1477, "step": 21546 }, { "epoch": 6.587282176704372, "grad_norm": 0.9167550206184387, "learning_rate": 2.4188357182285254e-05, "loss": 0.1448, "step": 21547 }, { "epoch": 6.587587893610516, "grad_norm": 1.0655641555786133, "learning_rate": 2.4187932571865316e-05, "loss": 0.1866, "step": 21548 }, { "epoch": 6.5878936105166614, "grad_norm": 1.0270944833755493, "learning_rate": 2.4187507961445375e-05, "loss": 0.1941, "step": 21549 }, { "epoch": 6.588199327422807, "grad_norm": 1.560840129852295, "learning_rate": 2.4187083351025437e-05, "loss": 0.1861, "step": 21550 }, { "epoch": 6.588505044328952, "grad_norm": 4.310232639312744, "learning_rate": 2.4186658740605495e-05, "loss": 0.2073, "step": 21551 }, { "epoch": 6.588810761235097, "grad_norm": 0.47541362047195435, "learning_rate": 2.4186234130185554e-05, "loss": 0.131, "step": 21552 }, { "epoch": 6.589116478141241, "grad_norm": 0.603796124458313, "learning_rate": 2.4185809519765616e-05, "loss": 0.069, "step": 21553 }, { "epoch": 6.589422195047386, "grad_norm": 0.32061660289764404, "learning_rate": 2.4185384909345675e-05, "loss": 0.071, "step": 21554 }, { "epoch": 6.589727911953531, "grad_norm": 0.19017764925956726, "learning_rate": 2.4184960298925737e-05, "loss": 0.0515, "step": 21555 }, { "epoch": 6.5900336288596755, "grad_norm": 0.2091628760099411, "learning_rate": 2.4184535688505796e-05, "loss": 0.0583, "step": 21556 }, { "epoch": 6.590339345765821, "grad_norm": 0.3118257522583008, "learning_rate": 2.4184111078085858e-05, "loss": 0.0724, "step": 21557 }, { "epoch": 6.590645062671966, "grad_norm": 0.22996442019939423, "learning_rate": 2.4183686467665916e-05, "loss": 0.073, "step": 21558 }, { "epoch": 6.590950779578111, "grad_norm": 0.4352387487888336, "learning_rate": 2.418326185724598e-05, "loss": 0.0688, "step": 21559 }, { "epoch": 6.591256496484256, "grad_norm": 0.7760034203529358, "learning_rate": 2.4182837246826037e-05, "loss": 0.0749, "step": 21560 }, { "epoch": 6.5915622133904, "grad_norm": 0.2411978840827942, "learning_rate": 2.41824126364061e-05, "loss": 0.0541, "step": 21561 }, { "epoch": 6.591867930296545, "grad_norm": 0.48004916310310364, "learning_rate": 2.4181988025986158e-05, "loss": 0.084, "step": 21562 }, { "epoch": 6.59217364720269, "grad_norm": 0.3816184401512146, "learning_rate": 2.418156341556622e-05, "loss": 0.0855, "step": 21563 }, { "epoch": 6.5924793641088355, "grad_norm": 0.7620307207107544, "learning_rate": 2.418113880514628e-05, "loss": 0.086, "step": 21564 }, { "epoch": 6.592785081014981, "grad_norm": 3.3999242782592773, "learning_rate": 2.4180714194726338e-05, "loss": 0.132, "step": 21565 }, { "epoch": 6.593090797921125, "grad_norm": 0.8779677152633667, "learning_rate": 2.41802895843064e-05, "loss": 0.1381, "step": 21566 }, { "epoch": 6.59339651482727, "grad_norm": 0.5333719849586487, "learning_rate": 2.417986497388646e-05, "loss": 0.126, "step": 21567 }, { "epoch": 6.593702231733415, "grad_norm": 0.6046276688575745, "learning_rate": 2.417944036346652e-05, "loss": 0.1478, "step": 21568 }, { "epoch": 6.594007948639559, "grad_norm": 0.6983715295791626, "learning_rate": 2.417901575304658e-05, "loss": 0.158, "step": 21569 }, { "epoch": 6.594313665545704, "grad_norm": 0.38303807377815247, "learning_rate": 2.417859114262664e-05, "loss": 0.1439, "step": 21570 }, { "epoch": 6.5946193824518495, "grad_norm": 2.73012113571167, "learning_rate": 2.41781665322067e-05, "loss": 0.1313, "step": 21571 }, { "epoch": 6.594925099357995, "grad_norm": 0.46916577219963074, "learning_rate": 2.4177741921786762e-05, "loss": 0.1637, "step": 21572 }, { "epoch": 6.59523081626414, "grad_norm": 0.6644063591957092, "learning_rate": 2.417731731136682e-05, "loss": 0.1773, "step": 21573 }, { "epoch": 6.595536533170284, "grad_norm": 0.7125985026359558, "learning_rate": 2.4176892700946883e-05, "loss": 0.1762, "step": 21574 }, { "epoch": 6.595842250076429, "grad_norm": 0.8260692954063416, "learning_rate": 2.417646809052694e-05, "loss": 0.2278, "step": 21575 }, { "epoch": 6.596147966982574, "grad_norm": 1.1127216815948486, "learning_rate": 2.4176043480107004e-05, "loss": 0.2422, "step": 21576 }, { "epoch": 6.596453683888719, "grad_norm": 0.3591744005680084, "learning_rate": 2.4175618869687062e-05, "loss": 0.1226, "step": 21577 }, { "epoch": 6.5967594007948644, "grad_norm": 0.3200613856315613, "learning_rate": 2.417519425926712e-05, "loss": 0.0762, "step": 21578 }, { "epoch": 6.597065117701009, "grad_norm": 0.5043635368347168, "learning_rate": 2.4174769648847183e-05, "loss": 0.0541, "step": 21579 }, { "epoch": 6.597370834607154, "grad_norm": 0.5076474547386169, "learning_rate": 2.4174345038427242e-05, "loss": 0.0513, "step": 21580 }, { "epoch": 6.597676551513299, "grad_norm": 0.21509021520614624, "learning_rate": 2.4173920428007304e-05, "loss": 0.0479, "step": 21581 }, { "epoch": 6.597982268419443, "grad_norm": 0.17655795812606812, "learning_rate": 2.4173495817587363e-05, "loss": 0.0316, "step": 21582 }, { "epoch": 6.598287985325588, "grad_norm": 0.2778565585613251, "learning_rate": 2.4173071207167425e-05, "loss": 0.0732, "step": 21583 }, { "epoch": 6.598593702231733, "grad_norm": 0.46470415592193604, "learning_rate": 2.4172646596747483e-05, "loss": 0.0887, "step": 21584 }, { "epoch": 6.5988994191378785, "grad_norm": 0.25365278124809265, "learning_rate": 2.4172221986327545e-05, "loss": 0.0513, "step": 21585 }, { "epoch": 6.599205136044024, "grad_norm": 6.110572338104248, "learning_rate": 2.4171797375907604e-05, "loss": 0.0605, "step": 21586 }, { "epoch": 6.599510852950168, "grad_norm": 0.4483489394187927, "learning_rate": 2.4171372765487666e-05, "loss": 0.088, "step": 21587 }, { "epoch": 6.599816569856313, "grad_norm": 0.4255867600440979, "learning_rate": 2.4170948155067725e-05, "loss": 0.0817, "step": 21588 }, { "epoch": 6.600122286762458, "grad_norm": 0.41618612408638, "learning_rate": 2.4170523544647787e-05, "loss": 0.0682, "step": 21589 }, { "epoch": 6.600428003668603, "grad_norm": 0.7474077343940735, "learning_rate": 2.4170098934227846e-05, "loss": 0.0782, "step": 21590 }, { "epoch": 6.600733720574748, "grad_norm": 0.25108397006988525, "learning_rate": 2.4169674323807904e-05, "loss": 0.0908, "step": 21591 }, { "epoch": 6.6010394374808925, "grad_norm": 0.8481459617614746, "learning_rate": 2.4169249713387967e-05, "loss": 0.1145, "step": 21592 }, { "epoch": 6.601345154387038, "grad_norm": 0.6898753643035889, "learning_rate": 2.4168825102968025e-05, "loss": 0.1456, "step": 21593 }, { "epoch": 6.601650871293183, "grad_norm": 1.1915109157562256, "learning_rate": 2.4168400492548087e-05, "loss": 0.1585, "step": 21594 }, { "epoch": 6.601956588199327, "grad_norm": 0.5611310601234436, "learning_rate": 2.4167975882128146e-05, "loss": 0.1603, "step": 21595 }, { "epoch": 6.602262305105472, "grad_norm": 0.6918768882751465, "learning_rate": 2.4167551271708208e-05, "loss": 0.198, "step": 21596 }, { "epoch": 6.602568022011617, "grad_norm": 0.6960002183914185, "learning_rate": 2.4167126661288267e-05, "loss": 0.1561, "step": 21597 }, { "epoch": 6.602873738917762, "grad_norm": 0.5619196891784668, "learning_rate": 2.416670205086833e-05, "loss": 0.1751, "step": 21598 }, { "epoch": 6.603179455823907, "grad_norm": 0.6010273098945618, "learning_rate": 2.4166277440448388e-05, "loss": 0.1766, "step": 21599 }, { "epoch": 6.603485172730052, "grad_norm": 4.721993446350098, "learning_rate": 2.416585283002845e-05, "loss": 0.1783, "step": 21600 }, { "epoch": 6.603790889636197, "grad_norm": 0.855708658695221, "learning_rate": 2.416542821960851e-05, "loss": 0.1828, "step": 21601 }, { "epoch": 6.604096606542342, "grad_norm": 0.6397972106933594, "learning_rate": 2.4165003609188567e-05, "loss": 0.1479, "step": 21602 }, { "epoch": 6.604402323448487, "grad_norm": 0.19229066371917725, "learning_rate": 2.416457899876863e-05, "loss": 0.0843, "step": 21603 }, { "epoch": 6.604708040354632, "grad_norm": 0.14537721872329712, "learning_rate": 2.4164154388348688e-05, "loss": 0.06, "step": 21604 }, { "epoch": 6.605013757260776, "grad_norm": 0.19200102984905243, "learning_rate": 2.416372977792875e-05, "loss": 0.0495, "step": 21605 }, { "epoch": 6.6053194741669214, "grad_norm": 0.20212720334529877, "learning_rate": 2.416330516750881e-05, "loss": 0.0541, "step": 21606 }, { "epoch": 6.605625191073067, "grad_norm": 0.5325552225112915, "learning_rate": 2.416288055708887e-05, "loss": 0.0653, "step": 21607 }, { "epoch": 6.605930907979211, "grad_norm": 0.5626130104064941, "learning_rate": 2.416245594666893e-05, "loss": 0.0569, "step": 21608 }, { "epoch": 6.606236624885356, "grad_norm": 0.2726353704929352, "learning_rate": 2.416203133624899e-05, "loss": 0.0796, "step": 21609 }, { "epoch": 6.606542341791501, "grad_norm": 0.2902885973453522, "learning_rate": 2.416160672582905e-05, "loss": 0.0754, "step": 21610 }, { "epoch": 6.606848058697646, "grad_norm": 0.7728516459465027, "learning_rate": 2.4161182115409112e-05, "loss": 0.0608, "step": 21611 }, { "epoch": 6.607153775603791, "grad_norm": 0.30785733461380005, "learning_rate": 2.416075750498917e-05, "loss": 0.092, "step": 21612 }, { "epoch": 6.6074594925099355, "grad_norm": 0.30984216928482056, "learning_rate": 2.4160332894569233e-05, "loss": 0.0753, "step": 21613 }, { "epoch": 6.607765209416081, "grad_norm": 0.26772573590278625, "learning_rate": 2.4159908284149295e-05, "loss": 0.0958, "step": 21614 }, { "epoch": 6.608070926322226, "grad_norm": 0.2995551824569702, "learning_rate": 2.4159483673729354e-05, "loss": 0.1182, "step": 21615 }, { "epoch": 6.608376643228371, "grad_norm": 0.48089155554771423, "learning_rate": 2.4159059063309416e-05, "loss": 0.122, "step": 21616 }, { "epoch": 6.608682360134516, "grad_norm": 0.5699353218078613, "learning_rate": 2.4158634452889475e-05, "loss": 0.1266, "step": 21617 }, { "epoch": 6.60898807704066, "grad_norm": 0.41191884875297546, "learning_rate": 2.4158209842469537e-05, "loss": 0.1494, "step": 21618 }, { "epoch": 6.609293793946805, "grad_norm": 0.7672383189201355, "learning_rate": 2.4157785232049595e-05, "loss": 0.1887, "step": 21619 }, { "epoch": 6.60959951085295, "grad_norm": 0.8424311280250549, "learning_rate": 2.4157360621629658e-05, "loss": 0.153, "step": 21620 }, { "epoch": 6.609905227759095, "grad_norm": 0.5676297545433044, "learning_rate": 2.4156936011209716e-05, "loss": 0.1781, "step": 21621 }, { "epoch": 6.61021094466524, "grad_norm": 0.7394131422042847, "learning_rate": 2.415651140078978e-05, "loss": 0.1546, "step": 21622 }, { "epoch": 6.610516661571385, "grad_norm": 0.9365368485450745, "learning_rate": 2.4156086790369837e-05, "loss": 0.166, "step": 21623 }, { "epoch": 6.61082237847753, "grad_norm": 2.055239200592041, "learning_rate": 2.41556621799499e-05, "loss": 0.1578, "step": 21624 }, { "epoch": 6.611128095383675, "grad_norm": 1.6923191547393799, "learning_rate": 2.4155237569529958e-05, "loss": 0.1819, "step": 21625 }, { "epoch": 6.611433812289819, "grad_norm": 1.1597747802734375, "learning_rate": 2.415481295911002e-05, "loss": 0.2094, "step": 21626 }, { "epoch": 6.611739529195964, "grad_norm": 0.49812954664230347, "learning_rate": 2.415438834869008e-05, "loss": 0.1374, "step": 21627 }, { "epoch": 6.6120452461021095, "grad_norm": 0.3382442891597748, "learning_rate": 2.4153963738270137e-05, "loss": 0.0828, "step": 21628 }, { "epoch": 6.612350963008255, "grad_norm": 0.4538422226905823, "learning_rate": 2.41535391278502e-05, "loss": 0.0953, "step": 21629 }, { "epoch": 6.6126566799144, "grad_norm": 0.2098471075296402, "learning_rate": 2.4153114517430258e-05, "loss": 0.0609, "step": 21630 }, { "epoch": 6.612962396820544, "grad_norm": 0.1632542610168457, "learning_rate": 2.415268990701032e-05, "loss": 0.0469, "step": 21631 }, { "epoch": 6.613268113726689, "grad_norm": 0.19470670819282532, "learning_rate": 2.415226529659038e-05, "loss": 0.0637, "step": 21632 }, { "epoch": 6.613573830632834, "grad_norm": 0.4483107924461365, "learning_rate": 2.415184068617044e-05, "loss": 0.0593, "step": 21633 }, { "epoch": 6.613879547538978, "grad_norm": 0.4152931869029999, "learning_rate": 2.41514160757505e-05, "loss": 0.0446, "step": 21634 }, { "epoch": 6.614185264445124, "grad_norm": 0.3251768946647644, "learning_rate": 2.4150991465330562e-05, "loss": 0.1034, "step": 21635 }, { "epoch": 6.614490981351269, "grad_norm": 0.2542549967765808, "learning_rate": 2.415056685491062e-05, "loss": 0.0769, "step": 21636 }, { "epoch": 6.614796698257414, "grad_norm": 0.23267540335655212, "learning_rate": 2.4150142244490683e-05, "loss": 0.064, "step": 21637 }, { "epoch": 6.615102415163559, "grad_norm": 0.44387346506118774, "learning_rate": 2.414971763407074e-05, "loss": 0.1147, "step": 21638 }, { "epoch": 6.615408132069703, "grad_norm": 0.3220040500164032, "learning_rate": 2.4149293023650803e-05, "loss": 0.1068, "step": 21639 }, { "epoch": 6.615713848975848, "grad_norm": 0.4936688244342804, "learning_rate": 2.4148868413230862e-05, "loss": 0.1051, "step": 21640 }, { "epoch": 6.616019565881993, "grad_norm": 0.30316832661628723, "learning_rate": 2.414844380281092e-05, "loss": 0.0931, "step": 21641 }, { "epoch": 6.6163252827881385, "grad_norm": 0.8138030171394348, "learning_rate": 2.4148019192390983e-05, "loss": 0.1328, "step": 21642 }, { "epoch": 6.616630999694284, "grad_norm": 0.7916926145553589, "learning_rate": 2.414759458197104e-05, "loss": 0.138, "step": 21643 }, { "epoch": 6.616936716600428, "grad_norm": 0.4715564250946045, "learning_rate": 2.4147169971551104e-05, "loss": 0.1229, "step": 21644 }, { "epoch": 6.617242433506573, "grad_norm": 0.9742826223373413, "learning_rate": 2.4146745361131162e-05, "loss": 0.1583, "step": 21645 }, { "epoch": 6.617548150412718, "grad_norm": 1.0188014507293701, "learning_rate": 2.4146320750711224e-05, "loss": 0.1624, "step": 21646 }, { "epoch": 6.617853867318862, "grad_norm": 0.7024531364440918, "learning_rate": 2.4145896140291283e-05, "loss": 0.2007, "step": 21647 }, { "epoch": 6.618159584225007, "grad_norm": 2.1142356395721436, "learning_rate": 2.4145471529871345e-05, "loss": 0.1783, "step": 21648 }, { "epoch": 6.6184653011311525, "grad_norm": 0.6243417859077454, "learning_rate": 2.4145046919451404e-05, "loss": 0.1612, "step": 21649 }, { "epoch": 6.618771018037298, "grad_norm": 0.8518993854522705, "learning_rate": 2.4144622309031466e-05, "loss": 0.1933, "step": 21650 }, { "epoch": 6.619076734943443, "grad_norm": 1.0408016443252563, "learning_rate": 2.4144197698611525e-05, "loss": 0.2026, "step": 21651 }, { "epoch": 6.619382451849587, "grad_norm": 4.364480495452881, "learning_rate": 2.4143773088191587e-05, "loss": 0.127, "step": 21652 }, { "epoch": 6.619688168755732, "grad_norm": 0.3296370506286621, "learning_rate": 2.4143348477771645e-05, "loss": 0.072, "step": 21653 }, { "epoch": 6.619993885661877, "grad_norm": 0.299274742603302, "learning_rate": 2.4142923867351704e-05, "loss": 0.0554, "step": 21654 }, { "epoch": 6.620299602568022, "grad_norm": 0.28117209672927856, "learning_rate": 2.4142499256931766e-05, "loss": 0.0702, "step": 21655 }, { "epoch": 6.620605319474167, "grad_norm": 0.2306010127067566, "learning_rate": 2.4142074646511825e-05, "loss": 0.0439, "step": 21656 }, { "epoch": 6.620911036380312, "grad_norm": 0.21624495089054108, "learning_rate": 2.4141650036091887e-05, "loss": 0.0648, "step": 21657 }, { "epoch": 6.621216753286457, "grad_norm": 0.5410044193267822, "learning_rate": 2.4141225425671946e-05, "loss": 0.0564, "step": 21658 }, { "epoch": 6.621522470192602, "grad_norm": 0.18436786532402039, "learning_rate": 2.4140800815252008e-05, "loss": 0.0551, "step": 21659 }, { "epoch": 6.621828187098746, "grad_norm": 0.30639299750328064, "learning_rate": 2.4140376204832067e-05, "loss": 0.0862, "step": 21660 }, { "epoch": 6.622133904004891, "grad_norm": 0.7067753076553345, "learning_rate": 2.413995159441213e-05, "loss": 0.0947, "step": 21661 }, { "epoch": 6.622439620911036, "grad_norm": 0.3001381456851959, "learning_rate": 2.4139526983992187e-05, "loss": 0.0567, "step": 21662 }, { "epoch": 6.622745337817181, "grad_norm": 1.20539128780365, "learning_rate": 2.413910237357225e-05, "loss": 0.0674, "step": 21663 }, { "epoch": 6.623051054723327, "grad_norm": 0.4753836393356323, "learning_rate": 2.4138677763152308e-05, "loss": 0.0991, "step": 21664 }, { "epoch": 6.623356771629471, "grad_norm": 0.4635782539844513, "learning_rate": 2.413825315273237e-05, "loss": 0.1133, "step": 21665 }, { "epoch": 6.623662488535616, "grad_norm": 0.972466230392456, "learning_rate": 2.413782854231243e-05, "loss": 0.1371, "step": 21666 }, { "epoch": 6.623968205441761, "grad_norm": 2.0380260944366455, "learning_rate": 2.4137403931892488e-05, "loss": 0.1609, "step": 21667 }, { "epoch": 6.624273922347906, "grad_norm": 3.2759337425231934, "learning_rate": 2.413697932147255e-05, "loss": 0.1556, "step": 21668 }, { "epoch": 6.624579639254051, "grad_norm": 1.8520342111587524, "learning_rate": 2.413655471105261e-05, "loss": 0.1307, "step": 21669 }, { "epoch": 6.6248853561601955, "grad_norm": 0.4597774147987366, "learning_rate": 2.413613010063267e-05, "loss": 0.1535, "step": 21670 }, { "epoch": 6.625191073066341, "grad_norm": 3.982389450073242, "learning_rate": 2.413570549021273e-05, "loss": 0.1551, "step": 21671 }, { "epoch": 6.625496789972486, "grad_norm": 1.3138524293899536, "learning_rate": 2.413528087979279e-05, "loss": 0.1585, "step": 21672 }, { "epoch": 6.62580250687863, "grad_norm": 0.5275397896766663, "learning_rate": 2.413485626937285e-05, "loss": 0.1497, "step": 21673 }, { "epoch": 6.626108223784775, "grad_norm": 1.9998294115066528, "learning_rate": 2.4134431658952912e-05, "loss": 0.1902, "step": 21674 }, { "epoch": 6.62641394069092, "grad_norm": 2.437983512878418, "learning_rate": 2.413400704853297e-05, "loss": 0.1944, "step": 21675 }, { "epoch": 6.626719657597065, "grad_norm": 4.074017524719238, "learning_rate": 2.4133582438113033e-05, "loss": 0.24, "step": 21676 }, { "epoch": 6.62702537450321, "grad_norm": 0.6541531085968018, "learning_rate": 2.413315782769309e-05, "loss": 0.1318, "step": 21677 }, { "epoch": 6.627331091409355, "grad_norm": 0.3601188361644745, "learning_rate": 2.4132733217273154e-05, "loss": 0.0691, "step": 21678 }, { "epoch": 6.6276368083155, "grad_norm": 0.44040414690971375, "learning_rate": 2.4132308606853212e-05, "loss": 0.0536, "step": 21679 }, { "epoch": 6.627942525221645, "grad_norm": 0.3304639756679535, "learning_rate": 2.413188399643327e-05, "loss": 0.0722, "step": 21680 }, { "epoch": 6.62824824212779, "grad_norm": 0.3026779890060425, "learning_rate": 2.4131459386013333e-05, "loss": 0.0507, "step": 21681 }, { "epoch": 6.628553959033935, "grad_norm": 0.20219457149505615, "learning_rate": 2.4131034775593392e-05, "loss": 0.0483, "step": 21682 }, { "epoch": 6.628859675940079, "grad_norm": 0.24288013577461243, "learning_rate": 2.4130610165173454e-05, "loss": 0.056, "step": 21683 }, { "epoch": 6.629165392846224, "grad_norm": 0.4576403796672821, "learning_rate": 2.4130185554753513e-05, "loss": 0.0448, "step": 21684 }, { "epoch": 6.6294711097523695, "grad_norm": 0.5578054189682007, "learning_rate": 2.4129760944333575e-05, "loss": 0.0657, "step": 21685 }, { "epoch": 6.629776826658514, "grad_norm": 0.6176365613937378, "learning_rate": 2.4129336333913633e-05, "loss": 0.0579, "step": 21686 }, { "epoch": 6.630082543564659, "grad_norm": 0.4306028187274933, "learning_rate": 2.4128911723493695e-05, "loss": 0.1013, "step": 21687 }, { "epoch": 6.630388260470804, "grad_norm": 0.47037020325660706, "learning_rate": 2.4128487113073754e-05, "loss": 0.0797, "step": 21688 }, { "epoch": 6.630693977376949, "grad_norm": 0.31224387884140015, "learning_rate": 2.4128062502653816e-05, "loss": 0.0749, "step": 21689 }, { "epoch": 6.630999694283094, "grad_norm": 0.5362979769706726, "learning_rate": 2.4127637892233875e-05, "loss": 0.1319, "step": 21690 }, { "epoch": 6.631305411189238, "grad_norm": 0.5382906198501587, "learning_rate": 2.4127213281813937e-05, "loss": 0.1014, "step": 21691 }, { "epoch": 6.631611128095384, "grad_norm": 1.2310545444488525, "learning_rate": 2.4126788671393996e-05, "loss": 0.1126, "step": 21692 }, { "epoch": 6.631916845001529, "grad_norm": 0.6270748376846313, "learning_rate": 2.4126364060974054e-05, "loss": 0.149, "step": 21693 }, { "epoch": 6.632222561907674, "grad_norm": 1.563722848892212, "learning_rate": 2.4125939450554117e-05, "loss": 0.1664, "step": 21694 }, { "epoch": 6.632528278813819, "grad_norm": 0.7096534967422485, "learning_rate": 2.4125514840134175e-05, "loss": 0.15, "step": 21695 }, { "epoch": 6.632833995719963, "grad_norm": 0.6214866638183594, "learning_rate": 2.4125090229714237e-05, "loss": 0.1402, "step": 21696 }, { "epoch": 6.633139712626108, "grad_norm": 0.946815550327301, "learning_rate": 2.4124665619294296e-05, "loss": 0.1939, "step": 21697 }, { "epoch": 6.633445429532253, "grad_norm": 1.7273716926574707, "learning_rate": 2.4124241008874358e-05, "loss": 0.1816, "step": 21698 }, { "epoch": 6.633751146438398, "grad_norm": 0.7441437244415283, "learning_rate": 2.4123816398454417e-05, "loss": 0.1856, "step": 21699 }, { "epoch": 6.634056863344543, "grad_norm": 3.7222137451171875, "learning_rate": 2.412339178803448e-05, "loss": 0.2067, "step": 21700 }, { "epoch": 6.634362580250688, "grad_norm": 1.3584599494934082, "learning_rate": 2.4122967177614538e-05, "loss": 0.1966, "step": 21701 }, { "epoch": 6.634668297156833, "grad_norm": 0.35887810587882996, "learning_rate": 2.41225425671946e-05, "loss": 0.1311, "step": 21702 }, { "epoch": 6.634974014062978, "grad_norm": 1.1691946983337402, "learning_rate": 2.412211795677466e-05, "loss": 0.0913, "step": 21703 }, { "epoch": 6.635279730969122, "grad_norm": 0.24720627069473267, "learning_rate": 2.412169334635472e-05, "loss": 0.0658, "step": 21704 }, { "epoch": 6.635585447875267, "grad_norm": 0.7558805346488953, "learning_rate": 2.412126873593478e-05, "loss": 0.071, "step": 21705 }, { "epoch": 6.6358911647814125, "grad_norm": 0.9626797437667847, "learning_rate": 2.4120844125514838e-05, "loss": 0.05, "step": 21706 }, { "epoch": 6.636196881687558, "grad_norm": 0.26505157351493835, "learning_rate": 2.41204195150949e-05, "loss": 0.0433, "step": 21707 }, { "epoch": 6.636502598593703, "grad_norm": 0.40145254135131836, "learning_rate": 2.411999490467496e-05, "loss": 0.0532, "step": 21708 }, { "epoch": 6.636808315499847, "grad_norm": 0.205316960811615, "learning_rate": 2.411957029425502e-05, "loss": 0.0529, "step": 21709 }, { "epoch": 6.637114032405992, "grad_norm": 0.1969393640756607, "learning_rate": 2.411914568383508e-05, "loss": 0.053, "step": 21710 }, { "epoch": 6.637419749312137, "grad_norm": 0.18551398813724518, "learning_rate": 2.411872107341514e-05, "loss": 0.0465, "step": 21711 }, { "epoch": 6.637725466218281, "grad_norm": 0.44852349162101746, "learning_rate": 2.41182964629952e-05, "loss": 0.1055, "step": 21712 }, { "epoch": 6.6380311831244265, "grad_norm": 1.126994252204895, "learning_rate": 2.4117871852575262e-05, "loss": 0.0633, "step": 21713 }, { "epoch": 6.638336900030572, "grad_norm": 0.6398458480834961, "learning_rate": 2.411744724215532e-05, "loss": 0.0926, "step": 21714 }, { "epoch": 6.638642616936717, "grad_norm": 0.5522647500038147, "learning_rate": 2.4117022631735383e-05, "loss": 0.0876, "step": 21715 }, { "epoch": 6.638948333842862, "grad_norm": 0.5741357803344727, "learning_rate": 2.4116598021315445e-05, "loss": 0.1141, "step": 21716 }, { "epoch": 6.639254050749006, "grad_norm": 1.6042684316635132, "learning_rate": 2.4116173410895507e-05, "loss": 0.1211, "step": 21717 }, { "epoch": 6.639559767655151, "grad_norm": 0.46244460344314575, "learning_rate": 2.4115748800475566e-05, "loss": 0.1428, "step": 21718 }, { "epoch": 6.639865484561296, "grad_norm": 0.7853929996490479, "learning_rate": 2.4115324190055625e-05, "loss": 0.1302, "step": 21719 }, { "epoch": 6.640171201467441, "grad_norm": 0.6518571376800537, "learning_rate": 2.4114899579635687e-05, "loss": 0.1752, "step": 21720 }, { "epoch": 6.640476918373587, "grad_norm": 0.7106354236602783, "learning_rate": 2.4114474969215745e-05, "loss": 0.1625, "step": 21721 }, { "epoch": 6.640782635279731, "grad_norm": 0.6671791672706604, "learning_rate": 2.4114050358795808e-05, "loss": 0.1796, "step": 21722 }, { "epoch": 6.641088352185876, "grad_norm": 0.774836540222168, "learning_rate": 2.4113625748375866e-05, "loss": 0.1741, "step": 21723 }, { "epoch": 6.641394069092021, "grad_norm": 1.2667759656906128, "learning_rate": 2.411320113795593e-05, "loss": 0.1589, "step": 21724 }, { "epoch": 6.641699785998165, "grad_norm": 5.063528537750244, "learning_rate": 2.4112776527535987e-05, "loss": 0.1959, "step": 21725 }, { "epoch": 6.64200550290431, "grad_norm": 1.0008677244186401, "learning_rate": 2.411235191711605e-05, "loss": 0.2006, "step": 21726 }, { "epoch": 6.6423112198104555, "grad_norm": 0.5071481466293335, "learning_rate": 2.4111927306696108e-05, "loss": 0.1249, "step": 21727 }, { "epoch": 6.642616936716601, "grad_norm": 0.7090135216712952, "learning_rate": 2.411150269627617e-05, "loss": 0.0811, "step": 21728 }, { "epoch": 6.642922653622746, "grad_norm": 0.339537113904953, "learning_rate": 2.411107808585623e-05, "loss": 0.0624, "step": 21729 }, { "epoch": 6.64322837052889, "grad_norm": 0.7477615475654602, "learning_rate": 2.4110653475436287e-05, "loss": 0.0787, "step": 21730 }, { "epoch": 6.643534087435035, "grad_norm": 0.25704875588417053, "learning_rate": 2.411022886501635e-05, "loss": 0.0487, "step": 21731 }, { "epoch": 6.64383980434118, "grad_norm": 0.1700727343559265, "learning_rate": 2.4109804254596408e-05, "loss": 0.0425, "step": 21732 }, { "epoch": 6.644145521247325, "grad_norm": 0.22494718432426453, "learning_rate": 2.410937964417647e-05, "loss": 0.0587, "step": 21733 }, { "epoch": 6.64445123815347, "grad_norm": 0.4211888313293457, "learning_rate": 2.410895503375653e-05, "loss": 0.075, "step": 21734 }, { "epoch": 6.644756955059615, "grad_norm": 0.41521844267845154, "learning_rate": 2.410853042333659e-05, "loss": 0.0919, "step": 21735 }, { "epoch": 6.64506267196576, "grad_norm": 0.3644210696220398, "learning_rate": 2.410810581291665e-05, "loss": 0.0458, "step": 21736 }, { "epoch": 6.645368388871905, "grad_norm": 0.4225381314754486, "learning_rate": 2.4107681202496712e-05, "loss": 0.1243, "step": 21737 }, { "epoch": 6.645674105778049, "grad_norm": 1.9731496572494507, "learning_rate": 2.410725659207677e-05, "loss": 0.0932, "step": 21738 }, { "epoch": 6.645979822684194, "grad_norm": 0.9362351894378662, "learning_rate": 2.4106831981656833e-05, "loss": 0.0629, "step": 21739 }, { "epoch": 6.646285539590339, "grad_norm": 0.8652750253677368, "learning_rate": 2.410640737123689e-05, "loss": 0.1203, "step": 21740 }, { "epoch": 6.646591256496484, "grad_norm": 0.7160530686378479, "learning_rate": 2.4105982760816953e-05, "loss": 0.0826, "step": 21741 }, { "epoch": 6.6468969734026295, "grad_norm": 1.7904523611068726, "learning_rate": 2.4105558150397012e-05, "loss": 0.1559, "step": 21742 }, { "epoch": 6.647202690308774, "grad_norm": 0.9798429608345032, "learning_rate": 2.410513353997707e-05, "loss": 0.1387, "step": 21743 }, { "epoch": 6.647508407214919, "grad_norm": 0.767403244972229, "learning_rate": 2.4104708929557133e-05, "loss": 0.1339, "step": 21744 }, { "epoch": 6.647814124121064, "grad_norm": 0.6647427678108215, "learning_rate": 2.410428431913719e-05, "loss": 0.1628, "step": 21745 }, { "epoch": 6.648119841027209, "grad_norm": 0.6129042506217957, "learning_rate": 2.4103859708717254e-05, "loss": 0.1629, "step": 21746 }, { "epoch": 6.648425557933354, "grad_norm": 0.5802541375160217, "learning_rate": 2.4103435098297312e-05, "loss": 0.1492, "step": 21747 }, { "epoch": 6.648731274839498, "grad_norm": 3.020630359649658, "learning_rate": 2.4103010487877374e-05, "loss": 0.138, "step": 21748 }, { "epoch": 6.6490369917456436, "grad_norm": 1.3876183032989502, "learning_rate": 2.4102585877457433e-05, "loss": 0.213, "step": 21749 }, { "epoch": 6.649342708651789, "grad_norm": 0.6166839599609375, "learning_rate": 2.4102161267037495e-05, "loss": 0.184, "step": 21750 }, { "epoch": 6.649648425557933, "grad_norm": 1.9879498481750488, "learning_rate": 2.4101736656617554e-05, "loss": 0.2174, "step": 21751 }, { "epoch": 6.649954142464078, "grad_norm": 0.7113563418388367, "learning_rate": 2.4101312046197616e-05, "loss": 0.1633, "step": 21752 }, { "epoch": 6.650259859370223, "grad_norm": 0.5670984387397766, "learning_rate": 2.4100887435777675e-05, "loss": 0.1019, "step": 21753 }, { "epoch": 6.650565576276368, "grad_norm": 0.21612633764743805, "learning_rate": 2.4100462825357737e-05, "loss": 0.0509, "step": 21754 }, { "epoch": 6.650871293182513, "grad_norm": 0.24852485954761505, "learning_rate": 2.4100038214937795e-05, "loss": 0.0505, "step": 21755 }, { "epoch": 6.651177010088658, "grad_norm": 0.7454033493995667, "learning_rate": 2.4099613604517854e-05, "loss": 0.0538, "step": 21756 }, { "epoch": 6.651482726994803, "grad_norm": 0.1513039767742157, "learning_rate": 2.4099188994097916e-05, "loss": 0.0429, "step": 21757 }, { "epoch": 6.651788443900948, "grad_norm": 1.1063891649246216, "learning_rate": 2.4098764383677975e-05, "loss": 0.0624, "step": 21758 }, { "epoch": 6.652094160807093, "grad_norm": 0.24007967114448547, "learning_rate": 2.4098339773258037e-05, "loss": 0.0679, "step": 21759 }, { "epoch": 6.652399877713238, "grad_norm": 0.316578209400177, "learning_rate": 2.4097915162838096e-05, "loss": 0.0532, "step": 21760 }, { "epoch": 6.652705594619382, "grad_norm": 0.2191687971353531, "learning_rate": 2.4097490552418158e-05, "loss": 0.0605, "step": 21761 }, { "epoch": 6.653011311525527, "grad_norm": 0.21981583535671234, "learning_rate": 2.4097065941998217e-05, "loss": 0.0874, "step": 21762 }, { "epoch": 6.6533170284316725, "grad_norm": 0.3880472481250763, "learning_rate": 2.409664133157828e-05, "loss": 0.0813, "step": 21763 }, { "epoch": 6.653622745337817, "grad_norm": 0.37838855385780334, "learning_rate": 2.4096216721158337e-05, "loss": 0.0839, "step": 21764 }, { "epoch": 6.653928462243962, "grad_norm": 0.5892343521118164, "learning_rate": 2.40957921107384e-05, "loss": 0.1109, "step": 21765 }, { "epoch": 6.654234179150107, "grad_norm": 0.6723191142082214, "learning_rate": 2.4095367500318458e-05, "loss": 0.1323, "step": 21766 }, { "epoch": 6.654539896056252, "grad_norm": 0.5652338862419128, "learning_rate": 2.409494288989852e-05, "loss": 0.1262, "step": 21767 }, { "epoch": 6.654845612962397, "grad_norm": 0.34880802035331726, "learning_rate": 2.409451827947858e-05, "loss": 0.1352, "step": 21768 }, { "epoch": 6.655151329868541, "grad_norm": 0.5709972977638245, "learning_rate": 2.4094093669058638e-05, "loss": 0.16, "step": 21769 }, { "epoch": 6.6554570467746865, "grad_norm": 0.7150827646255493, "learning_rate": 2.40936690586387e-05, "loss": 0.137, "step": 21770 }, { "epoch": 6.655762763680832, "grad_norm": 1.7482396364212036, "learning_rate": 2.409324444821876e-05, "loss": 0.1405, "step": 21771 }, { "epoch": 6.656068480586977, "grad_norm": 1.2394611835479736, "learning_rate": 2.409281983779882e-05, "loss": 0.2766, "step": 21772 }, { "epoch": 6.656374197493121, "grad_norm": 1.9075334072113037, "learning_rate": 2.409239522737888e-05, "loss": 0.1658, "step": 21773 }, { "epoch": 6.656679914399266, "grad_norm": 0.7392836213111877, "learning_rate": 2.409197061695894e-05, "loss": 0.1759, "step": 21774 }, { "epoch": 6.656985631305411, "grad_norm": 0.8276727199554443, "learning_rate": 2.4091546006539e-05, "loss": 0.1903, "step": 21775 }, { "epoch": 6.657291348211556, "grad_norm": 1.6453136205673218, "learning_rate": 2.4091121396119062e-05, "loss": 0.2212, "step": 21776 }, { "epoch": 6.6575970651177006, "grad_norm": 0.485230416059494, "learning_rate": 2.409069678569912e-05, "loss": 0.1302, "step": 21777 }, { "epoch": 6.657902782023846, "grad_norm": 0.574414074420929, "learning_rate": 2.4090272175279183e-05, "loss": 0.0752, "step": 21778 }, { "epoch": 6.658208498929991, "grad_norm": 0.3485560715198517, "learning_rate": 2.408984756485924e-05, "loss": 0.0647, "step": 21779 }, { "epoch": 6.658514215836136, "grad_norm": 0.19509734213352203, "learning_rate": 2.4089422954439304e-05, "loss": 0.0397, "step": 21780 }, { "epoch": 6.658819932742281, "grad_norm": 0.30990204215049744, "learning_rate": 2.4088998344019362e-05, "loss": 0.0713, "step": 21781 }, { "epoch": 6.659125649648425, "grad_norm": 0.5179685950279236, "learning_rate": 2.408857373359942e-05, "loss": 0.0528, "step": 21782 }, { "epoch": 6.65943136655457, "grad_norm": 0.2916179895401001, "learning_rate": 2.4088149123179483e-05, "loss": 0.0516, "step": 21783 }, { "epoch": 6.6597370834607155, "grad_norm": 0.5586234927177429, "learning_rate": 2.4087724512759542e-05, "loss": 0.0445, "step": 21784 }, { "epoch": 6.660042800366861, "grad_norm": 0.3386734127998352, "learning_rate": 2.4087299902339604e-05, "loss": 0.0672, "step": 21785 }, { "epoch": 6.660348517273005, "grad_norm": 0.22865769267082214, "learning_rate": 2.4086875291919663e-05, "loss": 0.0574, "step": 21786 }, { "epoch": 6.66065423417915, "grad_norm": 0.4425964951515198, "learning_rate": 2.4086450681499725e-05, "loss": 0.0867, "step": 21787 }, { "epoch": 6.660959951085295, "grad_norm": 0.36340224742889404, "learning_rate": 2.4086026071079783e-05, "loss": 0.0817, "step": 21788 }, { "epoch": 6.66126566799144, "grad_norm": 0.3704182505607605, "learning_rate": 2.4085601460659845e-05, "loss": 0.1056, "step": 21789 }, { "epoch": 6.661571384897584, "grad_norm": 0.948585569858551, "learning_rate": 2.4085176850239904e-05, "loss": 0.1229, "step": 21790 }, { "epoch": 6.6618771018037295, "grad_norm": 0.3359985053539276, "learning_rate": 2.4084752239819966e-05, "loss": 0.0906, "step": 21791 }, { "epoch": 6.662182818709875, "grad_norm": 0.5376591682434082, "learning_rate": 2.4084327629400025e-05, "loss": 0.1391, "step": 21792 }, { "epoch": 6.66248853561602, "grad_norm": 0.7107796669006348, "learning_rate": 2.4083903018980087e-05, "loss": 0.1557, "step": 21793 }, { "epoch": 6.662794252522165, "grad_norm": 0.4397164583206177, "learning_rate": 2.4083478408560146e-05, "loss": 0.1807, "step": 21794 }, { "epoch": 6.663099969428309, "grad_norm": 0.32155394554138184, "learning_rate": 2.4083053798140204e-05, "loss": 0.1543, "step": 21795 }, { "epoch": 6.663405686334454, "grad_norm": 1.3398582935333252, "learning_rate": 2.4082629187720267e-05, "loss": 0.1684, "step": 21796 }, { "epoch": 6.663711403240599, "grad_norm": 1.1320463418960571, "learning_rate": 2.4082204577300325e-05, "loss": 0.1681, "step": 21797 }, { "epoch": 6.664017120146744, "grad_norm": 0.7108551263809204, "learning_rate": 2.4081779966880387e-05, "loss": 0.1733, "step": 21798 }, { "epoch": 6.664322837052889, "grad_norm": 1.1951167583465576, "learning_rate": 2.4081355356460446e-05, "loss": 0.1886, "step": 21799 }, { "epoch": 6.664628553959034, "grad_norm": 0.7022933959960938, "learning_rate": 2.4080930746040508e-05, "loss": 0.2236, "step": 21800 }, { "epoch": 6.664934270865179, "grad_norm": 0.9425702691078186, "learning_rate": 2.4080506135620567e-05, "loss": 0.2022, "step": 21801 }, { "epoch": 6.665239987771324, "grad_norm": 0.5169018507003784, "learning_rate": 2.408008152520063e-05, "loss": 0.1387, "step": 21802 }, { "epoch": 6.665545704677468, "grad_norm": 0.36128780245780945, "learning_rate": 2.4079656914780688e-05, "loss": 0.1106, "step": 21803 }, { "epoch": 6.665851421583613, "grad_norm": 0.268698513507843, "learning_rate": 2.407923230436075e-05, "loss": 0.0571, "step": 21804 }, { "epoch": 6.666157138489758, "grad_norm": 0.3949039876461029, "learning_rate": 2.407880769394081e-05, "loss": 0.0498, "step": 21805 }, { "epoch": 6.6664628553959036, "grad_norm": 0.278842955827713, "learning_rate": 2.407838308352087e-05, "loss": 0.0564, "step": 21806 }, { "epoch": 6.666768572302049, "grad_norm": 0.23846977949142456, "learning_rate": 2.407795847310093e-05, "loss": 0.0881, "step": 21807 }, { "epoch": 6.667074289208193, "grad_norm": 0.1927356868982315, "learning_rate": 2.4077533862680988e-05, "loss": 0.0331, "step": 21808 }, { "epoch": 6.667380006114338, "grad_norm": 0.2911524176597595, "learning_rate": 2.407710925226105e-05, "loss": 0.0625, "step": 21809 }, { "epoch": 6.667685723020483, "grad_norm": 0.6784061789512634, "learning_rate": 2.407668464184111e-05, "loss": 0.0803, "step": 21810 }, { "epoch": 6.667991439926628, "grad_norm": 0.38640639185905457, "learning_rate": 2.407626003142117e-05, "loss": 0.0597, "step": 21811 }, { "epoch": 6.6682971568327725, "grad_norm": 0.4190351963043213, "learning_rate": 2.407583542100123e-05, "loss": 0.0919, "step": 21812 }, { "epoch": 6.668602873738918, "grad_norm": 0.40844351053237915, "learning_rate": 2.407541081058129e-05, "loss": 0.086, "step": 21813 }, { "epoch": 6.668908590645063, "grad_norm": 0.2919502854347229, "learning_rate": 2.407498620016135e-05, "loss": 0.0917, "step": 21814 }, { "epoch": 6.669214307551208, "grad_norm": 0.41853970289230347, "learning_rate": 2.4074561589741412e-05, "loss": 0.0987, "step": 21815 }, { "epoch": 6.669520024457352, "grad_norm": 0.34706053137779236, "learning_rate": 2.407413697932147e-05, "loss": 0.1422, "step": 21816 }, { "epoch": 6.669825741363497, "grad_norm": 0.31146177649497986, "learning_rate": 2.4073712368901533e-05, "loss": 0.1352, "step": 21817 }, { "epoch": 6.670131458269642, "grad_norm": 0.5506784915924072, "learning_rate": 2.4073287758481595e-05, "loss": 0.1391, "step": 21818 }, { "epoch": 6.670437175175787, "grad_norm": 0.502693235874176, "learning_rate": 2.4072863148061657e-05, "loss": 0.1428, "step": 21819 }, { "epoch": 6.6707428920819325, "grad_norm": 0.5051175951957703, "learning_rate": 2.4072438537641716e-05, "loss": 0.1626, "step": 21820 }, { "epoch": 6.671048608988077, "grad_norm": 0.7651900053024292, "learning_rate": 2.4072013927221775e-05, "loss": 0.1574, "step": 21821 }, { "epoch": 6.671354325894222, "grad_norm": 0.5774397850036621, "learning_rate": 2.4071589316801837e-05, "loss": 0.165, "step": 21822 }, { "epoch": 6.671660042800367, "grad_norm": 0.6043558120727539, "learning_rate": 2.4071164706381896e-05, "loss": 0.1811, "step": 21823 }, { "epoch": 6.671965759706512, "grad_norm": 0.7292405962944031, "learning_rate": 2.4070740095961958e-05, "loss": 0.1764, "step": 21824 }, { "epoch": 6.672271476612656, "grad_norm": 0.7698682546615601, "learning_rate": 2.4070315485542016e-05, "loss": 0.2233, "step": 21825 }, { "epoch": 6.672577193518801, "grad_norm": 0.9948698878288269, "learning_rate": 2.406989087512208e-05, "loss": 0.2292, "step": 21826 }, { "epoch": 6.6728829104249465, "grad_norm": 0.6994957327842712, "learning_rate": 2.4069466264702137e-05, "loss": 0.1167, "step": 21827 }, { "epoch": 6.673188627331092, "grad_norm": 0.2613848149776459, "learning_rate": 2.40690416542822e-05, "loss": 0.0783, "step": 21828 }, { "epoch": 6.673494344237236, "grad_norm": 0.30914878845214844, "learning_rate": 2.4068617043862258e-05, "loss": 0.0632, "step": 21829 }, { "epoch": 6.673800061143381, "grad_norm": 0.32807478308677673, "learning_rate": 2.406819243344232e-05, "loss": 0.0552, "step": 21830 }, { "epoch": 6.674105778049526, "grad_norm": 0.2595452070236206, "learning_rate": 2.406776782302238e-05, "loss": 0.0685, "step": 21831 }, { "epoch": 6.674411494955671, "grad_norm": 0.28972128033638, "learning_rate": 2.406734321260244e-05, "loss": 0.04, "step": 21832 }, { "epoch": 6.674717211861816, "grad_norm": 0.4344096779823303, "learning_rate": 2.40669186021825e-05, "loss": 0.0566, "step": 21833 }, { "epoch": 6.6750229287679606, "grad_norm": 0.5077471733093262, "learning_rate": 2.4066493991762558e-05, "loss": 0.0557, "step": 21834 }, { "epoch": 6.675328645674106, "grad_norm": 0.2340615838766098, "learning_rate": 2.406606938134262e-05, "loss": 0.0694, "step": 21835 }, { "epoch": 6.675634362580251, "grad_norm": 0.17894800007343292, "learning_rate": 2.406564477092268e-05, "loss": 0.0558, "step": 21836 }, { "epoch": 6.675940079486396, "grad_norm": 0.2563183605670929, "learning_rate": 2.406522016050274e-05, "loss": 0.1019, "step": 21837 }, { "epoch": 6.67624579639254, "grad_norm": 0.5026403665542603, "learning_rate": 2.40647955500828e-05, "loss": 0.0688, "step": 21838 }, { "epoch": 6.676551513298685, "grad_norm": 0.2048358917236328, "learning_rate": 2.4064370939662862e-05, "loss": 0.0972, "step": 21839 }, { "epoch": 6.67685723020483, "grad_norm": 0.40412160754203796, "learning_rate": 2.406394632924292e-05, "loss": 0.0941, "step": 21840 }, { "epoch": 6.6771629471109755, "grad_norm": 0.5487343668937683, "learning_rate": 2.4063521718822983e-05, "loss": 0.1165, "step": 21841 }, { "epoch": 6.67746866401712, "grad_norm": 0.49290695786476135, "learning_rate": 2.406309710840304e-05, "loss": 0.1369, "step": 21842 }, { "epoch": 6.677774380923265, "grad_norm": 1.3618249893188477, "learning_rate": 2.4062672497983103e-05, "loss": 0.1192, "step": 21843 }, { "epoch": 6.67808009782941, "grad_norm": 0.43141892552375793, "learning_rate": 2.4062247887563162e-05, "loss": 0.1604, "step": 21844 }, { "epoch": 6.678385814735555, "grad_norm": 0.7352354526519775, "learning_rate": 2.406182327714322e-05, "loss": 0.1383, "step": 21845 }, { "epoch": 6.6786915316417, "grad_norm": 0.566762387752533, "learning_rate": 2.4061398666723283e-05, "loss": 0.1656, "step": 21846 }, { "epoch": 6.678997248547844, "grad_norm": 0.5814469456672668, "learning_rate": 2.406097405630334e-05, "loss": 0.1486, "step": 21847 }, { "epoch": 6.6793029654539895, "grad_norm": 1.1848902702331543, "learning_rate": 2.4060549445883404e-05, "loss": 0.1761, "step": 21848 }, { "epoch": 6.679608682360135, "grad_norm": 0.7323266863822937, "learning_rate": 2.4060124835463462e-05, "loss": 0.1616, "step": 21849 }, { "epoch": 6.67991439926628, "grad_norm": 0.8444457054138184, "learning_rate": 2.4059700225043524e-05, "loss": 0.148, "step": 21850 }, { "epoch": 6.680220116172424, "grad_norm": 1.3368315696716309, "learning_rate": 2.4059275614623583e-05, "loss": 0.2262, "step": 21851 }, { "epoch": 6.680525833078569, "grad_norm": 0.5779988169670105, "learning_rate": 2.4058851004203645e-05, "loss": 0.105, "step": 21852 }, { "epoch": 6.680831549984714, "grad_norm": 0.2768917679786682, "learning_rate": 2.4058426393783704e-05, "loss": 0.0817, "step": 21853 }, { "epoch": 6.681137266890859, "grad_norm": 0.1917112171649933, "learning_rate": 2.4058001783363766e-05, "loss": 0.0712, "step": 21854 }, { "epoch": 6.6814429837970035, "grad_norm": 0.3815167248249054, "learning_rate": 2.4057577172943825e-05, "loss": 0.0692, "step": 21855 }, { "epoch": 6.681748700703149, "grad_norm": 0.2192154824733734, "learning_rate": 2.4057152562523887e-05, "loss": 0.0579, "step": 21856 }, { "epoch": 6.682054417609294, "grad_norm": 0.15302202105522156, "learning_rate": 2.4056727952103946e-05, "loss": 0.0489, "step": 21857 }, { "epoch": 6.682360134515439, "grad_norm": 0.4030667841434479, "learning_rate": 2.4056303341684004e-05, "loss": 0.061, "step": 21858 }, { "epoch": 6.682665851421584, "grad_norm": 0.5187489986419678, "learning_rate": 2.4055878731264066e-05, "loss": 0.0463, "step": 21859 }, { "epoch": 6.682971568327728, "grad_norm": 0.21951895952224731, "learning_rate": 2.4055454120844125e-05, "loss": 0.06, "step": 21860 }, { "epoch": 6.683277285233873, "grad_norm": 0.16147589683532715, "learning_rate": 2.4055029510424187e-05, "loss": 0.0446, "step": 21861 }, { "epoch": 6.683583002140018, "grad_norm": 0.8229557275772095, "learning_rate": 2.4054604900004246e-05, "loss": 0.0895, "step": 21862 }, { "epoch": 6.6838887190461636, "grad_norm": 0.3693799674510956, "learning_rate": 2.4054180289584308e-05, "loss": 0.0758, "step": 21863 }, { "epoch": 6.684194435952308, "grad_norm": 0.36773839592933655, "learning_rate": 2.4053755679164367e-05, "loss": 0.0901, "step": 21864 }, { "epoch": 6.684500152858453, "grad_norm": 0.35851338505744934, "learning_rate": 2.405333106874443e-05, "loss": 0.1022, "step": 21865 }, { "epoch": 6.684805869764598, "grad_norm": 0.31193965673446655, "learning_rate": 2.4052906458324487e-05, "loss": 0.1198, "step": 21866 }, { "epoch": 6.685111586670743, "grad_norm": 0.5726253390312195, "learning_rate": 2.405248184790455e-05, "loss": 0.1677, "step": 21867 }, { "epoch": 6.685417303576887, "grad_norm": 0.45579859614372253, "learning_rate": 2.4052057237484608e-05, "loss": 0.1285, "step": 21868 }, { "epoch": 6.6857230204830325, "grad_norm": 0.7376073598861694, "learning_rate": 2.405163262706467e-05, "loss": 0.1448, "step": 21869 }, { "epoch": 6.686028737389178, "grad_norm": 2.941974639892578, "learning_rate": 2.405120801664473e-05, "loss": 0.1383, "step": 21870 }, { "epoch": 6.686334454295323, "grad_norm": 0.6157527565956116, "learning_rate": 2.4050783406224788e-05, "loss": 0.1647, "step": 21871 }, { "epoch": 6.686640171201468, "grad_norm": 1.1522812843322754, "learning_rate": 2.405035879580485e-05, "loss": 0.157, "step": 21872 }, { "epoch": 6.686945888107612, "grad_norm": 0.7107307314872742, "learning_rate": 2.404993418538491e-05, "loss": 0.1909, "step": 21873 }, { "epoch": 6.687251605013757, "grad_norm": 0.7929272055625916, "learning_rate": 2.404950957496497e-05, "loss": 0.1621, "step": 21874 }, { "epoch": 6.687557321919902, "grad_norm": 0.9361541271209717, "learning_rate": 2.404908496454503e-05, "loss": 0.1649, "step": 21875 }, { "epoch": 6.687863038826047, "grad_norm": 1.317588210105896, "learning_rate": 2.404866035412509e-05, "loss": 0.3191, "step": 21876 }, { "epoch": 6.688168755732192, "grad_norm": 0.4694579243659973, "learning_rate": 2.404823574370515e-05, "loss": 0.1442, "step": 21877 }, { "epoch": 6.688474472638337, "grad_norm": 0.20942877233028412, "learning_rate": 2.4047811133285212e-05, "loss": 0.0711, "step": 21878 }, { "epoch": 6.688780189544482, "grad_norm": 0.1803780198097229, "learning_rate": 2.404738652286527e-05, "loss": 0.0612, "step": 21879 }, { "epoch": 6.689085906450627, "grad_norm": 0.2790502905845642, "learning_rate": 2.4046961912445333e-05, "loss": 0.0685, "step": 21880 }, { "epoch": 6.689391623356771, "grad_norm": 0.14913009107112885, "learning_rate": 2.404653730202539e-05, "loss": 0.05, "step": 21881 }, { "epoch": 6.689697340262916, "grad_norm": 0.6090280413627625, "learning_rate": 2.4046112691605454e-05, "loss": 0.0679, "step": 21882 }, { "epoch": 6.690003057169061, "grad_norm": 0.2813650667667389, "learning_rate": 2.4045688081185512e-05, "loss": 0.0561, "step": 21883 }, { "epoch": 6.6903087740752065, "grad_norm": 0.12673963606357574, "learning_rate": 2.404526347076557e-05, "loss": 0.0454, "step": 21884 }, { "epoch": 6.690614490981352, "grad_norm": 0.27891165018081665, "learning_rate": 2.4044838860345633e-05, "loss": 0.0651, "step": 21885 }, { "epoch": 6.690920207887496, "grad_norm": 0.18411973118782043, "learning_rate": 2.4044414249925692e-05, "loss": 0.0544, "step": 21886 }, { "epoch": 6.691225924793641, "grad_norm": 0.23135703802108765, "learning_rate": 2.4043989639505754e-05, "loss": 0.0717, "step": 21887 }, { "epoch": 6.691531641699786, "grad_norm": 0.23994702100753784, "learning_rate": 2.4043565029085813e-05, "loss": 0.0793, "step": 21888 }, { "epoch": 6.691837358605931, "grad_norm": 0.6455984115600586, "learning_rate": 2.4043140418665875e-05, "loss": 0.094, "step": 21889 }, { "epoch": 6.692143075512075, "grad_norm": 0.36362460255622864, "learning_rate": 2.4042715808245933e-05, "loss": 0.1079, "step": 21890 }, { "epoch": 6.6924487924182205, "grad_norm": 0.8531476855278015, "learning_rate": 2.4042291197825996e-05, "loss": 0.0996, "step": 21891 }, { "epoch": 6.692754509324366, "grad_norm": 1.6739939451217651, "learning_rate": 2.4041866587406054e-05, "loss": 0.1212, "step": 21892 }, { "epoch": 6.693060226230511, "grad_norm": 1.3884636163711548, "learning_rate": 2.4041441976986116e-05, "loss": 0.1543, "step": 21893 }, { "epoch": 6.693365943136655, "grad_norm": 0.8089674115180969, "learning_rate": 2.4041017366566175e-05, "loss": 0.1722, "step": 21894 }, { "epoch": 6.6936716600428, "grad_norm": 1.182071566581726, "learning_rate": 2.4040592756146237e-05, "loss": 0.1769, "step": 21895 }, { "epoch": 6.693977376948945, "grad_norm": 0.6205013990402222, "learning_rate": 2.4040168145726296e-05, "loss": 0.1591, "step": 21896 }, { "epoch": 6.69428309385509, "grad_norm": 1.4086904525756836, "learning_rate": 2.4039743535306355e-05, "loss": 0.1465, "step": 21897 }, { "epoch": 6.6945888107612355, "grad_norm": 0.5664488077163696, "learning_rate": 2.4039318924886417e-05, "loss": 0.2092, "step": 21898 }, { "epoch": 6.69489452766738, "grad_norm": 0.7574003338813782, "learning_rate": 2.4038894314466475e-05, "loss": 0.1728, "step": 21899 }, { "epoch": 6.695200244573525, "grad_norm": 1.613892912864685, "learning_rate": 2.4038469704046537e-05, "loss": 0.1665, "step": 21900 }, { "epoch": 6.69550596147967, "grad_norm": 0.9728577733039856, "learning_rate": 2.4038045093626596e-05, "loss": 0.1954, "step": 21901 }, { "epoch": 6.695811678385815, "grad_norm": 0.6235082149505615, "learning_rate": 2.4037620483206658e-05, "loss": 0.1435, "step": 21902 }, { "epoch": 6.696117395291959, "grad_norm": 0.408829003572464, "learning_rate": 2.4037195872786717e-05, "loss": 0.0991, "step": 21903 }, { "epoch": 6.696423112198104, "grad_norm": 0.16643430292606354, "learning_rate": 2.403677126236678e-05, "loss": 0.0672, "step": 21904 }, { "epoch": 6.6967288291042495, "grad_norm": 0.1839747428894043, "learning_rate": 2.4036346651946838e-05, "loss": 0.0635, "step": 21905 }, { "epoch": 6.697034546010395, "grad_norm": 0.18919000029563904, "learning_rate": 2.40359220415269e-05, "loss": 0.0598, "step": 21906 }, { "epoch": 6.697340262916539, "grad_norm": 0.31339192390441895, "learning_rate": 2.403549743110696e-05, "loss": 0.0676, "step": 21907 }, { "epoch": 6.697645979822684, "grad_norm": 0.46282345056533813, "learning_rate": 2.403507282068702e-05, "loss": 0.0563, "step": 21908 }, { "epoch": 6.697951696728829, "grad_norm": 0.30597540736198425, "learning_rate": 2.403464821026708e-05, "loss": 0.0632, "step": 21909 }, { "epoch": 6.698257413634974, "grad_norm": 0.23378053307533264, "learning_rate": 2.4034223599847138e-05, "loss": 0.1051, "step": 21910 }, { "epoch": 6.698563130541119, "grad_norm": 0.3712295591831207, "learning_rate": 2.40337989894272e-05, "loss": 0.0575, "step": 21911 }, { "epoch": 6.6988688474472635, "grad_norm": 0.4985766112804413, "learning_rate": 2.403337437900726e-05, "loss": 0.0755, "step": 21912 }, { "epoch": 6.699174564353409, "grad_norm": 0.3408149480819702, "learning_rate": 2.403294976858732e-05, "loss": 0.0594, "step": 21913 }, { "epoch": 6.699480281259554, "grad_norm": 0.4229801893234253, "learning_rate": 2.403252515816738e-05, "loss": 0.068, "step": 21914 }, { "epoch": 6.699785998165699, "grad_norm": 0.40228286385536194, "learning_rate": 2.403210054774744e-05, "loss": 0.1263, "step": 21915 }, { "epoch": 6.700091715071843, "grad_norm": 0.6283367276191711, "learning_rate": 2.40316759373275e-05, "loss": 0.112, "step": 21916 }, { "epoch": 6.700397431977988, "grad_norm": 0.5225504040718079, "learning_rate": 2.4031251326907562e-05, "loss": 0.1232, "step": 21917 }, { "epoch": 6.700703148884133, "grad_norm": 2.6431403160095215, "learning_rate": 2.403082671648762e-05, "loss": 0.1358, "step": 21918 }, { "epoch": 6.701008865790278, "grad_norm": 0.8832088112831116, "learning_rate": 2.4030402106067683e-05, "loss": 0.1161, "step": 21919 }, { "epoch": 6.701314582696423, "grad_norm": 1.104292869567871, "learning_rate": 2.4029977495647745e-05, "loss": 0.137, "step": 21920 }, { "epoch": 6.701620299602568, "grad_norm": 0.6418956518173218, "learning_rate": 2.4029552885227807e-05, "loss": 0.1723, "step": 21921 }, { "epoch": 6.701926016508713, "grad_norm": 0.7084742188453674, "learning_rate": 2.4029128274807866e-05, "loss": 0.1904, "step": 21922 }, { "epoch": 6.702231733414858, "grad_norm": 0.8131171464920044, "learning_rate": 2.4028703664387925e-05, "loss": 0.1558, "step": 21923 }, { "epoch": 6.702537450321003, "grad_norm": 2.9835739135742188, "learning_rate": 2.4028279053967987e-05, "loss": 0.2141, "step": 21924 }, { "epoch": 6.702843167227147, "grad_norm": 0.9444133043289185, "learning_rate": 2.4027854443548046e-05, "loss": 0.1979, "step": 21925 }, { "epoch": 6.7031488841332925, "grad_norm": 0.8586975336074829, "learning_rate": 2.4027429833128108e-05, "loss": 0.1911, "step": 21926 }, { "epoch": 6.703454601039438, "grad_norm": 1.3980692625045776, "learning_rate": 2.4027005222708166e-05, "loss": 0.1596, "step": 21927 }, { "epoch": 6.703760317945583, "grad_norm": 0.39599424600601196, "learning_rate": 2.402658061228823e-05, "loss": 0.094, "step": 21928 }, { "epoch": 6.704066034851727, "grad_norm": 0.32087981700897217, "learning_rate": 2.4026156001868287e-05, "loss": 0.0571, "step": 21929 }, { "epoch": 6.704371751757872, "grad_norm": 0.4590475559234619, "learning_rate": 2.402573139144835e-05, "loss": 0.0524, "step": 21930 }, { "epoch": 6.704677468664017, "grad_norm": 0.24423855543136597, "learning_rate": 2.4025306781028408e-05, "loss": 0.0525, "step": 21931 }, { "epoch": 6.704983185570162, "grad_norm": 0.13489428162574768, "learning_rate": 2.402488217060847e-05, "loss": 0.0586, "step": 21932 }, { "epoch": 6.7052889024763065, "grad_norm": 0.30189892649650574, "learning_rate": 2.402445756018853e-05, "loss": 0.0768, "step": 21933 }, { "epoch": 6.705594619382452, "grad_norm": 0.3413833975791931, "learning_rate": 2.402403294976859e-05, "loss": 0.0773, "step": 21934 }, { "epoch": 6.705900336288597, "grad_norm": 0.31056466698646545, "learning_rate": 2.402360833934865e-05, "loss": 0.0521, "step": 21935 }, { "epoch": 6.706206053194742, "grad_norm": 0.24053989350795746, "learning_rate": 2.4023183728928708e-05, "loss": 0.0876, "step": 21936 }, { "epoch": 6.706511770100887, "grad_norm": 0.2405046671628952, "learning_rate": 2.402275911850877e-05, "loss": 0.084, "step": 21937 }, { "epoch": 6.706817487007031, "grad_norm": 0.22709541022777557, "learning_rate": 2.402233450808883e-05, "loss": 0.0811, "step": 21938 }, { "epoch": 6.707123203913176, "grad_norm": 0.3464573621749878, "learning_rate": 2.402190989766889e-05, "loss": 0.0661, "step": 21939 }, { "epoch": 6.707428920819321, "grad_norm": 0.30401545763015747, "learning_rate": 2.402148528724895e-05, "loss": 0.1043, "step": 21940 }, { "epoch": 6.7077346377254665, "grad_norm": 0.41404590010643005, "learning_rate": 2.4021060676829012e-05, "loss": 0.1399, "step": 21941 }, { "epoch": 6.708040354631611, "grad_norm": 0.5611311197280884, "learning_rate": 2.402063606640907e-05, "loss": 0.1736, "step": 21942 }, { "epoch": 6.708346071537756, "grad_norm": 0.45839402079582214, "learning_rate": 2.4020211455989133e-05, "loss": 0.1441, "step": 21943 }, { "epoch": 6.708651788443901, "grad_norm": 0.5154041051864624, "learning_rate": 2.401978684556919e-05, "loss": 0.1772, "step": 21944 }, { "epoch": 6.708957505350046, "grad_norm": 0.36520662903785706, "learning_rate": 2.4019362235149253e-05, "loss": 0.1687, "step": 21945 }, { "epoch": 6.70926322225619, "grad_norm": 1.1324902772903442, "learning_rate": 2.4018937624729312e-05, "loss": 0.212, "step": 21946 }, { "epoch": 6.709568939162335, "grad_norm": 0.6549566388130188, "learning_rate": 2.4018513014309374e-05, "loss": 0.1552, "step": 21947 }, { "epoch": 6.7098746560684805, "grad_norm": 0.7213472723960876, "learning_rate": 2.4018088403889433e-05, "loss": 0.1655, "step": 21948 }, { "epoch": 6.710180372974626, "grad_norm": 1.1576792001724243, "learning_rate": 2.401766379346949e-05, "loss": 0.1757, "step": 21949 }, { "epoch": 6.710486089880771, "grad_norm": NaN, "learning_rate": 2.401766379346949e-05, "loss": 0.2087, "step": 21950 }, { "epoch": 6.710791806786915, "grad_norm": 0.6904534101486206, "learning_rate": 2.4017239183049554e-05, "loss": 0.1924, "step": 21951 }, { "epoch": 6.71109752369306, "grad_norm": 0.33691805601119995, "learning_rate": 2.4016814572629612e-05, "loss": 0.1448, "step": 21952 }, { "epoch": 6.711403240599205, "grad_norm": 0.38677871227264404, "learning_rate": 2.4016389962209674e-05, "loss": 0.0798, "step": 21953 }, { "epoch": 6.71170895750535, "grad_norm": 0.25880950689315796, "learning_rate": 2.4015965351789733e-05, "loss": 0.0603, "step": 21954 }, { "epoch": 6.712014674411495, "grad_norm": 0.22804765403270721, "learning_rate": 2.4015540741369795e-05, "loss": 0.0725, "step": 21955 }, { "epoch": 6.71232039131764, "grad_norm": 0.14330996572971344, "learning_rate": 2.4015116130949854e-05, "loss": 0.0551, "step": 21956 }, { "epoch": 6.712626108223785, "grad_norm": 0.450147807598114, "learning_rate": 2.4014691520529916e-05, "loss": 0.0709, "step": 21957 }, { "epoch": 6.71293182512993, "grad_norm": 0.2138645350933075, "learning_rate": 2.4014266910109975e-05, "loss": 0.0336, "step": 21958 }, { "epoch": 6.713237542036074, "grad_norm": 0.181799054145813, "learning_rate": 2.4013842299690037e-05, "loss": 0.0441, "step": 21959 }, { "epoch": 6.713543258942219, "grad_norm": 0.2695091962814331, "learning_rate": 2.4013417689270096e-05, "loss": 0.0688, "step": 21960 }, { "epoch": 6.713848975848364, "grad_norm": 0.2523859143257141, "learning_rate": 2.4012993078850154e-05, "loss": 0.0645, "step": 21961 }, { "epoch": 6.7141546927545095, "grad_norm": 0.30852559208869934, "learning_rate": 2.4012568468430216e-05, "loss": 0.0922, "step": 21962 }, { "epoch": 6.714460409660655, "grad_norm": 0.2554326355457306, "learning_rate": 2.4012143858010275e-05, "loss": 0.0703, "step": 21963 }, { "epoch": 6.714766126566799, "grad_norm": 0.37584686279296875, "learning_rate": 2.4011719247590337e-05, "loss": 0.1311, "step": 21964 }, { "epoch": 6.715071843472944, "grad_norm": 0.5567603707313538, "learning_rate": 2.4011294637170396e-05, "loss": 0.1019, "step": 21965 }, { "epoch": 6.715377560379089, "grad_norm": 0.5890644788742065, "learning_rate": 2.4010870026750458e-05, "loss": 0.1577, "step": 21966 }, { "epoch": 6.715683277285234, "grad_norm": 0.2942119538784027, "learning_rate": 2.4010445416330517e-05, "loss": 0.1754, "step": 21967 }, { "epoch": 6.715988994191378, "grad_norm": 0.8637819886207581, "learning_rate": 2.401002080591058e-05, "loss": 0.1585, "step": 21968 }, { "epoch": 6.7162947110975235, "grad_norm": 0.5321954488754272, "learning_rate": 2.4009596195490637e-05, "loss": 0.1808, "step": 21969 }, { "epoch": 6.716600428003669, "grad_norm": 0.5478083491325378, "learning_rate": 2.40091715850707e-05, "loss": 0.1549, "step": 21970 }, { "epoch": 6.716906144909814, "grad_norm": 0.39137616753578186, "learning_rate": 2.4008746974650758e-05, "loss": 0.1648, "step": 21971 }, { "epoch": 6.717211861815958, "grad_norm": 0.9547986388206482, "learning_rate": 2.400832236423082e-05, "loss": 0.1721, "step": 21972 }, { "epoch": 6.717517578722103, "grad_norm": 0.4797540009021759, "learning_rate": 2.400789775381088e-05, "loss": 0.1504, "step": 21973 }, { "epoch": 6.717823295628248, "grad_norm": 0.679997444152832, "learning_rate": 2.4007473143390938e-05, "loss": 0.2277, "step": 21974 }, { "epoch": 6.718129012534393, "grad_norm": 0.9945102334022522, "learning_rate": 2.4007048532971e-05, "loss": 0.1977, "step": 21975 }, { "epoch": 6.718434729440538, "grad_norm": 0.9385262727737427, "learning_rate": 2.400662392255106e-05, "loss": 0.2104, "step": 21976 }, { "epoch": 6.718740446346683, "grad_norm": 0.2917172312736511, "learning_rate": 2.400619931213112e-05, "loss": 0.1253, "step": 21977 }, { "epoch": 6.719046163252828, "grad_norm": 0.5217665433883667, "learning_rate": 2.400577470171118e-05, "loss": 0.07, "step": 21978 }, { "epoch": 6.719351880158973, "grad_norm": 0.1605129987001419, "learning_rate": 2.400535009129124e-05, "loss": 0.0525, "step": 21979 }, { "epoch": 6.719657597065118, "grad_norm": 0.20502214133739471, "learning_rate": 2.40049254808713e-05, "loss": 0.0838, "step": 21980 }, { "epoch": 6.719963313971262, "grad_norm": 0.2536808252334595, "learning_rate": 2.4004500870451362e-05, "loss": 0.06, "step": 21981 }, { "epoch": 6.720269030877407, "grad_norm": 0.2720838785171509, "learning_rate": 2.400407626003142e-05, "loss": 0.0844, "step": 21982 }, { "epoch": 6.7205747477835525, "grad_norm": 0.6003649234771729, "learning_rate": 2.4003651649611483e-05, "loss": 0.0718, "step": 21983 }, { "epoch": 6.720880464689698, "grad_norm": 0.33081555366516113, "learning_rate": 2.400322703919154e-05, "loss": 0.057, "step": 21984 }, { "epoch": 6.721186181595842, "grad_norm": 0.4012954831123352, "learning_rate": 2.4002802428771604e-05, "loss": 0.0793, "step": 21985 }, { "epoch": 6.721491898501987, "grad_norm": 0.2759213447570801, "learning_rate": 2.4002377818351662e-05, "loss": 0.0529, "step": 21986 }, { "epoch": 6.721797615408132, "grad_norm": 0.928045392036438, "learning_rate": 2.400195320793172e-05, "loss": 0.0715, "step": 21987 }, { "epoch": 6.722103332314277, "grad_norm": 0.34352660179138184, "learning_rate": 2.4001528597511783e-05, "loss": 0.0533, "step": 21988 }, { "epoch": 6.722409049220422, "grad_norm": 0.22159846127033234, "learning_rate": 2.4001103987091842e-05, "loss": 0.1019, "step": 21989 }, { "epoch": 6.7227147661265665, "grad_norm": 0.3065022826194763, "learning_rate": 2.4000679376671904e-05, "loss": 0.1223, "step": 21990 }, { "epoch": 6.723020483032712, "grad_norm": 0.5063134431838989, "learning_rate": 2.4000254766251963e-05, "loss": 0.1252, "step": 21991 }, { "epoch": 6.723326199938857, "grad_norm": 0.26812851428985596, "learning_rate": 2.3999830155832025e-05, "loss": 0.1101, "step": 21992 }, { "epoch": 6.723631916845002, "grad_norm": 0.3818185031414032, "learning_rate": 2.3999405545412083e-05, "loss": 0.1379, "step": 21993 }, { "epoch": 6.723937633751146, "grad_norm": 0.684481680393219, "learning_rate": 2.3998980934992146e-05, "loss": 0.1319, "step": 21994 }, { "epoch": 6.724243350657291, "grad_norm": 0.6985824108123779, "learning_rate": 2.3998556324572204e-05, "loss": 0.1724, "step": 21995 }, { "epoch": 6.724549067563436, "grad_norm": 0.8153463006019592, "learning_rate": 2.3998131714152266e-05, "loss": 0.1666, "step": 21996 }, { "epoch": 6.724854784469581, "grad_norm": 0.569279670715332, "learning_rate": 2.3997707103732325e-05, "loss": 0.1555, "step": 21997 }, { "epoch": 6.725160501375726, "grad_norm": 0.6187518835067749, "learning_rate": 2.3997282493312387e-05, "loss": 0.1597, "step": 21998 }, { "epoch": 6.725466218281871, "grad_norm": 0.544073760509491, "learning_rate": 2.3996857882892446e-05, "loss": 0.1729, "step": 21999 }, { "epoch": 6.725771935188016, "grad_norm": 1.339949369430542, "learning_rate": 2.3996433272472505e-05, "loss": 0.159, "step": 22000 }, { "epoch": 6.725771935188016, "eval_cer": 0.1872879229266909, "eval_loss": 0.23102498054504395, "eval_runtime": 19.4266, "eval_samples_per_second": 233.597, "eval_steps_per_second": 0.772, "eval_wer": 0.3243205497053155, "step": 22000 }, { "epoch": 6.726077652094161, "grad_norm": 1.2415074110031128, "learning_rate": 2.3996008662052567e-05, "loss": 0.2281, "step": 22001 }, { "epoch": 6.726383369000306, "grad_norm": 0.4775652289390564, "learning_rate": 2.3995584051632625e-05, "loss": 0.1484, "step": 22002 }, { "epoch": 6.72668908590645, "grad_norm": 1.5133097171783447, "learning_rate": 2.3995159441212687e-05, "loss": 0.07, "step": 22003 }, { "epoch": 6.726994802812595, "grad_norm": 0.18260619044303894, "learning_rate": 2.3994734830792746e-05, "loss": 0.0605, "step": 22004 }, { "epoch": 6.7273005197187405, "grad_norm": 0.1623542755842209, "learning_rate": 2.3994310220372808e-05, "loss": 0.0542, "step": 22005 }, { "epoch": 6.727606236624886, "grad_norm": 0.41701072454452515, "learning_rate": 2.3993885609952867e-05, "loss": 0.0638, "step": 22006 }, { "epoch": 6.72791195353103, "grad_norm": 0.16947892308235168, "learning_rate": 2.399346099953293e-05, "loss": 0.0499, "step": 22007 }, { "epoch": 6.728217670437175, "grad_norm": 0.44637343287467957, "learning_rate": 2.3993036389112988e-05, "loss": 0.0745, "step": 22008 }, { "epoch": 6.72852338734332, "grad_norm": 0.2224861979484558, "learning_rate": 2.399261177869305e-05, "loss": 0.074, "step": 22009 }, { "epoch": 6.728829104249465, "grad_norm": 0.2776935398578644, "learning_rate": 2.399218716827311e-05, "loss": 0.0695, "step": 22010 }, { "epoch": 6.7291348211556095, "grad_norm": 0.3001580536365509, "learning_rate": 2.399176255785317e-05, "loss": 0.0716, "step": 22011 }, { "epoch": 6.729440538061755, "grad_norm": 0.591814398765564, "learning_rate": 2.399133794743323e-05, "loss": 0.1077, "step": 22012 }, { "epoch": 6.7297462549679, "grad_norm": 0.6155014634132385, "learning_rate": 2.3990913337013288e-05, "loss": 0.0729, "step": 22013 }, { "epoch": 6.730051971874045, "grad_norm": 0.44223490357398987, "learning_rate": 2.399048872659335e-05, "loss": 0.0855, "step": 22014 }, { "epoch": 6.73035768878019, "grad_norm": 0.6856004595756531, "learning_rate": 2.399006411617341e-05, "loss": 0.1052, "step": 22015 }, { "epoch": 6.730663405686334, "grad_norm": 0.408417284488678, "learning_rate": 2.398963950575347e-05, "loss": 0.1476, "step": 22016 }, { "epoch": 6.730969122592479, "grad_norm": 0.39184409379959106, "learning_rate": 2.398921489533353e-05, "loss": 0.1275, "step": 22017 }, { "epoch": 6.731274839498624, "grad_norm": 0.38065800070762634, "learning_rate": 2.398879028491359e-05, "loss": 0.1298, "step": 22018 }, { "epoch": 6.7315805564047695, "grad_norm": 0.645467221736908, "learning_rate": 2.398836567449365e-05, "loss": 0.1753, "step": 22019 }, { "epoch": 6.731886273310914, "grad_norm": 0.38796466588974, "learning_rate": 2.3987941064073712e-05, "loss": 0.1779, "step": 22020 }, { "epoch": 6.732191990217059, "grad_norm": 1.6639348268508911, "learning_rate": 2.398751645365377e-05, "loss": 0.1989, "step": 22021 }, { "epoch": 6.732497707123204, "grad_norm": 0.9327749013900757, "learning_rate": 2.3987091843233833e-05, "loss": 0.1742, "step": 22022 }, { "epoch": 6.732803424029349, "grad_norm": 0.8240117430686951, "learning_rate": 2.3986667232813895e-05, "loss": 0.1522, "step": 22023 }, { "epoch": 6.733109140935493, "grad_norm": 1.173445701599121, "learning_rate": 2.3986242622393957e-05, "loss": 0.2037, "step": 22024 }, { "epoch": 6.733414857841638, "grad_norm": 1.0768722295761108, "learning_rate": 2.3985818011974016e-05, "loss": 0.1862, "step": 22025 }, { "epoch": 6.7337205747477835, "grad_norm": 1.9875823259353638, "learning_rate": 2.3985393401554075e-05, "loss": 0.2023, "step": 22026 }, { "epoch": 6.734026291653929, "grad_norm": 0.3512042164802551, "learning_rate": 2.3984968791134137e-05, "loss": 0.1477, "step": 22027 }, { "epoch": 6.734332008560074, "grad_norm": 0.5740318298339844, "learning_rate": 2.3984544180714196e-05, "loss": 0.07, "step": 22028 }, { "epoch": 6.734637725466218, "grad_norm": 0.3794001340866089, "learning_rate": 2.3984119570294258e-05, "loss": 0.0632, "step": 22029 }, { "epoch": 6.734943442372363, "grad_norm": 0.15018558502197266, "learning_rate": 2.3983694959874316e-05, "loss": 0.0415, "step": 22030 }, { "epoch": 6.735249159278508, "grad_norm": 0.29959091544151306, "learning_rate": 2.398327034945438e-05, "loss": 0.0497, "step": 22031 }, { "epoch": 6.735554876184653, "grad_norm": 0.18461142480373383, "learning_rate": 2.3982845739034437e-05, "loss": 0.0591, "step": 22032 }, { "epoch": 6.7358605930907975, "grad_norm": 0.22369159758090973, "learning_rate": 2.39824211286145e-05, "loss": 0.0512, "step": 22033 }, { "epoch": 6.736166309996943, "grad_norm": 0.23237556219100952, "learning_rate": 2.3981996518194558e-05, "loss": 0.0574, "step": 22034 }, { "epoch": 6.736472026903088, "grad_norm": 1.0590800046920776, "learning_rate": 2.398157190777462e-05, "loss": 0.0497, "step": 22035 }, { "epoch": 6.736777743809233, "grad_norm": 0.27768170833587646, "learning_rate": 2.398114729735468e-05, "loss": 0.0607, "step": 22036 }, { "epoch": 6.737083460715377, "grad_norm": 0.4789730906486511, "learning_rate": 2.398072268693474e-05, "loss": 0.146, "step": 22037 }, { "epoch": 6.737389177621522, "grad_norm": 0.8871320486068726, "learning_rate": 2.39802980765148e-05, "loss": 0.048, "step": 22038 }, { "epoch": 6.737694894527667, "grad_norm": 0.266340970993042, "learning_rate": 2.3979873466094858e-05, "loss": 0.0932, "step": 22039 }, { "epoch": 6.7380006114338125, "grad_norm": 0.556869387626648, "learning_rate": 2.397944885567492e-05, "loss": 0.0905, "step": 22040 }, { "epoch": 6.738306328339958, "grad_norm": 0.46392861008644104, "learning_rate": 2.397902424525498e-05, "loss": 0.1187, "step": 22041 }, { "epoch": 6.738612045246102, "grad_norm": 1.2892489433288574, "learning_rate": 2.397859963483504e-05, "loss": 0.1568, "step": 22042 }, { "epoch": 6.738917762152247, "grad_norm": 0.41729938983917236, "learning_rate": 2.39781750244151e-05, "loss": 0.1505, "step": 22043 }, { "epoch": 6.739223479058392, "grad_norm": 1.3890869617462158, "learning_rate": 2.3977750413995162e-05, "loss": 0.1413, "step": 22044 }, { "epoch": 6.739529195964537, "grad_norm": 0.5887420773506165, "learning_rate": 2.397732580357522e-05, "loss": 0.1689, "step": 22045 }, { "epoch": 6.739834912870681, "grad_norm": 0.6325968503952026, "learning_rate": 2.3976901193155283e-05, "loss": 0.155, "step": 22046 }, { "epoch": 6.7401406297768265, "grad_norm": 1.3345471620559692, "learning_rate": 2.397647658273534e-05, "loss": 0.1819, "step": 22047 }, { "epoch": 6.740446346682972, "grad_norm": 0.470140278339386, "learning_rate": 2.3976051972315403e-05, "loss": 0.1492, "step": 22048 }, { "epoch": 6.740752063589117, "grad_norm": 1.0266964435577393, "learning_rate": 2.3975627361895462e-05, "loss": 0.2061, "step": 22049 }, { "epoch": 6.741057780495261, "grad_norm": 0.5923460721969604, "learning_rate": 2.3975202751475524e-05, "loss": 0.1712, "step": 22050 }, { "epoch": 6.741363497401406, "grad_norm": 2.8327434062957764, "learning_rate": 2.3974778141055583e-05, "loss": 0.2446, "step": 22051 }, { "epoch": 6.741669214307551, "grad_norm": 0.682319164276123, "learning_rate": 2.397435353063564e-05, "loss": 0.1525, "step": 22052 }, { "epoch": 6.741974931213696, "grad_norm": 0.21207718551158905, "learning_rate": 2.3973928920215704e-05, "loss": 0.0779, "step": 22053 }, { "epoch": 6.742280648119841, "grad_norm": 0.9463611245155334, "learning_rate": 2.3973504309795762e-05, "loss": 0.1086, "step": 22054 }, { "epoch": 6.742586365025986, "grad_norm": 0.18615666031837463, "learning_rate": 2.3973079699375825e-05, "loss": 0.0517, "step": 22055 }, { "epoch": 6.742892081932131, "grad_norm": 0.6164522171020508, "learning_rate": 2.3972655088955883e-05, "loss": 0.0624, "step": 22056 }, { "epoch": 6.743197798838276, "grad_norm": 0.22773179411888123, "learning_rate": 2.3972230478535945e-05, "loss": 0.0435, "step": 22057 }, { "epoch": 6.743503515744421, "grad_norm": 0.5702065825462341, "learning_rate": 2.3971805868116004e-05, "loss": 0.0513, "step": 22058 }, { "epoch": 6.743809232650565, "grad_norm": 0.17938461899757385, "learning_rate": 2.3971381257696066e-05, "loss": 0.0518, "step": 22059 }, { "epoch": 6.74411494955671, "grad_norm": 0.2977841794490814, "learning_rate": 2.3970956647276125e-05, "loss": 0.0592, "step": 22060 }, { "epoch": 6.744420666462855, "grad_norm": 0.2356501817703247, "learning_rate": 2.3970532036856187e-05, "loss": 0.0617, "step": 22061 }, { "epoch": 6.7447263833690005, "grad_norm": 0.5651552677154541, "learning_rate": 2.3970107426436246e-05, "loss": 0.1037, "step": 22062 }, { "epoch": 6.745032100275145, "grad_norm": 0.20315955579280853, "learning_rate": 2.3969682816016308e-05, "loss": 0.0732, "step": 22063 }, { "epoch": 6.74533781718129, "grad_norm": 0.3765391707420349, "learning_rate": 2.3969258205596366e-05, "loss": 0.0999, "step": 22064 }, { "epoch": 6.745643534087435, "grad_norm": 0.24462711811065674, "learning_rate": 2.3968833595176425e-05, "loss": 0.0932, "step": 22065 }, { "epoch": 6.74594925099358, "grad_norm": 0.3502673804759979, "learning_rate": 2.3968408984756487e-05, "loss": 0.1213, "step": 22066 }, { "epoch": 6.746254967899725, "grad_norm": 0.30101361870765686, "learning_rate": 2.3967984374336546e-05, "loss": 0.1107, "step": 22067 }, { "epoch": 6.7465606848058695, "grad_norm": 0.3462139070034027, "learning_rate": 2.3967559763916608e-05, "loss": 0.1549, "step": 22068 }, { "epoch": 6.746866401712015, "grad_norm": 0.7049499750137329, "learning_rate": 2.3967135153496667e-05, "loss": 0.1153, "step": 22069 }, { "epoch": 6.74717211861816, "grad_norm": 0.6949379444122314, "learning_rate": 2.396671054307673e-05, "loss": 0.1465, "step": 22070 }, { "epoch": 6.747477835524305, "grad_norm": 1.1516047716140747, "learning_rate": 2.3966285932656787e-05, "loss": 0.158, "step": 22071 }, { "epoch": 6.747783552430449, "grad_norm": 0.4526146352291107, "learning_rate": 2.396586132223685e-05, "loss": 0.1539, "step": 22072 }, { "epoch": 6.748089269336594, "grad_norm": 1.1086034774780273, "learning_rate": 2.3965436711816908e-05, "loss": 0.1667, "step": 22073 }, { "epoch": 6.748394986242739, "grad_norm": 0.8599976301193237, "learning_rate": 2.396501210139697e-05, "loss": 0.1837, "step": 22074 }, { "epoch": 6.748700703148884, "grad_norm": 0.7905387282371521, "learning_rate": 2.396458749097703e-05, "loss": 0.2332, "step": 22075 }, { "epoch": 6.749006420055029, "grad_norm": 1.7540920972824097, "learning_rate": 2.3964162880557088e-05, "loss": 0.1813, "step": 22076 }, { "epoch": 6.749312136961174, "grad_norm": 0.4914014935493469, "learning_rate": 2.396373827013715e-05, "loss": 0.1171, "step": 22077 }, { "epoch": 6.749617853867319, "grad_norm": 0.47751086950302124, "learning_rate": 2.396331365971721e-05, "loss": 0.0805, "step": 22078 }, { "epoch": 6.749923570773464, "grad_norm": 0.16971881687641144, "learning_rate": 2.396288904929727e-05, "loss": 0.0643, "step": 22079 }, { "epoch": 6.750229287679609, "grad_norm": 0.15878702700138092, "learning_rate": 2.396246443887733e-05, "loss": 0.0522, "step": 22080 }, { "epoch": 6.750535004585753, "grad_norm": 0.20208881795406342, "learning_rate": 2.396203982845739e-05, "loss": 0.0568, "step": 22081 }, { "epoch": 6.750840721491898, "grad_norm": 0.27959853410720825, "learning_rate": 2.396161521803745e-05, "loss": 0.046, "step": 22082 }, { "epoch": 6.7511464383980435, "grad_norm": 0.25937163829803467, "learning_rate": 2.3961190607617512e-05, "loss": 0.0457, "step": 22083 }, { "epoch": 6.751452155304189, "grad_norm": 0.23937684297561646, "learning_rate": 2.396076599719757e-05, "loss": 0.0452, "step": 22084 }, { "epoch": 6.751757872210333, "grad_norm": 0.2931431829929352, "learning_rate": 2.3960341386777633e-05, "loss": 0.0548, "step": 22085 }, { "epoch": 6.752063589116478, "grad_norm": 0.39840444922447205, "learning_rate": 2.395991677635769e-05, "loss": 0.0653, "step": 22086 }, { "epoch": 6.752369306022623, "grad_norm": 0.36240506172180176, "learning_rate": 2.3959492165937754e-05, "loss": 0.0833, "step": 22087 }, { "epoch": 6.752675022928768, "grad_norm": 0.2338925153017044, "learning_rate": 2.3959067555517812e-05, "loss": 0.0704, "step": 22088 }, { "epoch": 6.752980739834912, "grad_norm": 0.3176068365573883, "learning_rate": 2.395864294509787e-05, "loss": 0.0629, "step": 22089 }, { "epoch": 6.7532864567410575, "grad_norm": 0.4133112132549286, "learning_rate": 2.3958218334677933e-05, "loss": 0.1019, "step": 22090 }, { "epoch": 6.753592173647203, "grad_norm": 1.584756851196289, "learning_rate": 2.3957793724257992e-05, "loss": 0.1093, "step": 22091 }, { "epoch": 6.753897890553348, "grad_norm": 0.317825049161911, "learning_rate": 2.3957369113838054e-05, "loss": 0.1314, "step": 22092 }, { "epoch": 6.754203607459493, "grad_norm": 1.14395272731781, "learning_rate": 2.3956944503418113e-05, "loss": 0.1978, "step": 22093 }, { "epoch": 6.754509324365637, "grad_norm": 0.31262269616127014, "learning_rate": 2.3956519892998175e-05, "loss": 0.1494, "step": 22094 }, { "epoch": 6.754815041271782, "grad_norm": 1.5282357931137085, "learning_rate": 2.3956095282578233e-05, "loss": 0.1609, "step": 22095 }, { "epoch": 6.755120758177927, "grad_norm": 0.8211148381233215, "learning_rate": 2.3955670672158296e-05, "loss": 0.1601, "step": 22096 }, { "epoch": 6.7554264750840725, "grad_norm": 0.4980211853981018, "learning_rate": 2.3955246061738354e-05, "loss": 0.164, "step": 22097 }, { "epoch": 6.755732191990217, "grad_norm": 0.4649371802806854, "learning_rate": 2.3954821451318416e-05, "loss": 0.1794, "step": 22098 }, { "epoch": 6.756037908896362, "grad_norm": 1.056717038154602, "learning_rate": 2.3954396840898475e-05, "loss": 0.1612, "step": 22099 }, { "epoch": 6.756343625802507, "grad_norm": 0.5079130530357361, "learning_rate": 2.3953972230478537e-05, "loss": 0.1703, "step": 22100 }, { "epoch": 6.756649342708652, "grad_norm": 0.8013178706169128, "learning_rate": 2.3953547620058596e-05, "loss": 0.2036, "step": 22101 }, { "epoch": 6.756955059614796, "grad_norm": 0.4753950238227844, "learning_rate": 2.3953123009638655e-05, "loss": 0.1328, "step": 22102 }, { "epoch": 6.757260776520941, "grad_norm": 0.29855895042419434, "learning_rate": 2.3952698399218717e-05, "loss": 0.0997, "step": 22103 }, { "epoch": 6.7575664934270865, "grad_norm": 0.583743155002594, "learning_rate": 2.3952273788798775e-05, "loss": 0.0508, "step": 22104 }, { "epoch": 6.757872210333232, "grad_norm": 0.1324227899312973, "learning_rate": 2.3951849178378837e-05, "loss": 0.0541, "step": 22105 }, { "epoch": 6.758177927239377, "grad_norm": 0.17015790939331055, "learning_rate": 2.3951424567958896e-05, "loss": 0.0533, "step": 22106 }, { "epoch": 6.758483644145521, "grad_norm": 0.23280058801174164, "learning_rate": 2.3950999957538958e-05, "loss": 0.0583, "step": 22107 }, { "epoch": 6.758789361051666, "grad_norm": 0.202836811542511, "learning_rate": 2.3950575347119017e-05, "loss": 0.0478, "step": 22108 }, { "epoch": 6.759095077957811, "grad_norm": 0.1993943601846695, "learning_rate": 2.395015073669908e-05, "loss": 0.0713, "step": 22109 }, { "epoch": 6.759400794863956, "grad_norm": 0.761286199092865, "learning_rate": 2.3949726126279138e-05, "loss": 0.0675, "step": 22110 }, { "epoch": 6.7597065117701005, "grad_norm": 0.4725548326969147, "learning_rate": 2.39493015158592e-05, "loss": 0.0663, "step": 22111 }, { "epoch": 6.760012228676246, "grad_norm": 0.3985122740268707, "learning_rate": 2.394887690543926e-05, "loss": 0.0938, "step": 22112 }, { "epoch": 6.760317945582391, "grad_norm": 0.24228695034980774, "learning_rate": 2.394845229501932e-05, "loss": 0.1001, "step": 22113 }, { "epoch": 6.760623662488536, "grad_norm": 0.5579488277435303, "learning_rate": 2.394802768459938e-05, "loss": 0.1055, "step": 22114 }, { "epoch": 6.76092937939468, "grad_norm": 0.7231791615486145, "learning_rate": 2.3947603074179438e-05, "loss": 0.076, "step": 22115 }, { "epoch": 6.761235096300825, "grad_norm": 1.103756070137024, "learning_rate": 2.39471784637595e-05, "loss": 0.1316, "step": 22116 }, { "epoch": 6.76154081320697, "grad_norm": 1.5581403970718384, "learning_rate": 2.394675385333956e-05, "loss": 0.1299, "step": 22117 }, { "epoch": 6.761846530113115, "grad_norm": 0.4583662748336792, "learning_rate": 2.394632924291962e-05, "loss": 0.1309, "step": 22118 }, { "epoch": 6.7621522470192605, "grad_norm": 0.34385061264038086, "learning_rate": 2.394590463249968e-05, "loss": 0.1276, "step": 22119 }, { "epoch": 6.762457963925405, "grad_norm": 0.48499980568885803, "learning_rate": 2.394548002207974e-05, "loss": 0.1389, "step": 22120 }, { "epoch": 6.76276368083155, "grad_norm": 0.3256663382053375, "learning_rate": 2.39450554116598e-05, "loss": 0.1445, "step": 22121 }, { "epoch": 6.763069397737695, "grad_norm": 0.851608157157898, "learning_rate": 2.3944630801239862e-05, "loss": 0.1771, "step": 22122 }, { "epoch": 6.76337511464384, "grad_norm": 0.5936332941055298, "learning_rate": 2.394420619081992e-05, "loss": 0.1497, "step": 22123 }, { "epoch": 6.763680831549984, "grad_norm": 0.5006363391876221, "learning_rate": 2.3943781580399983e-05, "loss": 0.1616, "step": 22124 }, { "epoch": 6.7639865484561295, "grad_norm": 1.0667184591293335, "learning_rate": 2.3943356969980045e-05, "loss": 0.1585, "step": 22125 }, { "epoch": 6.764292265362275, "grad_norm": 2.4724884033203125, "learning_rate": 2.3942932359560107e-05, "loss": 0.2038, "step": 22126 }, { "epoch": 6.76459798226842, "grad_norm": 0.4144921600818634, "learning_rate": 2.3942507749140166e-05, "loss": 0.1409, "step": 22127 }, { "epoch": 6.764903699174564, "grad_norm": 0.3059692084789276, "learning_rate": 2.3942083138720225e-05, "loss": 0.0771, "step": 22128 }, { "epoch": 6.765209416080709, "grad_norm": 0.722633957862854, "learning_rate": 2.3941658528300287e-05, "loss": 0.0774, "step": 22129 }, { "epoch": 6.765515132986854, "grad_norm": 0.2799307107925415, "learning_rate": 2.3941233917880346e-05, "loss": 0.0516, "step": 22130 }, { "epoch": 6.765820849892999, "grad_norm": 0.2800896465778351, "learning_rate": 2.3940809307460408e-05, "loss": 0.0606, "step": 22131 }, { "epoch": 6.766126566799144, "grad_norm": 0.24451768398284912, "learning_rate": 2.3940384697040466e-05, "loss": 0.0556, "step": 22132 }, { "epoch": 6.766432283705289, "grad_norm": 0.20367498695850372, "learning_rate": 2.393996008662053e-05, "loss": 0.0556, "step": 22133 }, { "epoch": 6.766738000611434, "grad_norm": 0.2766670286655426, "learning_rate": 2.3939535476200587e-05, "loss": 0.0679, "step": 22134 }, { "epoch": 6.767043717517579, "grad_norm": 0.2938487231731415, "learning_rate": 2.393911086578065e-05, "loss": 0.0518, "step": 22135 }, { "epoch": 6.767349434423724, "grad_norm": 0.4176516532897949, "learning_rate": 2.3938686255360708e-05, "loss": 0.0608, "step": 22136 }, { "epoch": 6.767655151329868, "grad_norm": 0.385861873626709, "learning_rate": 2.393826164494077e-05, "loss": 0.1142, "step": 22137 }, { "epoch": 6.767960868236013, "grad_norm": 0.9565507173538208, "learning_rate": 2.393783703452083e-05, "loss": 0.0755, "step": 22138 }, { "epoch": 6.768266585142158, "grad_norm": 0.38155287504196167, "learning_rate": 2.393741242410089e-05, "loss": 0.1669, "step": 22139 }, { "epoch": 6.7685723020483035, "grad_norm": 0.42059633135795593, "learning_rate": 2.393698781368095e-05, "loss": 0.1036, "step": 22140 }, { "epoch": 6.768878018954448, "grad_norm": 0.49113553762435913, "learning_rate": 2.3936563203261008e-05, "loss": 0.1182, "step": 22141 }, { "epoch": 6.769183735860593, "grad_norm": 0.8836022019386292, "learning_rate": 2.393613859284107e-05, "loss": 0.1381, "step": 22142 }, { "epoch": 6.769489452766738, "grad_norm": 0.7976353764533997, "learning_rate": 2.393571398242113e-05, "loss": 0.1259, "step": 22143 }, { "epoch": 6.769795169672883, "grad_norm": 0.49823397397994995, "learning_rate": 2.393528937200119e-05, "loss": 0.1761, "step": 22144 }, { "epoch": 6.770100886579028, "grad_norm": 0.8388669490814209, "learning_rate": 2.393486476158125e-05, "loss": 0.1385, "step": 22145 }, { "epoch": 6.770406603485172, "grad_norm": 1.290209174156189, "learning_rate": 2.3934440151161312e-05, "loss": 0.1696, "step": 22146 }, { "epoch": 6.7707123203913175, "grad_norm": 0.6921802163124084, "learning_rate": 2.393401554074137e-05, "loss": 0.148, "step": 22147 }, { "epoch": 6.771018037297463, "grad_norm": 1.8855782747268677, "learning_rate": 2.3933590930321433e-05, "loss": 0.2011, "step": 22148 }, { "epoch": 6.771323754203608, "grad_norm": 0.5009734034538269, "learning_rate": 2.393316631990149e-05, "loss": 0.1672, "step": 22149 }, { "epoch": 6.771629471109752, "grad_norm": 1.3163466453552246, "learning_rate": 2.3932741709481553e-05, "loss": 0.1709, "step": 22150 }, { "epoch": 6.771935188015897, "grad_norm": 0.8382934927940369, "learning_rate": 2.3932317099061612e-05, "loss": 0.2029, "step": 22151 }, { "epoch": 6.772240904922042, "grad_norm": 0.4371887147426605, "learning_rate": 2.3931892488641674e-05, "loss": 0.1335, "step": 22152 }, { "epoch": 6.772546621828187, "grad_norm": 0.7250796556472778, "learning_rate": 2.3931467878221733e-05, "loss": 0.0697, "step": 22153 }, { "epoch": 6.772852338734332, "grad_norm": 0.21932367980480194, "learning_rate": 2.393104326780179e-05, "loss": 0.0668, "step": 22154 }, { "epoch": 6.773158055640477, "grad_norm": 0.22569458186626434, "learning_rate": 2.3930618657381854e-05, "loss": 0.0761, "step": 22155 }, { "epoch": 6.773463772546622, "grad_norm": 0.23016543686389923, "learning_rate": 2.3930194046961912e-05, "loss": 0.0484, "step": 22156 }, { "epoch": 6.773769489452767, "grad_norm": 0.27496129274368286, "learning_rate": 2.3929769436541975e-05, "loss": 0.0526, "step": 22157 }, { "epoch": 6.774075206358912, "grad_norm": 0.3230811357498169, "learning_rate": 2.3929344826122033e-05, "loss": 0.0684, "step": 22158 }, { "epoch": 6.774380923265056, "grad_norm": 0.2153768241405487, "learning_rate": 2.3928920215702095e-05, "loss": 0.0524, "step": 22159 }, { "epoch": 6.774686640171201, "grad_norm": 0.4431750476360321, "learning_rate": 2.3928495605282154e-05, "loss": 0.0779, "step": 22160 }, { "epoch": 6.7749923570773465, "grad_norm": 0.20234981179237366, "learning_rate": 2.3928070994862216e-05, "loss": 0.0681, "step": 22161 }, { "epoch": 6.775298073983492, "grad_norm": 0.4514651596546173, "learning_rate": 2.3927646384442275e-05, "loss": 0.0567, "step": 22162 }, { "epoch": 6.775603790889636, "grad_norm": 0.17773284018039703, "learning_rate": 2.3927221774022337e-05, "loss": 0.0739, "step": 22163 }, { "epoch": 6.775909507795781, "grad_norm": 0.41178107261657715, "learning_rate": 2.3926797163602396e-05, "loss": 0.0798, "step": 22164 }, { "epoch": 6.776215224701926, "grad_norm": 0.46688684821128845, "learning_rate": 2.3926372553182458e-05, "loss": 0.1119, "step": 22165 }, { "epoch": 6.776520941608071, "grad_norm": 0.7621598839759827, "learning_rate": 2.3925947942762516e-05, "loss": 0.1083, "step": 22166 }, { "epoch": 6.776826658514215, "grad_norm": 0.5984489321708679, "learning_rate": 2.3925523332342575e-05, "loss": 0.1297, "step": 22167 }, { "epoch": 6.7771323754203605, "grad_norm": 0.5502066016197205, "learning_rate": 2.3925098721922637e-05, "loss": 0.1315, "step": 22168 }, { "epoch": 6.777438092326506, "grad_norm": 0.46843552589416504, "learning_rate": 2.3924674111502696e-05, "loss": 0.1575, "step": 22169 }, { "epoch": 6.777743809232651, "grad_norm": 0.6559891104698181, "learning_rate": 2.3924249501082758e-05, "loss": 0.1709, "step": 22170 }, { "epoch": 6.778049526138796, "grad_norm": 0.7983714938163757, "learning_rate": 2.3923824890662817e-05, "loss": 0.1549, "step": 22171 }, { "epoch": 6.77835524304494, "grad_norm": 0.6149054765701294, "learning_rate": 2.392340028024288e-05, "loss": 0.1865, "step": 22172 }, { "epoch": 6.778660959951085, "grad_norm": 1.4558019638061523, "learning_rate": 2.3922975669822937e-05, "loss": 0.167, "step": 22173 }, { "epoch": 6.77896667685723, "grad_norm": 0.9398530125617981, "learning_rate": 2.3922551059403e-05, "loss": 0.1754, "step": 22174 }, { "epoch": 6.779272393763375, "grad_norm": 0.6837993860244751, "learning_rate": 2.3922126448983058e-05, "loss": 0.1875, "step": 22175 }, { "epoch": 6.77957811066952, "grad_norm": 1.6146596670150757, "learning_rate": 2.392170183856312e-05, "loss": 0.2623, "step": 22176 }, { "epoch": 6.779883827575665, "grad_norm": 0.5056219696998596, "learning_rate": 2.392127722814318e-05, "loss": 0.1524, "step": 22177 }, { "epoch": 6.78018954448181, "grad_norm": 0.4124840497970581, "learning_rate": 2.392085261772324e-05, "loss": 0.0759, "step": 22178 }, { "epoch": 6.780495261387955, "grad_norm": 0.20068402588367462, "learning_rate": 2.39204280073033e-05, "loss": 0.0677, "step": 22179 }, { "epoch": 6.780800978294099, "grad_norm": 0.5228434801101685, "learning_rate": 2.392000339688336e-05, "loss": 0.0608, "step": 22180 }, { "epoch": 6.781106695200244, "grad_norm": 0.20591548085212708, "learning_rate": 2.391957878646342e-05, "loss": 0.0682, "step": 22181 }, { "epoch": 6.7814124121063895, "grad_norm": 0.34578174352645874, "learning_rate": 2.391915417604348e-05, "loss": 0.0565, "step": 22182 }, { "epoch": 6.781718129012535, "grad_norm": 0.32506030797958374, "learning_rate": 2.391872956562354e-05, "loss": 0.075, "step": 22183 }, { "epoch": 6.78202384591868, "grad_norm": 0.6338350772857666, "learning_rate": 2.39183049552036e-05, "loss": 0.0761, "step": 22184 }, { "epoch": 6.782329562824824, "grad_norm": 0.19940441846847534, "learning_rate": 2.3917880344783662e-05, "loss": 0.0584, "step": 22185 }, { "epoch": 6.782635279730969, "grad_norm": 0.7235556244850159, "learning_rate": 2.391745573436372e-05, "loss": 0.0674, "step": 22186 }, { "epoch": 6.782940996637114, "grad_norm": 0.42762333154678345, "learning_rate": 2.3917031123943783e-05, "loss": 0.08, "step": 22187 }, { "epoch": 6.783246713543259, "grad_norm": 0.43909409642219543, "learning_rate": 2.391660651352384e-05, "loss": 0.0619, "step": 22188 }, { "epoch": 6.7835524304494035, "grad_norm": 0.352649062871933, "learning_rate": 2.3916181903103904e-05, "loss": 0.1176, "step": 22189 }, { "epoch": 6.783858147355549, "grad_norm": 0.3729504346847534, "learning_rate": 2.3915757292683962e-05, "loss": 0.0941, "step": 22190 }, { "epoch": 6.784163864261694, "grad_norm": 0.3159545361995697, "learning_rate": 2.3915332682264025e-05, "loss": 0.0855, "step": 22191 }, { "epoch": 6.784469581167839, "grad_norm": 0.9963465929031372, "learning_rate": 2.3914908071844083e-05, "loss": 0.1194, "step": 22192 }, { "epoch": 6.784775298073983, "grad_norm": 1.7939835786819458, "learning_rate": 2.3914483461424142e-05, "loss": 0.163, "step": 22193 }, { "epoch": 6.785081014980128, "grad_norm": 0.42396435141563416, "learning_rate": 2.3914058851004204e-05, "loss": 0.1409, "step": 22194 }, { "epoch": 6.785386731886273, "grad_norm": 1.583531379699707, "learning_rate": 2.3913634240584263e-05, "loss": 0.1897, "step": 22195 }, { "epoch": 6.785692448792418, "grad_norm": 1.040088176727295, "learning_rate": 2.3913209630164325e-05, "loss": 0.2091, "step": 22196 }, { "epoch": 6.7859981656985635, "grad_norm": 1.5164368152618408, "learning_rate": 2.3912785019744384e-05, "loss": 0.1702, "step": 22197 }, { "epoch": 6.786303882604708, "grad_norm": 3.5167009830474854, "learning_rate": 2.3912360409324446e-05, "loss": 0.171, "step": 22198 }, { "epoch": 6.786609599510853, "grad_norm": 1.2108067274093628, "learning_rate": 2.3911935798904504e-05, "loss": 0.1597, "step": 22199 }, { "epoch": 6.786915316416998, "grad_norm": 0.6947643160820007, "learning_rate": 2.3911511188484566e-05, "loss": 0.2027, "step": 22200 }, { "epoch": 6.787221033323143, "grad_norm": 1.865363597869873, "learning_rate": 2.3911086578064625e-05, "loss": 0.2091, "step": 22201 }, { "epoch": 6.787526750229287, "grad_norm": 0.47872036695480347, "learning_rate": 2.3910661967644687e-05, "loss": 0.1633, "step": 22202 }, { "epoch": 6.787832467135432, "grad_norm": 0.2978817820549011, "learning_rate": 2.3910237357224746e-05, "loss": 0.0699, "step": 22203 }, { "epoch": 6.7881381840415775, "grad_norm": 0.23445060849189758, "learning_rate": 2.3909812746804805e-05, "loss": 0.0751, "step": 22204 }, { "epoch": 6.788443900947723, "grad_norm": 0.5497908592224121, "learning_rate": 2.3909388136384867e-05, "loss": 0.0567, "step": 22205 }, { "epoch": 6.788749617853867, "grad_norm": 0.29509487748146057, "learning_rate": 2.3908963525964925e-05, "loss": 0.0512, "step": 22206 }, { "epoch": 6.789055334760012, "grad_norm": 0.2853676378726959, "learning_rate": 2.3908538915544987e-05, "loss": 0.0467, "step": 22207 }, { "epoch": 6.789361051666157, "grad_norm": 0.6122973561286926, "learning_rate": 2.3908114305125046e-05, "loss": 0.0494, "step": 22208 }, { "epoch": 6.789666768572302, "grad_norm": 0.2943824231624603, "learning_rate": 2.3907689694705108e-05, "loss": 0.0623, "step": 22209 }, { "epoch": 6.789972485478447, "grad_norm": 0.30279427766799927, "learning_rate": 2.3907265084285167e-05, "loss": 0.0787, "step": 22210 }, { "epoch": 6.790278202384592, "grad_norm": 0.2856222987174988, "learning_rate": 2.390684047386523e-05, "loss": 0.06, "step": 22211 }, { "epoch": 6.790583919290737, "grad_norm": 0.36618608236312866, "learning_rate": 2.3906415863445288e-05, "loss": 0.0749, "step": 22212 }, { "epoch": 6.790889636196882, "grad_norm": 0.4182315170764923, "learning_rate": 2.390599125302535e-05, "loss": 0.0556, "step": 22213 }, { "epoch": 6.791195353103027, "grad_norm": 1.1459132432937622, "learning_rate": 2.390556664260541e-05, "loss": 0.0938, "step": 22214 }, { "epoch": 6.791501070009171, "grad_norm": 0.5693000555038452, "learning_rate": 2.390514203218547e-05, "loss": 0.1323, "step": 22215 }, { "epoch": 6.791806786915316, "grad_norm": 0.37699878215789795, "learning_rate": 2.390471742176553e-05, "loss": 0.1118, "step": 22216 }, { "epoch": 6.792112503821461, "grad_norm": 0.5214504599571228, "learning_rate": 2.3904292811345588e-05, "loss": 0.1485, "step": 22217 }, { "epoch": 6.7924182207276065, "grad_norm": 0.4707179665565491, "learning_rate": 2.390386820092565e-05, "loss": 0.173, "step": 22218 }, { "epoch": 6.792723937633751, "grad_norm": 0.8319391012191772, "learning_rate": 2.390344359050571e-05, "loss": 0.1742, "step": 22219 }, { "epoch": 6.793029654539896, "grad_norm": 0.35827377438545227, "learning_rate": 2.390301898008577e-05, "loss": 0.1321, "step": 22220 }, { "epoch": 6.793335371446041, "grad_norm": 1.083058476448059, "learning_rate": 2.390259436966583e-05, "loss": 0.1447, "step": 22221 }, { "epoch": 6.793641088352186, "grad_norm": 0.5932117104530334, "learning_rate": 2.390216975924589e-05, "loss": 0.1939, "step": 22222 }, { "epoch": 6.793946805258331, "grad_norm": 0.5245637893676758, "learning_rate": 2.390174514882595e-05, "loss": 0.1725, "step": 22223 }, { "epoch": 6.794252522164475, "grad_norm": 0.938563883304596, "learning_rate": 2.3901320538406012e-05, "loss": 0.1946, "step": 22224 }, { "epoch": 6.7945582390706205, "grad_norm": 1.2025971412658691, "learning_rate": 2.390089592798607e-05, "loss": 0.2168, "step": 22225 }, { "epoch": 6.794863955976766, "grad_norm": 0.8193597197532654, "learning_rate": 2.3900471317566133e-05, "loss": 0.2303, "step": 22226 }, { "epoch": 6.795169672882911, "grad_norm": 0.40389880537986755, "learning_rate": 2.3900046707146195e-05, "loss": 0.1627, "step": 22227 }, { "epoch": 6.795475389789055, "grad_norm": 0.23100876808166504, "learning_rate": 2.3899622096726257e-05, "loss": 0.0851, "step": 22228 }, { "epoch": 6.7957811066952, "grad_norm": 0.2252345085144043, "learning_rate": 2.3899197486306316e-05, "loss": 0.0798, "step": 22229 }, { "epoch": 6.796086823601345, "grad_norm": 0.3326300084590912, "learning_rate": 2.3898772875886375e-05, "loss": 0.0464, "step": 22230 }, { "epoch": 6.79639254050749, "grad_norm": 0.3776150345802307, "learning_rate": 2.3898348265466437e-05, "loss": 0.0559, "step": 22231 }, { "epoch": 6.7966982574136345, "grad_norm": 0.17162874341011047, "learning_rate": 2.3897923655046496e-05, "loss": 0.0533, "step": 22232 }, { "epoch": 6.79700397431978, "grad_norm": 0.25307756662368774, "learning_rate": 2.3897499044626558e-05, "loss": 0.0912, "step": 22233 }, { "epoch": 6.797309691225925, "grad_norm": 0.2965870797634125, "learning_rate": 2.3897074434206616e-05, "loss": 0.0683, "step": 22234 }, { "epoch": 6.79761540813207, "grad_norm": 1.2381579875946045, "learning_rate": 2.389664982378668e-05, "loss": 0.0752, "step": 22235 }, { "epoch": 6.797921125038215, "grad_norm": 0.2739071547985077, "learning_rate": 2.3896225213366737e-05, "loss": 0.0702, "step": 22236 }, { "epoch": 6.798226841944359, "grad_norm": 0.33016160130500793, "learning_rate": 2.38958006029468e-05, "loss": 0.1217, "step": 22237 }, { "epoch": 6.798532558850504, "grad_norm": 0.40179532766342163, "learning_rate": 2.3895375992526858e-05, "loss": 0.0765, "step": 22238 }, { "epoch": 6.7988382757566495, "grad_norm": 0.4674365818500519, "learning_rate": 2.389495138210692e-05, "loss": 0.0925, "step": 22239 }, { "epoch": 6.799143992662795, "grad_norm": 0.7207281589508057, "learning_rate": 2.389452677168698e-05, "loss": 0.1295, "step": 22240 }, { "epoch": 6.799449709568939, "grad_norm": 0.9160220623016357, "learning_rate": 2.389410216126704e-05, "loss": 0.1191, "step": 22241 }, { "epoch": 6.799755426475084, "grad_norm": 0.3883257210254669, "learning_rate": 2.38936775508471e-05, "loss": 0.1183, "step": 22242 }, { "epoch": 6.800061143381229, "grad_norm": 0.6025193333625793, "learning_rate": 2.3893252940427158e-05, "loss": 0.1353, "step": 22243 }, { "epoch": 6.800366860287374, "grad_norm": 0.7415311336517334, "learning_rate": 2.389282833000722e-05, "loss": 0.1614, "step": 22244 }, { "epoch": 6.800672577193518, "grad_norm": 0.6887041330337524, "learning_rate": 2.389240371958728e-05, "loss": 0.1472, "step": 22245 }, { "epoch": 6.8009782940996635, "grad_norm": 9.630769729614258, "learning_rate": 2.389197910916734e-05, "loss": 0.1365, "step": 22246 }, { "epoch": 6.801284011005809, "grad_norm": 0.7485224008560181, "learning_rate": 2.38915544987474e-05, "loss": 0.1759, "step": 22247 }, { "epoch": 6.801589727911954, "grad_norm": 1.5059969425201416, "learning_rate": 2.3891129888327462e-05, "loss": 0.1494, "step": 22248 }, { "epoch": 6.801895444818099, "grad_norm": 0.8876821994781494, "learning_rate": 2.389070527790752e-05, "loss": 0.1312, "step": 22249 }, { "epoch": 6.802201161724243, "grad_norm": 1.2234715223312378, "learning_rate": 2.3890280667487583e-05, "loss": 0.1967, "step": 22250 }, { "epoch": 6.802506878630388, "grad_norm": 0.8511974215507507, "learning_rate": 2.388985605706764e-05, "loss": 0.171, "step": 22251 }, { "epoch": 6.802812595536533, "grad_norm": 0.3093087673187256, "learning_rate": 2.3889431446647704e-05, "loss": 0.1363, "step": 22252 }, { "epoch": 6.803118312442678, "grad_norm": 0.4378778636455536, "learning_rate": 2.3889006836227762e-05, "loss": 0.0976, "step": 22253 }, { "epoch": 6.803424029348823, "grad_norm": 0.7406368851661682, "learning_rate": 2.3888582225807824e-05, "loss": 0.0479, "step": 22254 }, { "epoch": 6.803729746254968, "grad_norm": 0.2807108163833618, "learning_rate": 2.3888157615387883e-05, "loss": 0.0587, "step": 22255 }, { "epoch": 6.804035463161113, "grad_norm": 0.35277315974235535, "learning_rate": 2.388773300496794e-05, "loss": 0.0484, "step": 22256 }, { "epoch": 6.804341180067258, "grad_norm": 0.35565659403800964, "learning_rate": 2.3887308394548004e-05, "loss": 0.0397, "step": 22257 }, { "epoch": 6.804646896973402, "grad_norm": 0.42720919847488403, "learning_rate": 2.3886883784128062e-05, "loss": 0.07, "step": 22258 }, { "epoch": 6.804952613879547, "grad_norm": 0.18344227969646454, "learning_rate": 2.3886459173708125e-05, "loss": 0.0308, "step": 22259 }, { "epoch": 6.805258330785692, "grad_norm": 0.5292580723762512, "learning_rate": 2.3886034563288183e-05, "loss": 0.0853, "step": 22260 }, { "epoch": 6.8055640476918375, "grad_norm": 0.24441619217395782, "learning_rate": 2.3885609952868245e-05, "loss": 0.0509, "step": 22261 }, { "epoch": 6.805869764597983, "grad_norm": 0.3594871759414673, "learning_rate": 2.3885185342448304e-05, "loss": 0.0651, "step": 22262 }, { "epoch": 6.806175481504127, "grad_norm": 0.2109840214252472, "learning_rate": 2.3884760732028366e-05, "loss": 0.0808, "step": 22263 }, { "epoch": 6.806481198410272, "grad_norm": 0.43864843249320984, "learning_rate": 2.3884336121608425e-05, "loss": 0.0918, "step": 22264 }, { "epoch": 6.806786915316417, "grad_norm": 0.27524930238723755, "learning_rate": 2.3883911511188487e-05, "loss": 0.1083, "step": 22265 }, { "epoch": 6.807092632222562, "grad_norm": 0.3923599123954773, "learning_rate": 2.3883486900768546e-05, "loss": 0.1351, "step": 22266 }, { "epoch": 6.8073983491287064, "grad_norm": 0.37515026330947876, "learning_rate": 2.3883062290348608e-05, "loss": 0.1279, "step": 22267 }, { "epoch": 6.807704066034852, "grad_norm": 0.6024891138076782, "learning_rate": 2.3882637679928666e-05, "loss": 0.1883, "step": 22268 }, { "epoch": 6.808009782940997, "grad_norm": 0.6994851231575012, "learning_rate": 2.3882213069508725e-05, "loss": 0.1481, "step": 22269 }, { "epoch": 6.808315499847142, "grad_norm": 0.36238595843315125, "learning_rate": 2.3881788459088787e-05, "loss": 0.1317, "step": 22270 }, { "epoch": 6.808621216753286, "grad_norm": 1.1226582527160645, "learning_rate": 2.3881363848668846e-05, "loss": 0.1475, "step": 22271 }, { "epoch": 6.808926933659431, "grad_norm": 0.5313805937767029, "learning_rate": 2.3880939238248908e-05, "loss": 0.1601, "step": 22272 }, { "epoch": 6.809232650565576, "grad_norm": 0.6029691100120544, "learning_rate": 2.3880514627828967e-05, "loss": 0.1584, "step": 22273 }, { "epoch": 6.809538367471721, "grad_norm": 2.118182897567749, "learning_rate": 2.388009001740903e-05, "loss": 0.1455, "step": 22274 }, { "epoch": 6.8098440843778665, "grad_norm": 1.2081999778747559, "learning_rate": 2.3879665406989087e-05, "loss": 0.2296, "step": 22275 }, { "epoch": 6.810149801284011, "grad_norm": 1.301877737045288, "learning_rate": 2.387924079656915e-05, "loss": 0.1901, "step": 22276 }, { "epoch": 6.810455518190156, "grad_norm": 0.37158435583114624, "learning_rate": 2.3878816186149208e-05, "loss": 0.1048, "step": 22277 }, { "epoch": 6.810761235096301, "grad_norm": 0.23383858799934387, "learning_rate": 2.387839157572927e-05, "loss": 0.0646, "step": 22278 }, { "epoch": 6.811066952002446, "grad_norm": 0.2999046742916107, "learning_rate": 2.387796696530933e-05, "loss": 0.0693, "step": 22279 }, { "epoch": 6.81137266890859, "grad_norm": 0.2125459760427475, "learning_rate": 2.387754235488939e-05, "loss": 0.063, "step": 22280 }, { "epoch": 6.811678385814735, "grad_norm": 0.29325780272483826, "learning_rate": 2.387711774446945e-05, "loss": 0.0408, "step": 22281 }, { "epoch": 6.8119841027208805, "grad_norm": 1.3246369361877441, "learning_rate": 2.387669313404951e-05, "loss": 0.0313, "step": 22282 }, { "epoch": 6.812289819627026, "grad_norm": 0.16311880946159363, "learning_rate": 2.387626852362957e-05, "loss": 0.0553, "step": 22283 }, { "epoch": 6.81259553653317, "grad_norm": 0.31733056902885437, "learning_rate": 2.387584391320963e-05, "loss": 0.0518, "step": 22284 }, { "epoch": 6.812901253439315, "grad_norm": 0.39444684982299805, "learning_rate": 2.387541930278969e-05, "loss": 0.0562, "step": 22285 }, { "epoch": 6.81320697034546, "grad_norm": 0.2435513138771057, "learning_rate": 2.387499469236975e-05, "loss": 0.0577, "step": 22286 }, { "epoch": 6.813512687251605, "grad_norm": 0.7381508946418762, "learning_rate": 2.3874570081949812e-05, "loss": 0.0793, "step": 22287 }, { "epoch": 6.81381840415775, "grad_norm": 0.49527159333229065, "learning_rate": 2.387414547152987e-05, "loss": 0.0981, "step": 22288 }, { "epoch": 6.8141241210638945, "grad_norm": 0.3283661901950836, "learning_rate": 2.3873720861109933e-05, "loss": 0.1121, "step": 22289 }, { "epoch": 6.81442983797004, "grad_norm": 0.921956479549408, "learning_rate": 2.3873296250689992e-05, "loss": 0.1144, "step": 22290 }, { "epoch": 6.814735554876185, "grad_norm": 0.2888866662979126, "learning_rate": 2.3872871640270054e-05, "loss": 0.0866, "step": 22291 }, { "epoch": 6.81504127178233, "grad_norm": 0.602588415145874, "learning_rate": 2.3872447029850112e-05, "loss": 0.1199, "step": 22292 }, { "epoch": 6.815346988688474, "grad_norm": 0.49659785628318787, "learning_rate": 2.3872022419430175e-05, "loss": 0.1257, "step": 22293 }, { "epoch": 6.815652705594619, "grad_norm": 0.5637485980987549, "learning_rate": 2.3871597809010233e-05, "loss": 0.1469, "step": 22294 }, { "epoch": 6.815958422500764, "grad_norm": 0.42343756556510925, "learning_rate": 2.3871173198590292e-05, "loss": 0.151, "step": 22295 }, { "epoch": 6.8162641394069095, "grad_norm": 0.6060395836830139, "learning_rate": 2.3870748588170354e-05, "loss": 0.1285, "step": 22296 }, { "epoch": 6.816569856313054, "grad_norm": 1.6074163913726807, "learning_rate": 2.3870323977750413e-05, "loss": 0.1706, "step": 22297 }, { "epoch": 6.816875573219199, "grad_norm": 0.9426450729370117, "learning_rate": 2.3869899367330475e-05, "loss": 0.167, "step": 22298 }, { "epoch": 6.817181290125344, "grad_norm": 1.430928111076355, "learning_rate": 2.3869474756910534e-05, "loss": 0.1889, "step": 22299 }, { "epoch": 6.817487007031489, "grad_norm": 0.7953181862831116, "learning_rate": 2.3869050146490596e-05, "loss": 0.2217, "step": 22300 }, { "epoch": 6.817792723937634, "grad_norm": 1.6825902462005615, "learning_rate": 2.3868625536070654e-05, "loss": 0.2591, "step": 22301 }, { "epoch": 6.818098440843778, "grad_norm": 0.3292333781719208, "learning_rate": 2.3868200925650716e-05, "loss": 0.1191, "step": 22302 }, { "epoch": 6.8184041577499235, "grad_norm": 0.29457181692123413, "learning_rate": 2.3867776315230775e-05, "loss": 0.0776, "step": 22303 }, { "epoch": 6.818709874656069, "grad_norm": 0.20391470193862915, "learning_rate": 2.3867351704810837e-05, "loss": 0.0686, "step": 22304 }, { "epoch": 6.819015591562214, "grad_norm": 0.26942455768585205, "learning_rate": 2.3866927094390896e-05, "loss": 0.0473, "step": 22305 }, { "epoch": 6.819321308468358, "grad_norm": 0.1322024166584015, "learning_rate": 2.3866502483970958e-05, "loss": 0.0378, "step": 22306 }, { "epoch": 6.819627025374503, "grad_norm": 0.1740330308675766, "learning_rate": 2.3866077873551017e-05, "loss": 0.0548, "step": 22307 }, { "epoch": 6.819932742280648, "grad_norm": 0.1517321765422821, "learning_rate": 2.3865653263131075e-05, "loss": 0.0519, "step": 22308 }, { "epoch": 6.820238459186793, "grad_norm": 0.3998226523399353, "learning_rate": 2.3865228652711137e-05, "loss": 0.0548, "step": 22309 }, { "epoch": 6.8205441760929375, "grad_norm": 0.3079882264137268, "learning_rate": 2.3864804042291196e-05, "loss": 0.0787, "step": 22310 }, { "epoch": 6.820849892999083, "grad_norm": 0.15746863186359406, "learning_rate": 2.3864379431871258e-05, "loss": 0.0466, "step": 22311 }, { "epoch": 6.821155609905228, "grad_norm": 0.5265077948570251, "learning_rate": 2.3863954821451317e-05, "loss": 0.1182, "step": 22312 }, { "epoch": 6.821461326811373, "grad_norm": 0.2338431179523468, "learning_rate": 2.386353021103138e-05, "loss": 0.0788, "step": 22313 }, { "epoch": 6.821767043717518, "grad_norm": 0.28886738419532776, "learning_rate": 2.3863105600611438e-05, "loss": 0.108, "step": 22314 }, { "epoch": 6.822072760623662, "grad_norm": 0.27899304032325745, "learning_rate": 2.38626809901915e-05, "loss": 0.0966, "step": 22315 }, { "epoch": 6.822378477529807, "grad_norm": 0.4284723699092865, "learning_rate": 2.386225637977156e-05, "loss": 0.1399, "step": 22316 }, { "epoch": 6.822684194435952, "grad_norm": 0.5414632558822632, "learning_rate": 2.386183176935162e-05, "loss": 0.1414, "step": 22317 }, { "epoch": 6.8229899113420975, "grad_norm": 0.42373132705688477, "learning_rate": 2.386140715893168e-05, "loss": 0.1623, "step": 22318 }, { "epoch": 6.823295628248242, "grad_norm": 1.4267562627792358, "learning_rate": 2.3860982548511738e-05, "loss": 0.1809, "step": 22319 }, { "epoch": 6.823601345154387, "grad_norm": 0.446542352437973, "learning_rate": 2.38605579380918e-05, "loss": 0.1468, "step": 22320 }, { "epoch": 6.823907062060532, "grad_norm": 0.5228882431983948, "learning_rate": 2.386013332767186e-05, "loss": 0.1561, "step": 22321 }, { "epoch": 6.824212778966677, "grad_norm": 0.8322956562042236, "learning_rate": 2.385970871725192e-05, "loss": 0.1791, "step": 22322 }, { "epoch": 6.824518495872821, "grad_norm": 0.788004994392395, "learning_rate": 2.385928410683198e-05, "loss": 0.1557, "step": 22323 }, { "epoch": 6.8248242127789664, "grad_norm": 1.0149943828582764, "learning_rate": 2.3858859496412042e-05, "loss": 0.1632, "step": 22324 }, { "epoch": 6.825129929685112, "grad_norm": 1.0473145246505737, "learning_rate": 2.38584348859921e-05, "loss": 0.198, "step": 22325 }, { "epoch": 6.825435646591257, "grad_norm": 0.7847385406494141, "learning_rate": 2.3858010275572162e-05, "loss": 0.2184, "step": 22326 }, { "epoch": 6.825741363497402, "grad_norm": 0.34721270203590393, "learning_rate": 2.385758566515222e-05, "loss": 0.1252, "step": 22327 }, { "epoch": 6.826047080403546, "grad_norm": 0.4276309311389923, "learning_rate": 2.3857161054732283e-05, "loss": 0.0834, "step": 22328 }, { "epoch": 6.826352797309691, "grad_norm": 0.35148489475250244, "learning_rate": 2.3856736444312345e-05, "loss": 0.0613, "step": 22329 }, { "epoch": 6.826658514215836, "grad_norm": 0.37985220551490784, "learning_rate": 2.3856311833892407e-05, "loss": 0.0685, "step": 22330 }, { "epoch": 6.826964231121981, "grad_norm": 0.16245731711387634, "learning_rate": 2.3855887223472466e-05, "loss": 0.0562, "step": 22331 }, { "epoch": 6.827269948028126, "grad_norm": 0.22101956605911255, "learning_rate": 2.3855462613052525e-05, "loss": 0.0499, "step": 22332 }, { "epoch": 6.827575664934271, "grad_norm": 0.7816270589828491, "learning_rate": 2.3855038002632587e-05, "loss": 0.1023, "step": 22333 }, { "epoch": 6.827881381840416, "grad_norm": 2.763969659805298, "learning_rate": 2.3854613392212646e-05, "loss": 0.0743, "step": 22334 }, { "epoch": 6.828187098746561, "grad_norm": 0.7522075772285461, "learning_rate": 2.3854188781792708e-05, "loss": 0.1078, "step": 22335 }, { "epoch": 6.828492815652705, "grad_norm": 0.3898114860057831, "learning_rate": 2.3853764171372766e-05, "loss": 0.058, "step": 22336 }, { "epoch": 6.82879853255885, "grad_norm": 0.27964890003204346, "learning_rate": 2.385333956095283e-05, "loss": 0.063, "step": 22337 }, { "epoch": 6.829104249464995, "grad_norm": 0.8356011509895325, "learning_rate": 2.3852914950532887e-05, "loss": 0.0956, "step": 22338 }, { "epoch": 6.8294099663711405, "grad_norm": 0.47743484377861023, "learning_rate": 2.385249034011295e-05, "loss": 0.0897, "step": 22339 }, { "epoch": 6.829715683277286, "grad_norm": 0.5767241716384888, "learning_rate": 2.3852065729693008e-05, "loss": 0.1192, "step": 22340 }, { "epoch": 6.83002140018343, "grad_norm": 0.4301300644874573, "learning_rate": 2.385164111927307e-05, "loss": 0.0847, "step": 22341 }, { "epoch": 6.830327117089575, "grad_norm": 0.38905683159828186, "learning_rate": 2.385121650885313e-05, "loss": 0.1466, "step": 22342 }, { "epoch": 6.83063283399572, "grad_norm": 0.8930121660232544, "learning_rate": 2.385079189843319e-05, "loss": 0.1608, "step": 22343 }, { "epoch": 6.830938550901865, "grad_norm": 0.8390859365463257, "learning_rate": 2.385036728801325e-05, "loss": 0.1716, "step": 22344 }, { "epoch": 6.831244267808009, "grad_norm": 0.6030247807502747, "learning_rate": 2.3849942677593308e-05, "loss": 0.1652, "step": 22345 }, { "epoch": 6.8315499847141545, "grad_norm": 0.6017872095108032, "learning_rate": 2.384951806717337e-05, "loss": 0.1564, "step": 22346 }, { "epoch": 6.8318557016203, "grad_norm": 0.6515194177627563, "learning_rate": 2.384909345675343e-05, "loss": 0.1646, "step": 22347 }, { "epoch": 6.832161418526445, "grad_norm": 0.8180335164070129, "learning_rate": 2.384866884633349e-05, "loss": 0.2048, "step": 22348 }, { "epoch": 6.832467135432589, "grad_norm": 0.8358579874038696, "learning_rate": 2.384824423591355e-05, "loss": 0.1794, "step": 22349 }, { "epoch": 6.832772852338734, "grad_norm": 1.2907735109329224, "learning_rate": 2.3847819625493612e-05, "loss": 0.1838, "step": 22350 }, { "epoch": 6.833078569244879, "grad_norm": 0.9007869958877563, "learning_rate": 2.384739501507367e-05, "loss": 0.2137, "step": 22351 }, { "epoch": 6.833384286151024, "grad_norm": 0.7069757580757141, "learning_rate": 2.3846970404653733e-05, "loss": 0.1271, "step": 22352 }, { "epoch": 6.8336900030571694, "grad_norm": 0.27936166524887085, "learning_rate": 2.384654579423379e-05, "loss": 0.0855, "step": 22353 }, { "epoch": 6.833995719963314, "grad_norm": 0.3181346356868744, "learning_rate": 2.3846121183813854e-05, "loss": 0.0686, "step": 22354 }, { "epoch": 6.834301436869459, "grad_norm": 0.33047786355018616, "learning_rate": 2.3845696573393912e-05, "loss": 0.0618, "step": 22355 }, { "epoch": 6.834607153775604, "grad_norm": 0.2483089566230774, "learning_rate": 2.3845271962973974e-05, "loss": 0.0782, "step": 22356 }, { "epoch": 6.834912870681749, "grad_norm": 0.17022211849689484, "learning_rate": 2.3844847352554033e-05, "loss": 0.0355, "step": 22357 }, { "epoch": 6.835218587587893, "grad_norm": 0.35550928115844727, "learning_rate": 2.3844422742134092e-05, "loss": 0.0819, "step": 22358 }, { "epoch": 6.835524304494038, "grad_norm": 0.22201965749263763, "learning_rate": 2.3843998131714154e-05, "loss": 0.0553, "step": 22359 }, { "epoch": 6.8358300214001835, "grad_norm": 0.21167229115962982, "learning_rate": 2.3843573521294213e-05, "loss": 0.0619, "step": 22360 }, { "epoch": 6.836135738306329, "grad_norm": 0.3899299204349518, "learning_rate": 2.3843148910874275e-05, "loss": 0.0581, "step": 22361 }, { "epoch": 6.836441455212473, "grad_norm": 0.4590700566768646, "learning_rate": 2.3842724300454333e-05, "loss": 0.0672, "step": 22362 }, { "epoch": 6.836747172118618, "grad_norm": 1.452531337738037, "learning_rate": 2.3842299690034395e-05, "loss": 0.0771, "step": 22363 }, { "epoch": 6.837052889024763, "grad_norm": 0.5252473950386047, "learning_rate": 2.3841875079614454e-05, "loss": 0.0817, "step": 22364 }, { "epoch": 6.837358605930908, "grad_norm": 1.4552029371261597, "learning_rate": 2.3841450469194516e-05, "loss": 0.0793, "step": 22365 }, { "epoch": 6.837664322837053, "grad_norm": 0.6874142289161682, "learning_rate": 2.3841025858774575e-05, "loss": 0.1684, "step": 22366 }, { "epoch": 6.8379700397431975, "grad_norm": 0.35960695147514343, "learning_rate": 2.3840601248354637e-05, "loss": 0.0912, "step": 22367 }, { "epoch": 6.838275756649343, "grad_norm": 0.6635904908180237, "learning_rate": 2.3840176637934696e-05, "loss": 0.1513, "step": 22368 }, { "epoch": 6.838581473555488, "grad_norm": 0.5988870859146118, "learning_rate": 2.3839752027514758e-05, "loss": 0.1544, "step": 22369 }, { "epoch": 6.838887190461633, "grad_norm": 0.42767006158828735, "learning_rate": 2.3839327417094816e-05, "loss": 0.1473, "step": 22370 }, { "epoch": 6.839192907367777, "grad_norm": 1.4294376373291016, "learning_rate": 2.3838902806674875e-05, "loss": 0.1633, "step": 22371 }, { "epoch": 6.839498624273922, "grad_norm": 0.8882268667221069, "learning_rate": 2.3838478196254937e-05, "loss": 0.1541, "step": 22372 }, { "epoch": 6.839804341180067, "grad_norm": 0.5552926063537598, "learning_rate": 2.3838053585834996e-05, "loss": 0.1554, "step": 22373 }, { "epoch": 6.840110058086212, "grad_norm": 0.6019986867904663, "learning_rate": 2.3837628975415058e-05, "loss": 0.1761, "step": 22374 }, { "epoch": 6.840415774992357, "grad_norm": 1.95649254322052, "learning_rate": 2.3837204364995117e-05, "loss": 0.1561, "step": 22375 }, { "epoch": 6.840721491898502, "grad_norm": 0.8736004829406738, "learning_rate": 2.383677975457518e-05, "loss": 0.1747, "step": 22376 }, { "epoch": 6.841027208804647, "grad_norm": 0.3077309727668762, "learning_rate": 2.3836355144155238e-05, "loss": 0.1507, "step": 22377 }, { "epoch": 6.841332925710792, "grad_norm": 0.8482815623283386, "learning_rate": 2.38359305337353e-05, "loss": 0.0715, "step": 22378 }, { "epoch": 6.841638642616937, "grad_norm": 0.2933030128479004, "learning_rate": 2.3835505923315358e-05, "loss": 0.0513, "step": 22379 }, { "epoch": 6.841944359523081, "grad_norm": 0.42257097363471985, "learning_rate": 2.383508131289542e-05, "loss": 0.0577, "step": 22380 }, { "epoch": 6.8422500764292264, "grad_norm": 0.23001410067081451, "learning_rate": 2.383465670247548e-05, "loss": 0.0681, "step": 22381 }, { "epoch": 6.842555793335372, "grad_norm": 0.7140171527862549, "learning_rate": 2.383423209205554e-05, "loss": 0.0476, "step": 22382 }, { "epoch": 6.842861510241517, "grad_norm": 0.21782122552394867, "learning_rate": 2.38338074816356e-05, "loss": 0.056, "step": 22383 }, { "epoch": 6.843167227147661, "grad_norm": 1.6905404329299927, "learning_rate": 2.383338287121566e-05, "loss": 0.0773, "step": 22384 }, { "epoch": 6.843472944053806, "grad_norm": 0.1883181780576706, "learning_rate": 2.383295826079572e-05, "loss": 0.0579, "step": 22385 }, { "epoch": 6.843778660959951, "grad_norm": 1.6961027383804321, "learning_rate": 2.383253365037578e-05, "loss": 0.0506, "step": 22386 }, { "epoch": 6.844084377866096, "grad_norm": 0.22902952134609222, "learning_rate": 2.383210903995584e-05, "loss": 0.1069, "step": 22387 }, { "epoch": 6.8443900947722405, "grad_norm": 0.30022406578063965, "learning_rate": 2.38316844295359e-05, "loss": 0.0824, "step": 22388 }, { "epoch": 6.844695811678386, "grad_norm": 0.31300923228263855, "learning_rate": 2.3831259819115962e-05, "loss": 0.0654, "step": 22389 }, { "epoch": 6.845001528584531, "grad_norm": 0.3174699544906616, "learning_rate": 2.383083520869602e-05, "loss": 0.1002, "step": 22390 }, { "epoch": 6.845307245490676, "grad_norm": 0.7430943250656128, "learning_rate": 2.3830410598276083e-05, "loss": 0.1084, "step": 22391 }, { "epoch": 6.845612962396821, "grad_norm": 1.0303384065628052, "learning_rate": 2.3829985987856142e-05, "loss": 0.1263, "step": 22392 }, { "epoch": 6.845918679302965, "grad_norm": 0.47635823488235474, "learning_rate": 2.3829561377436204e-05, "loss": 0.1247, "step": 22393 }, { "epoch": 6.84622439620911, "grad_norm": 0.8174107074737549, "learning_rate": 2.3829136767016263e-05, "loss": 0.1722, "step": 22394 }, { "epoch": 6.846530113115255, "grad_norm": 1.4807417392730713, "learning_rate": 2.3828712156596325e-05, "loss": 0.1844, "step": 22395 }, { "epoch": 6.8468358300214005, "grad_norm": 0.5184621810913086, "learning_rate": 2.3828287546176383e-05, "loss": 0.1582, "step": 22396 }, { "epoch": 6.847141546927545, "grad_norm": 0.8270666003227234, "learning_rate": 2.3827862935756442e-05, "loss": 0.1482, "step": 22397 }, { "epoch": 6.84744726383369, "grad_norm": 1.234525203704834, "learning_rate": 2.3827438325336504e-05, "loss": 0.1661, "step": 22398 }, { "epoch": 6.847752980739835, "grad_norm": 1.1281359195709229, "learning_rate": 2.3827013714916563e-05, "loss": 0.1601, "step": 22399 }, { "epoch": 6.84805869764598, "grad_norm": 1.4937925338745117, "learning_rate": 2.3826589104496625e-05, "loss": 0.2018, "step": 22400 }, { "epoch": 6.848364414552124, "grad_norm": 0.7383785247802734, "learning_rate": 2.3826164494076684e-05, "loss": 0.2091, "step": 22401 }, { "epoch": 6.848670131458269, "grad_norm": 0.4593085050582886, "learning_rate": 2.3825739883656746e-05, "loss": 0.1714, "step": 22402 }, { "epoch": 6.8489758483644145, "grad_norm": 1.7809301614761353, "learning_rate": 2.3825315273236804e-05, "loss": 0.0809, "step": 22403 }, { "epoch": 6.84928156527056, "grad_norm": 0.19063790142536163, "learning_rate": 2.3824890662816866e-05, "loss": 0.0599, "step": 22404 }, { "epoch": 6.849587282176705, "grad_norm": 0.1611335575580597, "learning_rate": 2.3824466052396925e-05, "loss": 0.0632, "step": 22405 }, { "epoch": 6.849892999082849, "grad_norm": 0.3448420464992523, "learning_rate": 2.3824041441976987e-05, "loss": 0.0651, "step": 22406 }, { "epoch": 6.850198715988994, "grad_norm": 0.17749431729316711, "learning_rate": 2.3823616831557046e-05, "loss": 0.0356, "step": 22407 }, { "epoch": 6.850504432895139, "grad_norm": 0.7797873616218567, "learning_rate": 2.3823192221137108e-05, "loss": 0.0599, "step": 22408 }, { "epoch": 6.850810149801284, "grad_norm": 0.564904510974884, "learning_rate": 2.3822767610717167e-05, "loss": 0.0625, "step": 22409 }, { "epoch": 6.851115866707429, "grad_norm": 0.6766606569290161, "learning_rate": 2.3822343000297225e-05, "loss": 0.0602, "step": 22410 }, { "epoch": 6.851421583613574, "grad_norm": 0.2826811373233795, "learning_rate": 2.3821918389877288e-05, "loss": 0.0818, "step": 22411 }, { "epoch": 6.851727300519719, "grad_norm": 0.35159045457839966, "learning_rate": 2.3821493779457346e-05, "loss": 0.0804, "step": 22412 }, { "epoch": 6.852033017425864, "grad_norm": 0.26725882291793823, "learning_rate": 2.382106916903741e-05, "loss": 0.0574, "step": 22413 }, { "epoch": 6.852338734332008, "grad_norm": 1.1716907024383545, "learning_rate": 2.3820644558617467e-05, "loss": 0.0776, "step": 22414 }, { "epoch": 6.852644451238153, "grad_norm": 0.5274061560630798, "learning_rate": 2.382021994819753e-05, "loss": 0.116, "step": 22415 }, { "epoch": 6.852950168144298, "grad_norm": 0.6589791178703308, "learning_rate": 2.3819795337777588e-05, "loss": 0.1301, "step": 22416 }, { "epoch": 6.8532558850504435, "grad_norm": 1.2311761379241943, "learning_rate": 2.381937072735765e-05, "loss": 0.1737, "step": 22417 }, { "epoch": 6.853561601956589, "grad_norm": 0.7798994183540344, "learning_rate": 2.381894611693771e-05, "loss": 0.1853, "step": 22418 }, { "epoch": 6.853867318862733, "grad_norm": 0.7822340130805969, "learning_rate": 2.381852150651777e-05, "loss": 0.1602, "step": 22419 }, { "epoch": 6.854173035768878, "grad_norm": 1.235572099685669, "learning_rate": 2.381809689609783e-05, "loss": 0.1472, "step": 22420 }, { "epoch": 6.854478752675023, "grad_norm": 0.9130182266235352, "learning_rate": 2.381767228567789e-05, "loss": 0.1969, "step": 22421 }, { "epoch": 6.854784469581168, "grad_norm": 0.7824402451515198, "learning_rate": 2.381724767525795e-05, "loss": 0.1741, "step": 22422 }, { "epoch": 6.855090186487312, "grad_norm": 0.626114010810852, "learning_rate": 2.381682306483801e-05, "loss": 0.1936, "step": 22423 }, { "epoch": 6.8553959033934575, "grad_norm": 2.981736183166504, "learning_rate": 2.381639845441807e-05, "loss": 0.2138, "step": 22424 }, { "epoch": 6.855701620299603, "grad_norm": 1.698155403137207, "learning_rate": 2.381597384399813e-05, "loss": 0.1474, "step": 22425 }, { "epoch": 6.856007337205748, "grad_norm": 2.1086411476135254, "learning_rate": 2.3815549233578192e-05, "loss": 0.2095, "step": 22426 }, { "epoch": 6.856313054111892, "grad_norm": 0.4076407253742218, "learning_rate": 2.381512462315825e-05, "loss": 0.1238, "step": 22427 }, { "epoch": 6.856618771018037, "grad_norm": 0.30770814418792725, "learning_rate": 2.3814700012738313e-05, "loss": 0.0596, "step": 22428 }, { "epoch": 6.856924487924182, "grad_norm": 0.6125208735466003, "learning_rate": 2.381427540231837e-05, "loss": 0.11, "step": 22429 }, { "epoch": 6.857230204830327, "grad_norm": 0.38023990392684937, "learning_rate": 2.3813850791898433e-05, "loss": 0.0524, "step": 22430 }, { "epoch": 6.857535921736472, "grad_norm": 0.40399032831192017, "learning_rate": 2.3813426181478492e-05, "loss": 0.054, "step": 22431 }, { "epoch": 6.857841638642617, "grad_norm": 0.34468740224838257, "learning_rate": 2.3813001571058554e-05, "loss": 0.0382, "step": 22432 }, { "epoch": 6.858147355548762, "grad_norm": 0.2135709673166275, "learning_rate": 2.3812576960638616e-05, "loss": 0.056, "step": 22433 }, { "epoch": 6.858453072454907, "grad_norm": 0.30318090319633484, "learning_rate": 2.3812152350218675e-05, "loss": 0.0906, "step": 22434 }, { "epoch": 6.858758789361052, "grad_norm": 0.2850782871246338, "learning_rate": 2.3811727739798737e-05, "loss": 0.0604, "step": 22435 }, { "epoch": 6.859064506267196, "grad_norm": 0.33744779229164124, "learning_rate": 2.3811303129378796e-05, "loss": 0.0521, "step": 22436 }, { "epoch": 6.859370223173341, "grad_norm": 0.291166216135025, "learning_rate": 2.3810878518958858e-05, "loss": 0.0905, "step": 22437 }, { "epoch": 6.8596759400794864, "grad_norm": 0.3834993839263916, "learning_rate": 2.3810453908538916e-05, "loss": 0.1117, "step": 22438 }, { "epoch": 6.859981656985632, "grad_norm": 0.3276855945587158, "learning_rate": 2.381002929811898e-05, "loss": 0.0843, "step": 22439 }, { "epoch": 6.860287373891776, "grad_norm": 0.5039359331130981, "learning_rate": 2.3809604687699037e-05, "loss": 0.1199, "step": 22440 }, { "epoch": 6.860593090797921, "grad_norm": 0.4211486876010895, "learning_rate": 2.38091800772791e-05, "loss": 0.1029, "step": 22441 }, { "epoch": 6.860898807704066, "grad_norm": 0.5638134479522705, "learning_rate": 2.3808755466859158e-05, "loss": 0.1383, "step": 22442 }, { "epoch": 6.861204524610211, "grad_norm": 0.692765474319458, "learning_rate": 2.380833085643922e-05, "loss": 0.1172, "step": 22443 }, { "epoch": 6.861510241516356, "grad_norm": 0.5972147583961487, "learning_rate": 2.380790624601928e-05, "loss": 0.1546, "step": 22444 }, { "epoch": 6.8618159584225005, "grad_norm": 6.161992073059082, "learning_rate": 2.380748163559934e-05, "loss": 0.1478, "step": 22445 }, { "epoch": 6.862121675328646, "grad_norm": 0.7271046042442322, "learning_rate": 2.38070570251794e-05, "loss": 0.1875, "step": 22446 }, { "epoch": 6.862427392234791, "grad_norm": 0.629711389541626, "learning_rate": 2.380663241475946e-05, "loss": 0.1992, "step": 22447 }, { "epoch": 6.862733109140936, "grad_norm": 0.9253043532371521, "learning_rate": 2.380620780433952e-05, "loss": 0.1468, "step": 22448 }, { "epoch": 6.86303882604708, "grad_norm": 1.1892457008361816, "learning_rate": 2.380578319391958e-05, "loss": 0.166, "step": 22449 }, { "epoch": 6.863344542953225, "grad_norm": 2.069930076599121, "learning_rate": 2.380535858349964e-05, "loss": 0.2047, "step": 22450 }, { "epoch": 6.86365025985937, "grad_norm": 0.8994724154472351, "learning_rate": 2.38049339730797e-05, "loss": 0.2563, "step": 22451 }, { "epoch": 6.863955976765515, "grad_norm": 0.6100030541419983, "learning_rate": 2.3804509362659762e-05, "loss": 0.14, "step": 22452 }, { "epoch": 6.86426169367166, "grad_norm": 0.2679370641708374, "learning_rate": 2.380408475223982e-05, "loss": 0.0641, "step": 22453 }, { "epoch": 6.864567410577805, "grad_norm": 1.4566612243652344, "learning_rate": 2.3803660141819883e-05, "loss": 0.0707, "step": 22454 }, { "epoch": 6.86487312748395, "grad_norm": 0.2551634609699249, "learning_rate": 2.380323553139994e-05, "loss": 0.0507, "step": 22455 }, { "epoch": 6.865178844390095, "grad_norm": 0.19016771018505096, "learning_rate": 2.3802810920980004e-05, "loss": 0.0512, "step": 22456 }, { "epoch": 6.86548456129624, "grad_norm": 0.2064092755317688, "learning_rate": 2.3802386310560062e-05, "loss": 0.0494, "step": 22457 }, { "epoch": 6.865790278202384, "grad_norm": 0.2864954471588135, "learning_rate": 2.3801961700140124e-05, "loss": 0.0829, "step": 22458 }, { "epoch": 6.866095995108529, "grad_norm": 0.24127881228923798, "learning_rate": 2.3801537089720183e-05, "loss": 0.0678, "step": 22459 }, { "epoch": 6.8664017120146745, "grad_norm": 0.18969851732254028, "learning_rate": 2.3801112479300242e-05, "loss": 0.0692, "step": 22460 }, { "epoch": 6.86670742892082, "grad_norm": 0.27773746848106384, "learning_rate": 2.3800687868880304e-05, "loss": 0.0576, "step": 22461 }, { "epoch": 6.867013145826964, "grad_norm": 0.34305599331855774, "learning_rate": 2.3800263258460363e-05, "loss": 0.0936, "step": 22462 }, { "epoch": 6.867318862733109, "grad_norm": 1.2229843139648438, "learning_rate": 2.3799838648040425e-05, "loss": 0.0788, "step": 22463 }, { "epoch": 6.867624579639254, "grad_norm": 0.5093756318092346, "learning_rate": 2.3799414037620483e-05, "loss": 0.083, "step": 22464 }, { "epoch": 6.867930296545399, "grad_norm": 0.41168853640556335, "learning_rate": 2.3798989427200545e-05, "loss": 0.155, "step": 22465 }, { "epoch": 6.868236013451543, "grad_norm": 0.33538323640823364, "learning_rate": 2.3798564816780604e-05, "loss": 0.1008, "step": 22466 }, { "epoch": 6.868541730357689, "grad_norm": 0.4228763282299042, "learning_rate": 2.3798140206360666e-05, "loss": 0.1493, "step": 22467 }, { "epoch": 6.868847447263834, "grad_norm": 0.6095553636550903, "learning_rate": 2.3797715595940725e-05, "loss": 0.1432, "step": 22468 }, { "epoch": 6.869153164169979, "grad_norm": 1.2453163862228394, "learning_rate": 2.3797290985520787e-05, "loss": 0.1581, "step": 22469 }, { "epoch": 6.869458881076124, "grad_norm": 2.1323904991149902, "learning_rate": 2.3796866375100846e-05, "loss": 0.1855, "step": 22470 }, { "epoch": 6.869764597982268, "grad_norm": 0.5364910364151001, "learning_rate": 2.3796441764680908e-05, "loss": 0.1711, "step": 22471 }, { "epoch": 6.870070314888413, "grad_norm": 0.4539217948913574, "learning_rate": 2.3796017154260966e-05, "loss": 0.1523, "step": 22472 }, { "epoch": 6.870376031794558, "grad_norm": 1.0702760219573975, "learning_rate": 2.3795592543841025e-05, "loss": 0.1741, "step": 22473 }, { "epoch": 6.8706817487007035, "grad_norm": 0.9296409487724304, "learning_rate": 2.3795167933421087e-05, "loss": 0.2018, "step": 22474 }, { "epoch": 6.870987465606848, "grad_norm": 0.8303443193435669, "learning_rate": 2.3794743323001146e-05, "loss": 0.2044, "step": 22475 }, { "epoch": 6.871293182512993, "grad_norm": 0.8761958479881287, "learning_rate": 2.3794318712581208e-05, "loss": 0.1908, "step": 22476 }, { "epoch": 6.871598899419138, "grad_norm": 0.2816883623600006, "learning_rate": 2.3793894102161267e-05, "loss": 0.1321, "step": 22477 }, { "epoch": 6.871904616325283, "grad_norm": 0.5306848883628845, "learning_rate": 2.379346949174133e-05, "loss": 0.0805, "step": 22478 }, { "epoch": 6.872210333231427, "grad_norm": 0.28753188252449036, "learning_rate": 2.3793044881321388e-05, "loss": 0.0695, "step": 22479 }, { "epoch": 6.872516050137572, "grad_norm": 1.1063988208770752, "learning_rate": 2.379262027090145e-05, "loss": 0.0539, "step": 22480 }, { "epoch": 6.8728217670437175, "grad_norm": 0.2295762449502945, "learning_rate": 2.379219566048151e-05, "loss": 0.0532, "step": 22481 }, { "epoch": 6.873127483949863, "grad_norm": 0.1430887132883072, "learning_rate": 2.379177105006157e-05, "loss": 0.0509, "step": 22482 }, { "epoch": 6.873433200856008, "grad_norm": 0.3696850538253784, "learning_rate": 2.379134643964163e-05, "loss": 0.0487, "step": 22483 }, { "epoch": 6.873738917762152, "grad_norm": 0.5989334583282471, "learning_rate": 2.379092182922169e-05, "loss": 0.072, "step": 22484 }, { "epoch": 6.874044634668297, "grad_norm": 0.3733104467391968, "learning_rate": 2.379049721880175e-05, "loss": 0.0534, "step": 22485 }, { "epoch": 6.874350351574442, "grad_norm": 0.2972203493118286, "learning_rate": 2.379007260838181e-05, "loss": 0.0519, "step": 22486 }, { "epoch": 6.874656068480587, "grad_norm": 0.45200371742248535, "learning_rate": 2.378964799796187e-05, "loss": 0.0914, "step": 22487 }, { "epoch": 6.8749617853867315, "grad_norm": 0.33194416761398315, "learning_rate": 2.378922338754193e-05, "loss": 0.0841, "step": 22488 }, { "epoch": 6.875267502292877, "grad_norm": 0.30475053191185, "learning_rate": 2.378879877712199e-05, "loss": 0.0982, "step": 22489 }, { "epoch": 6.875573219199022, "grad_norm": 0.363726407289505, "learning_rate": 2.378837416670205e-05, "loss": 0.1032, "step": 22490 }, { "epoch": 6.875878936105167, "grad_norm": 0.4654591977596283, "learning_rate": 2.3787949556282112e-05, "loss": 0.1305, "step": 22491 }, { "epoch": 6.876184653011311, "grad_norm": 1.1507880687713623, "learning_rate": 2.378752494586217e-05, "loss": 0.1283, "step": 22492 }, { "epoch": 6.876490369917456, "grad_norm": 0.355670690536499, "learning_rate": 2.3787100335442233e-05, "loss": 0.1611, "step": 22493 }, { "epoch": 6.876796086823601, "grad_norm": 2.3853609561920166, "learning_rate": 2.3786675725022292e-05, "loss": 0.1479, "step": 22494 }, { "epoch": 6.877101803729746, "grad_norm": 0.4873766303062439, "learning_rate": 2.3786251114602354e-05, "loss": 0.1637, "step": 22495 }, { "epoch": 6.877407520635892, "grad_norm": 1.3250422477722168, "learning_rate": 2.3785826504182413e-05, "loss": 0.1799, "step": 22496 }, { "epoch": 6.877713237542036, "grad_norm": 0.5419408082962036, "learning_rate": 2.3785401893762475e-05, "loss": 0.1806, "step": 22497 }, { "epoch": 6.878018954448181, "grad_norm": 0.7370045781135559, "learning_rate": 2.3784977283342533e-05, "loss": 0.1891, "step": 22498 }, { "epoch": 6.878324671354326, "grad_norm": 0.4853605329990387, "learning_rate": 2.3784552672922592e-05, "loss": 0.1642, "step": 22499 }, { "epoch": 6.878630388260471, "grad_norm": 9.414645195007324, "learning_rate": 2.3784128062502654e-05, "loss": 0.1883, "step": 22500 }, { "epoch": 6.878936105166615, "grad_norm": 1.605327844619751, "learning_rate": 2.3783703452082713e-05, "loss": 0.2381, "step": 22501 }, { "epoch": 6.8792418220727605, "grad_norm": 0.31280824542045593, "learning_rate": 2.3783278841662775e-05, "loss": 0.1353, "step": 22502 }, { "epoch": 6.879547538978906, "grad_norm": 0.185944601893425, "learning_rate": 2.3782854231242834e-05, "loss": 0.0676, "step": 22503 }, { "epoch": 6.879853255885051, "grad_norm": 0.5469896793365479, "learning_rate": 2.3782429620822896e-05, "loss": 0.0841, "step": 22504 }, { "epoch": 6.880158972791195, "grad_norm": 0.5013695955276489, "learning_rate": 2.3782005010402954e-05, "loss": 0.067, "step": 22505 }, { "epoch": 6.88046468969734, "grad_norm": 0.5244770646095276, "learning_rate": 2.3781580399983016e-05, "loss": 0.0585, "step": 22506 }, { "epoch": 6.880770406603485, "grad_norm": 1.5908228158950806, "learning_rate": 2.3781155789563075e-05, "loss": 0.0533, "step": 22507 }, { "epoch": 6.88107612350963, "grad_norm": 0.15811282396316528, "learning_rate": 2.3780731179143137e-05, "loss": 0.042, "step": 22508 }, { "epoch": 6.881381840415775, "grad_norm": 0.2711082100868225, "learning_rate": 2.3780306568723196e-05, "loss": 0.0573, "step": 22509 }, { "epoch": 6.88168755732192, "grad_norm": 0.4467836618423462, "learning_rate": 2.3779881958303258e-05, "loss": 0.0615, "step": 22510 }, { "epoch": 6.881993274228065, "grad_norm": 0.5955237150192261, "learning_rate": 2.3779457347883317e-05, "loss": 0.0911, "step": 22511 }, { "epoch": 6.88229899113421, "grad_norm": 0.8860634565353394, "learning_rate": 2.3779032737463375e-05, "loss": 0.0606, "step": 22512 }, { "epoch": 6.882604708040355, "grad_norm": 0.16118602454662323, "learning_rate": 2.3778608127043438e-05, "loss": 0.0656, "step": 22513 }, { "epoch": 6.882910424946499, "grad_norm": 0.2717818021774292, "learning_rate": 2.3778183516623496e-05, "loss": 0.0861, "step": 22514 }, { "epoch": 6.883216141852644, "grad_norm": 1.8584729433059692, "learning_rate": 2.377775890620356e-05, "loss": 0.0926, "step": 22515 }, { "epoch": 6.883521858758789, "grad_norm": 0.4432554543018341, "learning_rate": 2.3777334295783617e-05, "loss": 0.0923, "step": 22516 }, { "epoch": 6.8838275756649345, "grad_norm": 0.4725094139575958, "learning_rate": 2.377690968536368e-05, "loss": 0.1105, "step": 22517 }, { "epoch": 6.884133292571079, "grad_norm": 0.4678305685520172, "learning_rate": 2.3776485074943738e-05, "loss": 0.1361, "step": 22518 }, { "epoch": 6.884439009477224, "grad_norm": 1.4999066591262817, "learning_rate": 2.37760604645238e-05, "loss": 0.1625, "step": 22519 }, { "epoch": 6.884744726383369, "grad_norm": 0.6129382252693176, "learning_rate": 2.377563585410386e-05, "loss": 0.1691, "step": 22520 }, { "epoch": 6.885050443289514, "grad_norm": 0.5972217917442322, "learning_rate": 2.377521124368392e-05, "loss": 0.1816, "step": 22521 }, { "epoch": 6.885356160195659, "grad_norm": 1.2338683605194092, "learning_rate": 2.377478663326398e-05, "loss": 0.1678, "step": 22522 }, { "epoch": 6.885661877101803, "grad_norm": 0.7071772217750549, "learning_rate": 2.377436202284404e-05, "loss": 0.1727, "step": 22523 }, { "epoch": 6.8859675940079486, "grad_norm": 1.2227590084075928, "learning_rate": 2.37739374124241e-05, "loss": 0.1855, "step": 22524 }, { "epoch": 6.886273310914094, "grad_norm": 0.9020646810531616, "learning_rate": 2.377351280200416e-05, "loss": 0.1909, "step": 22525 }, { "epoch": 6.886579027820239, "grad_norm": 0.9646186232566833, "learning_rate": 2.377308819158422e-05, "loss": 0.2191, "step": 22526 }, { "epoch": 6.886884744726383, "grad_norm": 0.40804824233055115, "learning_rate": 2.377266358116428e-05, "loss": 0.1502, "step": 22527 }, { "epoch": 6.887190461632528, "grad_norm": 0.48700031638145447, "learning_rate": 2.3772238970744342e-05, "loss": 0.1041, "step": 22528 }, { "epoch": 6.887496178538673, "grad_norm": 0.17669783532619476, "learning_rate": 2.37718143603244e-05, "loss": 0.0671, "step": 22529 }, { "epoch": 6.887801895444818, "grad_norm": 0.27535417675971985, "learning_rate": 2.3771389749904463e-05, "loss": 0.0684, "step": 22530 }, { "epoch": 6.888107612350963, "grad_norm": 0.2278408706188202, "learning_rate": 2.377096513948452e-05, "loss": 0.0617, "step": 22531 }, { "epoch": 6.888413329257108, "grad_norm": 0.1921481192111969, "learning_rate": 2.3770540529064583e-05, "loss": 0.0434, "step": 22532 }, { "epoch": 6.888719046163253, "grad_norm": 0.49550849199295044, "learning_rate": 2.3770115918644642e-05, "loss": 0.0789, "step": 22533 }, { "epoch": 6.889024763069398, "grad_norm": 0.17305819690227509, "learning_rate": 2.3769691308224704e-05, "loss": 0.0433, "step": 22534 }, { "epoch": 6.889330479975543, "grad_norm": 0.5391182899475098, "learning_rate": 2.3769266697804766e-05, "loss": 0.0466, "step": 22535 }, { "epoch": 6.889636196881687, "grad_norm": 1.2700117826461792, "learning_rate": 2.376884208738483e-05, "loss": 0.0726, "step": 22536 }, { "epoch": 6.889941913787832, "grad_norm": 0.5603554844856262, "learning_rate": 2.3768417476964887e-05, "loss": 0.1016, "step": 22537 }, { "epoch": 6.8902476306939775, "grad_norm": 0.5981147289276123, "learning_rate": 2.3767992866544946e-05, "loss": 0.1108, "step": 22538 }, { "epoch": 6.890553347600123, "grad_norm": 0.33327898383140564, "learning_rate": 2.3767568256125008e-05, "loss": 0.0879, "step": 22539 }, { "epoch": 6.890859064506267, "grad_norm": 0.6216566562652588, "learning_rate": 2.3767143645705066e-05, "loss": 0.1228, "step": 22540 }, { "epoch": 6.891164781412412, "grad_norm": 0.3698928952217102, "learning_rate": 2.376671903528513e-05, "loss": 0.1228, "step": 22541 }, { "epoch": 6.891470498318557, "grad_norm": 0.46906691789627075, "learning_rate": 2.3766294424865187e-05, "loss": 0.1082, "step": 22542 }, { "epoch": 6.891776215224702, "grad_norm": 0.714888870716095, "learning_rate": 2.376586981444525e-05, "loss": 0.1544, "step": 22543 }, { "epoch": 6.892081932130846, "grad_norm": 1.1243422031402588, "learning_rate": 2.3765445204025308e-05, "loss": 0.1623, "step": 22544 }, { "epoch": 6.8923876490369915, "grad_norm": 0.5721116065979004, "learning_rate": 2.376502059360537e-05, "loss": 0.1574, "step": 22545 }, { "epoch": 6.892693365943137, "grad_norm": 0.9244924187660217, "learning_rate": 2.376459598318543e-05, "loss": 0.1783, "step": 22546 }, { "epoch": 6.892999082849282, "grad_norm": 0.8539825677871704, "learning_rate": 2.376417137276549e-05, "loss": 0.1807, "step": 22547 }, { "epoch": 6.893304799755427, "grad_norm": 1.4635220766067505, "learning_rate": 2.376374676234555e-05, "loss": 0.1759, "step": 22548 }, { "epoch": 6.893610516661571, "grad_norm": 1.1262274980545044, "learning_rate": 2.376332215192561e-05, "loss": 0.1639, "step": 22549 }, { "epoch": 6.893916233567716, "grad_norm": 1.9844189882278442, "learning_rate": 2.376289754150567e-05, "loss": 0.1835, "step": 22550 }, { "epoch": 6.894221950473861, "grad_norm": 6.693962097167969, "learning_rate": 2.376247293108573e-05, "loss": 0.231, "step": 22551 }, { "epoch": 6.894527667380006, "grad_norm": 0.5910589694976807, "learning_rate": 2.376204832066579e-05, "loss": 0.1421, "step": 22552 }, { "epoch": 6.894833384286151, "grad_norm": 0.36346954107284546, "learning_rate": 2.376162371024585e-05, "loss": 0.066, "step": 22553 }, { "epoch": 6.895139101192296, "grad_norm": 0.4627760946750641, "learning_rate": 2.3761199099825912e-05, "loss": 0.0647, "step": 22554 }, { "epoch": 6.895444818098441, "grad_norm": 0.26037177443504333, "learning_rate": 2.376077448940597e-05, "loss": 0.0447, "step": 22555 }, { "epoch": 6.895750535004586, "grad_norm": 0.250072717666626, "learning_rate": 2.3760349878986033e-05, "loss": 0.0644, "step": 22556 }, { "epoch": 6.89605625191073, "grad_norm": 0.17184945940971375, "learning_rate": 2.375992526856609e-05, "loss": 0.0373, "step": 22557 }, { "epoch": 6.896361968816875, "grad_norm": 0.7264448404312134, "learning_rate": 2.3759500658146154e-05, "loss": 0.0589, "step": 22558 }, { "epoch": 6.8966676857230205, "grad_norm": 0.6735402345657349, "learning_rate": 2.3759076047726212e-05, "loss": 0.0603, "step": 22559 }, { "epoch": 6.896973402629166, "grad_norm": 0.445060133934021, "learning_rate": 2.3758651437306274e-05, "loss": 0.0552, "step": 22560 }, { "epoch": 6.897279119535311, "grad_norm": 0.43027880787849426, "learning_rate": 2.3758226826886333e-05, "loss": 0.0898, "step": 22561 }, { "epoch": 6.897584836441455, "grad_norm": 0.5785424113273621, "learning_rate": 2.3757802216466392e-05, "loss": 0.0952, "step": 22562 }, { "epoch": 6.8978905533476, "grad_norm": 0.5989310145378113, "learning_rate": 2.3757377606046454e-05, "loss": 0.0692, "step": 22563 }, { "epoch": 6.898196270253745, "grad_norm": 1.2901062965393066, "learning_rate": 2.3756952995626513e-05, "loss": 0.1183, "step": 22564 }, { "epoch": 6.89850198715989, "grad_norm": 0.49213260412216187, "learning_rate": 2.3756528385206575e-05, "loss": 0.103, "step": 22565 }, { "epoch": 6.8988077040660345, "grad_norm": 0.5573272705078125, "learning_rate": 2.3756103774786633e-05, "loss": 0.1202, "step": 22566 }, { "epoch": 6.89911342097218, "grad_norm": 0.5031291842460632, "learning_rate": 2.3755679164366695e-05, "loss": 0.1384, "step": 22567 }, { "epoch": 6.899419137878325, "grad_norm": 0.716643214225769, "learning_rate": 2.3755254553946754e-05, "loss": 0.1416, "step": 22568 }, { "epoch": 6.89972485478447, "grad_norm": 1.2044909000396729, "learning_rate": 2.3754829943526816e-05, "loss": 0.1742, "step": 22569 }, { "epoch": 6.900030571690614, "grad_norm": 0.8381408452987671, "learning_rate": 2.3754405333106875e-05, "loss": 0.1769, "step": 22570 }, { "epoch": 6.900336288596759, "grad_norm": 0.5044066309928894, "learning_rate": 2.3753980722686937e-05, "loss": 0.1673, "step": 22571 }, { "epoch": 6.900642005502904, "grad_norm": 1.4942057132720947, "learning_rate": 2.3753556112266996e-05, "loss": 0.2386, "step": 22572 }, { "epoch": 6.900947722409049, "grad_norm": 1.0769857168197632, "learning_rate": 2.3753131501847058e-05, "loss": 0.1662, "step": 22573 }, { "epoch": 6.9012534393151945, "grad_norm": 0.9724130630493164, "learning_rate": 2.3752706891427117e-05, "loss": 0.1926, "step": 22574 }, { "epoch": 6.901559156221339, "grad_norm": 1.0532432794570923, "learning_rate": 2.3752282281007175e-05, "loss": 0.1566, "step": 22575 }, { "epoch": 6.901864873127484, "grad_norm": 5.3436503410339355, "learning_rate": 2.3751857670587237e-05, "loss": 0.2031, "step": 22576 }, { "epoch": 6.902170590033629, "grad_norm": 1.4151209592819214, "learning_rate": 2.3751433060167296e-05, "loss": 0.1422, "step": 22577 }, { "epoch": 6.902476306939774, "grad_norm": 0.3707098662853241, "learning_rate": 2.3751008449747358e-05, "loss": 0.0562, "step": 22578 }, { "epoch": 6.902782023845918, "grad_norm": 0.3168052136898041, "learning_rate": 2.3750583839327417e-05, "loss": 0.0625, "step": 22579 }, { "epoch": 6.903087740752063, "grad_norm": 0.2158774435520172, "learning_rate": 2.375015922890748e-05, "loss": 0.0591, "step": 22580 }, { "epoch": 6.9033934576582086, "grad_norm": 0.233789324760437, "learning_rate": 2.3749734618487538e-05, "loss": 0.0637, "step": 22581 }, { "epoch": 6.903699174564354, "grad_norm": 0.4738948345184326, "learning_rate": 2.37493100080676e-05, "loss": 0.0484, "step": 22582 }, { "epoch": 6.904004891470498, "grad_norm": 0.31731516122817993, "learning_rate": 2.374888539764766e-05, "loss": 0.1092, "step": 22583 }, { "epoch": 6.904310608376643, "grad_norm": 0.44266802072525024, "learning_rate": 2.374846078722772e-05, "loss": 0.0543, "step": 22584 }, { "epoch": 6.904616325282788, "grad_norm": 0.5089300870895386, "learning_rate": 2.374803617680778e-05, "loss": 0.0695, "step": 22585 }, { "epoch": 6.904922042188933, "grad_norm": 0.2857292592525482, "learning_rate": 2.374761156638784e-05, "loss": 0.0673, "step": 22586 }, { "epoch": 6.905227759095078, "grad_norm": 0.70187908411026, "learning_rate": 2.37471869559679e-05, "loss": 0.1183, "step": 22587 }, { "epoch": 6.905533476001223, "grad_norm": 0.38967156410217285, "learning_rate": 2.374676234554796e-05, "loss": 0.064, "step": 22588 }, { "epoch": 6.905839192907368, "grad_norm": 0.25438937544822693, "learning_rate": 2.374633773512802e-05, "loss": 0.0781, "step": 22589 }, { "epoch": 6.906144909813513, "grad_norm": 0.3315916061401367, "learning_rate": 2.374591312470808e-05, "loss": 0.1086, "step": 22590 }, { "epoch": 6.906450626719658, "grad_norm": 0.6631964445114136, "learning_rate": 2.374548851428814e-05, "loss": 0.0939, "step": 22591 }, { "epoch": 6.906756343625802, "grad_norm": 0.8047707080841064, "learning_rate": 2.37450639038682e-05, "loss": 0.1187, "step": 22592 }, { "epoch": 6.907062060531947, "grad_norm": 0.9331210255622864, "learning_rate": 2.3744639293448262e-05, "loss": 0.1625, "step": 22593 }, { "epoch": 6.907367777438092, "grad_norm": 0.7266026735305786, "learning_rate": 2.374421468302832e-05, "loss": 0.1389, "step": 22594 }, { "epoch": 6.9076734943442375, "grad_norm": 0.7291355133056641, "learning_rate": 2.3743790072608383e-05, "loss": 0.1785, "step": 22595 }, { "epoch": 6.907979211250382, "grad_norm": 0.5561609864234924, "learning_rate": 2.3743365462188442e-05, "loss": 0.1911, "step": 22596 }, { "epoch": 6.908284928156527, "grad_norm": 0.687288224697113, "learning_rate": 2.3742940851768504e-05, "loss": 0.1738, "step": 22597 }, { "epoch": 6.908590645062672, "grad_norm": 0.6648432612419128, "learning_rate": 2.3742516241348563e-05, "loss": 0.1992, "step": 22598 }, { "epoch": 6.908896361968817, "grad_norm": 0.4987461566925049, "learning_rate": 2.3742091630928625e-05, "loss": 0.1646, "step": 22599 }, { "epoch": 6.909202078874962, "grad_norm": 0.9492217302322388, "learning_rate": 2.3741667020508683e-05, "loss": 0.1642, "step": 22600 }, { "epoch": 6.909507795781106, "grad_norm": 2.3738620281219482, "learning_rate": 2.3741242410088742e-05, "loss": 0.2614, "step": 22601 }, { "epoch": 6.9098135126872515, "grad_norm": 0.37642189860343933, "learning_rate": 2.3740817799668804e-05, "loss": 0.1382, "step": 22602 }, { "epoch": 6.910119229593397, "grad_norm": 0.39443451166152954, "learning_rate": 2.3740393189248863e-05, "loss": 0.0824, "step": 22603 }, { "epoch": 6.910424946499542, "grad_norm": 1.6476812362670898, "learning_rate": 2.3739968578828925e-05, "loss": 0.062, "step": 22604 }, { "epoch": 6.910730663405686, "grad_norm": 0.3512537479400635, "learning_rate": 2.3739543968408984e-05, "loss": 0.0517, "step": 22605 }, { "epoch": 6.911036380311831, "grad_norm": 0.20622578263282776, "learning_rate": 2.3739119357989046e-05, "loss": 0.0595, "step": 22606 }, { "epoch": 6.911342097217976, "grad_norm": 0.1753045618534088, "learning_rate": 2.3738694747569104e-05, "loss": 0.0554, "step": 22607 }, { "epoch": 6.911647814124121, "grad_norm": 0.24711595475673676, "learning_rate": 2.3738270137149167e-05, "loss": 0.0795, "step": 22608 }, { "epoch": 6.9119535310302656, "grad_norm": 0.22346530854701996, "learning_rate": 2.3737845526729225e-05, "loss": 0.0783, "step": 22609 }, { "epoch": 6.912259247936411, "grad_norm": 0.26136642694473267, "learning_rate": 2.3737420916309287e-05, "loss": 0.0861, "step": 22610 }, { "epoch": 6.912564964842556, "grad_norm": 0.14708682894706726, "learning_rate": 2.3736996305889346e-05, "loss": 0.0467, "step": 22611 }, { "epoch": 6.912870681748701, "grad_norm": 0.9601826071739197, "learning_rate": 2.3736571695469408e-05, "loss": 0.0752, "step": 22612 }, { "epoch": 6.913176398654846, "grad_norm": 0.18720673024654388, "learning_rate": 2.3736147085049467e-05, "loss": 0.0752, "step": 22613 }, { "epoch": 6.91348211556099, "grad_norm": 0.24517996609210968, "learning_rate": 2.3735722474629525e-05, "loss": 0.0855, "step": 22614 }, { "epoch": 6.913787832467135, "grad_norm": 0.48063525557518005, "learning_rate": 2.3735297864209588e-05, "loss": 0.1297, "step": 22615 }, { "epoch": 6.9140935493732805, "grad_norm": 0.4972539246082306, "learning_rate": 2.3734873253789646e-05, "loss": 0.1259, "step": 22616 }, { "epoch": 6.914399266279426, "grad_norm": 0.9870432019233704, "learning_rate": 2.373444864336971e-05, "loss": 0.1227, "step": 22617 }, { "epoch": 6.91470498318557, "grad_norm": 1.4497265815734863, "learning_rate": 2.3734024032949767e-05, "loss": 0.1598, "step": 22618 }, { "epoch": 6.915010700091715, "grad_norm": 0.8474774956703186, "learning_rate": 2.373359942252983e-05, "loss": 0.1585, "step": 22619 }, { "epoch": 6.91531641699786, "grad_norm": 0.39484187960624695, "learning_rate": 2.3733174812109888e-05, "loss": 0.123, "step": 22620 }, { "epoch": 6.915622133904005, "grad_norm": 0.4002956449985504, "learning_rate": 2.373275020168995e-05, "loss": 0.1395, "step": 22621 }, { "epoch": 6.915927850810149, "grad_norm": 1.5644874572753906, "learning_rate": 2.373232559127001e-05, "loss": 0.1767, "step": 22622 }, { "epoch": 6.9162335677162945, "grad_norm": 0.8875448703765869, "learning_rate": 2.373190098085007e-05, "loss": 0.1393, "step": 22623 }, { "epoch": 6.91653928462244, "grad_norm": 2.981659412384033, "learning_rate": 2.373147637043013e-05, "loss": 0.1661, "step": 22624 }, { "epoch": 6.916845001528585, "grad_norm": 0.5171760320663452, "learning_rate": 2.373105176001019e-05, "loss": 0.1637, "step": 22625 }, { "epoch": 6.91715071843473, "grad_norm": 1.6391313076019287, "learning_rate": 2.373062714959025e-05, "loss": 0.244, "step": 22626 }, { "epoch": 6.917456435340874, "grad_norm": 0.40290263295173645, "learning_rate": 2.373020253917031e-05, "loss": 0.1484, "step": 22627 }, { "epoch": 6.917762152247019, "grad_norm": 0.21609829366207123, "learning_rate": 2.372977792875037e-05, "loss": 0.0784, "step": 22628 }, { "epoch": 6.918067869153164, "grad_norm": 0.3173176944255829, "learning_rate": 2.372935331833043e-05, "loss": 0.0654, "step": 22629 }, { "epoch": 6.918373586059309, "grad_norm": 0.3256617784500122, "learning_rate": 2.3728928707910492e-05, "loss": 0.0812, "step": 22630 }, { "epoch": 6.918679302965454, "grad_norm": 0.8494409918785095, "learning_rate": 2.372850409749055e-05, "loss": 0.0734, "step": 22631 }, { "epoch": 6.918985019871599, "grad_norm": 0.8213494420051575, "learning_rate": 2.3728079487070613e-05, "loss": 0.0376, "step": 22632 }, { "epoch": 6.919290736777744, "grad_norm": 0.4093777537345886, "learning_rate": 2.372765487665067e-05, "loss": 0.0479, "step": 22633 }, { "epoch": 6.919596453683889, "grad_norm": 0.27773383259773254, "learning_rate": 2.3727230266230733e-05, "loss": 0.0672, "step": 22634 }, { "epoch": 6.919902170590033, "grad_norm": 0.8947184085845947, "learning_rate": 2.3726805655810792e-05, "loss": 0.0569, "step": 22635 }, { "epoch": 6.920207887496178, "grad_norm": 0.43350592255592346, "learning_rate": 2.3726381045390854e-05, "loss": 0.0781, "step": 22636 }, { "epoch": 6.920513604402323, "grad_norm": 0.5934906601905823, "learning_rate": 2.3725956434970916e-05, "loss": 0.0906, "step": 22637 }, { "epoch": 6.9208193213084686, "grad_norm": 1.0687286853790283, "learning_rate": 2.372553182455098e-05, "loss": 0.0725, "step": 22638 }, { "epoch": 6.921125038214614, "grad_norm": 0.27101054787635803, "learning_rate": 2.3725107214131037e-05, "loss": 0.0794, "step": 22639 }, { "epoch": 6.921430755120758, "grad_norm": 0.32957723736763, "learning_rate": 2.3724682603711096e-05, "loss": 0.1108, "step": 22640 }, { "epoch": 6.921736472026903, "grad_norm": 0.5860902667045593, "learning_rate": 2.3724257993291158e-05, "loss": 0.1171, "step": 22641 }, { "epoch": 6.922042188933048, "grad_norm": 0.49152857065200806, "learning_rate": 2.3723833382871217e-05, "loss": 0.1037, "step": 22642 }, { "epoch": 6.922347905839193, "grad_norm": 1.118747591972351, "learning_rate": 2.372340877245128e-05, "loss": 0.1315, "step": 22643 }, { "epoch": 6.9226536227453375, "grad_norm": 0.6588165760040283, "learning_rate": 2.3722984162031337e-05, "loss": 0.1505, "step": 22644 }, { "epoch": 6.922959339651483, "grad_norm": 0.6942024827003479, "learning_rate": 2.37225595516114e-05, "loss": 0.1815, "step": 22645 }, { "epoch": 6.923265056557628, "grad_norm": 0.7258799076080322, "learning_rate": 2.3722134941191458e-05, "loss": 0.166, "step": 22646 }, { "epoch": 6.923570773463773, "grad_norm": 0.6945796012878418, "learning_rate": 2.372171033077152e-05, "loss": 0.1599, "step": 22647 }, { "epoch": 6.923876490369917, "grad_norm": 2.0119223594665527, "learning_rate": 2.372128572035158e-05, "loss": 0.1747, "step": 22648 }, { "epoch": 6.924182207276062, "grad_norm": 0.7908098101615906, "learning_rate": 2.372086110993164e-05, "loss": 0.1763, "step": 22649 }, { "epoch": 6.924487924182207, "grad_norm": 2.3245460987091064, "learning_rate": 2.37204364995117e-05, "loss": 0.1702, "step": 22650 }, { "epoch": 6.924793641088352, "grad_norm": 1.5678610801696777, "learning_rate": 2.3720011889091762e-05, "loss": 0.2152, "step": 22651 }, { "epoch": 6.9250993579944975, "grad_norm": 0.49044033885002136, "learning_rate": 2.371958727867182e-05, "loss": 0.1592, "step": 22652 }, { "epoch": 6.925405074900642, "grad_norm": 0.19757263362407684, "learning_rate": 2.371916266825188e-05, "loss": 0.0731, "step": 22653 }, { "epoch": 6.925710791806787, "grad_norm": 0.6761715412139893, "learning_rate": 2.371873805783194e-05, "loss": 0.0421, "step": 22654 }, { "epoch": 6.926016508712932, "grad_norm": 0.35299867391586304, "learning_rate": 2.3718313447412e-05, "loss": 0.0606, "step": 22655 }, { "epoch": 6.926322225619077, "grad_norm": 0.28878331184387207, "learning_rate": 2.3717888836992062e-05, "loss": 0.0573, "step": 22656 }, { "epoch": 6.926627942525221, "grad_norm": 0.1575728803873062, "learning_rate": 2.371746422657212e-05, "loss": 0.0619, "step": 22657 }, { "epoch": 6.926933659431366, "grad_norm": 0.2778177261352539, "learning_rate": 2.3717039616152183e-05, "loss": 0.0623, "step": 22658 }, { "epoch": 6.9272393763375115, "grad_norm": 0.40211084485054016, "learning_rate": 2.371661500573224e-05, "loss": 0.0904, "step": 22659 }, { "epoch": 6.927545093243657, "grad_norm": 0.2466539740562439, "learning_rate": 2.3716190395312304e-05, "loss": 0.0443, "step": 22660 }, { "epoch": 6.927850810149801, "grad_norm": 0.3161744177341461, "learning_rate": 2.3715765784892362e-05, "loss": 0.0574, "step": 22661 }, { "epoch": 6.928156527055946, "grad_norm": 1.1666855812072754, "learning_rate": 2.3715341174472424e-05, "loss": 0.0936, "step": 22662 }, { "epoch": 6.928462243962091, "grad_norm": 0.3097958266735077, "learning_rate": 2.3714916564052483e-05, "loss": 0.079, "step": 22663 }, { "epoch": 6.928767960868236, "grad_norm": 0.5827215909957886, "learning_rate": 2.3714491953632542e-05, "loss": 0.0901, "step": 22664 }, { "epoch": 6.929073677774381, "grad_norm": 0.4553370177745819, "learning_rate": 2.3714067343212604e-05, "loss": 0.1034, "step": 22665 }, { "epoch": 6.9293793946805256, "grad_norm": 0.3003826439380646, "learning_rate": 2.3713642732792663e-05, "loss": 0.1091, "step": 22666 }, { "epoch": 6.929685111586671, "grad_norm": 0.5445914268493652, "learning_rate": 2.3713218122372725e-05, "loss": 0.1373, "step": 22667 }, { "epoch": 6.929990828492816, "grad_norm": 0.7441391944885254, "learning_rate": 2.3712793511952783e-05, "loss": 0.1416, "step": 22668 }, { "epoch": 6.930296545398961, "grad_norm": 0.5533941984176636, "learning_rate": 2.3712368901532845e-05, "loss": 0.1586, "step": 22669 }, { "epoch": 6.930602262305105, "grad_norm": 0.5785590410232544, "learning_rate": 2.3711944291112904e-05, "loss": 0.1609, "step": 22670 }, { "epoch": 6.93090797921125, "grad_norm": 0.5103518962860107, "learning_rate": 2.3711519680692966e-05, "loss": 0.1545, "step": 22671 }, { "epoch": 6.931213696117395, "grad_norm": 1.085023045539856, "learning_rate": 2.3711095070273025e-05, "loss": 0.1456, "step": 22672 }, { "epoch": 6.9315194130235405, "grad_norm": 0.6616905331611633, "learning_rate": 2.3710670459853087e-05, "loss": 0.151, "step": 22673 }, { "epoch": 6.931825129929685, "grad_norm": 2.0857083797454834, "learning_rate": 2.3710245849433146e-05, "loss": 0.1914, "step": 22674 }, { "epoch": 6.93213084683583, "grad_norm": 0.7617282271385193, "learning_rate": 2.3709821239013208e-05, "loss": 0.174, "step": 22675 }, { "epoch": 6.932436563741975, "grad_norm": 1.8711512088775635, "learning_rate": 2.3709396628593267e-05, "loss": 0.2355, "step": 22676 }, { "epoch": 6.93274228064812, "grad_norm": 0.5490959882736206, "learning_rate": 2.3708972018173325e-05, "loss": 0.1599, "step": 22677 }, { "epoch": 6.933047997554265, "grad_norm": 0.4150683879852295, "learning_rate": 2.3708547407753387e-05, "loss": 0.0931, "step": 22678 }, { "epoch": 6.933353714460409, "grad_norm": 0.29750165343284607, "learning_rate": 2.3708122797333446e-05, "loss": 0.0529, "step": 22679 }, { "epoch": 6.9336594313665545, "grad_norm": 0.3923814296722412, "learning_rate": 2.3707698186913508e-05, "loss": 0.0524, "step": 22680 }, { "epoch": 6.9339651482727, "grad_norm": 0.4173097014427185, "learning_rate": 2.3707273576493567e-05, "loss": 0.067, "step": 22681 }, { "epoch": 6.934270865178845, "grad_norm": 0.22155193984508514, "learning_rate": 2.370684896607363e-05, "loss": 0.0384, "step": 22682 }, { "epoch": 6.934576582084989, "grad_norm": 0.16342668235301971, "learning_rate": 2.3706424355653688e-05, "loss": 0.0503, "step": 22683 }, { "epoch": 6.934882298991134, "grad_norm": 0.38117098808288574, "learning_rate": 2.370599974523375e-05, "loss": 0.0446, "step": 22684 }, { "epoch": 6.935188015897279, "grad_norm": 0.4006608724594116, "learning_rate": 2.370557513481381e-05, "loss": 0.0898, "step": 22685 }, { "epoch": 6.935493732803424, "grad_norm": 0.13897745311260223, "learning_rate": 2.370515052439387e-05, "loss": 0.0442, "step": 22686 }, { "epoch": 6.9357994497095685, "grad_norm": 1.071020483970642, "learning_rate": 2.370472591397393e-05, "loss": 0.0831, "step": 22687 }, { "epoch": 6.936105166615714, "grad_norm": 0.3506534993648529, "learning_rate": 2.370430130355399e-05, "loss": 0.0923, "step": 22688 }, { "epoch": 6.936410883521859, "grad_norm": 1.2607029676437378, "learning_rate": 2.370387669313405e-05, "loss": 0.075, "step": 22689 }, { "epoch": 6.936716600428004, "grad_norm": 0.47029969096183777, "learning_rate": 2.370345208271411e-05, "loss": 0.1158, "step": 22690 }, { "epoch": 6.937022317334149, "grad_norm": 0.4811001420021057, "learning_rate": 2.370302747229417e-05, "loss": 0.1111, "step": 22691 }, { "epoch": 6.937328034240293, "grad_norm": 0.5011916756629944, "learning_rate": 2.370260286187423e-05, "loss": 0.1386, "step": 22692 }, { "epoch": 6.937633751146438, "grad_norm": 0.6912839412689209, "learning_rate": 2.370217825145429e-05, "loss": 0.1387, "step": 22693 }, { "epoch": 6.937939468052583, "grad_norm": 1.2947263717651367, "learning_rate": 2.370175364103435e-05, "loss": 0.159, "step": 22694 }, { "epoch": 6.9382451849587286, "grad_norm": 0.6133198142051697, "learning_rate": 2.3701329030614412e-05, "loss": 0.164, "step": 22695 }, { "epoch": 6.938550901864873, "grad_norm": 0.6335939168930054, "learning_rate": 2.370090442019447e-05, "loss": 0.157, "step": 22696 }, { "epoch": 6.938856618771018, "grad_norm": 0.5172121524810791, "learning_rate": 2.3700479809774533e-05, "loss": 0.142, "step": 22697 }, { "epoch": 6.939162335677163, "grad_norm": 0.7485842704772949, "learning_rate": 2.3700055199354592e-05, "loss": 0.1536, "step": 22698 }, { "epoch": 6.939468052583308, "grad_norm": 0.531968355178833, "learning_rate": 2.3699630588934654e-05, "loss": 0.1386, "step": 22699 }, { "epoch": 6.939773769489452, "grad_norm": 1.2494871616363525, "learning_rate": 2.3699205978514713e-05, "loss": 0.2111, "step": 22700 }, { "epoch": 6.9400794863955975, "grad_norm": 1.5122971534729004, "learning_rate": 2.3698781368094775e-05, "loss": 0.2115, "step": 22701 }, { "epoch": 6.940385203301743, "grad_norm": 0.3540289103984833, "learning_rate": 2.3698356757674833e-05, "loss": 0.1514, "step": 22702 }, { "epoch": 6.940690920207888, "grad_norm": 0.4179474413394928, "learning_rate": 2.3697932147254892e-05, "loss": 0.1214, "step": 22703 }, { "epoch": 6.940996637114033, "grad_norm": 0.2182471752166748, "learning_rate": 2.3697507536834954e-05, "loss": 0.0561, "step": 22704 }, { "epoch": 6.941302354020177, "grad_norm": 0.19383253157138824, "learning_rate": 2.3697082926415013e-05, "loss": 0.0667, "step": 22705 }, { "epoch": 6.941608070926322, "grad_norm": 0.46570053696632385, "learning_rate": 2.3696658315995075e-05, "loss": 0.0732, "step": 22706 }, { "epoch": 6.941913787832467, "grad_norm": 0.2177038937807083, "learning_rate": 2.3696233705575134e-05, "loss": 0.0471, "step": 22707 }, { "epoch": 6.942219504738612, "grad_norm": 0.3678787648677826, "learning_rate": 2.3695809095155196e-05, "loss": 0.0481, "step": 22708 }, { "epoch": 6.942525221644757, "grad_norm": 0.397093266248703, "learning_rate": 2.3695384484735254e-05, "loss": 0.0714, "step": 22709 }, { "epoch": 6.942830938550902, "grad_norm": 0.17607295513153076, "learning_rate": 2.3694959874315317e-05, "loss": 0.0566, "step": 22710 }, { "epoch": 6.943136655457047, "grad_norm": 0.1785864233970642, "learning_rate": 2.3694535263895375e-05, "loss": 0.0601, "step": 22711 }, { "epoch": 6.943442372363192, "grad_norm": 0.3333708643913269, "learning_rate": 2.3694110653475437e-05, "loss": 0.0852, "step": 22712 }, { "epoch": 6.943748089269336, "grad_norm": 1.7048436403274536, "learning_rate": 2.3693686043055496e-05, "loss": 0.0956, "step": 22713 }, { "epoch": 6.944053806175481, "grad_norm": 0.217410609126091, "learning_rate": 2.3693261432635558e-05, "loss": 0.0785, "step": 22714 }, { "epoch": 6.944359523081626, "grad_norm": 0.31973856687545776, "learning_rate": 2.3692836822215617e-05, "loss": 0.1128, "step": 22715 }, { "epoch": 6.9446652399877715, "grad_norm": 0.5596300363540649, "learning_rate": 2.3692412211795676e-05, "loss": 0.1311, "step": 22716 }, { "epoch": 6.944970956893917, "grad_norm": 0.42848023772239685, "learning_rate": 2.3691987601375738e-05, "loss": 0.1228, "step": 22717 }, { "epoch": 6.945276673800061, "grad_norm": 1.222222924232483, "learning_rate": 2.3691562990955796e-05, "loss": 0.1356, "step": 22718 }, { "epoch": 6.945582390706206, "grad_norm": 0.9667991399765015, "learning_rate": 2.369113838053586e-05, "loss": 0.149, "step": 22719 }, { "epoch": 6.945888107612351, "grad_norm": 0.7485057711601257, "learning_rate": 2.3690713770115917e-05, "loss": 0.1648, "step": 22720 }, { "epoch": 6.946193824518496, "grad_norm": 1.350364327430725, "learning_rate": 2.369028915969598e-05, "loss": 0.1773, "step": 22721 }, { "epoch": 6.94649954142464, "grad_norm": 0.5465497970581055, "learning_rate": 2.3689864549276038e-05, "loss": 0.1357, "step": 22722 }, { "epoch": 6.9468052583307855, "grad_norm": 1.3470306396484375, "learning_rate": 2.36894399388561e-05, "loss": 0.1608, "step": 22723 }, { "epoch": 6.947110975236931, "grad_norm": 0.8312376737594604, "learning_rate": 2.368901532843616e-05, "loss": 0.1646, "step": 22724 }, { "epoch": 6.947416692143076, "grad_norm": 0.6723052263259888, "learning_rate": 2.368859071801622e-05, "loss": 0.1866, "step": 22725 }, { "epoch": 6.94772240904922, "grad_norm": 2.6637864112854004, "learning_rate": 2.368816610759628e-05, "loss": 0.235, "step": 22726 }, { "epoch": 6.948028125955365, "grad_norm": 0.33309873938560486, "learning_rate": 2.368774149717634e-05, "loss": 0.1286, "step": 22727 }, { "epoch": 6.94833384286151, "grad_norm": 0.42310062050819397, "learning_rate": 2.36873168867564e-05, "loss": 0.0817, "step": 22728 }, { "epoch": 6.948639559767655, "grad_norm": 0.1777726709842682, "learning_rate": 2.368689227633646e-05, "loss": 0.0599, "step": 22729 }, { "epoch": 6.9489452766738005, "grad_norm": 0.2586727440357208, "learning_rate": 2.368646766591652e-05, "loss": 0.0538, "step": 22730 }, { "epoch": 6.949250993579945, "grad_norm": 0.47954505681991577, "learning_rate": 2.368604305549658e-05, "loss": 0.0483, "step": 22731 }, { "epoch": 6.94955671048609, "grad_norm": 0.20960652828216553, "learning_rate": 2.3685618445076642e-05, "loss": 0.0394, "step": 22732 }, { "epoch": 6.949862427392235, "grad_norm": 0.18601678311824799, "learning_rate": 2.36851938346567e-05, "loss": 0.0586, "step": 22733 }, { "epoch": 6.95016814429838, "grad_norm": 0.2382211685180664, "learning_rate": 2.3684769224236763e-05, "loss": 0.0591, "step": 22734 }, { "epoch": 6.950473861204524, "grad_norm": 0.287670761346817, "learning_rate": 2.368434461381682e-05, "loss": 0.0522, "step": 22735 }, { "epoch": 6.950779578110669, "grad_norm": 0.4036199748516083, "learning_rate": 2.3683920003396883e-05, "loss": 0.1079, "step": 22736 }, { "epoch": 6.9510852950168145, "grad_norm": 1.3602614402770996, "learning_rate": 2.3683495392976942e-05, "loss": 0.0812, "step": 22737 }, { "epoch": 6.95139101192296, "grad_norm": 0.49519675970077515, "learning_rate": 2.3683070782557004e-05, "loss": 0.0816, "step": 22738 }, { "epoch": 6.951696728829104, "grad_norm": 0.3589365482330322, "learning_rate": 2.3682646172137066e-05, "loss": 0.0861, "step": 22739 }, { "epoch": 6.952002445735249, "grad_norm": 0.3597607910633087, "learning_rate": 2.368222156171713e-05, "loss": 0.1033, "step": 22740 }, { "epoch": 6.952308162641394, "grad_norm": 0.22271136939525604, "learning_rate": 2.3681796951297187e-05, "loss": 0.0974, "step": 22741 }, { "epoch": 6.952613879547539, "grad_norm": 0.8225452899932861, "learning_rate": 2.3681372340877246e-05, "loss": 0.1436, "step": 22742 }, { "epoch": 6.952919596453684, "grad_norm": 0.6468713283538818, "learning_rate": 2.3680947730457308e-05, "loss": 0.113, "step": 22743 }, { "epoch": 6.9532253133598285, "grad_norm": 0.5128264427185059, "learning_rate": 2.3680523120037367e-05, "loss": 0.1882, "step": 22744 }, { "epoch": 6.953531030265974, "grad_norm": 0.8362371921539307, "learning_rate": 2.368009850961743e-05, "loss": 0.1679, "step": 22745 }, { "epoch": 6.953836747172119, "grad_norm": 0.5748916268348694, "learning_rate": 2.3679673899197487e-05, "loss": 0.144, "step": 22746 }, { "epoch": 6.954142464078264, "grad_norm": 1.712778925895691, "learning_rate": 2.367924928877755e-05, "loss": 0.1558, "step": 22747 }, { "epoch": 6.954448180984408, "grad_norm": 1.4510594606399536, "learning_rate": 2.3678824678357608e-05, "loss": 0.1644, "step": 22748 }, { "epoch": 6.954753897890553, "grad_norm": 0.8364124894142151, "learning_rate": 2.367840006793767e-05, "loss": 0.1549, "step": 22749 }, { "epoch": 6.955059614796698, "grad_norm": 1.1020835638046265, "learning_rate": 2.367797545751773e-05, "loss": 0.1805, "step": 22750 }, { "epoch": 6.955365331702843, "grad_norm": 1.0688968896865845, "learning_rate": 2.367755084709779e-05, "loss": 0.2076, "step": 22751 }, { "epoch": 6.955671048608988, "grad_norm": 0.7089506983757019, "learning_rate": 2.367712623667785e-05, "loss": 0.1178, "step": 22752 }, { "epoch": 6.955976765515133, "grad_norm": 0.17270934581756592, "learning_rate": 2.3676701626257912e-05, "loss": 0.0728, "step": 22753 }, { "epoch": 6.956282482421278, "grad_norm": 0.23667119443416595, "learning_rate": 2.367627701583797e-05, "loss": 0.0729, "step": 22754 }, { "epoch": 6.956588199327423, "grad_norm": 0.2547309100627899, "learning_rate": 2.367585240541803e-05, "loss": 0.0586, "step": 22755 }, { "epoch": 6.956893916233568, "grad_norm": 0.17152997851371765, "learning_rate": 2.367542779499809e-05, "loss": 0.0746, "step": 22756 }, { "epoch": 6.957199633139712, "grad_norm": 0.1938847303390503, "learning_rate": 2.367500318457815e-05, "loss": 0.0472, "step": 22757 }, { "epoch": 6.9575053500458575, "grad_norm": 0.1568722128868103, "learning_rate": 2.3674578574158212e-05, "loss": 0.0341, "step": 22758 }, { "epoch": 6.957811066952003, "grad_norm": 0.494558185338974, "learning_rate": 2.367415396373827e-05, "loss": 0.0544, "step": 22759 }, { "epoch": 6.958116783858148, "grad_norm": 0.4054427444934845, "learning_rate": 2.3673729353318333e-05, "loss": 0.0721, "step": 22760 }, { "epoch": 6.958422500764292, "grad_norm": 0.20933018624782562, "learning_rate": 2.367330474289839e-05, "loss": 0.0642, "step": 22761 }, { "epoch": 6.958728217670437, "grad_norm": 0.3697074353694916, "learning_rate": 2.3672880132478454e-05, "loss": 0.0879, "step": 22762 }, { "epoch": 6.959033934576582, "grad_norm": 1.0342168807983398, "learning_rate": 2.3672455522058512e-05, "loss": 0.0703, "step": 22763 }, { "epoch": 6.959339651482727, "grad_norm": 0.30041614174842834, "learning_rate": 2.3672030911638574e-05, "loss": 0.0714, "step": 22764 }, { "epoch": 6.9596453683888715, "grad_norm": 0.2954327166080475, "learning_rate": 2.3671606301218633e-05, "loss": 0.0918, "step": 22765 }, { "epoch": 6.959951085295017, "grad_norm": 0.48731184005737305, "learning_rate": 2.3671181690798695e-05, "loss": 0.0973, "step": 22766 }, { "epoch": 6.960256802201162, "grad_norm": 0.6367764472961426, "learning_rate": 2.3670757080378754e-05, "loss": 0.1298, "step": 22767 }, { "epoch": 6.960562519107307, "grad_norm": 0.7440504431724548, "learning_rate": 2.3670332469958813e-05, "loss": 0.124, "step": 22768 }, { "epoch": 6.960868236013452, "grad_norm": 0.49006208777427673, "learning_rate": 2.3669907859538875e-05, "loss": 0.1403, "step": 22769 }, { "epoch": 6.961173952919596, "grad_norm": 0.6516388654708862, "learning_rate": 2.3669483249118933e-05, "loss": 0.1482, "step": 22770 }, { "epoch": 6.961479669825741, "grad_norm": 0.3771679103374481, "learning_rate": 2.3669058638698996e-05, "loss": 0.1475, "step": 22771 }, { "epoch": 6.961785386731886, "grad_norm": 0.7052801251411438, "learning_rate": 2.3668634028279054e-05, "loss": 0.1931, "step": 22772 }, { "epoch": 6.9620911036380315, "grad_norm": 1.289109468460083, "learning_rate": 2.3668209417859116e-05, "loss": 0.177, "step": 22773 }, { "epoch": 6.962396820544176, "grad_norm": 1.2940908670425415, "learning_rate": 2.3667784807439175e-05, "loss": 0.142, "step": 22774 }, { "epoch": 6.962702537450321, "grad_norm": 1.9974557161331177, "learning_rate": 2.3667360197019237e-05, "loss": 0.1818, "step": 22775 }, { "epoch": 6.963008254356466, "grad_norm": 1.04794442653656, "learning_rate": 2.3666935586599296e-05, "loss": 0.2321, "step": 22776 }, { "epoch": 6.963313971262611, "grad_norm": 0.3515241742134094, "learning_rate": 2.3666510976179358e-05, "loss": 0.1549, "step": 22777 }, { "epoch": 6.963619688168755, "grad_norm": 0.20329797267913818, "learning_rate": 2.3666086365759417e-05, "loss": 0.0664, "step": 22778 }, { "epoch": 6.9639254050749, "grad_norm": 0.15116246044635773, "learning_rate": 2.366566175533948e-05, "loss": 0.0524, "step": 22779 }, { "epoch": 6.9642311219810455, "grad_norm": 0.33754608035087585, "learning_rate": 2.3665237144919537e-05, "loss": 0.0468, "step": 22780 }, { "epoch": 6.964536838887191, "grad_norm": 0.3199833333492279, "learning_rate": 2.3664812534499596e-05, "loss": 0.0574, "step": 22781 }, { "epoch": 6.964842555793336, "grad_norm": 0.519221842288971, "learning_rate": 2.3664387924079658e-05, "loss": 0.0561, "step": 22782 }, { "epoch": 6.96514827269948, "grad_norm": 0.28369611501693726, "learning_rate": 2.3663963313659717e-05, "loss": 0.035, "step": 22783 }, { "epoch": 6.965453989605625, "grad_norm": 0.21543613076210022, "learning_rate": 2.366353870323978e-05, "loss": 0.0592, "step": 22784 }, { "epoch": 6.96575970651177, "grad_norm": 0.4792042672634125, "learning_rate": 2.3663114092819838e-05, "loss": 0.0829, "step": 22785 }, { "epoch": 6.966065423417915, "grad_norm": 1.487350344657898, "learning_rate": 2.36626894823999e-05, "loss": 0.0779, "step": 22786 }, { "epoch": 6.96637114032406, "grad_norm": 0.42980635166168213, "learning_rate": 2.366226487197996e-05, "loss": 0.0829, "step": 22787 }, { "epoch": 6.966676857230205, "grad_norm": 0.2977074682712555, "learning_rate": 2.366184026156002e-05, "loss": 0.0856, "step": 22788 }, { "epoch": 6.96698257413635, "grad_norm": 0.42701253294944763, "learning_rate": 2.366141565114008e-05, "loss": 0.1044, "step": 22789 }, { "epoch": 6.967288291042495, "grad_norm": 0.5040174126625061, "learning_rate": 2.366099104072014e-05, "loss": 0.1528, "step": 22790 }, { "epoch": 6.967594007948639, "grad_norm": 0.4882960915565491, "learning_rate": 2.36605664303002e-05, "loss": 0.1234, "step": 22791 }, { "epoch": 6.967899724854784, "grad_norm": 0.376095712184906, "learning_rate": 2.366014181988026e-05, "loss": 0.119, "step": 22792 }, { "epoch": 6.968205441760929, "grad_norm": 0.6597958207130432, "learning_rate": 2.365971720946032e-05, "loss": 0.144, "step": 22793 }, { "epoch": 6.9685111586670745, "grad_norm": 0.4720153510570526, "learning_rate": 2.365929259904038e-05, "loss": 0.1672, "step": 22794 }, { "epoch": 6.96881687557322, "grad_norm": 0.7126634120941162, "learning_rate": 2.365886798862044e-05, "loss": 0.1607, "step": 22795 }, { "epoch": 6.969122592479364, "grad_norm": 1.1425154209136963, "learning_rate": 2.36584433782005e-05, "loss": 0.1637, "step": 22796 }, { "epoch": 6.969428309385509, "grad_norm": 0.5606197118759155, "learning_rate": 2.3658018767780562e-05, "loss": 0.1712, "step": 22797 }, { "epoch": 6.969734026291654, "grad_norm": 3.8203303813934326, "learning_rate": 2.365759415736062e-05, "loss": 0.2035, "step": 22798 }, { "epoch": 6.970039743197799, "grad_norm": 1.3765380382537842, "learning_rate": 2.3657169546940683e-05, "loss": 0.148, "step": 22799 }, { "epoch": 6.970345460103943, "grad_norm": 0.5928225517272949, "learning_rate": 2.3656744936520742e-05, "loss": 0.1808, "step": 22800 }, { "epoch": 6.9706511770100885, "grad_norm": 1.1134337186813354, "learning_rate": 2.3656320326100804e-05, "loss": 0.2036, "step": 22801 }, { "epoch": 6.970956893916234, "grad_norm": 0.9155967235565186, "learning_rate": 2.3655895715680863e-05, "loss": 0.126, "step": 22802 }, { "epoch": 6.971262610822379, "grad_norm": 0.9138651490211487, "learning_rate": 2.3655471105260925e-05, "loss": 0.0937, "step": 22803 }, { "epoch": 6.971568327728523, "grad_norm": 0.29244935512542725, "learning_rate": 2.3655046494840983e-05, "loss": 0.072, "step": 22804 }, { "epoch": 6.971874044634668, "grad_norm": 0.5632997751235962, "learning_rate": 2.3654621884421042e-05, "loss": 0.0971, "step": 22805 }, { "epoch": 6.972179761540813, "grad_norm": 0.6164126992225647, "learning_rate": 2.3654197274001104e-05, "loss": 0.0587, "step": 22806 }, { "epoch": 6.972485478446958, "grad_norm": 0.8074196577072144, "learning_rate": 2.3653772663581163e-05, "loss": 0.0727, "step": 22807 }, { "epoch": 6.972791195353103, "grad_norm": 0.2926614582538605, "learning_rate": 2.3653348053161225e-05, "loss": 0.0454, "step": 22808 }, { "epoch": 6.973096912259248, "grad_norm": 0.46631908416748047, "learning_rate": 2.3652923442741284e-05, "loss": 0.0654, "step": 22809 }, { "epoch": 6.973402629165393, "grad_norm": 0.3448212146759033, "learning_rate": 2.3652498832321346e-05, "loss": 0.0526, "step": 22810 }, { "epoch": 6.973708346071538, "grad_norm": 0.3633638620376587, "learning_rate": 2.3652074221901404e-05, "loss": 0.0589, "step": 22811 }, { "epoch": 6.974014062977683, "grad_norm": 0.9289235472679138, "learning_rate": 2.3651649611481467e-05, "loss": 0.0631, "step": 22812 }, { "epoch": 6.974319779883827, "grad_norm": 0.4140314757823944, "learning_rate": 2.3651225001061525e-05, "loss": 0.0974, "step": 22813 }, { "epoch": 6.974625496789972, "grad_norm": 0.31114906072616577, "learning_rate": 2.3650800390641587e-05, "loss": 0.1067, "step": 22814 }, { "epoch": 6.9749312136961175, "grad_norm": 0.6833174824714661, "learning_rate": 2.3650375780221646e-05, "loss": 0.0789, "step": 22815 }, { "epoch": 6.975236930602263, "grad_norm": 0.37194493412971497, "learning_rate": 2.3649951169801708e-05, "loss": 0.0855, "step": 22816 }, { "epoch": 6.975542647508407, "grad_norm": 0.44852200150489807, "learning_rate": 2.3649526559381767e-05, "loss": 0.1657, "step": 22817 }, { "epoch": 6.975848364414552, "grad_norm": 0.6106465458869934, "learning_rate": 2.3649101948961826e-05, "loss": 0.1386, "step": 22818 }, { "epoch": 6.976154081320697, "grad_norm": 0.6883792877197266, "learning_rate": 2.3648677338541888e-05, "loss": 0.1223, "step": 22819 }, { "epoch": 6.976459798226842, "grad_norm": 0.6935672163963318, "learning_rate": 2.3648252728121946e-05, "loss": 0.1372, "step": 22820 }, { "epoch": 6.976765515132987, "grad_norm": 0.5622730851173401, "learning_rate": 2.364782811770201e-05, "loss": 0.1609, "step": 22821 }, { "epoch": 6.9770712320391315, "grad_norm": 0.5770902633666992, "learning_rate": 2.3647403507282067e-05, "loss": 0.1552, "step": 22822 }, { "epoch": 6.977376948945277, "grad_norm": 0.6939988136291504, "learning_rate": 2.364697889686213e-05, "loss": 0.1913, "step": 22823 }, { "epoch": 6.977682665851422, "grad_norm": 1.4859950542449951, "learning_rate": 2.3646554286442188e-05, "loss": 0.2043, "step": 22824 }, { "epoch": 6.977988382757567, "grad_norm": 0.5373274087905884, "learning_rate": 2.364612967602225e-05, "loss": 0.1764, "step": 22825 }, { "epoch": 6.978294099663711, "grad_norm": 1.6163774728775024, "learning_rate": 2.364570506560231e-05, "loss": 0.2206, "step": 22826 }, { "epoch": 6.978599816569856, "grad_norm": 0.4380199909210205, "learning_rate": 2.364528045518237e-05, "loss": 0.1733, "step": 22827 }, { "epoch": 6.978905533476001, "grad_norm": 0.2924002707004547, "learning_rate": 2.364485584476243e-05, "loss": 0.0696, "step": 22828 }, { "epoch": 6.979211250382146, "grad_norm": 0.25473177433013916, "learning_rate": 2.364443123434249e-05, "loss": 0.0912, "step": 22829 }, { "epoch": 6.979516967288291, "grad_norm": 0.2705880403518677, "learning_rate": 2.364400662392255e-05, "loss": 0.0597, "step": 22830 }, { "epoch": 6.979822684194436, "grad_norm": 0.26872438192367554, "learning_rate": 2.364358201350261e-05, "loss": 0.0431, "step": 22831 }, { "epoch": 6.980128401100581, "grad_norm": 0.34162867069244385, "learning_rate": 2.364315740308267e-05, "loss": 0.051, "step": 22832 }, { "epoch": 6.980434118006726, "grad_norm": 0.1312520056962967, "learning_rate": 2.364273279266273e-05, "loss": 0.0352, "step": 22833 }, { "epoch": 6.980739834912871, "grad_norm": 0.23705172538757324, "learning_rate": 2.3642308182242792e-05, "loss": 0.0562, "step": 22834 }, { "epoch": 6.981045551819015, "grad_norm": 0.6826582551002502, "learning_rate": 2.364188357182285e-05, "loss": 0.094, "step": 22835 }, { "epoch": 6.98135126872516, "grad_norm": 0.3590957522392273, "learning_rate": 2.3641458961402913e-05, "loss": 0.0688, "step": 22836 }, { "epoch": 6.9816569856313055, "grad_norm": 0.34586888551712036, "learning_rate": 2.364103435098297e-05, "loss": 0.0908, "step": 22837 }, { "epoch": 6.981962702537451, "grad_norm": 0.2434527426958084, "learning_rate": 2.3640609740563033e-05, "loss": 0.0766, "step": 22838 }, { "epoch": 6.982268419443595, "grad_norm": 0.2772751748561859, "learning_rate": 2.3640185130143092e-05, "loss": 0.0796, "step": 22839 }, { "epoch": 6.98257413634974, "grad_norm": 0.28967779874801636, "learning_rate": 2.3639760519723154e-05, "loss": 0.1018, "step": 22840 }, { "epoch": 6.982879853255885, "grad_norm": 0.40911078453063965, "learning_rate": 2.3639335909303216e-05, "loss": 0.1103, "step": 22841 }, { "epoch": 6.98318557016203, "grad_norm": 0.3880925178527832, "learning_rate": 2.363891129888328e-05, "loss": 0.1122, "step": 22842 }, { "epoch": 6.9834912870681745, "grad_norm": 0.4874424338340759, "learning_rate": 2.3638486688463337e-05, "loss": 0.1578, "step": 22843 }, { "epoch": 6.98379700397432, "grad_norm": 0.6619102358818054, "learning_rate": 2.3638062078043396e-05, "loss": 0.1728, "step": 22844 }, { "epoch": 6.984102720880465, "grad_norm": 0.9005120396614075, "learning_rate": 2.3637637467623458e-05, "loss": 0.1515, "step": 22845 }, { "epoch": 6.98440843778661, "grad_norm": 0.4511738121509552, "learning_rate": 2.3637212857203517e-05, "loss": 0.1509, "step": 22846 }, { "epoch": 6.984714154692755, "grad_norm": 2.949410915374756, "learning_rate": 2.363678824678358e-05, "loss": 0.162, "step": 22847 }, { "epoch": 6.985019871598899, "grad_norm": 1.0827052593231201, "learning_rate": 2.3636363636363637e-05, "loss": 0.1594, "step": 22848 }, { "epoch": 6.985325588505044, "grad_norm": 0.9268751740455627, "learning_rate": 2.36359390259437e-05, "loss": 0.1908, "step": 22849 }, { "epoch": 6.985631305411189, "grad_norm": 0.8042384386062622, "learning_rate": 2.3635514415523758e-05, "loss": 0.1976, "step": 22850 }, { "epoch": 6.9859370223173345, "grad_norm": 0.8655027151107788, "learning_rate": 2.363508980510382e-05, "loss": 0.208, "step": 22851 }, { "epoch": 6.986242739223479, "grad_norm": 0.3895508348941803, "learning_rate": 2.363466519468388e-05, "loss": 0.1387, "step": 22852 }, { "epoch": 6.986548456129624, "grad_norm": 0.1741533875465393, "learning_rate": 2.363424058426394e-05, "loss": 0.058, "step": 22853 }, { "epoch": 6.986854173035769, "grad_norm": 0.19442026317119598, "learning_rate": 2.3633815973844e-05, "loss": 0.064, "step": 22854 }, { "epoch": 6.987159889941914, "grad_norm": 0.24382686614990234, "learning_rate": 2.3633391363424062e-05, "loss": 0.0609, "step": 22855 }, { "epoch": 6.987465606848058, "grad_norm": 0.14869308471679688, "learning_rate": 2.363296675300412e-05, "loss": 0.0458, "step": 22856 }, { "epoch": 6.987771323754203, "grad_norm": 0.41581565141677856, "learning_rate": 2.363254214258418e-05, "loss": 0.0547, "step": 22857 }, { "epoch": 6.9880770406603485, "grad_norm": 0.15573346614837646, "learning_rate": 2.363211753216424e-05, "loss": 0.0474, "step": 22858 }, { "epoch": 6.988382757566494, "grad_norm": 0.76036536693573, "learning_rate": 2.36316929217443e-05, "loss": 0.061, "step": 22859 }, { "epoch": 6.988688474472639, "grad_norm": 0.7291110754013062, "learning_rate": 2.3631268311324362e-05, "loss": 0.0869, "step": 22860 }, { "epoch": 6.988994191378783, "grad_norm": 0.2318873256444931, "learning_rate": 2.363084370090442e-05, "loss": 0.0653, "step": 22861 }, { "epoch": 6.989299908284928, "grad_norm": 0.6932134628295898, "learning_rate": 2.3630419090484483e-05, "loss": 0.1083, "step": 22862 }, { "epoch": 6.989605625191073, "grad_norm": 0.27563413977622986, "learning_rate": 2.362999448006454e-05, "loss": 0.061, "step": 22863 }, { "epoch": 6.989911342097218, "grad_norm": 0.2601296305656433, "learning_rate": 2.3629569869644604e-05, "loss": 0.0904, "step": 22864 }, { "epoch": 6.9902170590033625, "grad_norm": 0.34056392312049866, "learning_rate": 2.3629145259224662e-05, "loss": 0.1076, "step": 22865 }, { "epoch": 6.990522775909508, "grad_norm": 1.0377604961395264, "learning_rate": 2.3628720648804724e-05, "loss": 0.1143, "step": 22866 }, { "epoch": 6.990828492815653, "grad_norm": 0.3627599775791168, "learning_rate": 2.3628296038384783e-05, "loss": 0.1232, "step": 22867 }, { "epoch": 6.991134209721798, "grad_norm": 0.36388108134269714, "learning_rate": 2.3627871427964845e-05, "loss": 0.1428, "step": 22868 }, { "epoch": 6.991439926627942, "grad_norm": 1.276749849319458, "learning_rate": 2.3627446817544904e-05, "loss": 0.1688, "step": 22869 }, { "epoch": 6.991745643534087, "grad_norm": 0.6278138160705566, "learning_rate": 2.3627022207124963e-05, "loss": 0.1987, "step": 22870 }, { "epoch": 6.992051360440232, "grad_norm": 1.145729660987854, "learning_rate": 2.3626597596705025e-05, "loss": 0.1495, "step": 22871 }, { "epoch": 6.9923570773463775, "grad_norm": 0.49384286999702454, "learning_rate": 2.3626172986285083e-05, "loss": 0.1627, "step": 22872 }, { "epoch": 6.992662794252523, "grad_norm": 2.167536973953247, "learning_rate": 2.3625748375865146e-05, "loss": 0.1803, "step": 22873 }, { "epoch": 6.992968511158667, "grad_norm": 0.7159838676452637, "learning_rate": 2.3625323765445204e-05, "loss": 0.1507, "step": 22874 }, { "epoch": 6.993274228064812, "grad_norm": 0.6158289313316345, "learning_rate": 2.3624899155025266e-05, "loss": 0.1481, "step": 22875 }, { "epoch": 6.993579944970957, "grad_norm": 0.7105059027671814, "learning_rate": 2.3624474544605325e-05, "loss": 0.1727, "step": 22876 }, { "epoch": 6.993885661877102, "grad_norm": 0.5068464279174805, "learning_rate": 2.3624049934185387e-05, "loss": 0.1268, "step": 22877 }, { "epoch": 6.994191378783246, "grad_norm": 0.28329646587371826, "learning_rate": 2.3623625323765446e-05, "loss": 0.0818, "step": 22878 }, { "epoch": 6.9944970956893915, "grad_norm": 0.36776968836784363, "learning_rate": 2.3623200713345508e-05, "loss": 0.1014, "step": 22879 }, { "epoch": 6.994802812595537, "grad_norm": 0.20123514533042908, "learning_rate": 2.3622776102925567e-05, "loss": 0.0746, "step": 22880 }, { "epoch": 6.995108529501682, "grad_norm": 0.1444971263408661, "learning_rate": 2.362235149250563e-05, "loss": 0.0409, "step": 22881 }, { "epoch": 6.995414246407826, "grad_norm": 0.13313539326190948, "learning_rate": 2.3621926882085687e-05, "loss": 0.0438, "step": 22882 }, { "epoch": 6.995719963313971, "grad_norm": 0.3009246587753296, "learning_rate": 2.3621502271665746e-05, "loss": 0.0627, "step": 22883 }, { "epoch": 6.996025680220116, "grad_norm": 0.5636846423149109, "learning_rate": 2.3621077661245808e-05, "loss": 0.0639, "step": 22884 }, { "epoch": 6.996331397126261, "grad_norm": 0.2633838355541229, "learning_rate": 2.3620653050825867e-05, "loss": 0.0818, "step": 22885 }, { "epoch": 6.996637114032406, "grad_norm": 0.3296327590942383, "learning_rate": 2.362022844040593e-05, "loss": 0.0911, "step": 22886 }, { "epoch": 6.996942830938551, "grad_norm": 0.9385263323783875, "learning_rate": 2.3619803829985988e-05, "loss": 0.089, "step": 22887 }, { "epoch": 6.997248547844696, "grad_norm": 0.20990873873233795, "learning_rate": 2.361937921956605e-05, "loss": 0.0897, "step": 22888 }, { "epoch": 6.997554264750841, "grad_norm": 0.5828734636306763, "learning_rate": 2.361895460914611e-05, "loss": 0.0951, "step": 22889 }, { "epoch": 6.997859981656986, "grad_norm": 0.6873428225517273, "learning_rate": 2.361852999872617e-05, "loss": 0.158, "step": 22890 }, { "epoch": 6.99816569856313, "grad_norm": 0.6882810592651367, "learning_rate": 2.361810538830623e-05, "loss": 0.1359, "step": 22891 }, { "epoch": 6.998471415469275, "grad_norm": 0.5994560718536377, "learning_rate": 2.361768077788629e-05, "loss": 0.132, "step": 22892 }, { "epoch": 6.99877713237542, "grad_norm": 0.6067599058151245, "learning_rate": 2.361725616746635e-05, "loss": 0.1772, "step": 22893 }, { "epoch": 6.9990828492815655, "grad_norm": 0.8229199051856995, "learning_rate": 2.3616831557046412e-05, "loss": 0.1454, "step": 22894 }, { "epoch": 6.99938856618771, "grad_norm": 2.7359886169433594, "learning_rate": 2.361640694662647e-05, "loss": 0.1682, "step": 22895 }, { "epoch": 6.999694283093855, "grad_norm": 0.7411574125289917, "learning_rate": 2.361598233620653e-05, "loss": 0.1694, "step": 22896 }, { "epoch": 7.0, "grad_norm": 1.2227338552474976, "learning_rate": 2.361555772578659e-05, "loss": 0.1897, "step": 22897 }, { "epoch": 7.000305716906145, "grad_norm": 0.3951197564601898, "learning_rate": 2.361513311536665e-05, "loss": 0.1266, "step": 22898 }, { "epoch": 7.00061143381229, "grad_norm": 0.22313864529132843, "learning_rate": 2.3614708504946712e-05, "loss": 0.0843, "step": 22899 }, { "epoch": 7.0009171507184345, "grad_norm": 0.1862332820892334, "learning_rate": 2.361428389452677e-05, "loss": 0.0618, "step": 22900 }, { "epoch": 7.00122286762458, "grad_norm": 0.16383109986782074, "learning_rate": 2.3613859284106833e-05, "loss": 0.0417, "step": 22901 }, { "epoch": 7.001528584530725, "grad_norm": 0.321507066488266, "learning_rate": 2.3613434673686892e-05, "loss": 0.0566, "step": 22902 }, { "epoch": 7.00183430143687, "grad_norm": 0.2524586021900177, "learning_rate": 2.3613010063266954e-05, "loss": 0.0471, "step": 22903 }, { "epoch": 7.002140018343014, "grad_norm": 0.31328168511390686, "learning_rate": 2.3612585452847013e-05, "loss": 0.0469, "step": 22904 }, { "epoch": 7.002445735249159, "grad_norm": 0.2403487116098404, "learning_rate": 2.3612160842427075e-05, "loss": 0.0503, "step": 22905 }, { "epoch": 7.002751452155304, "grad_norm": 0.5594202876091003, "learning_rate": 2.3611736232007133e-05, "loss": 0.0863, "step": 22906 }, { "epoch": 7.003057169061449, "grad_norm": 0.16357745230197906, "learning_rate": 2.3611311621587192e-05, "loss": 0.039, "step": 22907 }, { "epoch": 7.003362885967594, "grad_norm": 0.229640394449234, "learning_rate": 2.3610887011167254e-05, "loss": 0.097, "step": 22908 }, { "epoch": 7.003668602873739, "grad_norm": 0.30827173590660095, "learning_rate": 2.3610462400747313e-05, "loss": 0.0663, "step": 22909 }, { "epoch": 7.003974319779884, "grad_norm": 0.3678353428840637, "learning_rate": 2.3610037790327375e-05, "loss": 0.0748, "step": 22910 }, { "epoch": 7.004280036686029, "grad_norm": 0.2916758358478546, "learning_rate": 2.3609613179907434e-05, "loss": 0.0801, "step": 22911 }, { "epoch": 7.004585753592174, "grad_norm": 0.46016713976860046, "learning_rate": 2.3609188569487496e-05, "loss": 0.1202, "step": 22912 }, { "epoch": 7.004891470498318, "grad_norm": 0.4556479752063751, "learning_rate": 2.3608763959067555e-05, "loss": 0.1212, "step": 22913 }, { "epoch": 7.005197187404463, "grad_norm": 1.5422600507736206, "learning_rate": 2.3608339348647617e-05, "loss": 0.1607, "step": 22914 }, { "epoch": 7.0055029043106085, "grad_norm": 1.5988305807113647, "learning_rate": 2.3607914738227675e-05, "loss": 0.1233, "step": 22915 }, { "epoch": 7.005808621216754, "grad_norm": 0.6867781281471252, "learning_rate": 2.3607490127807737e-05, "loss": 0.1545, "step": 22916 }, { "epoch": 7.006114338122898, "grad_norm": 0.9174875617027283, "learning_rate": 2.3607065517387796e-05, "loss": 0.1636, "step": 22917 }, { "epoch": 7.006420055029043, "grad_norm": 0.5921070575714111, "learning_rate": 2.3606640906967858e-05, "loss": 0.1868, "step": 22918 }, { "epoch": 7.006725771935188, "grad_norm": 1.0348973274230957, "learning_rate": 2.3606216296547917e-05, "loss": 0.1827, "step": 22919 }, { "epoch": 7.007031488841333, "grad_norm": 0.6219981908798218, "learning_rate": 2.3605791686127976e-05, "loss": 0.1543, "step": 22920 }, { "epoch": 7.007337205747477, "grad_norm": 0.9681984186172485, "learning_rate": 2.3605367075708038e-05, "loss": 0.1743, "step": 22921 }, { "epoch": 7.0076429226536225, "grad_norm": 1.7100658416748047, "learning_rate": 2.3604942465288096e-05, "loss": 0.1964, "step": 22922 }, { "epoch": 7.007948639559768, "grad_norm": 0.4819667637348175, "learning_rate": 2.360451785486816e-05, "loss": 0.1373, "step": 22923 }, { "epoch": 7.008254356465913, "grad_norm": 0.2131066918373108, "learning_rate": 2.3604093244448217e-05, "loss": 0.0635, "step": 22924 }, { "epoch": 7.008560073372058, "grad_norm": 0.382104754447937, "learning_rate": 2.360366863402828e-05, "loss": 0.0675, "step": 22925 }, { "epoch": 7.008865790278202, "grad_norm": 0.2456456571817398, "learning_rate": 2.3603244023608338e-05, "loss": 0.0564, "step": 22926 }, { "epoch": 7.009171507184347, "grad_norm": 0.25014692544937134, "learning_rate": 2.36028194131884e-05, "loss": 0.0571, "step": 22927 }, { "epoch": 7.009477224090492, "grad_norm": 0.343538761138916, "learning_rate": 2.360239480276846e-05, "loss": 0.0595, "step": 22928 }, { "epoch": 7.0097829409966375, "grad_norm": 0.15054461359977722, "learning_rate": 2.360197019234852e-05, "loss": 0.0517, "step": 22929 }, { "epoch": 7.010088657902782, "grad_norm": 0.2599731385707855, "learning_rate": 2.360154558192858e-05, "loss": 0.0432, "step": 22930 }, { "epoch": 7.010394374808927, "grad_norm": 0.27766990661621094, "learning_rate": 2.360112097150864e-05, "loss": 0.0939, "step": 22931 }, { "epoch": 7.010700091715072, "grad_norm": 0.2832690179347992, "learning_rate": 2.36006963610887e-05, "loss": 0.1073, "step": 22932 }, { "epoch": 7.011005808621217, "grad_norm": 0.2963535785675049, "learning_rate": 2.360027175066876e-05, "loss": 0.0548, "step": 22933 }, { "epoch": 7.011311525527361, "grad_norm": 0.42599737644195557, "learning_rate": 2.359984714024882e-05, "loss": 0.0873, "step": 22934 }, { "epoch": 7.011617242433506, "grad_norm": 0.8870165944099426, "learning_rate": 2.359942252982888e-05, "loss": 0.1062, "step": 22935 }, { "epoch": 7.0119229593396515, "grad_norm": 0.4376316964626312, "learning_rate": 2.3598997919408942e-05, "loss": 0.1201, "step": 22936 }, { "epoch": 7.012228676245797, "grad_norm": 0.404685378074646, "learning_rate": 2.3598573308989e-05, "loss": 0.0787, "step": 22937 }, { "epoch": 7.012534393151942, "grad_norm": 0.34117311239242554, "learning_rate": 2.3598148698569063e-05, "loss": 0.123, "step": 22938 }, { "epoch": 7.012840110058086, "grad_norm": 0.467063844203949, "learning_rate": 2.359772408814912e-05, "loss": 0.1116, "step": 22939 }, { "epoch": 7.013145826964231, "grad_norm": 0.5630607604980469, "learning_rate": 2.3597299477729183e-05, "loss": 0.1381, "step": 22940 }, { "epoch": 7.013451543870376, "grad_norm": 1.084837555885315, "learning_rate": 2.3596874867309242e-05, "loss": 0.1501, "step": 22941 }, { "epoch": 7.013757260776521, "grad_norm": 0.8243706226348877, "learning_rate": 2.3596450256889304e-05, "loss": 0.174, "step": 22942 }, { "epoch": 7.0140629776826655, "grad_norm": 0.9123761653900146, "learning_rate": 2.3596025646469366e-05, "loss": 0.1796, "step": 22943 }, { "epoch": 7.014368694588811, "grad_norm": 0.7617657780647278, "learning_rate": 2.359560103604943e-05, "loss": 0.1631, "step": 22944 }, { "epoch": 7.014674411494956, "grad_norm": 0.545815646648407, "learning_rate": 2.3595176425629487e-05, "loss": 0.1813, "step": 22945 }, { "epoch": 7.014980128401101, "grad_norm": 0.869849443435669, "learning_rate": 2.3594751815209546e-05, "loss": 0.1727, "step": 22946 }, { "epoch": 7.015285845307245, "grad_norm": 6.14178466796875, "learning_rate": 2.3594327204789608e-05, "loss": 0.2399, "step": 22947 }, { "epoch": 7.01559156221339, "grad_norm": 0.2450004667043686, "learning_rate": 2.3593902594369667e-05, "loss": 0.1119, "step": 22948 }, { "epoch": 7.015897279119535, "grad_norm": 0.2192986011505127, "learning_rate": 2.359347798394973e-05, "loss": 0.059, "step": 22949 }, { "epoch": 7.01620299602568, "grad_norm": 0.4039638936519623, "learning_rate": 2.3593053373529787e-05, "loss": 0.0923, "step": 22950 }, { "epoch": 7.0165087129318255, "grad_norm": 0.18914003670215607, "learning_rate": 2.359262876310985e-05, "loss": 0.0514, "step": 22951 }, { "epoch": 7.01681442983797, "grad_norm": 0.6245805025100708, "learning_rate": 2.3592204152689908e-05, "loss": 0.056, "step": 22952 }, { "epoch": 7.017120146744115, "grad_norm": 0.22064103186130524, "learning_rate": 2.359177954226997e-05, "loss": 0.0429, "step": 22953 }, { "epoch": 7.01742586365026, "grad_norm": 0.5471866130828857, "learning_rate": 2.359135493185003e-05, "loss": 0.0681, "step": 22954 }, { "epoch": 7.017731580556405, "grad_norm": 0.3806839883327484, "learning_rate": 2.359093032143009e-05, "loss": 0.0506, "step": 22955 }, { "epoch": 7.018037297462549, "grad_norm": 0.28646859526634216, "learning_rate": 2.359050571101015e-05, "loss": 0.0661, "step": 22956 }, { "epoch": 7.0183430143686945, "grad_norm": 0.4405265748500824, "learning_rate": 2.3590081100590212e-05, "loss": 0.0581, "step": 22957 }, { "epoch": 7.01864873127484, "grad_norm": 0.5853974223136902, "learning_rate": 2.358965649017027e-05, "loss": 0.066, "step": 22958 }, { "epoch": 7.018954448180985, "grad_norm": 0.2838756740093231, "learning_rate": 2.358923187975033e-05, "loss": 0.0722, "step": 22959 }, { "epoch": 7.019260165087129, "grad_norm": 0.4849720895290375, "learning_rate": 2.358880726933039e-05, "loss": 0.1106, "step": 22960 }, { "epoch": 7.019565881993274, "grad_norm": 0.33323919773101807, "learning_rate": 2.358838265891045e-05, "loss": 0.098, "step": 22961 }, { "epoch": 7.019871598899419, "grad_norm": 0.576349139213562, "learning_rate": 2.3587958048490512e-05, "loss": 0.133, "step": 22962 }, { "epoch": 7.020177315805564, "grad_norm": 0.6910362243652344, "learning_rate": 2.358753343807057e-05, "loss": 0.1302, "step": 22963 }, { "epoch": 7.020483032711709, "grad_norm": 0.6705617308616638, "learning_rate": 2.3587108827650633e-05, "loss": 0.1372, "step": 22964 }, { "epoch": 7.020788749617854, "grad_norm": 1.1364390850067139, "learning_rate": 2.358668421723069e-05, "loss": 0.1377, "step": 22965 }, { "epoch": 7.021094466523999, "grad_norm": 0.5660175681114197, "learning_rate": 2.3586259606810754e-05, "loss": 0.1421, "step": 22966 }, { "epoch": 7.021400183430144, "grad_norm": 0.4738544821739197, "learning_rate": 2.3585834996390812e-05, "loss": 0.1833, "step": 22967 }, { "epoch": 7.021705900336289, "grad_norm": 1.220414161682129, "learning_rate": 2.3585410385970874e-05, "loss": 0.1528, "step": 22968 }, { "epoch": 7.022011617242433, "grad_norm": 1.4155099391937256, "learning_rate": 2.3584985775550933e-05, "loss": 0.1694, "step": 22969 }, { "epoch": 7.022317334148578, "grad_norm": 1.165522813796997, "learning_rate": 2.3584561165130995e-05, "loss": 0.1801, "step": 22970 }, { "epoch": 7.022623051054723, "grad_norm": 0.7029817700386047, "learning_rate": 2.3584136554711054e-05, "loss": 0.1755, "step": 22971 }, { "epoch": 7.0229287679608685, "grad_norm": 1.4755101203918457, "learning_rate": 2.3583711944291113e-05, "loss": 0.217, "step": 22972 }, { "epoch": 7.023234484867013, "grad_norm": 0.8701930642127991, "learning_rate": 2.3583287333871175e-05, "loss": 0.1533, "step": 22973 }, { "epoch": 7.023540201773158, "grad_norm": 0.8239535093307495, "learning_rate": 2.3582862723451233e-05, "loss": 0.0881, "step": 22974 }, { "epoch": 7.023845918679303, "grad_norm": 0.15785042941570282, "learning_rate": 2.3582438113031296e-05, "loss": 0.0503, "step": 22975 }, { "epoch": 7.024151635585448, "grad_norm": 0.3407614231109619, "learning_rate": 2.3582013502611354e-05, "loss": 0.0726, "step": 22976 }, { "epoch": 7.024457352491593, "grad_norm": 0.390807569026947, "learning_rate": 2.3581588892191416e-05, "loss": 0.0585, "step": 22977 }, { "epoch": 7.024763069397737, "grad_norm": 0.4356328845024109, "learning_rate": 2.3581164281771475e-05, "loss": 0.0342, "step": 22978 }, { "epoch": 7.0250687863038825, "grad_norm": 0.40243276953697205, "learning_rate": 2.3580739671351537e-05, "loss": 0.0569, "step": 22979 }, { "epoch": 7.025374503210028, "grad_norm": 0.12430088222026825, "learning_rate": 2.3580315060931596e-05, "loss": 0.0553, "step": 22980 }, { "epoch": 7.025680220116173, "grad_norm": 0.15645408630371094, "learning_rate": 2.3579890450511658e-05, "loss": 0.0569, "step": 22981 }, { "epoch": 7.025985937022317, "grad_norm": 0.38586366176605225, "learning_rate": 2.3579465840091717e-05, "loss": 0.0456, "step": 22982 }, { "epoch": 7.026291653928462, "grad_norm": 0.3850826323032379, "learning_rate": 2.357904122967178e-05, "loss": 0.0696, "step": 22983 }, { "epoch": 7.026597370834607, "grad_norm": 0.23824061453342438, "learning_rate": 2.3578616619251837e-05, "loss": 0.0686, "step": 22984 }, { "epoch": 7.026903087740752, "grad_norm": 0.5902974009513855, "learning_rate": 2.3578192008831896e-05, "loss": 0.0887, "step": 22985 }, { "epoch": 7.027208804646897, "grad_norm": 0.28664422035217285, "learning_rate": 2.3577767398411958e-05, "loss": 0.1041, "step": 22986 }, { "epoch": 7.027514521553042, "grad_norm": 0.27089160680770874, "learning_rate": 2.3577342787992017e-05, "loss": 0.0826, "step": 22987 }, { "epoch": 7.027820238459187, "grad_norm": 0.42786481976509094, "learning_rate": 2.357691817757208e-05, "loss": 0.1416, "step": 22988 }, { "epoch": 7.028125955365332, "grad_norm": 0.360006719827652, "learning_rate": 2.3576493567152138e-05, "loss": 0.14, "step": 22989 }, { "epoch": 7.028431672271477, "grad_norm": 0.4376336932182312, "learning_rate": 2.35760689567322e-05, "loss": 0.1361, "step": 22990 }, { "epoch": 7.028737389177621, "grad_norm": 0.5026724338531494, "learning_rate": 2.357564434631226e-05, "loss": 0.1594, "step": 22991 }, { "epoch": 7.029043106083766, "grad_norm": 0.6688891649246216, "learning_rate": 2.357521973589232e-05, "loss": 0.1557, "step": 22992 }, { "epoch": 7.0293488229899115, "grad_norm": 0.6015721559524536, "learning_rate": 2.357479512547238e-05, "loss": 0.1723, "step": 22993 }, { "epoch": 7.029654539896057, "grad_norm": 2.1737306118011475, "learning_rate": 2.357437051505244e-05, "loss": 0.1903, "step": 22994 }, { "epoch": 7.029960256802201, "grad_norm": 0.7003132700920105, "learning_rate": 2.35739459046325e-05, "loss": 0.1908, "step": 22995 }, { "epoch": 7.030265973708346, "grad_norm": 0.9651060104370117, "learning_rate": 2.3573521294212562e-05, "loss": 0.1577, "step": 22996 }, { "epoch": 7.030571690614491, "grad_norm": 1.2609949111938477, "learning_rate": 2.357309668379262e-05, "loss": 0.1951, "step": 22997 }, { "epoch": 7.030877407520636, "grad_norm": 0.37281590700149536, "learning_rate": 2.357267207337268e-05, "loss": 0.1555, "step": 22998 }, { "epoch": 7.03118312442678, "grad_norm": 0.6528095602989197, "learning_rate": 2.357224746295274e-05, "loss": 0.0903, "step": 22999 }, { "epoch": 7.0314888413329255, "grad_norm": 0.14077843725681305, "learning_rate": 2.35718228525328e-05, "loss": 0.0633, "step": 23000 }, { "epoch": 7.0314888413329255, "eval_cer": 0.18807894995253838, "eval_loss": 0.23124389350414276, "eval_runtime": 18.8587, "eval_samples_per_second": 240.632, "eval_steps_per_second": 0.795, "eval_wer": 0.3255216334739253, "step": 23000 }, { "epoch": 7.031794558239071, "grad_norm": 0.22321027517318726, "learning_rate": 2.3571398242112862e-05, "loss": 0.0477, "step": 23001 }, { "epoch": 7.032100275145216, "grad_norm": 0.13361532986164093, "learning_rate": 2.357097363169292e-05, "loss": 0.0468, "step": 23002 }, { "epoch": 7.032405992051361, "grad_norm": 0.11988197267055511, "learning_rate": 2.3570549021272983e-05, "loss": 0.04, "step": 23003 }, { "epoch": 7.032711708957505, "grad_norm": 0.18857257068157196, "learning_rate": 2.3570124410853042e-05, "loss": 0.0632, "step": 23004 }, { "epoch": 7.03301742586365, "grad_norm": 0.1922644078731537, "learning_rate": 2.3569699800433104e-05, "loss": 0.0441, "step": 23005 }, { "epoch": 7.033323142769795, "grad_norm": 0.22718702256679535, "learning_rate": 2.3569275190013163e-05, "loss": 0.0798, "step": 23006 }, { "epoch": 7.03362885967594, "grad_norm": 0.1703043282032013, "learning_rate": 2.3568850579593225e-05, "loss": 0.0485, "step": 23007 }, { "epoch": 7.033934576582085, "grad_norm": 0.2529040277004242, "learning_rate": 2.3568425969173283e-05, "loss": 0.0849, "step": 23008 }, { "epoch": 7.03424029348823, "grad_norm": 1.0341243743896484, "learning_rate": 2.3568001358753346e-05, "loss": 0.0602, "step": 23009 }, { "epoch": 7.034546010394375, "grad_norm": 0.4714428186416626, "learning_rate": 2.3567576748333404e-05, "loss": 0.095, "step": 23010 }, { "epoch": 7.03485172730052, "grad_norm": 0.2954902648925781, "learning_rate": 2.3567152137913463e-05, "loss": 0.1097, "step": 23011 }, { "epoch": 7.035157444206664, "grad_norm": 0.5031193494796753, "learning_rate": 2.3566727527493525e-05, "loss": 0.1087, "step": 23012 }, { "epoch": 7.035463161112809, "grad_norm": 0.2885802984237671, "learning_rate": 2.3566302917073584e-05, "loss": 0.1393, "step": 23013 }, { "epoch": 7.0357688780189545, "grad_norm": 0.6095214486122131, "learning_rate": 2.3565878306653646e-05, "loss": 0.1194, "step": 23014 }, { "epoch": 7.0360745949251, "grad_norm": 1.1421079635620117, "learning_rate": 2.3565453696233705e-05, "loss": 0.1599, "step": 23015 }, { "epoch": 7.036380311831245, "grad_norm": 0.3120345175266266, "learning_rate": 2.3565029085813767e-05, "loss": 0.1189, "step": 23016 }, { "epoch": 7.036686028737389, "grad_norm": 0.8611211180686951, "learning_rate": 2.3564604475393825e-05, "loss": 0.1616, "step": 23017 }, { "epoch": 7.036991745643534, "grad_norm": 0.6864190697669983, "learning_rate": 2.3564179864973887e-05, "loss": 0.1494, "step": 23018 }, { "epoch": 7.037297462549679, "grad_norm": 0.6302158236503601, "learning_rate": 2.3563755254553946e-05, "loss": 0.2023, "step": 23019 }, { "epoch": 7.037603179455824, "grad_norm": 0.682948112487793, "learning_rate": 2.3563330644134008e-05, "loss": 0.1879, "step": 23020 }, { "epoch": 7.0379088963619685, "grad_norm": 3.1621463298797607, "learning_rate": 2.3562906033714067e-05, "loss": 0.1881, "step": 23021 }, { "epoch": 7.038214613268114, "grad_norm": 3.7798335552215576, "learning_rate": 2.3562481423294126e-05, "loss": 0.2576, "step": 23022 }, { "epoch": 7.038520330174259, "grad_norm": 0.4671382009983063, "learning_rate": 2.3562056812874188e-05, "loss": 0.1276, "step": 23023 }, { "epoch": 7.038826047080404, "grad_norm": 0.2046245038509369, "learning_rate": 2.3561632202454246e-05, "loss": 0.0674, "step": 23024 }, { "epoch": 7.039131763986548, "grad_norm": 0.12936127185821533, "learning_rate": 2.356120759203431e-05, "loss": 0.0532, "step": 23025 }, { "epoch": 7.039437480892693, "grad_norm": 0.3159558176994324, "learning_rate": 2.3560782981614367e-05, "loss": 0.0368, "step": 23026 }, { "epoch": 7.039743197798838, "grad_norm": 0.15055547654628754, "learning_rate": 2.356035837119443e-05, "loss": 0.0592, "step": 23027 }, { "epoch": 7.040048914704983, "grad_norm": 0.2151995599269867, "learning_rate": 2.3559933760774488e-05, "loss": 0.0589, "step": 23028 }, { "epoch": 7.0403546316111285, "grad_norm": 2.0954763889312744, "learning_rate": 2.355950915035455e-05, "loss": 0.0512, "step": 23029 }, { "epoch": 7.040660348517273, "grad_norm": 0.35449811816215515, "learning_rate": 2.355908453993461e-05, "loss": 0.0393, "step": 23030 }, { "epoch": 7.040966065423418, "grad_norm": 0.2806222140789032, "learning_rate": 2.355865992951467e-05, "loss": 0.0532, "step": 23031 }, { "epoch": 7.041271782329563, "grad_norm": 0.6565325260162354, "learning_rate": 2.355823531909473e-05, "loss": 0.0672, "step": 23032 }, { "epoch": 7.041577499235708, "grad_norm": 0.16555491089820862, "learning_rate": 2.355781070867479e-05, "loss": 0.0814, "step": 23033 }, { "epoch": 7.041883216141852, "grad_norm": 0.4567343294620514, "learning_rate": 2.355738609825485e-05, "loss": 0.0735, "step": 23034 }, { "epoch": 7.042188933047997, "grad_norm": 0.5667614936828613, "learning_rate": 2.355696148783491e-05, "loss": 0.0908, "step": 23035 }, { "epoch": 7.0424946499541425, "grad_norm": 0.24580048024654388, "learning_rate": 2.355653687741497e-05, "loss": 0.135, "step": 23036 }, { "epoch": 7.042800366860288, "grad_norm": 1.1802091598510742, "learning_rate": 2.355611226699503e-05, "loss": 0.0992, "step": 23037 }, { "epoch": 7.043106083766432, "grad_norm": 0.7576487064361572, "learning_rate": 2.3555687656575092e-05, "loss": 0.111, "step": 23038 }, { "epoch": 7.043411800672577, "grad_norm": 0.4628933072090149, "learning_rate": 2.355526304615515e-05, "loss": 0.1705, "step": 23039 }, { "epoch": 7.043717517578722, "grad_norm": 0.8730005621910095, "learning_rate": 2.3554838435735213e-05, "loss": 0.1407, "step": 23040 }, { "epoch": 7.044023234484867, "grad_norm": 0.793716549873352, "learning_rate": 2.355441382531527e-05, "loss": 0.1558, "step": 23041 }, { "epoch": 7.044328951391012, "grad_norm": 0.7890384793281555, "learning_rate": 2.3553989214895333e-05, "loss": 0.1426, "step": 23042 }, { "epoch": 7.044634668297157, "grad_norm": 0.7030298113822937, "learning_rate": 2.3553564604475392e-05, "loss": 0.1565, "step": 23043 }, { "epoch": 7.044940385203302, "grad_norm": 0.905916154384613, "learning_rate": 2.3553139994055454e-05, "loss": 0.2039, "step": 23044 }, { "epoch": 7.045246102109447, "grad_norm": 0.8281291723251343, "learning_rate": 2.3552715383635516e-05, "loss": 0.1648, "step": 23045 }, { "epoch": 7.045551819015592, "grad_norm": 0.9595474004745483, "learning_rate": 2.355229077321558e-05, "loss": 0.1903, "step": 23046 }, { "epoch": 7.045857535921736, "grad_norm": 0.902929425239563, "learning_rate": 2.3551866162795637e-05, "loss": 0.1893, "step": 23047 }, { "epoch": 7.046163252827881, "grad_norm": 0.800517201423645, "learning_rate": 2.3551441552375696e-05, "loss": 0.1199, "step": 23048 }, { "epoch": 7.046468969734026, "grad_norm": 0.20749452710151672, "learning_rate": 2.3551016941955758e-05, "loss": 0.0694, "step": 23049 }, { "epoch": 7.0467746866401715, "grad_norm": 0.5287102460861206, "learning_rate": 2.3550592331535817e-05, "loss": 0.0511, "step": 23050 }, { "epoch": 7.047080403546316, "grad_norm": 0.19458162784576416, "learning_rate": 2.355016772111588e-05, "loss": 0.0608, "step": 23051 }, { "epoch": 7.047386120452461, "grad_norm": 0.29510194063186646, "learning_rate": 2.3549743110695937e-05, "loss": 0.0476, "step": 23052 }, { "epoch": 7.047691837358606, "grad_norm": 0.22179734706878662, "learning_rate": 2.3549318500276e-05, "loss": 0.0446, "step": 23053 }, { "epoch": 7.047997554264751, "grad_norm": 0.3464416563510895, "learning_rate": 2.3548893889856058e-05, "loss": 0.0401, "step": 23054 }, { "epoch": 7.048303271170896, "grad_norm": 0.4119056761264801, "learning_rate": 2.354846927943612e-05, "loss": 0.0534, "step": 23055 }, { "epoch": 7.04860898807704, "grad_norm": 0.2662809491157532, "learning_rate": 2.354804466901618e-05, "loss": 0.0509, "step": 23056 }, { "epoch": 7.0489147049831855, "grad_norm": 0.20131246745586395, "learning_rate": 2.354762005859624e-05, "loss": 0.0481, "step": 23057 }, { "epoch": 7.049220421889331, "grad_norm": 0.20050197839736938, "learning_rate": 2.35471954481763e-05, "loss": 0.0644, "step": 23058 }, { "epoch": 7.049526138795476, "grad_norm": 0.4005392789840698, "learning_rate": 2.3546770837756362e-05, "loss": 0.0661, "step": 23059 }, { "epoch": 7.04983185570162, "grad_norm": 0.2959364950656891, "learning_rate": 2.354634622733642e-05, "loss": 0.0865, "step": 23060 }, { "epoch": 7.050137572607765, "grad_norm": 0.4186752140522003, "learning_rate": 2.354592161691648e-05, "loss": 0.1156, "step": 23061 }, { "epoch": 7.05044328951391, "grad_norm": 0.38444751501083374, "learning_rate": 2.354549700649654e-05, "loss": 0.1094, "step": 23062 }, { "epoch": 7.050749006420055, "grad_norm": 0.3217024803161621, "learning_rate": 2.35450723960766e-05, "loss": 0.1159, "step": 23063 }, { "epoch": 7.0510547233261995, "grad_norm": 0.499554306268692, "learning_rate": 2.3544647785656662e-05, "loss": 0.1246, "step": 23064 }, { "epoch": 7.051360440232345, "grad_norm": 0.6507577896118164, "learning_rate": 2.354422317523672e-05, "loss": 0.18, "step": 23065 }, { "epoch": 7.05166615713849, "grad_norm": 0.5668417811393738, "learning_rate": 2.3543798564816783e-05, "loss": 0.1418, "step": 23066 }, { "epoch": 7.051971874044635, "grad_norm": 0.4309762120246887, "learning_rate": 2.354337395439684e-05, "loss": 0.1653, "step": 23067 }, { "epoch": 7.05227759095078, "grad_norm": 0.6050726175308228, "learning_rate": 2.3542949343976904e-05, "loss": 0.2234, "step": 23068 }, { "epoch": 7.052583307856924, "grad_norm": 0.5145904421806335, "learning_rate": 2.3542524733556962e-05, "loss": 0.1638, "step": 23069 }, { "epoch": 7.052889024763069, "grad_norm": 1.7062066793441772, "learning_rate": 2.3542100123137025e-05, "loss": 0.1806, "step": 23070 }, { "epoch": 7.0531947416692145, "grad_norm": 1.556714653968811, "learning_rate": 2.3541675512717083e-05, "loss": 0.1599, "step": 23071 }, { "epoch": 7.05350045857536, "grad_norm": 3.0059614181518555, "learning_rate": 2.3541250902297145e-05, "loss": 0.1902, "step": 23072 }, { "epoch": 7.053806175481504, "grad_norm": 0.22182072699069977, "learning_rate": 2.3540826291877204e-05, "loss": 0.1199, "step": 23073 }, { "epoch": 7.054111892387649, "grad_norm": 0.21748262643814087, "learning_rate": 2.3540401681457263e-05, "loss": 0.0634, "step": 23074 }, { "epoch": 7.054417609293794, "grad_norm": 0.31169593334198, "learning_rate": 2.3539977071037325e-05, "loss": 0.05, "step": 23075 }, { "epoch": 7.054723326199939, "grad_norm": 0.3086962103843689, "learning_rate": 2.3539552460617384e-05, "loss": 0.0624, "step": 23076 }, { "epoch": 7.055029043106083, "grad_norm": 0.22701075673103333, "learning_rate": 2.3539127850197446e-05, "loss": 0.0392, "step": 23077 }, { "epoch": 7.0553347600122285, "grad_norm": 0.20541912317276, "learning_rate": 2.3538703239777504e-05, "loss": 0.0609, "step": 23078 }, { "epoch": 7.055640476918374, "grad_norm": 0.21641120314598083, "learning_rate": 2.3538278629357566e-05, "loss": 0.0986, "step": 23079 }, { "epoch": 7.055946193824519, "grad_norm": 0.1490655243396759, "learning_rate": 2.3537854018937625e-05, "loss": 0.0397, "step": 23080 }, { "epoch": 7.056251910730664, "grad_norm": 0.23941253125667572, "learning_rate": 2.3537429408517687e-05, "loss": 0.0616, "step": 23081 }, { "epoch": 7.056557627636808, "grad_norm": 0.4817259907722473, "learning_rate": 2.3537004798097746e-05, "loss": 0.0574, "step": 23082 }, { "epoch": 7.056863344542953, "grad_norm": 0.5066254734992981, "learning_rate": 2.3536580187677808e-05, "loss": 0.0987, "step": 23083 }, { "epoch": 7.057169061449098, "grad_norm": 0.4132371246814728, "learning_rate": 2.3536155577257867e-05, "loss": 0.0717, "step": 23084 }, { "epoch": 7.057474778355243, "grad_norm": 0.30716583132743835, "learning_rate": 2.353573096683793e-05, "loss": 0.1402, "step": 23085 }, { "epoch": 7.057780495261388, "grad_norm": 0.27340102195739746, "learning_rate": 2.3535306356417987e-05, "loss": 0.0915, "step": 23086 }, { "epoch": 7.058086212167533, "grad_norm": 0.6028252840042114, "learning_rate": 2.3534881745998046e-05, "loss": 0.1168, "step": 23087 }, { "epoch": 7.058391929073678, "grad_norm": 0.34063252806663513, "learning_rate": 2.3534457135578108e-05, "loss": 0.1332, "step": 23088 }, { "epoch": 7.058697645979823, "grad_norm": 0.4325239062309265, "learning_rate": 2.3534032525158167e-05, "loss": 0.1234, "step": 23089 }, { "epoch": 7.059003362885967, "grad_norm": 0.6093205213546753, "learning_rate": 2.353360791473823e-05, "loss": 0.1476, "step": 23090 }, { "epoch": 7.059309079792112, "grad_norm": 0.7242899537086487, "learning_rate": 2.3533183304318288e-05, "loss": 0.1393, "step": 23091 }, { "epoch": 7.059614796698257, "grad_norm": 1.9230091571807861, "learning_rate": 2.353275869389835e-05, "loss": 0.1778, "step": 23092 }, { "epoch": 7.0599205136044025, "grad_norm": 1.6750439405441284, "learning_rate": 2.353233408347841e-05, "loss": 0.1508, "step": 23093 }, { "epoch": 7.060226230510548, "grad_norm": 0.9492354393005371, "learning_rate": 2.353190947305847e-05, "loss": 0.1656, "step": 23094 }, { "epoch": 7.060531947416692, "grad_norm": 1.0055149793624878, "learning_rate": 2.353148486263853e-05, "loss": 0.2097, "step": 23095 }, { "epoch": 7.060837664322837, "grad_norm": 1.3645358085632324, "learning_rate": 2.353106025221859e-05, "loss": 0.1768, "step": 23096 }, { "epoch": 7.061143381228982, "grad_norm": 3.289898157119751, "learning_rate": 2.353063564179865e-05, "loss": 0.2206, "step": 23097 }, { "epoch": 7.061449098135127, "grad_norm": 0.4701058268547058, "learning_rate": 2.3530211031378712e-05, "loss": 0.1375, "step": 23098 }, { "epoch": 7.0617548150412714, "grad_norm": 0.27997738122940063, "learning_rate": 2.352978642095877e-05, "loss": 0.08, "step": 23099 }, { "epoch": 7.062060531947417, "grad_norm": 0.2502732276916504, "learning_rate": 2.352936181053883e-05, "loss": 0.0727, "step": 23100 }, { "epoch": 7.062366248853562, "grad_norm": 0.4085312485694885, "learning_rate": 2.352893720011889e-05, "loss": 0.0626, "step": 23101 }, { "epoch": 7.062671965759707, "grad_norm": 0.7580281496047974, "learning_rate": 2.352851258969895e-05, "loss": 0.0408, "step": 23102 }, { "epoch": 7.062977682665851, "grad_norm": 0.2777668535709381, "learning_rate": 2.3528087979279012e-05, "loss": 0.0692, "step": 23103 }, { "epoch": 7.063283399571996, "grad_norm": 0.8763957023620605, "learning_rate": 2.352766336885907e-05, "loss": 0.0646, "step": 23104 }, { "epoch": 7.063589116478141, "grad_norm": 1.0374348163604736, "learning_rate": 2.3527238758439133e-05, "loss": 0.0683, "step": 23105 }, { "epoch": 7.063894833384286, "grad_norm": 0.30577370524406433, "learning_rate": 2.3526814148019192e-05, "loss": 0.0498, "step": 23106 }, { "epoch": 7.0642005502904315, "grad_norm": 0.3269423544406891, "learning_rate": 2.3526389537599254e-05, "loss": 0.0483, "step": 23107 }, { "epoch": 7.064506267196576, "grad_norm": 1.9794955253601074, "learning_rate": 2.3525964927179313e-05, "loss": 0.0731, "step": 23108 }, { "epoch": 7.064811984102721, "grad_norm": 0.20068757236003876, "learning_rate": 2.3525540316759375e-05, "loss": 0.0831, "step": 23109 }, { "epoch": 7.065117701008866, "grad_norm": 0.3787343204021454, "learning_rate": 2.3525115706339434e-05, "loss": 0.0757, "step": 23110 }, { "epoch": 7.065423417915011, "grad_norm": 0.41309916973114014, "learning_rate": 2.3524691095919496e-05, "loss": 0.1014, "step": 23111 }, { "epoch": 7.065729134821155, "grad_norm": 0.7397098541259766, "learning_rate": 2.3524266485499554e-05, "loss": 0.1246, "step": 23112 }, { "epoch": 7.0660348517273, "grad_norm": 1.7555065155029297, "learning_rate": 2.3523841875079613e-05, "loss": 0.1059, "step": 23113 }, { "epoch": 7.0663405686334455, "grad_norm": 0.8859031796455383, "learning_rate": 2.3523417264659675e-05, "loss": 0.1526, "step": 23114 }, { "epoch": 7.066646285539591, "grad_norm": 0.7118905186653137, "learning_rate": 2.3522992654239734e-05, "loss": 0.1658, "step": 23115 }, { "epoch": 7.066952002445735, "grad_norm": 1.1122994422912598, "learning_rate": 2.3522568043819796e-05, "loss": 0.152, "step": 23116 }, { "epoch": 7.06725771935188, "grad_norm": 0.4613122344017029, "learning_rate": 2.3522143433399855e-05, "loss": 0.1472, "step": 23117 }, { "epoch": 7.067563436258025, "grad_norm": 2.8981571197509766, "learning_rate": 2.3521718822979917e-05, "loss": 0.1779, "step": 23118 }, { "epoch": 7.06786915316417, "grad_norm": 0.44092896580696106, "learning_rate": 2.3521294212559975e-05, "loss": 0.1497, "step": 23119 }, { "epoch": 7.068174870070315, "grad_norm": 0.9227309226989746, "learning_rate": 2.3520869602140037e-05, "loss": 0.1611, "step": 23120 }, { "epoch": 7.0684805869764595, "grad_norm": 0.9797049164772034, "learning_rate": 2.3520444991720096e-05, "loss": 0.1536, "step": 23121 }, { "epoch": 7.068786303882605, "grad_norm": 20.825313568115234, "learning_rate": 2.3520020381300158e-05, "loss": 0.2481, "step": 23122 }, { "epoch": 7.06909202078875, "grad_norm": 0.4506680965423584, "learning_rate": 2.3519595770880217e-05, "loss": 0.1426, "step": 23123 }, { "epoch": 7.069397737694895, "grad_norm": 0.5829150676727295, "learning_rate": 2.351917116046028e-05, "loss": 0.0926, "step": 23124 }, { "epoch": 7.069703454601039, "grad_norm": 0.2977692782878876, "learning_rate": 2.3518746550040338e-05, "loss": 0.0758, "step": 23125 }, { "epoch": 7.070009171507184, "grad_norm": 0.29186201095581055, "learning_rate": 2.3518321939620396e-05, "loss": 0.0503, "step": 23126 }, { "epoch": 7.070314888413329, "grad_norm": 0.2875964641571045, "learning_rate": 2.351789732920046e-05, "loss": 0.0418, "step": 23127 }, { "epoch": 7.0706206053194744, "grad_norm": 0.49768173694610596, "learning_rate": 2.3517472718780517e-05, "loss": 0.0523, "step": 23128 }, { "epoch": 7.070926322225619, "grad_norm": 0.25267162919044495, "learning_rate": 2.351704810836058e-05, "loss": 0.0221, "step": 23129 }, { "epoch": 7.071232039131764, "grad_norm": 0.25332626700401306, "learning_rate": 2.3516623497940638e-05, "loss": 0.0502, "step": 23130 }, { "epoch": 7.071537756037909, "grad_norm": 0.5564579963684082, "learning_rate": 2.35161988875207e-05, "loss": 0.0788, "step": 23131 }, { "epoch": 7.071843472944054, "grad_norm": 0.3297048807144165, "learning_rate": 2.351577427710076e-05, "loss": 0.0563, "step": 23132 }, { "epoch": 7.072149189850199, "grad_norm": 0.8514174818992615, "learning_rate": 2.351534966668082e-05, "loss": 0.0702, "step": 23133 }, { "epoch": 7.072454906756343, "grad_norm": 0.6351322531700134, "learning_rate": 2.351492505626088e-05, "loss": 0.0674, "step": 23134 }, { "epoch": 7.0727606236624885, "grad_norm": 0.2990107834339142, "learning_rate": 2.351450044584094e-05, "loss": 0.0622, "step": 23135 }, { "epoch": 7.073066340568634, "grad_norm": 0.9247722029685974, "learning_rate": 2.3514075835421e-05, "loss": 0.1104, "step": 23136 }, { "epoch": 7.073372057474779, "grad_norm": 0.33063942193984985, "learning_rate": 2.351365122500106e-05, "loss": 0.0891, "step": 23137 }, { "epoch": 7.073677774380923, "grad_norm": 0.4335883855819702, "learning_rate": 2.351322661458112e-05, "loss": 0.1389, "step": 23138 }, { "epoch": 7.073983491287068, "grad_norm": 0.35456037521362305, "learning_rate": 2.351280200416118e-05, "loss": 0.1329, "step": 23139 }, { "epoch": 7.074289208193213, "grad_norm": 0.4244078993797302, "learning_rate": 2.3512377393741242e-05, "loss": 0.1462, "step": 23140 }, { "epoch": 7.074594925099358, "grad_norm": 0.4530734419822693, "learning_rate": 2.35119527833213e-05, "loss": 0.1376, "step": 23141 }, { "epoch": 7.0749006420055025, "grad_norm": 0.6042066812515259, "learning_rate": 2.3511528172901363e-05, "loss": 0.152, "step": 23142 }, { "epoch": 7.075206358911648, "grad_norm": 0.8997057676315308, "learning_rate": 2.351110356248142e-05, "loss": 0.1605, "step": 23143 }, { "epoch": 7.075512075817793, "grad_norm": 0.768470048904419, "learning_rate": 2.3510678952061484e-05, "loss": 0.1885, "step": 23144 }, { "epoch": 7.075817792723938, "grad_norm": 0.8784999251365662, "learning_rate": 2.3510254341641542e-05, "loss": 0.1843, "step": 23145 }, { "epoch": 7.076123509630083, "grad_norm": 0.856619656085968, "learning_rate": 2.3509829731221604e-05, "loss": 0.1701, "step": 23146 }, { "epoch": 7.076429226536227, "grad_norm": 1.752780795097351, "learning_rate": 2.3509405120801666e-05, "loss": 0.2044, "step": 23147 }, { "epoch": 7.076734943442372, "grad_norm": 0.3434811234474182, "learning_rate": 2.350898051038173e-05, "loss": 0.1541, "step": 23148 }, { "epoch": 7.077040660348517, "grad_norm": 0.33617424964904785, "learning_rate": 2.3508555899961787e-05, "loss": 0.0614, "step": 23149 }, { "epoch": 7.0773463772546625, "grad_norm": 0.2451661080121994, "learning_rate": 2.3508131289541846e-05, "loss": 0.0605, "step": 23150 }, { "epoch": 7.077652094160807, "grad_norm": 0.14013095200061798, "learning_rate": 2.3507706679121908e-05, "loss": 0.0602, "step": 23151 }, { "epoch": 7.077957811066952, "grad_norm": 0.25993281602859497, "learning_rate": 2.3507282068701967e-05, "loss": 0.0543, "step": 23152 }, { "epoch": 7.078263527973097, "grad_norm": 0.14222384989261627, "learning_rate": 2.350685745828203e-05, "loss": 0.0496, "step": 23153 }, { "epoch": 7.078569244879242, "grad_norm": 0.2208828330039978, "learning_rate": 2.3506432847862087e-05, "loss": 0.0511, "step": 23154 }, { "epoch": 7.078874961785386, "grad_norm": 0.17467521131038666, "learning_rate": 2.350600823744215e-05, "loss": 0.0379, "step": 23155 }, { "epoch": 7.0791806786915314, "grad_norm": 0.7574658989906311, "learning_rate": 2.3505583627022208e-05, "loss": 0.0649, "step": 23156 }, { "epoch": 7.079486395597677, "grad_norm": 0.22038564085960388, "learning_rate": 2.350515901660227e-05, "loss": 0.07, "step": 23157 }, { "epoch": 7.079792112503822, "grad_norm": 0.4359354078769684, "learning_rate": 2.350473440618233e-05, "loss": 0.0955, "step": 23158 }, { "epoch": 7.080097829409967, "grad_norm": 0.3346080780029297, "learning_rate": 2.350430979576239e-05, "loss": 0.0621, "step": 23159 }, { "epoch": 7.080403546316111, "grad_norm": 0.28539273142814636, "learning_rate": 2.350388518534245e-05, "loss": 0.077, "step": 23160 }, { "epoch": 7.080709263222256, "grad_norm": 0.6596119999885559, "learning_rate": 2.3503460574922512e-05, "loss": 0.0755, "step": 23161 }, { "epoch": 7.081014980128401, "grad_norm": 0.40208700299263, "learning_rate": 2.350303596450257e-05, "loss": 0.1066, "step": 23162 }, { "epoch": 7.081320697034546, "grad_norm": 0.42543265223503113, "learning_rate": 2.350261135408263e-05, "loss": 0.1505, "step": 23163 }, { "epoch": 7.081626413940691, "grad_norm": 0.5013754367828369, "learning_rate": 2.350218674366269e-05, "loss": 0.1567, "step": 23164 }, { "epoch": 7.081932130846836, "grad_norm": 0.38387247920036316, "learning_rate": 2.350176213324275e-05, "loss": 0.1428, "step": 23165 }, { "epoch": 7.082237847752981, "grad_norm": 0.6493145823478699, "learning_rate": 2.3501337522822812e-05, "loss": 0.1434, "step": 23166 }, { "epoch": 7.082543564659126, "grad_norm": 0.7275039553642273, "learning_rate": 2.350091291240287e-05, "loss": 0.1617, "step": 23167 }, { "epoch": 7.08284928156527, "grad_norm": 1.3782641887664795, "learning_rate": 2.3500488301982933e-05, "loss": 0.1203, "step": 23168 }, { "epoch": 7.083154998471415, "grad_norm": 0.7177689671516418, "learning_rate": 2.350006369156299e-05, "loss": 0.1668, "step": 23169 }, { "epoch": 7.08346071537756, "grad_norm": 0.568946897983551, "learning_rate": 2.3499639081143054e-05, "loss": 0.1513, "step": 23170 }, { "epoch": 7.0837664322837055, "grad_norm": 0.877193033695221, "learning_rate": 2.3499214470723112e-05, "loss": 0.1904, "step": 23171 }, { "epoch": 7.084072149189851, "grad_norm": 1.4159197807312012, "learning_rate": 2.3498789860303175e-05, "loss": 0.2159, "step": 23172 }, { "epoch": 7.084377866095995, "grad_norm": 0.3803510367870331, "learning_rate": 2.3498365249883233e-05, "loss": 0.1301, "step": 23173 }, { "epoch": 7.08468358300214, "grad_norm": 0.19504883885383606, "learning_rate": 2.3497940639463295e-05, "loss": 0.091, "step": 23174 }, { "epoch": 7.084989299908285, "grad_norm": 0.25152283906936646, "learning_rate": 2.3497516029043354e-05, "loss": 0.0665, "step": 23175 }, { "epoch": 7.08529501681443, "grad_norm": 0.38346248865127563, "learning_rate": 2.3497091418623413e-05, "loss": 0.0626, "step": 23176 }, { "epoch": 7.085600733720574, "grad_norm": 0.37824633717536926, "learning_rate": 2.3496666808203475e-05, "loss": 0.038, "step": 23177 }, { "epoch": 7.0859064506267195, "grad_norm": 0.21232527494430542, "learning_rate": 2.3496242197783534e-05, "loss": 0.0317, "step": 23178 }, { "epoch": 7.086212167532865, "grad_norm": 0.23250696063041687, "learning_rate": 2.3495817587363596e-05, "loss": 0.0343, "step": 23179 }, { "epoch": 7.08651788443901, "grad_norm": 0.12669609487056732, "learning_rate": 2.3495392976943654e-05, "loss": 0.0348, "step": 23180 }, { "epoch": 7.086823601345154, "grad_norm": 0.24477210640907288, "learning_rate": 2.3494968366523716e-05, "loss": 0.0846, "step": 23181 }, { "epoch": 7.087129318251299, "grad_norm": 0.1957135945558548, "learning_rate": 2.3494543756103775e-05, "loss": 0.0756, "step": 23182 }, { "epoch": 7.087435035157444, "grad_norm": 0.43109387159347534, "learning_rate": 2.3494119145683837e-05, "loss": 0.1027, "step": 23183 }, { "epoch": 7.087740752063589, "grad_norm": 0.29967695474624634, "learning_rate": 2.3493694535263896e-05, "loss": 0.0684, "step": 23184 }, { "epoch": 7.0880464689697344, "grad_norm": 0.8084019422531128, "learning_rate": 2.3493269924843958e-05, "loss": 0.0795, "step": 23185 }, { "epoch": 7.088352185875879, "grad_norm": 0.4485774338245392, "learning_rate": 2.3492845314424017e-05, "loss": 0.0895, "step": 23186 }, { "epoch": 7.088657902782024, "grad_norm": 0.9338194131851196, "learning_rate": 2.349242070400408e-05, "loss": 0.1319, "step": 23187 }, { "epoch": 7.088963619688169, "grad_norm": 0.7412888407707214, "learning_rate": 2.3491996093584137e-05, "loss": 0.1361, "step": 23188 }, { "epoch": 7.089269336594314, "grad_norm": 2.190263271331787, "learning_rate": 2.3491571483164196e-05, "loss": 0.138, "step": 23189 }, { "epoch": 7.089575053500458, "grad_norm": 0.9835168123245239, "learning_rate": 2.3491146872744258e-05, "loss": 0.1612, "step": 23190 }, { "epoch": 7.089880770406603, "grad_norm": 0.3705076575279236, "learning_rate": 2.3490722262324317e-05, "loss": 0.1285, "step": 23191 }, { "epoch": 7.0901864873127485, "grad_norm": 0.6457189321517944, "learning_rate": 2.349029765190438e-05, "loss": 0.1634, "step": 23192 }, { "epoch": 7.090492204218894, "grad_norm": 1.1886448860168457, "learning_rate": 2.3489873041484438e-05, "loss": 0.1469, "step": 23193 }, { "epoch": 7.090797921125038, "grad_norm": 0.6179994344711304, "learning_rate": 2.34894484310645e-05, "loss": 0.1935, "step": 23194 }, { "epoch": 7.091103638031183, "grad_norm": 0.878574788570404, "learning_rate": 2.348902382064456e-05, "loss": 0.1772, "step": 23195 }, { "epoch": 7.091409354937328, "grad_norm": 2.6988584995269775, "learning_rate": 2.348859921022462e-05, "loss": 0.1356, "step": 23196 }, { "epoch": 7.091715071843473, "grad_norm": 0.7270660400390625, "learning_rate": 2.348817459980468e-05, "loss": 0.1976, "step": 23197 }, { "epoch": 7.092020788749618, "grad_norm": 0.5275213122367859, "learning_rate": 2.348774998938474e-05, "loss": 0.127, "step": 23198 }, { "epoch": 7.0923265056557625, "grad_norm": 0.4020727276802063, "learning_rate": 2.34873253789648e-05, "loss": 0.0754, "step": 23199 }, { "epoch": 7.092632222561908, "grad_norm": 0.20843623578548431, "learning_rate": 2.3486900768544862e-05, "loss": 0.0629, "step": 23200 }, { "epoch": 7.092937939468053, "grad_norm": 0.19206170737743378, "learning_rate": 2.348647615812492e-05, "loss": 0.0438, "step": 23201 }, { "epoch": 7.093243656374198, "grad_norm": 0.1918283849954605, "learning_rate": 2.348605154770498e-05, "loss": 0.0555, "step": 23202 }, { "epoch": 7.093549373280342, "grad_norm": 0.3102133870124817, "learning_rate": 2.348562693728504e-05, "loss": 0.0615, "step": 23203 }, { "epoch": 7.093855090186487, "grad_norm": 0.9370847940444946, "learning_rate": 2.34852023268651e-05, "loss": 0.0625, "step": 23204 }, { "epoch": 7.094160807092632, "grad_norm": 0.20315684378147125, "learning_rate": 2.3484777716445162e-05, "loss": 0.071, "step": 23205 }, { "epoch": 7.094466523998777, "grad_norm": 0.42986002564430237, "learning_rate": 2.348435310602522e-05, "loss": 0.09, "step": 23206 }, { "epoch": 7.094772240904922, "grad_norm": 0.26499149203300476, "learning_rate": 2.3483928495605283e-05, "loss": 0.0622, "step": 23207 }, { "epoch": 7.095077957811067, "grad_norm": 0.30209100246429443, "learning_rate": 2.3483503885185342e-05, "loss": 0.0746, "step": 23208 }, { "epoch": 7.095383674717212, "grad_norm": 0.24987003207206726, "learning_rate": 2.3483079274765404e-05, "loss": 0.0719, "step": 23209 }, { "epoch": 7.095689391623357, "grad_norm": 0.85332852602005, "learning_rate": 2.3482654664345463e-05, "loss": 0.106, "step": 23210 }, { "epoch": 7.095995108529502, "grad_norm": 0.3335622251033783, "learning_rate": 2.3482230053925525e-05, "loss": 0.119, "step": 23211 }, { "epoch": 7.096300825435646, "grad_norm": 0.5399472117424011, "learning_rate": 2.3481805443505584e-05, "loss": 0.0949, "step": 23212 }, { "epoch": 7.0966065423417914, "grad_norm": 1.0555137395858765, "learning_rate": 2.3481380833085646e-05, "loss": 0.1114, "step": 23213 }, { "epoch": 7.096912259247937, "grad_norm": 0.7058786749839783, "learning_rate": 2.3480956222665704e-05, "loss": 0.1442, "step": 23214 }, { "epoch": 7.097217976154082, "grad_norm": 0.38554880023002625, "learning_rate": 2.3480531612245763e-05, "loss": 0.1452, "step": 23215 }, { "epoch": 7.097523693060226, "grad_norm": 0.8751000165939331, "learning_rate": 2.3480107001825825e-05, "loss": 0.1306, "step": 23216 }, { "epoch": 7.097829409966371, "grad_norm": 1.2228784561157227, "learning_rate": 2.3479682391405884e-05, "loss": 0.1513, "step": 23217 }, { "epoch": 7.098135126872516, "grad_norm": 0.615655779838562, "learning_rate": 2.3479257780985946e-05, "loss": 0.1357, "step": 23218 }, { "epoch": 7.098440843778661, "grad_norm": 1.0195738077163696, "learning_rate": 2.3478833170566005e-05, "loss": 0.1516, "step": 23219 }, { "epoch": 7.0987465606848055, "grad_norm": 2.1614935398101807, "learning_rate": 2.3478408560146067e-05, "loss": 0.1656, "step": 23220 }, { "epoch": 7.099052277590951, "grad_norm": 1.212651252746582, "learning_rate": 2.3477983949726125e-05, "loss": 0.1668, "step": 23221 }, { "epoch": 7.099357994497096, "grad_norm": 4.744568347930908, "learning_rate": 2.3477559339306187e-05, "loss": 0.2192, "step": 23222 }, { "epoch": 7.099663711403241, "grad_norm": 0.47857484221458435, "learning_rate": 2.3477134728886246e-05, "loss": 0.1156, "step": 23223 }, { "epoch": 7.099969428309386, "grad_norm": 0.1866818070411682, "learning_rate": 2.3476710118466308e-05, "loss": 0.062, "step": 23224 }, { "epoch": 7.10027514521553, "grad_norm": 0.2377590835094452, "learning_rate": 2.3476285508046367e-05, "loss": 0.076, "step": 23225 }, { "epoch": 7.100580862121675, "grad_norm": 0.33748939633369446, "learning_rate": 2.347586089762643e-05, "loss": 0.0527, "step": 23226 }, { "epoch": 7.10088657902782, "grad_norm": 0.4292900562286377, "learning_rate": 2.3475436287206488e-05, "loss": 0.0697, "step": 23227 }, { "epoch": 7.1011922959339655, "grad_norm": 0.2613641619682312, "learning_rate": 2.3475011676786546e-05, "loss": 0.0441, "step": 23228 }, { "epoch": 7.10149801284011, "grad_norm": 0.24398554861545563, "learning_rate": 2.347458706636661e-05, "loss": 0.0574, "step": 23229 }, { "epoch": 7.101803729746255, "grad_norm": 0.20581726729869843, "learning_rate": 2.3474162455946667e-05, "loss": 0.0589, "step": 23230 }, { "epoch": 7.1021094466524, "grad_norm": 0.16307806968688965, "learning_rate": 2.347373784552673e-05, "loss": 0.0474, "step": 23231 }, { "epoch": 7.102415163558545, "grad_norm": 0.27937138080596924, "learning_rate": 2.3473313235106788e-05, "loss": 0.0654, "step": 23232 }, { "epoch": 7.102720880464689, "grad_norm": 0.4660007357597351, "learning_rate": 2.347288862468685e-05, "loss": 0.0823, "step": 23233 }, { "epoch": 7.103026597370834, "grad_norm": 0.8654910326004028, "learning_rate": 2.347246401426691e-05, "loss": 0.0932, "step": 23234 }, { "epoch": 7.1033323142769795, "grad_norm": 0.3001592457294464, "learning_rate": 2.347203940384697e-05, "loss": 0.0892, "step": 23235 }, { "epoch": 7.103638031183125, "grad_norm": 0.41176676750183105, "learning_rate": 2.347161479342703e-05, "loss": 0.1116, "step": 23236 }, { "epoch": 7.10394374808927, "grad_norm": 0.5350258946418762, "learning_rate": 2.3471190183007092e-05, "loss": 0.1053, "step": 23237 }, { "epoch": 7.104249464995414, "grad_norm": 0.6013351082801819, "learning_rate": 2.347076557258715e-05, "loss": 0.1256, "step": 23238 }, { "epoch": 7.104555181901559, "grad_norm": 0.5061735510826111, "learning_rate": 2.3470340962167212e-05, "loss": 0.1503, "step": 23239 }, { "epoch": 7.104860898807704, "grad_norm": 1.0558717250823975, "learning_rate": 2.346991635174727e-05, "loss": 0.1612, "step": 23240 }, { "epoch": 7.105166615713849, "grad_norm": 0.7931846976280212, "learning_rate": 2.346949174132733e-05, "loss": 0.1555, "step": 23241 }, { "epoch": 7.105472332619994, "grad_norm": 1.6443175077438354, "learning_rate": 2.3469067130907392e-05, "loss": 0.1641, "step": 23242 }, { "epoch": 7.105778049526139, "grad_norm": 0.783971905708313, "learning_rate": 2.346864252048745e-05, "loss": 0.1922, "step": 23243 }, { "epoch": 7.106083766432284, "grad_norm": 1.5486844778060913, "learning_rate": 2.3468217910067513e-05, "loss": 0.1511, "step": 23244 }, { "epoch": 7.106389483338429, "grad_norm": 1.071707010269165, "learning_rate": 2.346779329964757e-05, "loss": 0.1617, "step": 23245 }, { "epoch": 7.106695200244573, "grad_norm": 0.7558953166007996, "learning_rate": 2.3467368689227634e-05, "loss": 0.1651, "step": 23246 }, { "epoch": 7.107000917150718, "grad_norm": 2.0210189819335938, "learning_rate": 2.3466944078807692e-05, "loss": 0.2014, "step": 23247 }, { "epoch": 7.107306634056863, "grad_norm": 0.24796245992183685, "learning_rate": 2.3466519468387754e-05, "loss": 0.134, "step": 23248 }, { "epoch": 7.1076123509630085, "grad_norm": 0.26347917318344116, "learning_rate": 2.3466094857967813e-05, "loss": 0.0567, "step": 23249 }, { "epoch": 7.107918067869154, "grad_norm": 0.18501833081245422, "learning_rate": 2.346567024754788e-05, "loss": 0.0467, "step": 23250 }, { "epoch": 7.108223784775298, "grad_norm": 2.914743661880493, "learning_rate": 2.3465245637127937e-05, "loss": 0.0595, "step": 23251 }, { "epoch": 7.108529501681443, "grad_norm": 0.3124062716960907, "learning_rate": 2.3464821026708e-05, "loss": 0.0411, "step": 23252 }, { "epoch": 7.108835218587588, "grad_norm": 0.15845079720020294, "learning_rate": 2.3464396416288058e-05, "loss": 0.0635, "step": 23253 }, { "epoch": 7.109140935493733, "grad_norm": 0.23818059265613556, "learning_rate": 2.3463971805868117e-05, "loss": 0.0284, "step": 23254 }, { "epoch": 7.109446652399877, "grad_norm": 0.3222566246986389, "learning_rate": 2.346354719544818e-05, "loss": 0.0694, "step": 23255 }, { "epoch": 7.1097523693060225, "grad_norm": 0.301583856344223, "learning_rate": 2.3463122585028237e-05, "loss": 0.0761, "step": 23256 }, { "epoch": 7.110058086212168, "grad_norm": 0.6367588639259338, "learning_rate": 2.34626979746083e-05, "loss": 0.0658, "step": 23257 }, { "epoch": 7.110363803118313, "grad_norm": 0.9397388696670532, "learning_rate": 2.3462273364188358e-05, "loss": 0.1026, "step": 23258 }, { "epoch": 7.110669520024457, "grad_norm": 0.29180431365966797, "learning_rate": 2.346184875376842e-05, "loss": 0.0693, "step": 23259 }, { "epoch": 7.110975236930602, "grad_norm": 0.5141578316688538, "learning_rate": 2.346142414334848e-05, "loss": 0.0791, "step": 23260 }, { "epoch": 7.111280953836747, "grad_norm": 0.5688697695732117, "learning_rate": 2.346099953292854e-05, "loss": 0.103, "step": 23261 }, { "epoch": 7.111586670742892, "grad_norm": 1.7278022766113281, "learning_rate": 2.34605749225086e-05, "loss": 0.1252, "step": 23262 }, { "epoch": 7.111892387649037, "grad_norm": 0.8525035381317139, "learning_rate": 2.3460150312088662e-05, "loss": 0.158, "step": 23263 }, { "epoch": 7.112198104555182, "grad_norm": 0.4219549000263214, "learning_rate": 2.345972570166872e-05, "loss": 0.1401, "step": 23264 }, { "epoch": 7.112503821461327, "grad_norm": 0.3462356626987457, "learning_rate": 2.345930109124878e-05, "loss": 0.1819, "step": 23265 }, { "epoch": 7.112809538367472, "grad_norm": 0.6716098785400391, "learning_rate": 2.345887648082884e-05, "loss": 0.1363, "step": 23266 }, { "epoch": 7.113115255273617, "grad_norm": 0.534412145614624, "learning_rate": 2.34584518704089e-05, "loss": 0.1466, "step": 23267 }, { "epoch": 7.113420972179761, "grad_norm": 0.850408673286438, "learning_rate": 2.3458027259988962e-05, "loss": 0.1822, "step": 23268 }, { "epoch": 7.113726689085906, "grad_norm": 0.6433576345443726, "learning_rate": 2.345760264956902e-05, "loss": 0.1502, "step": 23269 }, { "epoch": 7.1140324059920514, "grad_norm": 1.306827425956726, "learning_rate": 2.3457178039149083e-05, "loss": 0.1807, "step": 23270 }, { "epoch": 7.114338122898197, "grad_norm": 0.8719184398651123, "learning_rate": 2.3456753428729142e-05, "loss": 0.1954, "step": 23271 }, { "epoch": 7.114643839804341, "grad_norm": 1.2249009609222412, "learning_rate": 2.3456328818309204e-05, "loss": 0.1771, "step": 23272 }, { "epoch": 7.114949556710486, "grad_norm": 0.32961422204971313, "learning_rate": 2.3455904207889262e-05, "loss": 0.1331, "step": 23273 }, { "epoch": 7.115255273616631, "grad_norm": 0.46041417121887207, "learning_rate": 2.3455479597469325e-05, "loss": 0.09, "step": 23274 }, { "epoch": 7.115560990522776, "grad_norm": 0.19633100926876068, "learning_rate": 2.3455054987049383e-05, "loss": 0.0586, "step": 23275 }, { "epoch": 7.115866707428921, "grad_norm": 0.19990794360637665, "learning_rate": 2.3454630376629445e-05, "loss": 0.0403, "step": 23276 }, { "epoch": 7.1161724243350655, "grad_norm": 0.2706088423728943, "learning_rate": 2.3454205766209504e-05, "loss": 0.0816, "step": 23277 }, { "epoch": 7.116478141241211, "grad_norm": 0.4874607026576996, "learning_rate": 2.3453781155789563e-05, "loss": 0.0481, "step": 23278 }, { "epoch": 7.116783858147356, "grad_norm": 0.15299417078495026, "learning_rate": 2.3453356545369625e-05, "loss": 0.0443, "step": 23279 }, { "epoch": 7.117089575053501, "grad_norm": 0.5438836812973022, "learning_rate": 2.3452931934949684e-05, "loss": 0.054, "step": 23280 }, { "epoch": 7.117395291959645, "grad_norm": 0.25566768646240234, "learning_rate": 2.3452507324529746e-05, "loss": 0.0714, "step": 23281 }, { "epoch": 7.11770100886579, "grad_norm": 0.6724231243133545, "learning_rate": 2.3452082714109804e-05, "loss": 0.0609, "step": 23282 }, { "epoch": 7.118006725771935, "grad_norm": 0.27029168605804443, "learning_rate": 2.3451658103689866e-05, "loss": 0.081, "step": 23283 }, { "epoch": 7.11831244267808, "grad_norm": 0.5579847097396851, "learning_rate": 2.3451233493269925e-05, "loss": 0.0663, "step": 23284 }, { "epoch": 7.118618159584225, "grad_norm": 0.3169487416744232, "learning_rate": 2.3450808882849987e-05, "loss": 0.0779, "step": 23285 }, { "epoch": 7.11892387649037, "grad_norm": 0.301389217376709, "learning_rate": 2.3450384272430046e-05, "loss": 0.0964, "step": 23286 }, { "epoch": 7.119229593396515, "grad_norm": 0.5158193111419678, "learning_rate": 2.3449959662010108e-05, "loss": 0.1303, "step": 23287 }, { "epoch": 7.11953531030266, "grad_norm": 1.035073161125183, "learning_rate": 2.3449535051590167e-05, "loss": 0.1622, "step": 23288 }, { "epoch": 7.119841027208805, "grad_norm": 0.5991262197494507, "learning_rate": 2.344911044117023e-05, "loss": 0.1392, "step": 23289 }, { "epoch": 7.120146744114949, "grad_norm": 0.5372682809829712, "learning_rate": 2.3448685830750287e-05, "loss": 0.1511, "step": 23290 }, { "epoch": 7.120452461021094, "grad_norm": 0.5702025294303894, "learning_rate": 2.3448261220330346e-05, "loss": 0.1362, "step": 23291 }, { "epoch": 7.1207581779272395, "grad_norm": 0.6494907736778259, "learning_rate": 2.3447836609910408e-05, "loss": 0.1676, "step": 23292 }, { "epoch": 7.121063894833385, "grad_norm": 0.528054416179657, "learning_rate": 2.3447411999490467e-05, "loss": 0.1786, "step": 23293 }, { "epoch": 7.121369611739529, "grad_norm": 0.9698000550270081, "learning_rate": 2.344698738907053e-05, "loss": 0.1792, "step": 23294 }, { "epoch": 7.121675328645674, "grad_norm": 0.964857816696167, "learning_rate": 2.3446562778650588e-05, "loss": 0.1815, "step": 23295 }, { "epoch": 7.121981045551819, "grad_norm": 0.6978760361671448, "learning_rate": 2.344613816823065e-05, "loss": 0.1786, "step": 23296 }, { "epoch": 7.122286762457964, "grad_norm": 1.5525306463241577, "learning_rate": 2.344571355781071e-05, "loss": 0.2084, "step": 23297 }, { "epoch": 7.122592479364108, "grad_norm": 0.401061087846756, "learning_rate": 2.344528894739077e-05, "loss": 0.1491, "step": 23298 }, { "epoch": 7.122898196270254, "grad_norm": 0.32206660509109497, "learning_rate": 2.344486433697083e-05, "loss": 0.0729, "step": 23299 }, { "epoch": 7.123203913176399, "grad_norm": 0.2587725818157196, "learning_rate": 2.344443972655089e-05, "loss": 0.0599, "step": 23300 }, { "epoch": 7.123509630082544, "grad_norm": 0.23485207557678223, "learning_rate": 2.344401511613095e-05, "loss": 0.0557, "step": 23301 }, { "epoch": 7.123815346988689, "grad_norm": 0.2931899428367615, "learning_rate": 2.3443590505711012e-05, "loss": 0.0458, "step": 23302 }, { "epoch": 7.124121063894833, "grad_norm": 0.15703964233398438, "learning_rate": 2.344316589529107e-05, "loss": 0.039, "step": 23303 }, { "epoch": 7.124426780800978, "grad_norm": 0.4481066167354584, "learning_rate": 2.344274128487113e-05, "loss": 0.0444, "step": 23304 }, { "epoch": 7.124732497707123, "grad_norm": 0.35288771986961365, "learning_rate": 2.3442316674451192e-05, "loss": 0.046, "step": 23305 }, { "epoch": 7.1250382146132685, "grad_norm": 0.389151394367218, "learning_rate": 2.344189206403125e-05, "loss": 0.0711, "step": 23306 }, { "epoch": 7.125343931519413, "grad_norm": 0.4071882963180542, "learning_rate": 2.3441467453611313e-05, "loss": 0.0778, "step": 23307 }, { "epoch": 7.125649648425558, "grad_norm": 0.38952168822288513, "learning_rate": 2.344104284319137e-05, "loss": 0.0784, "step": 23308 }, { "epoch": 7.125955365331703, "grad_norm": 0.3217843770980835, "learning_rate": 2.3440618232771433e-05, "loss": 0.0666, "step": 23309 }, { "epoch": 7.126261082237848, "grad_norm": 0.1982223093509674, "learning_rate": 2.3440193622351492e-05, "loss": 0.0598, "step": 23310 }, { "epoch": 7.126566799143992, "grad_norm": 4.542474746704102, "learning_rate": 2.3439769011931554e-05, "loss": 0.1238, "step": 23311 }, { "epoch": 7.126872516050137, "grad_norm": 0.3470214307308197, "learning_rate": 2.3439344401511613e-05, "loss": 0.1124, "step": 23312 }, { "epoch": 7.1271782329562825, "grad_norm": 1.007779598236084, "learning_rate": 2.3438919791091675e-05, "loss": 0.1378, "step": 23313 }, { "epoch": 7.127483949862428, "grad_norm": 0.8054673671722412, "learning_rate": 2.3438495180671734e-05, "loss": 0.1789, "step": 23314 }, { "epoch": 7.127789666768573, "grad_norm": 0.42368441820144653, "learning_rate": 2.3438070570251796e-05, "loss": 0.1343, "step": 23315 }, { "epoch": 7.128095383674717, "grad_norm": 0.6591746211051941, "learning_rate": 2.3437645959831854e-05, "loss": 0.1423, "step": 23316 }, { "epoch": 7.128401100580862, "grad_norm": 1.4755971431732178, "learning_rate": 2.3437221349411913e-05, "loss": 0.1482, "step": 23317 }, { "epoch": 7.128706817487007, "grad_norm": 0.6436469554901123, "learning_rate": 2.3436796738991975e-05, "loss": 0.1604, "step": 23318 }, { "epoch": 7.129012534393152, "grad_norm": 2.196922540664673, "learning_rate": 2.3436372128572034e-05, "loss": 0.1666, "step": 23319 }, { "epoch": 7.1293182512992965, "grad_norm": 0.8222170472145081, "learning_rate": 2.3435947518152096e-05, "loss": 0.1839, "step": 23320 }, { "epoch": 7.129623968205442, "grad_norm": 0.5456311106681824, "learning_rate": 2.3435522907732155e-05, "loss": 0.1935, "step": 23321 }, { "epoch": 7.129929685111587, "grad_norm": 2.4205269813537598, "learning_rate": 2.3435098297312217e-05, "loss": 0.2116, "step": 23322 }, { "epoch": 7.130235402017732, "grad_norm": 0.5035847425460815, "learning_rate": 2.3434673686892275e-05, "loss": 0.1458, "step": 23323 }, { "epoch": 7.130541118923876, "grad_norm": 0.23885196447372437, "learning_rate": 2.3434249076472338e-05, "loss": 0.0604, "step": 23324 }, { "epoch": 7.130846835830021, "grad_norm": 0.3970066010951996, "learning_rate": 2.3433824466052396e-05, "loss": 0.07, "step": 23325 }, { "epoch": 7.131152552736166, "grad_norm": 0.2151309847831726, "learning_rate": 2.3433399855632458e-05, "loss": 0.047, "step": 23326 }, { "epoch": 7.131458269642311, "grad_norm": 0.27758437395095825, "learning_rate": 2.3432975245212517e-05, "loss": 0.0528, "step": 23327 }, { "epoch": 7.131763986548457, "grad_norm": 0.26287642121315, "learning_rate": 2.343255063479258e-05, "loss": 0.0797, "step": 23328 }, { "epoch": 7.132069703454601, "grad_norm": 0.8775317072868347, "learning_rate": 2.3432126024372638e-05, "loss": 0.0642, "step": 23329 }, { "epoch": 7.132375420360746, "grad_norm": 0.16924011707305908, "learning_rate": 2.3431701413952696e-05, "loss": 0.04, "step": 23330 }, { "epoch": 7.132681137266891, "grad_norm": 0.45852023363113403, "learning_rate": 2.343127680353276e-05, "loss": 0.088, "step": 23331 }, { "epoch": 7.132986854173036, "grad_norm": 0.4707750082015991, "learning_rate": 2.3430852193112817e-05, "loss": 0.0568, "step": 23332 }, { "epoch": 7.13329257107918, "grad_norm": 0.5994769930839539, "learning_rate": 2.343042758269288e-05, "loss": 0.0701, "step": 23333 }, { "epoch": 7.1335982879853255, "grad_norm": 0.5505498647689819, "learning_rate": 2.3430002972272938e-05, "loss": 0.0697, "step": 23334 }, { "epoch": 7.133904004891471, "grad_norm": 0.4228430390357971, "learning_rate": 2.3429578361853e-05, "loss": 0.1114, "step": 23335 }, { "epoch": 7.134209721797616, "grad_norm": 0.7700561285018921, "learning_rate": 2.342915375143306e-05, "loss": 0.0855, "step": 23336 }, { "epoch": 7.13451543870376, "grad_norm": 0.4580347537994385, "learning_rate": 2.342872914101312e-05, "loss": 0.0906, "step": 23337 }, { "epoch": 7.134821155609905, "grad_norm": 1.4098620414733887, "learning_rate": 2.342830453059318e-05, "loss": 0.1819, "step": 23338 }, { "epoch": 7.13512687251605, "grad_norm": 0.7304680943489075, "learning_rate": 2.3427879920173242e-05, "loss": 0.139, "step": 23339 }, { "epoch": 7.135432589422195, "grad_norm": 0.593011736869812, "learning_rate": 2.34274553097533e-05, "loss": 0.1334, "step": 23340 }, { "epoch": 7.13573830632834, "grad_norm": 0.6764066815376282, "learning_rate": 2.3427030699333363e-05, "loss": 0.1393, "step": 23341 }, { "epoch": 7.136044023234485, "grad_norm": 0.696047842502594, "learning_rate": 2.342660608891342e-05, "loss": 0.158, "step": 23342 }, { "epoch": 7.13634974014063, "grad_norm": 0.8870816826820374, "learning_rate": 2.342618147849348e-05, "loss": 0.1414, "step": 23343 }, { "epoch": 7.136655457046775, "grad_norm": 1.2082443237304688, "learning_rate": 2.3425756868073542e-05, "loss": 0.1779, "step": 23344 }, { "epoch": 7.13696117395292, "grad_norm": 1.5463789701461792, "learning_rate": 2.34253322576536e-05, "loss": 0.1696, "step": 23345 }, { "epoch": 7.137266890859064, "grad_norm": 1.6664338111877441, "learning_rate": 2.3424907647233663e-05, "loss": 0.1878, "step": 23346 }, { "epoch": 7.137572607765209, "grad_norm": 1.1175576448440552, "learning_rate": 2.342448303681372e-05, "loss": 0.2259, "step": 23347 }, { "epoch": 7.137878324671354, "grad_norm": 0.8715552687644958, "learning_rate": 2.3424058426393784e-05, "loss": 0.1311, "step": 23348 }, { "epoch": 7.1381840415774995, "grad_norm": 0.32900434732437134, "learning_rate": 2.3423633815973842e-05, "loss": 0.0828, "step": 23349 }, { "epoch": 7.138489758483644, "grad_norm": 0.5015802979469299, "learning_rate": 2.3423209205553904e-05, "loss": 0.0557, "step": 23350 }, { "epoch": 7.138795475389789, "grad_norm": 0.5751197934150696, "learning_rate": 2.3422784595133963e-05, "loss": 0.0751, "step": 23351 }, { "epoch": 7.139101192295934, "grad_norm": 0.26570963859558105, "learning_rate": 2.3422359984714025e-05, "loss": 0.0535, "step": 23352 }, { "epoch": 7.139406909202079, "grad_norm": 0.2834629416465759, "learning_rate": 2.3421935374294087e-05, "loss": 0.0463, "step": 23353 }, { "epoch": 7.139712626108224, "grad_norm": 0.22699296474456787, "learning_rate": 2.342151076387415e-05, "loss": 0.0775, "step": 23354 }, { "epoch": 7.140018343014368, "grad_norm": 0.2192831039428711, "learning_rate": 2.3421086153454208e-05, "loss": 0.0677, "step": 23355 }, { "epoch": 7.1403240599205136, "grad_norm": 0.20734979212284088, "learning_rate": 2.3420661543034267e-05, "loss": 0.0643, "step": 23356 }, { "epoch": 7.140629776826659, "grad_norm": 0.21715103089809418, "learning_rate": 2.342023693261433e-05, "loss": 0.0705, "step": 23357 }, { "epoch": 7.140935493732804, "grad_norm": 0.24755936861038208, "learning_rate": 2.3419812322194388e-05, "loss": 0.0763, "step": 23358 }, { "epoch": 7.141241210638948, "grad_norm": 0.48555636405944824, "learning_rate": 2.341938771177445e-05, "loss": 0.0732, "step": 23359 }, { "epoch": 7.141546927545093, "grad_norm": 0.6414110660552979, "learning_rate": 2.341896310135451e-05, "loss": 0.0874, "step": 23360 }, { "epoch": 7.141852644451238, "grad_norm": 0.352631539106369, "learning_rate": 2.341853849093457e-05, "loss": 0.129, "step": 23361 }, { "epoch": 7.142158361357383, "grad_norm": 0.42833444476127625, "learning_rate": 2.341811388051463e-05, "loss": 0.1003, "step": 23362 }, { "epoch": 7.142464078263528, "grad_norm": 0.44595053791999817, "learning_rate": 2.341768927009469e-05, "loss": 0.1116, "step": 23363 }, { "epoch": 7.142769795169673, "grad_norm": 0.3936826288700104, "learning_rate": 2.341726465967475e-05, "loss": 0.13, "step": 23364 }, { "epoch": 7.143075512075818, "grad_norm": 1.44236421585083, "learning_rate": 2.3416840049254812e-05, "loss": 0.1525, "step": 23365 }, { "epoch": 7.143381228981963, "grad_norm": 0.7849860191345215, "learning_rate": 2.341641543883487e-05, "loss": 0.1352, "step": 23366 }, { "epoch": 7.143686945888108, "grad_norm": 0.804653525352478, "learning_rate": 2.3415990828414933e-05, "loss": 0.169, "step": 23367 }, { "epoch": 7.143992662794252, "grad_norm": 0.9190242886543274, "learning_rate": 2.341556621799499e-05, "loss": 0.1659, "step": 23368 }, { "epoch": 7.144298379700397, "grad_norm": 1.6475063562393188, "learning_rate": 2.341514160757505e-05, "loss": 0.1859, "step": 23369 }, { "epoch": 7.1446040966065425, "grad_norm": 0.8101791739463806, "learning_rate": 2.3414716997155112e-05, "loss": 0.21, "step": 23370 }, { "epoch": 7.144909813512688, "grad_norm": 0.9632042646408081, "learning_rate": 2.341429238673517e-05, "loss": 0.1972, "step": 23371 }, { "epoch": 7.145215530418832, "grad_norm": 1.0264912843704224, "learning_rate": 2.3413867776315233e-05, "loss": 0.2515, "step": 23372 }, { "epoch": 7.145521247324977, "grad_norm": 0.327314168214798, "learning_rate": 2.3413443165895292e-05, "loss": 0.1481, "step": 23373 }, { "epoch": 7.145826964231122, "grad_norm": 0.23926763236522675, "learning_rate": 2.3413018555475354e-05, "loss": 0.0868, "step": 23374 }, { "epoch": 7.146132681137267, "grad_norm": 0.3098345994949341, "learning_rate": 2.3412593945055413e-05, "loss": 0.0531, "step": 23375 }, { "epoch": 7.146438398043411, "grad_norm": 0.30871322751045227, "learning_rate": 2.3412169334635475e-05, "loss": 0.0468, "step": 23376 }, { "epoch": 7.1467441149495565, "grad_norm": 4.928985595703125, "learning_rate": 2.3411744724215533e-05, "loss": 0.0399, "step": 23377 }, { "epoch": 7.147049831855702, "grad_norm": 0.153474360704422, "learning_rate": 2.3411320113795595e-05, "loss": 0.0445, "step": 23378 }, { "epoch": 7.147355548761847, "grad_norm": 0.5192882418632507, "learning_rate": 2.3410895503375654e-05, "loss": 0.0425, "step": 23379 }, { "epoch": 7.147661265667992, "grad_norm": 0.4853235185146332, "learning_rate": 2.3410470892955713e-05, "loss": 0.0729, "step": 23380 }, { "epoch": 7.147966982574136, "grad_norm": 0.3733968734741211, "learning_rate": 2.3410046282535775e-05, "loss": 0.0753, "step": 23381 }, { "epoch": 7.148272699480281, "grad_norm": 1.8031589984893799, "learning_rate": 2.3409621672115834e-05, "loss": 0.0446, "step": 23382 }, { "epoch": 7.148578416386426, "grad_norm": 0.370435893535614, "learning_rate": 2.3409197061695896e-05, "loss": 0.0689, "step": 23383 }, { "epoch": 7.148884133292571, "grad_norm": 0.463544100522995, "learning_rate": 2.3408772451275954e-05, "loss": 0.074, "step": 23384 }, { "epoch": 7.149189850198716, "grad_norm": 0.3242402672767639, "learning_rate": 2.3408347840856016e-05, "loss": 0.0744, "step": 23385 }, { "epoch": 7.149495567104861, "grad_norm": 0.28500935435295105, "learning_rate": 2.3407923230436075e-05, "loss": 0.0956, "step": 23386 }, { "epoch": 7.149801284011006, "grad_norm": 0.502831757068634, "learning_rate": 2.3407498620016137e-05, "loss": 0.1426, "step": 23387 }, { "epoch": 7.150107000917151, "grad_norm": 0.5672838091850281, "learning_rate": 2.3407074009596196e-05, "loss": 0.1144, "step": 23388 }, { "epoch": 7.150412717823295, "grad_norm": 0.9320820569992065, "learning_rate": 2.3406649399176258e-05, "loss": 0.1559, "step": 23389 }, { "epoch": 7.15071843472944, "grad_norm": 0.902898371219635, "learning_rate": 2.3406224788756317e-05, "loss": 0.1622, "step": 23390 }, { "epoch": 7.1510241516355855, "grad_norm": 0.6473668217658997, "learning_rate": 2.340580017833638e-05, "loss": 0.1597, "step": 23391 }, { "epoch": 7.151329868541731, "grad_norm": 0.5314350724220276, "learning_rate": 2.3405375567916438e-05, "loss": 0.1704, "step": 23392 }, { "epoch": 7.151635585447876, "grad_norm": 0.5236432552337646, "learning_rate": 2.3404950957496496e-05, "loss": 0.1464, "step": 23393 }, { "epoch": 7.15194130235402, "grad_norm": 0.5150747299194336, "learning_rate": 2.340452634707656e-05, "loss": 0.1527, "step": 23394 }, { "epoch": 7.152247019260165, "grad_norm": 0.9196963906288147, "learning_rate": 2.3404101736656617e-05, "loss": 0.1863, "step": 23395 }, { "epoch": 7.15255273616631, "grad_norm": 1.5598653554916382, "learning_rate": 2.340367712623668e-05, "loss": 0.2091, "step": 23396 }, { "epoch": 7.152858453072455, "grad_norm": 1.2005016803741455, "learning_rate": 2.3403252515816738e-05, "loss": 0.2546, "step": 23397 }, { "epoch": 7.1531641699785995, "grad_norm": 0.3426981568336487, "learning_rate": 2.34028279053968e-05, "loss": 0.1384, "step": 23398 }, { "epoch": 7.153469886884745, "grad_norm": 0.16068808734416962, "learning_rate": 2.340240329497686e-05, "loss": 0.0855, "step": 23399 }, { "epoch": 7.15377560379089, "grad_norm": 0.22008459270000458, "learning_rate": 2.340197868455692e-05, "loss": 0.0605, "step": 23400 }, { "epoch": 7.154081320697035, "grad_norm": 0.18149153888225555, "learning_rate": 2.340155407413698e-05, "loss": 0.052, "step": 23401 }, { "epoch": 7.154387037603179, "grad_norm": 0.26743149757385254, "learning_rate": 2.340112946371704e-05, "loss": 0.0496, "step": 23402 }, { "epoch": 7.154692754509324, "grad_norm": 0.49076616764068604, "learning_rate": 2.34007048532971e-05, "loss": 0.0587, "step": 23403 }, { "epoch": 7.154998471415469, "grad_norm": 0.21376174688339233, "learning_rate": 2.3400280242877162e-05, "loss": 0.0642, "step": 23404 }, { "epoch": 7.155304188321614, "grad_norm": 0.2696560323238373, "learning_rate": 2.339985563245722e-05, "loss": 0.0591, "step": 23405 }, { "epoch": 7.1556099052277595, "grad_norm": 0.42446136474609375, "learning_rate": 2.339943102203728e-05, "loss": 0.0942, "step": 23406 }, { "epoch": 7.155915622133904, "grad_norm": 0.8028692603111267, "learning_rate": 2.3399006411617342e-05, "loss": 0.0741, "step": 23407 }, { "epoch": 7.156221339040049, "grad_norm": 0.7726205587387085, "learning_rate": 2.33985818011974e-05, "loss": 0.0671, "step": 23408 }, { "epoch": 7.156527055946194, "grad_norm": 0.32394957542419434, "learning_rate": 2.3398157190777463e-05, "loss": 0.0963, "step": 23409 }, { "epoch": 7.156832772852339, "grad_norm": 0.3274622857570648, "learning_rate": 2.339773258035752e-05, "loss": 0.1007, "step": 23410 }, { "epoch": 7.157138489758483, "grad_norm": 0.61168372631073, "learning_rate": 2.3397307969937583e-05, "loss": 0.1148, "step": 23411 }, { "epoch": 7.157444206664628, "grad_norm": 1.3473035097122192, "learning_rate": 2.3396883359517642e-05, "loss": 0.1144, "step": 23412 }, { "epoch": 7.1577499235707736, "grad_norm": 0.5908524394035339, "learning_rate": 2.3396458749097704e-05, "loss": 0.1488, "step": 23413 }, { "epoch": 7.158055640476919, "grad_norm": 1.1037670373916626, "learning_rate": 2.3396034138677763e-05, "loss": 0.138, "step": 23414 }, { "epoch": 7.158361357383063, "grad_norm": 0.399033784866333, "learning_rate": 2.3395609528257825e-05, "loss": 0.1363, "step": 23415 }, { "epoch": 7.158667074289208, "grad_norm": 0.5441644787788391, "learning_rate": 2.3395184917837884e-05, "loss": 0.1833, "step": 23416 }, { "epoch": 7.158972791195353, "grad_norm": 0.7664006948471069, "learning_rate": 2.3394760307417946e-05, "loss": 0.1519, "step": 23417 }, { "epoch": 7.159278508101498, "grad_norm": 0.9772031307220459, "learning_rate": 2.3394335696998004e-05, "loss": 0.1818, "step": 23418 }, { "epoch": 7.159584225007643, "grad_norm": 0.4942556619644165, "learning_rate": 2.3393911086578063e-05, "loss": 0.1608, "step": 23419 }, { "epoch": 7.159889941913788, "grad_norm": 1.0102193355560303, "learning_rate": 2.3393486476158125e-05, "loss": 0.1534, "step": 23420 }, { "epoch": 7.160195658819933, "grad_norm": 0.6506147980690002, "learning_rate": 2.3393061865738184e-05, "loss": 0.2015, "step": 23421 }, { "epoch": 7.160501375726078, "grad_norm": 1.9979287385940552, "learning_rate": 2.3392637255318246e-05, "loss": 0.1976, "step": 23422 }, { "epoch": 7.160807092632223, "grad_norm": 0.2930675446987152, "learning_rate": 2.3392212644898305e-05, "loss": 0.1216, "step": 23423 }, { "epoch": 7.161112809538367, "grad_norm": 0.260078489780426, "learning_rate": 2.3391788034478367e-05, "loss": 0.0666, "step": 23424 }, { "epoch": 7.161418526444512, "grad_norm": 0.25905802845954895, "learning_rate": 2.3391363424058425e-05, "loss": 0.0846, "step": 23425 }, { "epoch": 7.161724243350657, "grad_norm": 0.5802618265151978, "learning_rate": 2.3390938813638488e-05, "loss": 0.0622, "step": 23426 }, { "epoch": 7.1620299602568025, "grad_norm": 0.44580426812171936, "learning_rate": 2.3390514203218546e-05, "loss": 0.0573, "step": 23427 }, { "epoch": 7.162335677162947, "grad_norm": 0.7273748517036438, "learning_rate": 2.339008959279861e-05, "loss": 0.0488, "step": 23428 }, { "epoch": 7.162641394069092, "grad_norm": 0.28334876894950867, "learning_rate": 2.3389664982378667e-05, "loss": 0.0528, "step": 23429 }, { "epoch": 7.162947110975237, "grad_norm": 0.22055955231189728, "learning_rate": 2.338924037195873e-05, "loss": 0.0684, "step": 23430 }, { "epoch": 7.163252827881382, "grad_norm": 0.30757221579551697, "learning_rate": 2.3388815761538788e-05, "loss": 0.072, "step": 23431 }, { "epoch": 7.163558544787527, "grad_norm": 0.2521008253097534, "learning_rate": 2.3388391151118847e-05, "loss": 0.0526, "step": 23432 }, { "epoch": 7.163864261693671, "grad_norm": 0.3559456765651703, "learning_rate": 2.338796654069891e-05, "loss": 0.1057, "step": 23433 }, { "epoch": 7.1641699785998165, "grad_norm": 0.6859438419342041, "learning_rate": 2.3387541930278967e-05, "loss": 0.0777, "step": 23434 }, { "epoch": 7.164475695505962, "grad_norm": 3.493546485900879, "learning_rate": 2.338711731985903e-05, "loss": 0.0915, "step": 23435 }, { "epoch": 7.164781412412107, "grad_norm": 0.3550647795200348, "learning_rate": 2.3386692709439088e-05, "loss": 0.1138, "step": 23436 }, { "epoch": 7.165087129318251, "grad_norm": 0.46938690543174744, "learning_rate": 2.338626809901915e-05, "loss": 0.1023, "step": 23437 }, { "epoch": 7.165392846224396, "grad_norm": 0.906051754951477, "learning_rate": 2.338584348859921e-05, "loss": 0.1213, "step": 23438 }, { "epoch": 7.165698563130541, "grad_norm": 0.5548027753829956, "learning_rate": 2.338541887817927e-05, "loss": 0.1284, "step": 23439 }, { "epoch": 7.166004280036686, "grad_norm": 0.5707918405532837, "learning_rate": 2.338499426775933e-05, "loss": 0.137, "step": 23440 }, { "epoch": 7.1663099969428306, "grad_norm": 1.069769263267517, "learning_rate": 2.3384569657339392e-05, "loss": 0.1351, "step": 23441 }, { "epoch": 7.166615713848976, "grad_norm": 2.984057664871216, "learning_rate": 2.338414504691945e-05, "loss": 0.1545, "step": 23442 }, { "epoch": 7.166921430755121, "grad_norm": 0.5194962024688721, "learning_rate": 2.3383720436499513e-05, "loss": 0.1546, "step": 23443 }, { "epoch": 7.167227147661266, "grad_norm": 0.6017947793006897, "learning_rate": 2.338329582607957e-05, "loss": 0.1598, "step": 23444 }, { "epoch": 7.167532864567411, "grad_norm": 0.5786232352256775, "learning_rate": 2.338287121565963e-05, "loss": 0.1636, "step": 23445 }, { "epoch": 7.167838581473555, "grad_norm": 0.5544465780258179, "learning_rate": 2.3382446605239692e-05, "loss": 0.1758, "step": 23446 }, { "epoch": 7.1681442983797, "grad_norm": 1.7820110321044922, "learning_rate": 2.338202199481975e-05, "loss": 0.2125, "step": 23447 }, { "epoch": 7.1684500152858455, "grad_norm": 0.45044657588005066, "learning_rate": 2.3381597384399813e-05, "loss": 0.1336, "step": 23448 }, { "epoch": 7.168755732191991, "grad_norm": 0.1926298290491104, "learning_rate": 2.338117277397987e-05, "loss": 0.0784, "step": 23449 }, { "epoch": 7.169061449098135, "grad_norm": 0.38860198855400085, "learning_rate": 2.3380748163559934e-05, "loss": 0.0887, "step": 23450 }, { "epoch": 7.16936716600428, "grad_norm": 0.21382705867290497, "learning_rate": 2.3380323553139992e-05, "loss": 0.067, "step": 23451 }, { "epoch": 7.169672882910425, "grad_norm": 0.39091649651527405, "learning_rate": 2.3379898942720054e-05, "loss": 0.0506, "step": 23452 }, { "epoch": 7.16997859981657, "grad_norm": 0.36572951078414917, "learning_rate": 2.3379474332300113e-05, "loss": 0.0958, "step": 23453 }, { "epoch": 7.170284316722714, "grad_norm": 0.22710393369197845, "learning_rate": 2.3379049721880175e-05, "loss": 0.0754, "step": 23454 }, { "epoch": 7.1705900336288595, "grad_norm": 0.36190563440322876, "learning_rate": 2.3378625111460237e-05, "loss": 0.0441, "step": 23455 }, { "epoch": 7.170895750535005, "grad_norm": 0.2798950970172882, "learning_rate": 2.33782005010403e-05, "loss": 0.0929, "step": 23456 }, { "epoch": 7.17120146744115, "grad_norm": 0.9765605330467224, "learning_rate": 2.3377775890620358e-05, "loss": 0.061, "step": 23457 }, { "epoch": 7.171507184347295, "grad_norm": 0.48029184341430664, "learning_rate": 2.3377351280200417e-05, "loss": 0.0575, "step": 23458 }, { "epoch": 7.171812901253439, "grad_norm": 0.3164255917072296, "learning_rate": 2.337692666978048e-05, "loss": 0.1163, "step": 23459 }, { "epoch": 7.172118618159584, "grad_norm": 0.24859283864498138, "learning_rate": 2.3376502059360538e-05, "loss": 0.0711, "step": 23460 }, { "epoch": 7.172424335065729, "grad_norm": 0.3582904040813446, "learning_rate": 2.33760774489406e-05, "loss": 0.1225, "step": 23461 }, { "epoch": 7.172730051971874, "grad_norm": 0.6101423501968384, "learning_rate": 2.337565283852066e-05, "loss": 0.0953, "step": 23462 }, { "epoch": 7.173035768878019, "grad_norm": 0.836677610874176, "learning_rate": 2.337522822810072e-05, "loss": 0.1325, "step": 23463 }, { "epoch": 7.173341485784164, "grad_norm": 0.9694917798042297, "learning_rate": 2.337480361768078e-05, "loss": 0.1321, "step": 23464 }, { "epoch": 7.173647202690309, "grad_norm": 0.4746935963630676, "learning_rate": 2.337437900726084e-05, "loss": 0.1536, "step": 23465 }, { "epoch": 7.173952919596454, "grad_norm": 1.8000982999801636, "learning_rate": 2.33739543968409e-05, "loss": 0.163, "step": 23466 }, { "epoch": 7.174258636502598, "grad_norm": 0.9712826013565063, "learning_rate": 2.3373529786420962e-05, "loss": 0.1689, "step": 23467 }, { "epoch": 7.174564353408743, "grad_norm": 0.7167025208473206, "learning_rate": 2.337310517600102e-05, "loss": 0.1616, "step": 23468 }, { "epoch": 7.174870070314888, "grad_norm": 0.7523459792137146, "learning_rate": 2.3372680565581083e-05, "loss": 0.1505, "step": 23469 }, { "epoch": 7.1751757872210336, "grad_norm": 0.7821638584136963, "learning_rate": 2.337225595516114e-05, "loss": 0.1636, "step": 23470 }, { "epoch": 7.175481504127179, "grad_norm": 0.7917117476463318, "learning_rate": 2.33718313447412e-05, "loss": 0.1606, "step": 23471 }, { "epoch": 7.175787221033323, "grad_norm": 3.2409090995788574, "learning_rate": 2.3371406734321262e-05, "loss": 0.1971, "step": 23472 }, { "epoch": 7.176092937939468, "grad_norm": 1.1813331842422485, "learning_rate": 2.337098212390132e-05, "loss": 0.1296, "step": 23473 }, { "epoch": 7.176398654845613, "grad_norm": 0.4547446072101593, "learning_rate": 2.3370557513481383e-05, "loss": 0.0662, "step": 23474 }, { "epoch": 7.176704371751758, "grad_norm": 0.34142613410949707, "learning_rate": 2.3370132903061442e-05, "loss": 0.0856, "step": 23475 }, { "epoch": 7.1770100886579025, "grad_norm": 0.2916230261325836, "learning_rate": 2.3369708292641504e-05, "loss": 0.0536, "step": 23476 }, { "epoch": 7.177315805564048, "grad_norm": 0.6702030301094055, "learning_rate": 2.3369283682221563e-05, "loss": 0.0562, "step": 23477 }, { "epoch": 7.177621522470193, "grad_norm": 0.37804776430130005, "learning_rate": 2.3368859071801625e-05, "loss": 0.0609, "step": 23478 }, { "epoch": 7.177927239376338, "grad_norm": 0.3770039677619934, "learning_rate": 2.3368434461381683e-05, "loss": 0.0623, "step": 23479 }, { "epoch": 7.178232956282482, "grad_norm": 0.7993656396865845, "learning_rate": 2.3368009850961745e-05, "loss": 0.0654, "step": 23480 }, { "epoch": 7.178538673188627, "grad_norm": 0.8595466613769531, "learning_rate": 2.3367585240541804e-05, "loss": 0.0656, "step": 23481 }, { "epoch": 7.178844390094772, "grad_norm": 0.462790846824646, "learning_rate": 2.3367160630121866e-05, "loss": 0.0979, "step": 23482 }, { "epoch": 7.179150107000917, "grad_norm": 0.6698777675628662, "learning_rate": 2.3366736019701925e-05, "loss": 0.0824, "step": 23483 }, { "epoch": 7.1794558239070625, "grad_norm": 0.18763791024684906, "learning_rate": 2.3366311409281984e-05, "loss": 0.0609, "step": 23484 }, { "epoch": 7.179761540813207, "grad_norm": 0.564385712146759, "learning_rate": 2.3365886798862046e-05, "loss": 0.0879, "step": 23485 }, { "epoch": 7.180067257719352, "grad_norm": 0.3195793628692627, "learning_rate": 2.3365462188442104e-05, "loss": 0.1013, "step": 23486 }, { "epoch": 7.180372974625497, "grad_norm": 0.4421844780445099, "learning_rate": 2.3365037578022166e-05, "loss": 0.1283, "step": 23487 }, { "epoch": 7.180678691531642, "grad_norm": 0.5311647057533264, "learning_rate": 2.3364612967602225e-05, "loss": 0.141, "step": 23488 }, { "epoch": 7.180984408437786, "grad_norm": 0.5695652365684509, "learning_rate": 2.3364188357182287e-05, "loss": 0.1431, "step": 23489 }, { "epoch": 7.181290125343931, "grad_norm": 0.45085665583610535, "learning_rate": 2.3363763746762346e-05, "loss": 0.1528, "step": 23490 }, { "epoch": 7.1815958422500765, "grad_norm": 1.0345439910888672, "learning_rate": 2.3363339136342408e-05, "loss": 0.1742, "step": 23491 }, { "epoch": 7.181901559156222, "grad_norm": 1.1505788564682007, "learning_rate": 2.3362914525922467e-05, "loss": 0.1485, "step": 23492 }, { "epoch": 7.182207276062366, "grad_norm": 1.0843051671981812, "learning_rate": 2.336248991550253e-05, "loss": 0.1749, "step": 23493 }, { "epoch": 7.182512992968511, "grad_norm": 0.6726582646369934, "learning_rate": 2.3362065305082588e-05, "loss": 0.1787, "step": 23494 }, { "epoch": 7.182818709874656, "grad_norm": 0.5421465635299683, "learning_rate": 2.3361640694662646e-05, "loss": 0.1498, "step": 23495 }, { "epoch": 7.183124426780801, "grad_norm": 0.8908745646476746, "learning_rate": 2.336121608424271e-05, "loss": 0.1544, "step": 23496 }, { "epoch": 7.183430143686946, "grad_norm": 1.3951467275619507, "learning_rate": 2.3360791473822767e-05, "loss": 0.2075, "step": 23497 }, { "epoch": 7.1837358605930905, "grad_norm": 0.29271095991134644, "learning_rate": 2.336036686340283e-05, "loss": 0.1205, "step": 23498 }, { "epoch": 7.184041577499236, "grad_norm": 0.22516010701656342, "learning_rate": 2.3359942252982888e-05, "loss": 0.0851, "step": 23499 }, { "epoch": 7.184347294405381, "grad_norm": 0.25433430075645447, "learning_rate": 2.335951764256295e-05, "loss": 0.0571, "step": 23500 }, { "epoch": 7.184653011311526, "grad_norm": 0.26677805185317993, "learning_rate": 2.335909303214301e-05, "loss": 0.0696, "step": 23501 }, { "epoch": 7.18495872821767, "grad_norm": 0.2703262269496918, "learning_rate": 2.335866842172307e-05, "loss": 0.0519, "step": 23502 }, { "epoch": 7.185264445123815, "grad_norm": 0.2840028405189514, "learning_rate": 2.335824381130313e-05, "loss": 0.0442, "step": 23503 }, { "epoch": 7.18557016202996, "grad_norm": 0.3368328809738159, "learning_rate": 2.335781920088319e-05, "loss": 0.081, "step": 23504 }, { "epoch": 7.1858758789361055, "grad_norm": 0.37254858016967773, "learning_rate": 2.335739459046325e-05, "loss": 0.0588, "step": 23505 }, { "epoch": 7.18618159584225, "grad_norm": 0.5935482978820801, "learning_rate": 2.3356969980043312e-05, "loss": 0.0561, "step": 23506 }, { "epoch": 7.186487312748395, "grad_norm": 0.19721931219100952, "learning_rate": 2.335654536962337e-05, "loss": 0.0593, "step": 23507 }, { "epoch": 7.18679302965454, "grad_norm": 0.26686426997184753, "learning_rate": 2.335612075920343e-05, "loss": 0.0807, "step": 23508 }, { "epoch": 7.187098746560685, "grad_norm": 0.240223690867424, "learning_rate": 2.3355696148783492e-05, "loss": 0.0933, "step": 23509 }, { "epoch": 7.18740446346683, "grad_norm": 0.8069018721580505, "learning_rate": 2.335527153836355e-05, "loss": 0.0964, "step": 23510 }, { "epoch": 7.187710180372974, "grad_norm": 0.24680493772029877, "learning_rate": 2.3354846927943613e-05, "loss": 0.0996, "step": 23511 }, { "epoch": 7.1880158972791195, "grad_norm": 0.4726537764072418, "learning_rate": 2.335442231752367e-05, "loss": 0.1214, "step": 23512 }, { "epoch": 7.188321614185265, "grad_norm": 0.5991426110267639, "learning_rate": 2.3353997707103733e-05, "loss": 0.1307, "step": 23513 }, { "epoch": 7.18862733109141, "grad_norm": 0.6080889105796814, "learning_rate": 2.3353573096683792e-05, "loss": 0.1622, "step": 23514 }, { "epoch": 7.188933047997554, "grad_norm": 0.4634912312030792, "learning_rate": 2.3353148486263854e-05, "loss": 0.1452, "step": 23515 }, { "epoch": 7.189238764903699, "grad_norm": 0.5445618629455566, "learning_rate": 2.3352723875843913e-05, "loss": 0.1605, "step": 23516 }, { "epoch": 7.189544481809844, "grad_norm": 0.7616962790489197, "learning_rate": 2.3352299265423975e-05, "loss": 0.1681, "step": 23517 }, { "epoch": 7.189850198715989, "grad_norm": 0.4954245686531067, "learning_rate": 2.3351874655004034e-05, "loss": 0.1558, "step": 23518 }, { "epoch": 7.1901559156221335, "grad_norm": 0.787028968334198, "learning_rate": 2.3351450044584096e-05, "loss": 0.1483, "step": 23519 }, { "epoch": 7.190461632528279, "grad_norm": 0.8455654382705688, "learning_rate": 2.3351025434164154e-05, "loss": 0.1683, "step": 23520 }, { "epoch": 7.190767349434424, "grad_norm": 1.6702255010604858, "learning_rate": 2.3350600823744213e-05, "loss": 0.1653, "step": 23521 }, { "epoch": 7.191073066340569, "grad_norm": 1.619771957397461, "learning_rate": 2.3350176213324275e-05, "loss": 0.2292, "step": 23522 }, { "epoch": 7.191378783246714, "grad_norm": 0.7575592994689941, "learning_rate": 2.3349751602904334e-05, "loss": 0.1248, "step": 23523 }, { "epoch": 7.191684500152858, "grad_norm": 0.2846376597881317, "learning_rate": 2.3349326992484396e-05, "loss": 0.0967, "step": 23524 }, { "epoch": 7.191990217059003, "grad_norm": 0.39211925864219666, "learning_rate": 2.3348902382064455e-05, "loss": 0.0575, "step": 23525 }, { "epoch": 7.192295933965148, "grad_norm": 0.1727498173713684, "learning_rate": 2.3348477771644517e-05, "loss": 0.0393, "step": 23526 }, { "epoch": 7.1926016508712936, "grad_norm": 0.18861855566501617, "learning_rate": 2.3348053161224575e-05, "loss": 0.043, "step": 23527 }, { "epoch": 7.192907367777438, "grad_norm": 0.3570173382759094, "learning_rate": 2.3347628550804638e-05, "loss": 0.0614, "step": 23528 }, { "epoch": 7.193213084683583, "grad_norm": 0.12790638208389282, "learning_rate": 2.3347203940384696e-05, "loss": 0.0376, "step": 23529 }, { "epoch": 7.193518801589728, "grad_norm": 0.24693183600902557, "learning_rate": 2.334677932996476e-05, "loss": 0.0607, "step": 23530 }, { "epoch": 7.193824518495873, "grad_norm": 0.37179189920425415, "learning_rate": 2.3346354719544817e-05, "loss": 0.0748, "step": 23531 }, { "epoch": 7.194130235402017, "grad_norm": 0.18821083009243011, "learning_rate": 2.334593010912488e-05, "loss": 0.0435, "step": 23532 }, { "epoch": 7.1944359523081625, "grad_norm": 0.3019830286502838, "learning_rate": 2.3345505498704938e-05, "loss": 0.0607, "step": 23533 }, { "epoch": 7.194741669214308, "grad_norm": 0.22361727058887482, "learning_rate": 2.3345080888284997e-05, "loss": 0.0659, "step": 23534 }, { "epoch": 7.195047386120453, "grad_norm": 1.2703369855880737, "learning_rate": 2.334465627786506e-05, "loss": 0.1078, "step": 23535 }, { "epoch": 7.195353103026598, "grad_norm": 0.6473332047462463, "learning_rate": 2.3344231667445117e-05, "loss": 0.1137, "step": 23536 }, { "epoch": 7.195658819932742, "grad_norm": 0.877281904220581, "learning_rate": 2.334380705702518e-05, "loss": 0.1191, "step": 23537 }, { "epoch": 7.195964536838887, "grad_norm": 0.37966033816337585, "learning_rate": 2.3343382446605238e-05, "loss": 0.1077, "step": 23538 }, { "epoch": 7.196270253745032, "grad_norm": 0.49361035227775574, "learning_rate": 2.33429578361853e-05, "loss": 0.1519, "step": 23539 }, { "epoch": 7.196575970651177, "grad_norm": 0.6423585414886475, "learning_rate": 2.334253322576536e-05, "loss": 0.1663, "step": 23540 }, { "epoch": 7.196881687557322, "grad_norm": 0.7748610377311707, "learning_rate": 2.334210861534542e-05, "loss": 0.1357, "step": 23541 }, { "epoch": 7.197187404463467, "grad_norm": 0.8166571855545044, "learning_rate": 2.334168400492548e-05, "loss": 0.1899, "step": 23542 }, { "epoch": 7.197493121369612, "grad_norm": 0.7871077656745911, "learning_rate": 2.3341259394505542e-05, "loss": 0.1433, "step": 23543 }, { "epoch": 7.197798838275757, "grad_norm": 0.89417564868927, "learning_rate": 2.33408347840856e-05, "loss": 0.1768, "step": 23544 }, { "epoch": 7.198104555181901, "grad_norm": 1.675459623336792, "learning_rate": 2.3340410173665663e-05, "loss": 0.1693, "step": 23545 }, { "epoch": 7.198410272088046, "grad_norm": 0.5127987861633301, "learning_rate": 2.333998556324572e-05, "loss": 0.1604, "step": 23546 }, { "epoch": 7.198715988994191, "grad_norm": 1.2080742120742798, "learning_rate": 2.333956095282578e-05, "loss": 0.1893, "step": 23547 }, { "epoch": 7.1990217059003365, "grad_norm": 0.44982555508613586, "learning_rate": 2.3339136342405842e-05, "loss": 0.1368, "step": 23548 }, { "epoch": 7.199327422806482, "grad_norm": 0.26386913657188416, "learning_rate": 2.33387117319859e-05, "loss": 0.0939, "step": 23549 }, { "epoch": 7.199633139712626, "grad_norm": 0.5266050100326538, "learning_rate": 2.3338287121565963e-05, "loss": 0.0832, "step": 23550 }, { "epoch": 7.199938856618771, "grad_norm": 0.24232913553714752, "learning_rate": 2.333786251114602e-05, "loss": 0.0517, "step": 23551 }, { "epoch": 7.200244573524916, "grad_norm": 0.25076162815093994, "learning_rate": 2.3337437900726084e-05, "loss": 0.0596, "step": 23552 }, { "epoch": 7.200550290431061, "grad_norm": 0.1217290461063385, "learning_rate": 2.3337013290306142e-05, "loss": 0.0412, "step": 23553 }, { "epoch": 7.200856007337205, "grad_norm": 0.20172807574272156, "learning_rate": 2.3336588679886204e-05, "loss": 0.0524, "step": 23554 }, { "epoch": 7.2011617242433505, "grad_norm": 0.5973860621452332, "learning_rate": 2.3336164069466263e-05, "loss": 0.0615, "step": 23555 }, { "epoch": 7.201467441149496, "grad_norm": 0.1759631633758545, "learning_rate": 2.3335739459046325e-05, "loss": 0.0454, "step": 23556 }, { "epoch": 7.201773158055641, "grad_norm": 0.28343310952186584, "learning_rate": 2.3335314848626387e-05, "loss": 0.0772, "step": 23557 }, { "epoch": 7.202078874961785, "grad_norm": 0.4938347339630127, "learning_rate": 2.333489023820645e-05, "loss": 0.0857, "step": 23558 }, { "epoch": 7.20238459186793, "grad_norm": 0.2072669118642807, "learning_rate": 2.3334465627786508e-05, "loss": 0.0631, "step": 23559 }, { "epoch": 7.202690308774075, "grad_norm": 0.7502927184104919, "learning_rate": 2.3334041017366567e-05, "loss": 0.0816, "step": 23560 }, { "epoch": 7.20299602568022, "grad_norm": 0.6558435559272766, "learning_rate": 2.333361640694663e-05, "loss": 0.1055, "step": 23561 }, { "epoch": 7.2033017425863655, "grad_norm": 2.159581422805786, "learning_rate": 2.3333191796526688e-05, "loss": 0.0961, "step": 23562 }, { "epoch": 7.20360745949251, "grad_norm": 0.7611197233200073, "learning_rate": 2.333276718610675e-05, "loss": 0.1258, "step": 23563 }, { "epoch": 7.203913176398655, "grad_norm": 0.46294793486595154, "learning_rate": 2.333234257568681e-05, "loss": 0.1131, "step": 23564 }, { "epoch": 7.2042188933048, "grad_norm": 0.7712613344192505, "learning_rate": 2.333191796526687e-05, "loss": 0.1667, "step": 23565 }, { "epoch": 7.204524610210945, "grad_norm": 1.085827112197876, "learning_rate": 2.333149335484693e-05, "loss": 0.1608, "step": 23566 }, { "epoch": 7.204830327117089, "grad_norm": 0.43270784616470337, "learning_rate": 2.333106874442699e-05, "loss": 0.166, "step": 23567 }, { "epoch": 7.205136044023234, "grad_norm": 0.6703324317932129, "learning_rate": 2.333064413400705e-05, "loss": 0.1551, "step": 23568 }, { "epoch": 7.2054417609293795, "grad_norm": 0.8114317059516907, "learning_rate": 2.3330219523587112e-05, "loss": 0.15, "step": 23569 }, { "epoch": 7.205747477835525, "grad_norm": 0.8681427240371704, "learning_rate": 2.332979491316717e-05, "loss": 0.1563, "step": 23570 }, { "epoch": 7.206053194741669, "grad_norm": 1.8585891723632812, "learning_rate": 2.3329370302747233e-05, "loss": 0.1431, "step": 23571 }, { "epoch": 7.206358911647814, "grad_norm": 1.4074339866638184, "learning_rate": 2.332894569232729e-05, "loss": 0.1935, "step": 23572 }, { "epoch": 7.206664628553959, "grad_norm": 0.8556516170501709, "learning_rate": 2.332852108190735e-05, "loss": 0.1557, "step": 23573 }, { "epoch": 7.206970345460104, "grad_norm": 1.3702579736709595, "learning_rate": 2.3328096471487412e-05, "loss": 0.0959, "step": 23574 }, { "epoch": 7.207276062366249, "grad_norm": 0.3689086139202118, "learning_rate": 2.332767186106747e-05, "loss": 0.0648, "step": 23575 }, { "epoch": 7.2075817792723935, "grad_norm": 0.26556479930877686, "learning_rate": 2.3327247250647533e-05, "loss": 0.05, "step": 23576 }, { "epoch": 7.207887496178539, "grad_norm": 0.24363449215888977, "learning_rate": 2.3326822640227592e-05, "loss": 0.0463, "step": 23577 }, { "epoch": 7.208193213084684, "grad_norm": 0.15647709369659424, "learning_rate": 2.3326398029807654e-05, "loss": 0.0384, "step": 23578 }, { "epoch": 7.208498929990829, "grad_norm": 0.20803984999656677, "learning_rate": 2.3325973419387713e-05, "loss": 0.0487, "step": 23579 }, { "epoch": 7.208804646896973, "grad_norm": 0.5423885583877563, "learning_rate": 2.3325548808967775e-05, "loss": 0.0735, "step": 23580 }, { "epoch": 7.209110363803118, "grad_norm": 0.37016427516937256, "learning_rate": 2.3325124198547833e-05, "loss": 0.0424, "step": 23581 }, { "epoch": 7.209416080709263, "grad_norm": 0.22967946529388428, "learning_rate": 2.3324699588127895e-05, "loss": 0.0544, "step": 23582 }, { "epoch": 7.209721797615408, "grad_norm": 0.5105550289154053, "learning_rate": 2.3324274977707954e-05, "loss": 0.1145, "step": 23583 }, { "epoch": 7.210027514521553, "grad_norm": 0.4131813645362854, "learning_rate": 2.3323850367288016e-05, "loss": 0.0711, "step": 23584 }, { "epoch": 7.210333231427698, "grad_norm": 0.2530306279659271, "learning_rate": 2.3323425756868075e-05, "loss": 0.0934, "step": 23585 }, { "epoch": 7.210638948333843, "grad_norm": 0.27912285923957825, "learning_rate": 2.3323001146448134e-05, "loss": 0.1102, "step": 23586 }, { "epoch": 7.210944665239988, "grad_norm": 0.4756210744380951, "learning_rate": 2.3322576536028196e-05, "loss": 0.0899, "step": 23587 }, { "epoch": 7.211250382146133, "grad_norm": 0.4314800202846527, "learning_rate": 2.3322151925608254e-05, "loss": 0.1291, "step": 23588 }, { "epoch": 7.211556099052277, "grad_norm": 0.5992181301116943, "learning_rate": 2.3321727315188317e-05, "loss": 0.1441, "step": 23589 }, { "epoch": 7.2118618159584225, "grad_norm": 0.5746921896934509, "learning_rate": 2.3321302704768375e-05, "loss": 0.1463, "step": 23590 }, { "epoch": 7.212167532864568, "grad_norm": 1.5133709907531738, "learning_rate": 2.3320878094348437e-05, "loss": 0.1388, "step": 23591 }, { "epoch": 7.212473249770713, "grad_norm": 0.4399428069591522, "learning_rate": 2.3320453483928496e-05, "loss": 0.1513, "step": 23592 }, { "epoch": 7.212778966676857, "grad_norm": 0.9891401529312134, "learning_rate": 2.3320028873508558e-05, "loss": 0.1579, "step": 23593 }, { "epoch": 7.213084683583002, "grad_norm": 0.533439576625824, "learning_rate": 2.3319604263088617e-05, "loss": 0.1537, "step": 23594 }, { "epoch": 7.213390400489147, "grad_norm": 0.9828598499298096, "learning_rate": 2.331917965266868e-05, "loss": 0.204, "step": 23595 }, { "epoch": 7.213696117395292, "grad_norm": 0.707913875579834, "learning_rate": 2.3318755042248738e-05, "loss": 0.1739, "step": 23596 }, { "epoch": 7.2140018343014365, "grad_norm": 1.485708236694336, "learning_rate": 2.33183304318288e-05, "loss": 0.217, "step": 23597 }, { "epoch": 7.214307551207582, "grad_norm": 0.370450884103775, "learning_rate": 2.331790582140886e-05, "loss": 0.1255, "step": 23598 }, { "epoch": 7.214613268113727, "grad_norm": 0.6747261881828308, "learning_rate": 2.3317481210988917e-05, "loss": 0.0797, "step": 23599 }, { "epoch": 7.214918985019872, "grad_norm": 2.817863941192627, "learning_rate": 2.331705660056898e-05, "loss": 0.0739, "step": 23600 }, { "epoch": 7.215224701926017, "grad_norm": 0.5097647309303284, "learning_rate": 2.3316631990149038e-05, "loss": 0.054, "step": 23601 }, { "epoch": 7.215530418832161, "grad_norm": 0.21368591487407684, "learning_rate": 2.33162073797291e-05, "loss": 0.0456, "step": 23602 }, { "epoch": 7.215836135738306, "grad_norm": 0.32262060046195984, "learning_rate": 2.331578276930916e-05, "loss": 0.0591, "step": 23603 }, { "epoch": 7.216141852644451, "grad_norm": 0.23970291018486023, "learning_rate": 2.331535815888922e-05, "loss": 0.0455, "step": 23604 }, { "epoch": 7.2164475695505965, "grad_norm": 0.22784215211868286, "learning_rate": 2.331493354846928e-05, "loss": 0.0729, "step": 23605 }, { "epoch": 7.216753286456741, "grad_norm": 0.25229325890541077, "learning_rate": 2.331450893804934e-05, "loss": 0.058, "step": 23606 }, { "epoch": 7.217059003362886, "grad_norm": 0.23661941289901733, "learning_rate": 2.33140843276294e-05, "loss": 0.0529, "step": 23607 }, { "epoch": 7.217364720269031, "grad_norm": 0.3268624246120453, "learning_rate": 2.3313659717209462e-05, "loss": 0.0995, "step": 23608 }, { "epoch": 7.217670437175176, "grad_norm": 0.5911290645599365, "learning_rate": 2.331323510678952e-05, "loss": 0.0787, "step": 23609 }, { "epoch": 7.21797615408132, "grad_norm": 0.41106441617012024, "learning_rate": 2.331281049636958e-05, "loss": 0.1286, "step": 23610 }, { "epoch": 7.218281870987465, "grad_norm": 0.3164248764514923, "learning_rate": 2.3312385885949642e-05, "loss": 0.0782, "step": 23611 }, { "epoch": 7.2185875878936105, "grad_norm": 0.4245373010635376, "learning_rate": 2.33119612755297e-05, "loss": 0.1196, "step": 23612 }, { "epoch": 7.218893304799756, "grad_norm": 0.5350608229637146, "learning_rate": 2.3311536665109763e-05, "loss": 0.1295, "step": 23613 }, { "epoch": 7.219199021705901, "grad_norm": 0.5108667016029358, "learning_rate": 2.331111205468982e-05, "loss": 0.1267, "step": 23614 }, { "epoch": 7.219504738612045, "grad_norm": 1.2656409740447998, "learning_rate": 2.3310687444269883e-05, "loss": 0.1315, "step": 23615 }, { "epoch": 7.21981045551819, "grad_norm": 0.7121157050132751, "learning_rate": 2.3310262833849942e-05, "loss": 0.1571, "step": 23616 }, { "epoch": 7.220116172424335, "grad_norm": 1.5707825422286987, "learning_rate": 2.3309838223430004e-05, "loss": 0.1551, "step": 23617 }, { "epoch": 7.22042188933048, "grad_norm": 0.8015005588531494, "learning_rate": 2.3309413613010063e-05, "loss": 0.1741, "step": 23618 }, { "epoch": 7.220727606236625, "grad_norm": 1.2085391283035278, "learning_rate": 2.3308989002590125e-05, "loss": 0.1533, "step": 23619 }, { "epoch": 7.22103332314277, "grad_norm": 0.992590606212616, "learning_rate": 2.3308564392170184e-05, "loss": 0.1785, "step": 23620 }, { "epoch": 7.221339040048915, "grad_norm": 1.1484583616256714, "learning_rate": 2.3308139781750246e-05, "loss": 0.2035, "step": 23621 }, { "epoch": 7.22164475695506, "grad_norm": 3.7460622787475586, "learning_rate": 2.3307715171330304e-05, "loss": 0.2135, "step": 23622 }, { "epoch": 7.221950473861204, "grad_norm": 0.7972167730331421, "learning_rate": 2.3307290560910363e-05, "loss": 0.1234, "step": 23623 }, { "epoch": 7.222256190767349, "grad_norm": 0.23236145079135895, "learning_rate": 2.3306865950490425e-05, "loss": 0.0676, "step": 23624 }, { "epoch": 7.222561907673494, "grad_norm": 0.352235347032547, "learning_rate": 2.3306441340070484e-05, "loss": 0.0632, "step": 23625 }, { "epoch": 7.2228676245796395, "grad_norm": 0.5784857869148254, "learning_rate": 2.3306016729650546e-05, "loss": 0.0713, "step": 23626 }, { "epoch": 7.223173341485785, "grad_norm": 0.24430659413337708, "learning_rate": 2.3305592119230605e-05, "loss": 0.0486, "step": 23627 }, { "epoch": 7.223479058391929, "grad_norm": 0.1576656997203827, "learning_rate": 2.3305167508810667e-05, "loss": 0.0371, "step": 23628 }, { "epoch": 7.223784775298074, "grad_norm": 0.21073509752750397, "learning_rate": 2.3304742898390726e-05, "loss": 0.056, "step": 23629 }, { "epoch": 7.224090492204219, "grad_norm": 0.4672001600265503, "learning_rate": 2.3304318287970788e-05, "loss": 0.0804, "step": 23630 }, { "epoch": 7.224396209110364, "grad_norm": 0.39200523495674133, "learning_rate": 2.3303893677550846e-05, "loss": 0.0668, "step": 23631 }, { "epoch": 7.224701926016508, "grad_norm": 0.3021319508552551, "learning_rate": 2.330346906713091e-05, "loss": 0.0589, "step": 23632 }, { "epoch": 7.2250076429226535, "grad_norm": 0.39645272493362427, "learning_rate": 2.3303044456710967e-05, "loss": 0.0731, "step": 23633 }, { "epoch": 7.225313359828799, "grad_norm": 0.5940852165222168, "learning_rate": 2.330261984629103e-05, "loss": 0.0833, "step": 23634 }, { "epoch": 7.225619076734944, "grad_norm": 0.28872933983802795, "learning_rate": 2.3302195235871088e-05, "loss": 0.0809, "step": 23635 }, { "epoch": 7.225924793641088, "grad_norm": 0.6125587821006775, "learning_rate": 2.3301770625451147e-05, "loss": 0.1293, "step": 23636 }, { "epoch": 7.226230510547233, "grad_norm": 0.6461135745048523, "learning_rate": 2.330134601503121e-05, "loss": 0.1214, "step": 23637 }, { "epoch": 7.226536227453378, "grad_norm": 0.6580381989479065, "learning_rate": 2.3300921404611267e-05, "loss": 0.1366, "step": 23638 }, { "epoch": 7.226841944359523, "grad_norm": 0.6256481409072876, "learning_rate": 2.330049679419133e-05, "loss": 0.1551, "step": 23639 }, { "epoch": 7.227147661265668, "grad_norm": 3.8569271564483643, "learning_rate": 2.3300072183771388e-05, "loss": 0.1328, "step": 23640 }, { "epoch": 7.227453378171813, "grad_norm": 0.7169557809829712, "learning_rate": 2.329964757335145e-05, "loss": 0.1348, "step": 23641 }, { "epoch": 7.227759095077958, "grad_norm": 0.9319458603858948, "learning_rate": 2.329922296293151e-05, "loss": 0.1546, "step": 23642 }, { "epoch": 7.228064811984103, "grad_norm": 1.1066652536392212, "learning_rate": 2.329879835251157e-05, "loss": 0.1494, "step": 23643 }, { "epoch": 7.228370528890248, "grad_norm": 0.6153594851493835, "learning_rate": 2.329837374209163e-05, "loss": 0.1744, "step": 23644 }, { "epoch": 7.228676245796392, "grad_norm": 1.0638165473937988, "learning_rate": 2.3297949131671692e-05, "loss": 0.19, "step": 23645 }, { "epoch": 7.228981962702537, "grad_norm": 0.5165768265724182, "learning_rate": 2.329752452125175e-05, "loss": 0.1512, "step": 23646 }, { "epoch": 7.2292876796086825, "grad_norm": 0.9042559862136841, "learning_rate": 2.3297099910831813e-05, "loss": 0.1781, "step": 23647 }, { "epoch": 7.229593396514828, "grad_norm": 0.44124728441238403, "learning_rate": 2.329667530041187e-05, "loss": 0.1302, "step": 23648 }, { "epoch": 7.229899113420972, "grad_norm": 0.3165130615234375, "learning_rate": 2.329625068999193e-05, "loss": 0.0678, "step": 23649 }, { "epoch": 7.230204830327117, "grad_norm": 0.2216803878545761, "learning_rate": 2.3295826079571992e-05, "loss": 0.0574, "step": 23650 }, { "epoch": 7.230510547233262, "grad_norm": 0.2841487526893616, "learning_rate": 2.329540146915205e-05, "loss": 0.0654, "step": 23651 }, { "epoch": 7.230816264139407, "grad_norm": 0.28842881321907043, "learning_rate": 2.3294976858732113e-05, "loss": 0.0464, "step": 23652 }, { "epoch": 7.231121981045552, "grad_norm": 0.22798144817352295, "learning_rate": 2.329455224831217e-05, "loss": 0.0518, "step": 23653 }, { "epoch": 7.2314276979516965, "grad_norm": 0.24835602939128876, "learning_rate": 2.3294127637892234e-05, "loss": 0.0591, "step": 23654 }, { "epoch": 7.231733414857842, "grad_norm": 0.2671664357185364, "learning_rate": 2.3293703027472292e-05, "loss": 0.0541, "step": 23655 }, { "epoch": 7.232039131763987, "grad_norm": 0.35901138186454773, "learning_rate": 2.3293278417052354e-05, "loss": 0.0818, "step": 23656 }, { "epoch": 7.232344848670132, "grad_norm": 0.25071877241134644, "learning_rate": 2.3292853806632413e-05, "loss": 0.0547, "step": 23657 }, { "epoch": 7.232650565576276, "grad_norm": 0.3938788175582886, "learning_rate": 2.3292429196212475e-05, "loss": 0.0751, "step": 23658 }, { "epoch": 7.232956282482421, "grad_norm": 0.3274136483669281, "learning_rate": 2.3292004585792537e-05, "loss": 0.067, "step": 23659 }, { "epoch": 7.233261999388566, "grad_norm": 0.5091983079910278, "learning_rate": 2.32915799753726e-05, "loss": 0.0821, "step": 23660 }, { "epoch": 7.233567716294711, "grad_norm": 0.40909692645072937, "learning_rate": 2.3291155364952658e-05, "loss": 0.1217, "step": 23661 }, { "epoch": 7.233873433200856, "grad_norm": 1.1989398002624512, "learning_rate": 2.3290730754532717e-05, "loss": 0.1558, "step": 23662 }, { "epoch": 7.234179150107001, "grad_norm": 0.6090399026870728, "learning_rate": 2.329030614411278e-05, "loss": 0.1266, "step": 23663 }, { "epoch": 7.234484867013146, "grad_norm": 1.8391979932785034, "learning_rate": 2.3289881533692838e-05, "loss": 0.1361, "step": 23664 }, { "epoch": 7.234790583919291, "grad_norm": 1.3347141742706299, "learning_rate": 2.32894569232729e-05, "loss": 0.1587, "step": 23665 }, { "epoch": 7.235096300825436, "grad_norm": 0.673775315284729, "learning_rate": 2.328903231285296e-05, "loss": 0.178, "step": 23666 }, { "epoch": 7.23540201773158, "grad_norm": 3.1830873489379883, "learning_rate": 2.328860770243302e-05, "loss": 0.1819, "step": 23667 }, { "epoch": 7.235707734637725, "grad_norm": 0.6160210371017456, "learning_rate": 2.328818309201308e-05, "loss": 0.1738, "step": 23668 }, { "epoch": 7.2360134515438705, "grad_norm": 0.6021559834480286, "learning_rate": 2.328775848159314e-05, "loss": 0.1531, "step": 23669 }, { "epoch": 7.236319168450016, "grad_norm": 0.9647721648216248, "learning_rate": 2.32873338711732e-05, "loss": 0.1921, "step": 23670 }, { "epoch": 7.23662488535616, "grad_norm": 1.008527159690857, "learning_rate": 2.3286909260753262e-05, "loss": 0.1914, "step": 23671 }, { "epoch": 7.236930602262305, "grad_norm": 1.267146110534668, "learning_rate": 2.328648465033332e-05, "loss": 0.2387, "step": 23672 }, { "epoch": 7.23723631916845, "grad_norm": 0.5648046731948853, "learning_rate": 2.3286060039913383e-05, "loss": 0.1297, "step": 23673 }, { "epoch": 7.237542036074595, "grad_norm": 0.3024613559246063, "learning_rate": 2.328563542949344e-05, "loss": 0.1021, "step": 23674 }, { "epoch": 7.2378477529807395, "grad_norm": 1.2299309968948364, "learning_rate": 2.32852108190735e-05, "loss": 0.0823, "step": 23675 }, { "epoch": 7.238153469886885, "grad_norm": 0.6651812791824341, "learning_rate": 2.3284786208653562e-05, "loss": 0.064, "step": 23676 }, { "epoch": 7.23845918679303, "grad_norm": 0.6982271075248718, "learning_rate": 2.328436159823362e-05, "loss": 0.0598, "step": 23677 }, { "epoch": 7.238764903699175, "grad_norm": 0.35625869035720825, "learning_rate": 2.3283936987813683e-05, "loss": 0.0683, "step": 23678 }, { "epoch": 7.23907062060532, "grad_norm": 0.6218798756599426, "learning_rate": 2.3283512377393742e-05, "loss": 0.0551, "step": 23679 }, { "epoch": 7.239376337511464, "grad_norm": 0.21001151204109192, "learning_rate": 2.3283087766973804e-05, "loss": 0.0479, "step": 23680 }, { "epoch": 7.239682054417609, "grad_norm": 0.38112980127334595, "learning_rate": 2.3282663156553863e-05, "loss": 0.0762, "step": 23681 }, { "epoch": 7.239987771323754, "grad_norm": 0.24884800612926483, "learning_rate": 2.3282238546133925e-05, "loss": 0.0633, "step": 23682 }, { "epoch": 7.2402934882298995, "grad_norm": 0.3490852415561676, "learning_rate": 2.3281813935713983e-05, "loss": 0.1309, "step": 23683 }, { "epoch": 7.240599205136044, "grad_norm": 0.7626113891601562, "learning_rate": 2.3281389325294045e-05, "loss": 0.0936, "step": 23684 }, { "epoch": 7.240904922042189, "grad_norm": 0.7534968256950378, "learning_rate": 2.3280964714874104e-05, "loss": 0.0698, "step": 23685 }, { "epoch": 7.241210638948334, "grad_norm": 0.8477213382720947, "learning_rate": 2.3280540104454166e-05, "loss": 0.1254, "step": 23686 }, { "epoch": 7.241516355854479, "grad_norm": 0.47908759117126465, "learning_rate": 2.3280115494034225e-05, "loss": 0.1049, "step": 23687 }, { "epoch": 7.241822072760623, "grad_norm": 0.9345760941505432, "learning_rate": 2.3279690883614284e-05, "loss": 0.1218, "step": 23688 }, { "epoch": 7.242127789666768, "grad_norm": 0.3655157685279846, "learning_rate": 2.3279266273194346e-05, "loss": 0.1179, "step": 23689 }, { "epoch": 7.2424335065729135, "grad_norm": 1.412316083908081, "learning_rate": 2.3278841662774404e-05, "loss": 0.1648, "step": 23690 }, { "epoch": 7.242739223479059, "grad_norm": 1.1097694635391235, "learning_rate": 2.3278417052354467e-05, "loss": 0.1754, "step": 23691 }, { "epoch": 7.243044940385204, "grad_norm": 0.7811036109924316, "learning_rate": 2.3277992441934525e-05, "loss": 0.1768, "step": 23692 }, { "epoch": 7.243350657291348, "grad_norm": 0.7950069308280945, "learning_rate": 2.3277567831514587e-05, "loss": 0.164, "step": 23693 }, { "epoch": 7.243656374197493, "grad_norm": 4.122433185577393, "learning_rate": 2.3277143221094646e-05, "loss": 0.1618, "step": 23694 }, { "epoch": 7.243962091103638, "grad_norm": 1.286889910697937, "learning_rate": 2.3276718610674708e-05, "loss": 0.1482, "step": 23695 }, { "epoch": 7.244267808009783, "grad_norm": 1.3329142332077026, "learning_rate": 2.3276294000254767e-05, "loss": 0.1718, "step": 23696 }, { "epoch": 7.2445735249159275, "grad_norm": 7.729081153869629, "learning_rate": 2.327586938983483e-05, "loss": 0.2891, "step": 23697 }, { "epoch": 7.244879241822073, "grad_norm": 0.4139579236507416, "learning_rate": 2.3275444779414888e-05, "loss": 0.1616, "step": 23698 }, { "epoch": 7.245184958728218, "grad_norm": 0.3883295953273773, "learning_rate": 2.327502016899495e-05, "loss": 0.0798, "step": 23699 }, { "epoch": 7.245490675634363, "grad_norm": 0.22595424950122833, "learning_rate": 2.327459555857501e-05, "loss": 0.0512, "step": 23700 }, { "epoch": 7.245796392540507, "grad_norm": 0.28479132056236267, "learning_rate": 2.3274170948155067e-05, "loss": 0.051, "step": 23701 }, { "epoch": 7.246102109446652, "grad_norm": 1.7204155921936035, "learning_rate": 2.327374633773513e-05, "loss": 0.0548, "step": 23702 }, { "epoch": 7.246407826352797, "grad_norm": 0.2080840766429901, "learning_rate": 2.3273321727315188e-05, "loss": 0.0437, "step": 23703 }, { "epoch": 7.2467135432589425, "grad_norm": 0.39411064982414246, "learning_rate": 2.327289711689525e-05, "loss": 0.0505, "step": 23704 }, { "epoch": 7.247019260165088, "grad_norm": 0.21762950718402863, "learning_rate": 2.327247250647531e-05, "loss": 0.0774, "step": 23705 }, { "epoch": 7.247324977071232, "grad_norm": 0.766501784324646, "learning_rate": 2.327204789605537e-05, "loss": 0.0428, "step": 23706 }, { "epoch": 7.247630693977377, "grad_norm": 0.3347494900226593, "learning_rate": 2.327162328563543e-05, "loss": 0.0594, "step": 23707 }, { "epoch": 7.247936410883522, "grad_norm": 0.28427550196647644, "learning_rate": 2.327119867521549e-05, "loss": 0.1055, "step": 23708 }, { "epoch": 7.248242127789667, "grad_norm": 0.21418242156505585, "learning_rate": 2.327077406479555e-05, "loss": 0.0659, "step": 23709 }, { "epoch": 7.248547844695811, "grad_norm": 0.19381678104400635, "learning_rate": 2.3270349454375612e-05, "loss": 0.0553, "step": 23710 }, { "epoch": 7.2488535616019565, "grad_norm": 0.596067488193512, "learning_rate": 2.326992484395567e-05, "loss": 0.101, "step": 23711 }, { "epoch": 7.249159278508102, "grad_norm": 0.8410853743553162, "learning_rate": 2.3269500233535733e-05, "loss": 0.1171, "step": 23712 }, { "epoch": 7.249464995414247, "grad_norm": 0.7543322443962097, "learning_rate": 2.3269075623115792e-05, "loss": 0.1399, "step": 23713 }, { "epoch": 7.249770712320391, "grad_norm": 0.41287365555763245, "learning_rate": 2.326865101269585e-05, "loss": 0.1111, "step": 23714 }, { "epoch": 7.250076429226536, "grad_norm": 0.8246889710426331, "learning_rate": 2.3268226402275913e-05, "loss": 0.1575, "step": 23715 }, { "epoch": 7.250382146132681, "grad_norm": 0.6634794473648071, "learning_rate": 2.326780179185597e-05, "loss": 0.1561, "step": 23716 }, { "epoch": 7.250687863038826, "grad_norm": 0.46853360533714294, "learning_rate": 2.3267377181436033e-05, "loss": 0.1298, "step": 23717 }, { "epoch": 7.250993579944971, "grad_norm": 0.6043888926506042, "learning_rate": 2.3266952571016092e-05, "loss": 0.1971, "step": 23718 }, { "epoch": 7.251299296851116, "grad_norm": 0.7716415524482727, "learning_rate": 2.3266527960596154e-05, "loss": 0.1737, "step": 23719 }, { "epoch": 7.251605013757261, "grad_norm": 1.0044721364974976, "learning_rate": 2.3266103350176213e-05, "loss": 0.1936, "step": 23720 }, { "epoch": 7.251910730663406, "grad_norm": 0.9743267893791199, "learning_rate": 2.3265678739756275e-05, "loss": 0.1917, "step": 23721 }, { "epoch": 7.252216447569551, "grad_norm": 13.365703582763672, "learning_rate": 2.3265254129336334e-05, "loss": 0.2003, "step": 23722 }, { "epoch": 7.252522164475695, "grad_norm": 0.3112136721611023, "learning_rate": 2.3264829518916396e-05, "loss": 0.1397, "step": 23723 }, { "epoch": 7.25282788138184, "grad_norm": 0.5048288702964783, "learning_rate": 2.3264404908496454e-05, "loss": 0.0607, "step": 23724 }, { "epoch": 7.253133598287985, "grad_norm": 0.20759430527687073, "learning_rate": 2.3263980298076517e-05, "loss": 0.0657, "step": 23725 }, { "epoch": 7.2534393151941305, "grad_norm": 0.31609711050987244, "learning_rate": 2.3263555687656575e-05, "loss": 0.0493, "step": 23726 }, { "epoch": 7.253745032100275, "grad_norm": 0.19368991255760193, "learning_rate": 2.3263131077236634e-05, "loss": 0.0613, "step": 23727 }, { "epoch": 7.25405074900642, "grad_norm": 0.4456719160079956, "learning_rate": 2.3262706466816696e-05, "loss": 0.0364, "step": 23728 }, { "epoch": 7.254356465912565, "grad_norm": 1.9473007917404175, "learning_rate": 2.3262281856396755e-05, "loss": 0.0651, "step": 23729 }, { "epoch": 7.25466218281871, "grad_norm": 0.45218995213508606, "learning_rate": 2.3261857245976817e-05, "loss": 0.0406, "step": 23730 }, { "epoch": 7.254967899724855, "grad_norm": 0.44046494364738464, "learning_rate": 2.3261432635556876e-05, "loss": 0.0451, "step": 23731 }, { "epoch": 7.2552736166309995, "grad_norm": 0.9501919150352478, "learning_rate": 2.3261008025136938e-05, "loss": 0.0678, "step": 23732 }, { "epoch": 7.255579333537145, "grad_norm": 1.0934481620788574, "learning_rate": 2.3260583414716996e-05, "loss": 0.0841, "step": 23733 }, { "epoch": 7.25588505044329, "grad_norm": 0.2504015266895294, "learning_rate": 2.326015880429706e-05, "loss": 0.0776, "step": 23734 }, { "epoch": 7.256190767349435, "grad_norm": 0.40258678793907166, "learning_rate": 2.3259734193877117e-05, "loss": 0.0794, "step": 23735 }, { "epoch": 7.256496484255579, "grad_norm": 0.7731305360794067, "learning_rate": 2.325930958345718e-05, "loss": 0.1105, "step": 23736 }, { "epoch": 7.256802201161724, "grad_norm": 0.37572747468948364, "learning_rate": 2.3258884973037238e-05, "loss": 0.0904, "step": 23737 }, { "epoch": 7.257107918067869, "grad_norm": 0.2474011927843094, "learning_rate": 2.3258460362617297e-05, "loss": 0.1106, "step": 23738 }, { "epoch": 7.257413634974014, "grad_norm": 0.2931773364543915, "learning_rate": 2.325803575219736e-05, "loss": 0.114, "step": 23739 }, { "epoch": 7.257719351880159, "grad_norm": 1.2501306533813477, "learning_rate": 2.3257611141777417e-05, "loss": 0.1314, "step": 23740 }, { "epoch": 7.258025068786304, "grad_norm": 1.3506578207015991, "learning_rate": 2.325718653135748e-05, "loss": 0.1571, "step": 23741 }, { "epoch": 7.258330785692449, "grad_norm": 4.447240352630615, "learning_rate": 2.3256761920937538e-05, "loss": 0.1932, "step": 23742 }, { "epoch": 7.258636502598594, "grad_norm": 1.2878581285476685, "learning_rate": 2.32563373105176e-05, "loss": 0.164, "step": 23743 }, { "epoch": 7.258942219504739, "grad_norm": 2.503309965133667, "learning_rate": 2.325591270009766e-05, "loss": 0.1712, "step": 23744 }, { "epoch": 7.259247936410883, "grad_norm": 1.3379359245300293, "learning_rate": 2.325548808967772e-05, "loss": 0.1452, "step": 23745 }, { "epoch": 7.259553653317028, "grad_norm": 1.1568659543991089, "learning_rate": 2.325506347925778e-05, "loss": 0.1822, "step": 23746 }, { "epoch": 7.2598593702231735, "grad_norm": 1.2100625038146973, "learning_rate": 2.3254638868837842e-05, "loss": 0.1776, "step": 23747 }, { "epoch": 7.260165087129319, "grad_norm": 0.6170368194580078, "learning_rate": 2.32542142584179e-05, "loss": 0.1386, "step": 23748 }, { "epoch": 7.260470804035463, "grad_norm": 0.612281858921051, "learning_rate": 2.3253789647997963e-05, "loss": 0.086, "step": 23749 }, { "epoch": 7.260776520941608, "grad_norm": 0.28503143787384033, "learning_rate": 2.325336503757802e-05, "loss": 0.0598, "step": 23750 }, { "epoch": 7.261082237847753, "grad_norm": 0.20142380893230438, "learning_rate": 2.325294042715808e-05, "loss": 0.0557, "step": 23751 }, { "epoch": 7.261387954753898, "grad_norm": 0.2998499274253845, "learning_rate": 2.3252515816738142e-05, "loss": 0.0645, "step": 23752 }, { "epoch": 7.261693671660042, "grad_norm": 0.29830828309059143, "learning_rate": 2.32520912063182e-05, "loss": 0.0496, "step": 23753 }, { "epoch": 7.2619993885661875, "grad_norm": 0.4287185072898865, "learning_rate": 2.3251666595898263e-05, "loss": 0.0356, "step": 23754 }, { "epoch": 7.262305105472333, "grad_norm": 0.3068135380744934, "learning_rate": 2.325124198547832e-05, "loss": 0.084, "step": 23755 }, { "epoch": 7.262610822378478, "grad_norm": 0.2981056272983551, "learning_rate": 2.3250817375058384e-05, "loss": 0.0673, "step": 23756 }, { "epoch": 7.262916539284623, "grad_norm": 0.372040331363678, "learning_rate": 2.3250392764638442e-05, "loss": 0.0527, "step": 23757 }, { "epoch": 7.263222256190767, "grad_norm": 0.3581133782863617, "learning_rate": 2.3249968154218504e-05, "loss": 0.0828, "step": 23758 }, { "epoch": 7.263527973096912, "grad_norm": 0.32489287853240967, "learning_rate": 2.3249543543798563e-05, "loss": 0.0615, "step": 23759 }, { "epoch": 7.263833690003057, "grad_norm": 0.932296633720398, "learning_rate": 2.3249118933378625e-05, "loss": 0.0873, "step": 23760 }, { "epoch": 7.2641394069092025, "grad_norm": 0.48477983474731445, "learning_rate": 2.3248694322958687e-05, "loss": 0.0938, "step": 23761 }, { "epoch": 7.264445123815347, "grad_norm": 0.4643875062465668, "learning_rate": 2.324826971253875e-05, "loss": 0.1348, "step": 23762 }, { "epoch": 7.264750840721492, "grad_norm": 0.29377666115760803, "learning_rate": 2.3247845102118808e-05, "loss": 0.0902, "step": 23763 }, { "epoch": 7.265056557627637, "grad_norm": 1.107051968574524, "learning_rate": 2.3247420491698867e-05, "loss": 0.1295, "step": 23764 }, { "epoch": 7.265362274533782, "grad_norm": 0.47522103786468506, "learning_rate": 2.324699588127893e-05, "loss": 0.1437, "step": 23765 }, { "epoch": 7.265667991439926, "grad_norm": 0.435418963432312, "learning_rate": 2.3246571270858988e-05, "loss": 0.1604, "step": 23766 }, { "epoch": 7.265973708346071, "grad_norm": 0.8480985164642334, "learning_rate": 2.324614666043905e-05, "loss": 0.1515, "step": 23767 }, { "epoch": 7.2662794252522165, "grad_norm": 0.6124703884124756, "learning_rate": 2.324572205001911e-05, "loss": 0.1527, "step": 23768 }, { "epoch": 7.266585142158362, "grad_norm": 3.0419983863830566, "learning_rate": 2.324529743959917e-05, "loss": 0.1507, "step": 23769 }, { "epoch": 7.266890859064507, "grad_norm": 0.6224229335784912, "learning_rate": 2.324487282917923e-05, "loss": 0.1688, "step": 23770 }, { "epoch": 7.267196575970651, "grad_norm": 2.234553575515747, "learning_rate": 2.324444821875929e-05, "loss": 0.2141, "step": 23771 }, { "epoch": 7.267502292876796, "grad_norm": 0.8151829242706299, "learning_rate": 2.324402360833935e-05, "loss": 0.224, "step": 23772 }, { "epoch": 7.267808009782941, "grad_norm": 0.5124159455299377, "learning_rate": 2.3243598997919412e-05, "loss": 0.1294, "step": 23773 }, { "epoch": 7.268113726689086, "grad_norm": 0.2508696913719177, "learning_rate": 2.324317438749947e-05, "loss": 0.0582, "step": 23774 }, { "epoch": 7.2684194435952305, "grad_norm": 0.9982229471206665, "learning_rate": 2.3242749777079533e-05, "loss": 0.0781, "step": 23775 }, { "epoch": 7.268725160501376, "grad_norm": 0.1696411818265915, "learning_rate": 2.324232516665959e-05, "loss": 0.0519, "step": 23776 }, { "epoch": 7.269030877407521, "grad_norm": 0.19217020273208618, "learning_rate": 2.324190055623965e-05, "loss": 0.0558, "step": 23777 }, { "epoch": 7.269336594313666, "grad_norm": 0.3116132318973541, "learning_rate": 2.3241475945819712e-05, "loss": 0.0622, "step": 23778 }, { "epoch": 7.26964231121981, "grad_norm": 0.25774896144866943, "learning_rate": 2.324105133539977e-05, "loss": 0.058, "step": 23779 }, { "epoch": 7.269948028125955, "grad_norm": 0.3285776972770691, "learning_rate": 2.3240626724979833e-05, "loss": 0.0421, "step": 23780 }, { "epoch": 7.2702537450321, "grad_norm": 0.6755558252334595, "learning_rate": 2.3240202114559892e-05, "loss": 0.0902, "step": 23781 }, { "epoch": 7.270559461938245, "grad_norm": 1.2839343547821045, "learning_rate": 2.3239777504139954e-05, "loss": 0.0767, "step": 23782 }, { "epoch": 7.2708651788443905, "grad_norm": 0.8022035360336304, "learning_rate": 2.3239352893720013e-05, "loss": 0.0946, "step": 23783 }, { "epoch": 7.271170895750535, "grad_norm": 0.28710445761680603, "learning_rate": 2.3238928283300075e-05, "loss": 0.0731, "step": 23784 }, { "epoch": 7.27147661265668, "grad_norm": 1.2620956897735596, "learning_rate": 2.3238503672880133e-05, "loss": 0.0909, "step": 23785 }, { "epoch": 7.271782329562825, "grad_norm": 0.7456585764884949, "learning_rate": 2.3238079062460196e-05, "loss": 0.1114, "step": 23786 }, { "epoch": 7.27208804646897, "grad_norm": 0.28476881980895996, "learning_rate": 2.3237654452040254e-05, "loss": 0.0952, "step": 23787 }, { "epoch": 7.272393763375114, "grad_norm": 0.680684506893158, "learning_rate": 2.3237229841620316e-05, "loss": 0.1547, "step": 23788 }, { "epoch": 7.2726994802812595, "grad_norm": 0.6757149696350098, "learning_rate": 2.3236805231200375e-05, "loss": 0.1009, "step": 23789 }, { "epoch": 7.273005197187405, "grad_norm": 0.6952542066574097, "learning_rate": 2.3236380620780434e-05, "loss": 0.181, "step": 23790 }, { "epoch": 7.27331091409355, "grad_norm": 0.9310382604598999, "learning_rate": 2.3235956010360496e-05, "loss": 0.1388, "step": 23791 }, { "epoch": 7.273616630999694, "grad_norm": 2.1101839542388916, "learning_rate": 2.3235531399940554e-05, "loss": 0.1569, "step": 23792 }, { "epoch": 7.273922347905839, "grad_norm": 1.1486607789993286, "learning_rate": 2.3235106789520617e-05, "loss": 0.1561, "step": 23793 }, { "epoch": 7.274228064811984, "grad_norm": 1.4883836507797241, "learning_rate": 2.3234682179100675e-05, "loss": 0.1572, "step": 23794 }, { "epoch": 7.274533781718129, "grad_norm": 3.606260061264038, "learning_rate": 2.3234257568680737e-05, "loss": 0.1597, "step": 23795 }, { "epoch": 7.274839498624274, "grad_norm": 0.784544050693512, "learning_rate": 2.3233832958260796e-05, "loss": 0.1836, "step": 23796 }, { "epoch": 7.275145215530419, "grad_norm": 1.2223987579345703, "learning_rate": 2.3233408347840858e-05, "loss": 0.2561, "step": 23797 }, { "epoch": 7.275450932436564, "grad_norm": 0.3682945668697357, "learning_rate": 2.3232983737420917e-05, "loss": 0.1135, "step": 23798 }, { "epoch": 7.275756649342709, "grad_norm": 0.3608521819114685, "learning_rate": 2.323255912700098e-05, "loss": 0.0674, "step": 23799 }, { "epoch": 7.276062366248854, "grad_norm": 0.32096946239471436, "learning_rate": 2.3232134516581038e-05, "loss": 0.0778, "step": 23800 }, { "epoch": 7.276368083154998, "grad_norm": 0.2516865134239197, "learning_rate": 2.32317099061611e-05, "loss": 0.0675, "step": 23801 }, { "epoch": 7.276673800061143, "grad_norm": 0.21718594431877136, "learning_rate": 2.323128529574116e-05, "loss": 0.0388, "step": 23802 }, { "epoch": 7.276979516967288, "grad_norm": 0.25808700919151306, "learning_rate": 2.3230860685321217e-05, "loss": 0.0556, "step": 23803 }, { "epoch": 7.2772852338734335, "grad_norm": 0.14768345654010773, "learning_rate": 2.323043607490128e-05, "loss": 0.0353, "step": 23804 }, { "epoch": 7.277590950779578, "grad_norm": 0.26673921942710876, "learning_rate": 2.3230011464481338e-05, "loss": 0.0611, "step": 23805 }, { "epoch": 7.277896667685723, "grad_norm": 0.27328744530677795, "learning_rate": 2.32295868540614e-05, "loss": 0.0557, "step": 23806 }, { "epoch": 7.278202384591868, "grad_norm": 0.5546548962593079, "learning_rate": 2.322916224364146e-05, "loss": 0.0566, "step": 23807 }, { "epoch": 7.278508101498013, "grad_norm": 1.2917226552963257, "learning_rate": 2.322873763322152e-05, "loss": 0.1123, "step": 23808 }, { "epoch": 7.278813818404158, "grad_norm": 0.6455583572387695, "learning_rate": 2.322831302280158e-05, "loss": 0.0936, "step": 23809 }, { "epoch": 7.279119535310302, "grad_norm": 0.4846774935722351, "learning_rate": 2.322788841238164e-05, "loss": 0.0654, "step": 23810 }, { "epoch": 7.2794252522164475, "grad_norm": 0.573660671710968, "learning_rate": 2.32274638019617e-05, "loss": 0.1289, "step": 23811 }, { "epoch": 7.279730969122593, "grad_norm": 0.5390806198120117, "learning_rate": 2.3227039191541762e-05, "loss": 0.1189, "step": 23812 }, { "epoch": 7.280036686028738, "grad_norm": 0.40413886308670044, "learning_rate": 2.322661458112182e-05, "loss": 0.1335, "step": 23813 }, { "epoch": 7.280342402934882, "grad_norm": 0.8877401351928711, "learning_rate": 2.3226189970701883e-05, "loss": 0.1444, "step": 23814 }, { "epoch": 7.280648119841027, "grad_norm": 0.8697389960289001, "learning_rate": 2.3225765360281942e-05, "loss": 0.1601, "step": 23815 }, { "epoch": 7.280953836747172, "grad_norm": 1.0678263902664185, "learning_rate": 2.3225340749862e-05, "loss": 0.1547, "step": 23816 }, { "epoch": 7.281259553653317, "grad_norm": 1.5194889307022095, "learning_rate": 2.3224916139442063e-05, "loss": 0.1934, "step": 23817 }, { "epoch": 7.281565270559462, "grad_norm": 0.8860782980918884, "learning_rate": 2.322449152902212e-05, "loss": 0.1664, "step": 23818 }, { "epoch": 7.281870987465607, "grad_norm": 1.2239643335342407, "learning_rate": 2.3224066918602183e-05, "loss": 0.1703, "step": 23819 }, { "epoch": 7.282176704371752, "grad_norm": 1.2048522233963013, "learning_rate": 2.3223642308182242e-05, "loss": 0.1775, "step": 23820 }, { "epoch": 7.282482421277897, "grad_norm": 1.4706958532333374, "learning_rate": 2.3223217697762304e-05, "loss": 0.215, "step": 23821 }, { "epoch": 7.282788138184042, "grad_norm": 3.5317749977111816, "learning_rate": 2.3222793087342363e-05, "loss": 0.2086, "step": 23822 }, { "epoch": 7.283093855090186, "grad_norm": 0.34920844435691833, "learning_rate": 2.3222368476922425e-05, "loss": 0.1248, "step": 23823 }, { "epoch": 7.283399571996331, "grad_norm": 0.8708038330078125, "learning_rate": 2.3221943866502484e-05, "loss": 0.0744, "step": 23824 }, { "epoch": 7.2837052889024765, "grad_norm": 0.38900867104530334, "learning_rate": 2.3221519256082546e-05, "loss": 0.0651, "step": 23825 }, { "epoch": 7.284011005808622, "grad_norm": 0.5158032178878784, "learning_rate": 2.3221094645662605e-05, "loss": 0.0597, "step": 23826 }, { "epoch": 7.284316722714766, "grad_norm": 0.47443437576293945, "learning_rate": 2.3220670035242667e-05, "loss": 0.0684, "step": 23827 }, { "epoch": 7.284622439620911, "grad_norm": 0.47119998931884766, "learning_rate": 2.3220245424822725e-05, "loss": 0.0451, "step": 23828 }, { "epoch": 7.284928156527056, "grad_norm": 0.2534792125225067, "learning_rate": 2.3219820814402784e-05, "loss": 0.0734, "step": 23829 }, { "epoch": 7.285233873433201, "grad_norm": 3.705641031265259, "learning_rate": 2.3219396203982846e-05, "loss": 0.0525, "step": 23830 }, { "epoch": 7.285539590339345, "grad_norm": 0.2713260352611542, "learning_rate": 2.3218971593562905e-05, "loss": 0.0833, "step": 23831 }, { "epoch": 7.2858453072454905, "grad_norm": 0.20460495352745056, "learning_rate": 2.3218546983142967e-05, "loss": 0.0533, "step": 23832 }, { "epoch": 7.286151024151636, "grad_norm": 1.04416823387146, "learning_rate": 2.3218122372723026e-05, "loss": 0.0896, "step": 23833 }, { "epoch": 7.286456741057781, "grad_norm": 0.26077309250831604, "learning_rate": 2.3217697762303088e-05, "loss": 0.078, "step": 23834 }, { "epoch": 7.286762457963926, "grad_norm": 1.1468485593795776, "learning_rate": 2.3217273151883146e-05, "loss": 0.0881, "step": 23835 }, { "epoch": 7.28706817487007, "grad_norm": 0.2716189920902252, "learning_rate": 2.321684854146321e-05, "loss": 0.0872, "step": 23836 }, { "epoch": 7.287373891776215, "grad_norm": 0.28022170066833496, "learning_rate": 2.3216423931043267e-05, "loss": 0.1229, "step": 23837 }, { "epoch": 7.28767960868236, "grad_norm": 3.7536308765411377, "learning_rate": 2.321599932062333e-05, "loss": 0.1201, "step": 23838 }, { "epoch": 7.287985325588505, "grad_norm": 1.5540374517440796, "learning_rate": 2.3215574710203388e-05, "loss": 0.1475, "step": 23839 }, { "epoch": 7.28829104249465, "grad_norm": 2.320582151412964, "learning_rate": 2.321515009978345e-05, "loss": 0.1527, "step": 23840 }, { "epoch": 7.288596759400795, "grad_norm": 0.6754719018936157, "learning_rate": 2.321472548936351e-05, "loss": 0.1757, "step": 23841 }, { "epoch": 7.28890247630694, "grad_norm": 0.8668972849845886, "learning_rate": 2.3214300878943567e-05, "loss": 0.1742, "step": 23842 }, { "epoch": 7.289208193213085, "grad_norm": 0.5367320775985718, "learning_rate": 2.321387626852363e-05, "loss": 0.1679, "step": 23843 }, { "epoch": 7.289513910119229, "grad_norm": 0.5318883657455444, "learning_rate": 2.3213451658103688e-05, "loss": 0.185, "step": 23844 }, { "epoch": 7.289819627025374, "grad_norm": 0.6772513389587402, "learning_rate": 2.321302704768375e-05, "loss": 0.1836, "step": 23845 }, { "epoch": 7.2901253439315195, "grad_norm": 6.775463581085205, "learning_rate": 2.321260243726381e-05, "loss": 0.2044, "step": 23846 }, { "epoch": 7.290431060837665, "grad_norm": 1.4887217283248901, "learning_rate": 2.321217782684387e-05, "loss": 0.2219, "step": 23847 }, { "epoch": 7.29073677774381, "grad_norm": 0.7553520202636719, "learning_rate": 2.321175321642393e-05, "loss": 0.1675, "step": 23848 }, { "epoch": 7.291042494649954, "grad_norm": 0.47593313455581665, "learning_rate": 2.3211328606003992e-05, "loss": 0.0796, "step": 23849 }, { "epoch": 7.291348211556099, "grad_norm": 0.2691708207130432, "learning_rate": 2.321090399558405e-05, "loss": 0.0725, "step": 23850 }, { "epoch": 7.291653928462244, "grad_norm": 0.2981351912021637, "learning_rate": 2.3210479385164113e-05, "loss": 0.0657, "step": 23851 }, { "epoch": 7.291959645368389, "grad_norm": 0.22296224534511566, "learning_rate": 2.321005477474417e-05, "loss": 0.0493, "step": 23852 }, { "epoch": 7.2922653622745335, "grad_norm": 0.2979072034358978, "learning_rate": 2.320963016432423e-05, "loss": 0.06, "step": 23853 }, { "epoch": 7.292571079180679, "grad_norm": 0.23715132474899292, "learning_rate": 2.3209205553904292e-05, "loss": 0.0584, "step": 23854 }, { "epoch": 7.292876796086824, "grad_norm": 0.5764185190200806, "learning_rate": 2.320878094348435e-05, "loss": 0.0747, "step": 23855 }, { "epoch": 7.293182512992969, "grad_norm": 0.42383432388305664, "learning_rate": 2.3208356333064413e-05, "loss": 0.04, "step": 23856 }, { "epoch": 7.293488229899113, "grad_norm": 0.6391013860702515, "learning_rate": 2.320793172264447e-05, "loss": 0.0664, "step": 23857 }, { "epoch": 7.293793946805258, "grad_norm": 0.4598758816719055, "learning_rate": 2.3207507112224534e-05, "loss": 0.0696, "step": 23858 }, { "epoch": 7.294099663711403, "grad_norm": 0.526427149772644, "learning_rate": 2.3207082501804592e-05, "loss": 0.0621, "step": 23859 }, { "epoch": 7.294405380617548, "grad_norm": 0.466989129781723, "learning_rate": 2.3206657891384655e-05, "loss": 0.0992, "step": 23860 }, { "epoch": 7.2947110975236935, "grad_norm": 0.48408326506614685, "learning_rate": 2.3206233280964713e-05, "loss": 0.0922, "step": 23861 }, { "epoch": 7.295016814429838, "grad_norm": 0.39424270391464233, "learning_rate": 2.3205808670544775e-05, "loss": 0.0893, "step": 23862 }, { "epoch": 7.295322531335983, "grad_norm": 0.3795454502105713, "learning_rate": 2.3205384060124837e-05, "loss": 0.1066, "step": 23863 }, { "epoch": 7.295628248242128, "grad_norm": 0.6522400975227356, "learning_rate": 2.32049594497049e-05, "loss": 0.1285, "step": 23864 }, { "epoch": 7.295933965148273, "grad_norm": 0.5838062167167664, "learning_rate": 2.3204534839284958e-05, "loss": 0.1174, "step": 23865 }, { "epoch": 7.296239682054417, "grad_norm": 0.4402773976325989, "learning_rate": 2.3204110228865017e-05, "loss": 0.1331, "step": 23866 }, { "epoch": 7.296545398960562, "grad_norm": 0.7123770713806152, "learning_rate": 2.320368561844508e-05, "loss": 0.1442, "step": 23867 }, { "epoch": 7.2968511158667075, "grad_norm": 0.571160614490509, "learning_rate": 2.3203261008025138e-05, "loss": 0.1667, "step": 23868 }, { "epoch": 7.297156832772853, "grad_norm": 0.6983046531677246, "learning_rate": 2.32028363976052e-05, "loss": 0.1765, "step": 23869 }, { "epoch": 7.297462549678997, "grad_norm": 0.633408784866333, "learning_rate": 2.320241178718526e-05, "loss": 0.2147, "step": 23870 }, { "epoch": 7.297768266585142, "grad_norm": 0.9154729247093201, "learning_rate": 2.320198717676532e-05, "loss": 0.1552, "step": 23871 }, { "epoch": 7.298073983491287, "grad_norm": 2.1865878105163574, "learning_rate": 2.320156256634538e-05, "loss": 0.2118, "step": 23872 }, { "epoch": 7.298379700397432, "grad_norm": 1.2641278505325317, "learning_rate": 2.320113795592544e-05, "loss": 0.1419, "step": 23873 }, { "epoch": 7.298685417303577, "grad_norm": 0.2361089140176773, "learning_rate": 2.32007133455055e-05, "loss": 0.0656, "step": 23874 }, { "epoch": 7.298991134209722, "grad_norm": 0.3454267382621765, "learning_rate": 2.3200288735085562e-05, "loss": 0.0919, "step": 23875 }, { "epoch": 7.299296851115867, "grad_norm": 0.3168964087963104, "learning_rate": 2.319986412466562e-05, "loss": 0.0623, "step": 23876 }, { "epoch": 7.299602568022012, "grad_norm": 1.473423957824707, "learning_rate": 2.3199439514245683e-05, "loss": 0.0645, "step": 23877 }, { "epoch": 7.299908284928157, "grad_norm": 0.15473470091819763, "learning_rate": 2.319901490382574e-05, "loss": 0.0432, "step": 23878 }, { "epoch": 7.300214001834301, "grad_norm": 0.3272840976715088, "learning_rate": 2.31985902934058e-05, "loss": 0.0497, "step": 23879 }, { "epoch": 7.300519718740446, "grad_norm": 0.2878062129020691, "learning_rate": 2.3198165682985862e-05, "loss": 0.0647, "step": 23880 }, { "epoch": 7.300825435646591, "grad_norm": 0.4353795051574707, "learning_rate": 2.319774107256592e-05, "loss": 0.0914, "step": 23881 }, { "epoch": 7.3011311525527365, "grad_norm": 0.46554169058799744, "learning_rate": 2.3197316462145983e-05, "loss": 0.0765, "step": 23882 }, { "epoch": 7.301436869458881, "grad_norm": 0.3869338631629944, "learning_rate": 2.3196891851726042e-05, "loss": 0.0678, "step": 23883 }, { "epoch": 7.301742586365026, "grad_norm": 0.3823343515396118, "learning_rate": 2.3196467241306104e-05, "loss": 0.0728, "step": 23884 }, { "epoch": 7.302048303271171, "grad_norm": 0.3608725965023041, "learning_rate": 2.3196042630886163e-05, "loss": 0.0824, "step": 23885 }, { "epoch": 7.302354020177316, "grad_norm": 0.5952056050300598, "learning_rate": 2.3195618020466225e-05, "loss": 0.112, "step": 23886 }, { "epoch": 7.302659737083461, "grad_norm": 0.36357682943344116, "learning_rate": 2.3195193410046283e-05, "loss": 0.1203, "step": 23887 }, { "epoch": 7.302965453989605, "grad_norm": 0.4984031617641449, "learning_rate": 2.3194768799626346e-05, "loss": 0.1261, "step": 23888 }, { "epoch": 7.3032711708957505, "grad_norm": 0.9246265292167664, "learning_rate": 2.3194344189206404e-05, "loss": 0.154, "step": 23889 }, { "epoch": 7.303576887801896, "grad_norm": 1.2141603231430054, "learning_rate": 2.3193919578786466e-05, "loss": 0.1318, "step": 23890 }, { "epoch": 7.303882604708041, "grad_norm": 2.175302743911743, "learning_rate": 2.3193494968366525e-05, "loss": 0.1407, "step": 23891 }, { "epoch": 7.304188321614185, "grad_norm": 2.3923046588897705, "learning_rate": 2.3193070357946584e-05, "loss": 0.1585, "step": 23892 }, { "epoch": 7.30449403852033, "grad_norm": 1.0512028932571411, "learning_rate": 2.3192645747526646e-05, "loss": 0.1627, "step": 23893 }, { "epoch": 7.304799755426475, "grad_norm": 0.7506199479103088, "learning_rate": 2.3192221137106705e-05, "loss": 0.1682, "step": 23894 }, { "epoch": 7.30510547233262, "grad_norm": 0.6418784856796265, "learning_rate": 2.3191796526686767e-05, "loss": 0.1767, "step": 23895 }, { "epoch": 7.3054111892387645, "grad_norm": 0.728244423866272, "learning_rate": 2.3191371916266825e-05, "loss": 0.1857, "step": 23896 }, { "epoch": 7.30571690614491, "grad_norm": 1.2794231176376343, "learning_rate": 2.3190947305846887e-05, "loss": 0.2217, "step": 23897 }, { "epoch": 7.306022623051055, "grad_norm": 0.6630212068557739, "learning_rate": 2.3190522695426946e-05, "loss": 0.1518, "step": 23898 }, { "epoch": 7.3063283399572, "grad_norm": 0.3408677279949188, "learning_rate": 2.3190098085007008e-05, "loss": 0.1114, "step": 23899 }, { "epoch": 7.306634056863345, "grad_norm": 0.24706241488456726, "learning_rate": 2.3189673474587067e-05, "loss": 0.0608, "step": 23900 }, { "epoch": 7.306939773769489, "grad_norm": 0.2821144163608551, "learning_rate": 2.318924886416713e-05, "loss": 0.0691, "step": 23901 }, { "epoch": 7.307245490675634, "grad_norm": 0.3791678845882416, "learning_rate": 2.3188824253747188e-05, "loss": 0.0495, "step": 23902 }, { "epoch": 7.3075512075817795, "grad_norm": 1.276155710220337, "learning_rate": 2.318839964332725e-05, "loss": 0.0358, "step": 23903 }, { "epoch": 7.307856924487925, "grad_norm": 0.26700568199157715, "learning_rate": 2.318797503290731e-05, "loss": 0.0597, "step": 23904 }, { "epoch": 7.308162641394069, "grad_norm": 0.18867693841457367, "learning_rate": 2.3187550422487367e-05, "loss": 0.0493, "step": 23905 }, { "epoch": 7.308468358300214, "grad_norm": 0.17558744549751282, "learning_rate": 2.318712581206743e-05, "loss": 0.0548, "step": 23906 }, { "epoch": 7.308774075206359, "grad_norm": 0.3345417082309723, "learning_rate": 2.3186701201647488e-05, "loss": 0.0573, "step": 23907 }, { "epoch": 7.309079792112504, "grad_norm": 0.7432676553726196, "learning_rate": 2.318627659122755e-05, "loss": 0.1061, "step": 23908 }, { "epoch": 7.309385509018648, "grad_norm": 0.4716528058052063, "learning_rate": 2.318585198080761e-05, "loss": 0.0617, "step": 23909 }, { "epoch": 7.3096912259247935, "grad_norm": 0.17835237085819244, "learning_rate": 2.318542737038767e-05, "loss": 0.0766, "step": 23910 }, { "epoch": 7.309996942830939, "grad_norm": 0.32393383979797363, "learning_rate": 2.318500275996773e-05, "loss": 0.1323, "step": 23911 }, { "epoch": 7.310302659737084, "grad_norm": 0.5987812876701355, "learning_rate": 2.318457814954779e-05, "loss": 0.1082, "step": 23912 }, { "epoch": 7.310608376643229, "grad_norm": 0.61748206615448, "learning_rate": 2.318415353912785e-05, "loss": 0.1246, "step": 23913 }, { "epoch": 7.310914093549373, "grad_norm": 0.40666183829307556, "learning_rate": 2.3183728928707912e-05, "loss": 0.1283, "step": 23914 }, { "epoch": 7.311219810455518, "grad_norm": 0.4495156705379486, "learning_rate": 2.318330431828797e-05, "loss": 0.1105, "step": 23915 }, { "epoch": 7.311525527361663, "grad_norm": 0.4638597071170807, "learning_rate": 2.3182879707868033e-05, "loss": 0.1535, "step": 23916 }, { "epoch": 7.311831244267808, "grad_norm": 0.46969127655029297, "learning_rate": 2.3182455097448092e-05, "loss": 0.1393, "step": 23917 }, { "epoch": 7.312136961173953, "grad_norm": 0.7166615724563599, "learning_rate": 2.318203048702815e-05, "loss": 0.1279, "step": 23918 }, { "epoch": 7.312442678080098, "grad_norm": 0.5580765604972839, "learning_rate": 2.3181605876608213e-05, "loss": 0.1357, "step": 23919 }, { "epoch": 7.312748394986243, "grad_norm": 1.1279016733169556, "learning_rate": 2.318118126618827e-05, "loss": 0.1435, "step": 23920 }, { "epoch": 7.313054111892388, "grad_norm": 1.2911899089813232, "learning_rate": 2.3180756655768333e-05, "loss": 0.2035, "step": 23921 }, { "epoch": 7.313359828798532, "grad_norm": 2.0224404335021973, "learning_rate": 2.3180332045348392e-05, "loss": 0.2107, "step": 23922 }, { "epoch": 7.313665545704677, "grad_norm": 0.5141564607620239, "learning_rate": 2.3179907434928454e-05, "loss": 0.1361, "step": 23923 }, { "epoch": 7.313971262610822, "grad_norm": 0.3064340651035309, "learning_rate": 2.3179482824508513e-05, "loss": 0.0709, "step": 23924 }, { "epoch": 7.3142769795169675, "grad_norm": 0.41244131326675415, "learning_rate": 2.3179058214088575e-05, "loss": 0.0822, "step": 23925 }, { "epoch": 7.314582696423113, "grad_norm": 0.751066267490387, "learning_rate": 2.3178633603668634e-05, "loss": 0.0615, "step": 23926 }, { "epoch": 7.314888413329257, "grad_norm": 0.5163701772689819, "learning_rate": 2.3178208993248696e-05, "loss": 0.0622, "step": 23927 }, { "epoch": 7.315194130235402, "grad_norm": 0.6134297847747803, "learning_rate": 2.3177784382828755e-05, "loss": 0.0586, "step": 23928 }, { "epoch": 7.315499847141547, "grad_norm": 0.3604957163333893, "learning_rate": 2.3177359772408817e-05, "loss": 0.0423, "step": 23929 }, { "epoch": 7.315805564047692, "grad_norm": 0.3128524422645569, "learning_rate": 2.3176935161988875e-05, "loss": 0.0823, "step": 23930 }, { "epoch": 7.3161112809538364, "grad_norm": 0.25266480445861816, "learning_rate": 2.3176510551568934e-05, "loss": 0.0558, "step": 23931 }, { "epoch": 7.316416997859982, "grad_norm": 0.23381051421165466, "learning_rate": 2.3176085941148996e-05, "loss": 0.0518, "step": 23932 }, { "epoch": 7.316722714766127, "grad_norm": 0.5354254841804504, "learning_rate": 2.3175661330729055e-05, "loss": 0.0986, "step": 23933 }, { "epoch": 7.317028431672272, "grad_norm": 0.4018115699291229, "learning_rate": 2.3175236720309117e-05, "loss": 0.0651, "step": 23934 }, { "epoch": 7.317334148578416, "grad_norm": 0.6323752403259277, "learning_rate": 2.3174812109889176e-05, "loss": 0.0923, "step": 23935 }, { "epoch": 7.317639865484561, "grad_norm": 1.1013585329055786, "learning_rate": 2.3174387499469238e-05, "loss": 0.1264, "step": 23936 }, { "epoch": 7.317945582390706, "grad_norm": 0.35086312890052795, "learning_rate": 2.3173962889049296e-05, "loss": 0.144, "step": 23937 }, { "epoch": 7.318251299296851, "grad_norm": 0.9254657626152039, "learning_rate": 2.317353827862936e-05, "loss": 0.1155, "step": 23938 }, { "epoch": 7.3185570162029965, "grad_norm": 2.834726333618164, "learning_rate": 2.3173113668209417e-05, "loss": 0.1365, "step": 23939 }, { "epoch": 7.318862733109141, "grad_norm": 1.0431766510009766, "learning_rate": 2.317268905778948e-05, "loss": 0.1528, "step": 23940 }, { "epoch": 7.319168450015286, "grad_norm": 0.5493155121803284, "learning_rate": 2.3172264447369538e-05, "loss": 0.1665, "step": 23941 }, { "epoch": 7.319474166921431, "grad_norm": 0.4090520143508911, "learning_rate": 2.31718398369496e-05, "loss": 0.1877, "step": 23942 }, { "epoch": 7.319779883827576, "grad_norm": 0.8909623026847839, "learning_rate": 2.317141522652966e-05, "loss": 0.1483, "step": 23943 }, { "epoch": 7.32008560073372, "grad_norm": 1.490296721458435, "learning_rate": 2.3170990616109717e-05, "loss": 0.1643, "step": 23944 }, { "epoch": 7.320391317639865, "grad_norm": 0.9645735621452332, "learning_rate": 2.317056600568978e-05, "loss": 0.1815, "step": 23945 }, { "epoch": 7.3206970345460105, "grad_norm": 3.149449586868286, "learning_rate": 2.3170141395269838e-05, "loss": 0.1871, "step": 23946 }, { "epoch": 7.321002751452156, "grad_norm": 9.631153106689453, "learning_rate": 2.31697167848499e-05, "loss": 0.2066, "step": 23947 }, { "epoch": 7.3213084683583, "grad_norm": 0.8233566284179688, "learning_rate": 2.316929217442996e-05, "loss": 0.1478, "step": 23948 }, { "epoch": 7.321614185264445, "grad_norm": 0.5068669319152832, "learning_rate": 2.316886756401002e-05, "loss": 0.1062, "step": 23949 }, { "epoch": 7.32191990217059, "grad_norm": 0.8042405247688293, "learning_rate": 2.316844295359008e-05, "loss": 0.0926, "step": 23950 }, { "epoch": 7.322225619076735, "grad_norm": 0.22273173928260803, "learning_rate": 2.3168018343170142e-05, "loss": 0.0541, "step": 23951 }, { "epoch": 7.32253133598288, "grad_norm": 0.6677346229553223, "learning_rate": 2.31675937327502e-05, "loss": 0.0382, "step": 23952 }, { "epoch": 7.3228370528890245, "grad_norm": 0.6307243704795837, "learning_rate": 2.3167169122330263e-05, "loss": 0.0483, "step": 23953 }, { "epoch": 7.32314276979517, "grad_norm": 0.23144373297691345, "learning_rate": 2.316674451191032e-05, "loss": 0.0683, "step": 23954 }, { "epoch": 7.323448486701315, "grad_norm": 1.002418041229248, "learning_rate": 2.3166319901490383e-05, "loss": 0.0489, "step": 23955 }, { "epoch": 7.32375420360746, "grad_norm": 0.597310483455658, "learning_rate": 2.3165895291070442e-05, "loss": 0.0732, "step": 23956 }, { "epoch": 7.324059920513604, "grad_norm": 0.2409968227148056, "learning_rate": 2.31654706806505e-05, "loss": 0.0457, "step": 23957 }, { "epoch": 7.324365637419749, "grad_norm": 0.21313738822937012, "learning_rate": 2.3165046070230563e-05, "loss": 0.0587, "step": 23958 }, { "epoch": 7.324671354325894, "grad_norm": 0.4379555881023407, "learning_rate": 2.316462145981062e-05, "loss": 0.076, "step": 23959 }, { "epoch": 7.3249770712320394, "grad_norm": 0.4547993242740631, "learning_rate": 2.3164196849390684e-05, "loss": 0.0885, "step": 23960 }, { "epoch": 7.325282788138184, "grad_norm": 0.8066165447235107, "learning_rate": 2.3163772238970742e-05, "loss": 0.0911, "step": 23961 }, { "epoch": 7.325588505044329, "grad_norm": 0.4970864951610565, "learning_rate": 2.3163347628550805e-05, "loss": 0.1025, "step": 23962 }, { "epoch": 7.325894221950474, "grad_norm": 0.45138174295425415, "learning_rate": 2.3162923018130863e-05, "loss": 0.1315, "step": 23963 }, { "epoch": 7.326199938856619, "grad_norm": 2.6973214149475098, "learning_rate": 2.3162498407710925e-05, "loss": 0.1248, "step": 23964 }, { "epoch": 7.326505655762764, "grad_norm": 1.8511420488357544, "learning_rate": 2.3162073797290987e-05, "loss": 0.1517, "step": 23965 }, { "epoch": 7.326811372668908, "grad_norm": 0.8997952342033386, "learning_rate": 2.316164918687105e-05, "loss": 0.1843, "step": 23966 }, { "epoch": 7.3271170895750535, "grad_norm": 1.524084210395813, "learning_rate": 2.3161224576451108e-05, "loss": 0.1442, "step": 23967 }, { "epoch": 7.327422806481199, "grad_norm": 0.7852483987808228, "learning_rate": 2.3160799966031167e-05, "loss": 0.1859, "step": 23968 }, { "epoch": 7.327728523387344, "grad_norm": 0.4738018214702606, "learning_rate": 2.316037535561123e-05, "loss": 0.1595, "step": 23969 }, { "epoch": 7.328034240293488, "grad_norm": 0.6250832676887512, "learning_rate": 2.3159950745191288e-05, "loss": 0.1668, "step": 23970 }, { "epoch": 7.328339957199633, "grad_norm": 0.5363277792930603, "learning_rate": 2.315952613477135e-05, "loss": 0.1528, "step": 23971 }, { "epoch": 7.328645674105778, "grad_norm": 0.916037380695343, "learning_rate": 2.315910152435141e-05, "loss": 0.2048, "step": 23972 }, { "epoch": 7.328951391011923, "grad_norm": 0.4278410077095032, "learning_rate": 2.315867691393147e-05, "loss": 0.1239, "step": 23973 }, { "epoch": 7.3292571079180675, "grad_norm": 0.3395937383174896, "learning_rate": 2.315825230351153e-05, "loss": 0.0901, "step": 23974 }, { "epoch": 7.329562824824213, "grad_norm": 0.2836955487728119, "learning_rate": 2.315782769309159e-05, "loss": 0.0534, "step": 23975 }, { "epoch": 7.329868541730358, "grad_norm": 1.010936975479126, "learning_rate": 2.315740308267165e-05, "loss": 0.0698, "step": 23976 }, { "epoch": 7.330174258636503, "grad_norm": 0.14916926622390747, "learning_rate": 2.3156978472251712e-05, "loss": 0.047, "step": 23977 }, { "epoch": 7.330479975542648, "grad_norm": 0.3316066861152649, "learning_rate": 2.315655386183177e-05, "loss": 0.0632, "step": 23978 }, { "epoch": 7.330785692448792, "grad_norm": 0.23658272624015808, "learning_rate": 2.3156129251411833e-05, "loss": 0.0792, "step": 23979 }, { "epoch": 7.331091409354937, "grad_norm": 0.27476832270622253, "learning_rate": 2.315570464099189e-05, "loss": 0.0658, "step": 23980 }, { "epoch": 7.331397126261082, "grad_norm": 0.28388094902038574, "learning_rate": 2.315528003057195e-05, "loss": 0.0636, "step": 23981 }, { "epoch": 7.3317028431672275, "grad_norm": 0.9236795902252197, "learning_rate": 2.3154855420152012e-05, "loss": 0.0742, "step": 23982 }, { "epoch": 7.332008560073372, "grad_norm": 0.2750842571258545, "learning_rate": 2.315443080973207e-05, "loss": 0.0548, "step": 23983 }, { "epoch": 7.332314276979517, "grad_norm": 0.37107059359550476, "learning_rate": 2.3154006199312133e-05, "loss": 0.0851, "step": 23984 }, { "epoch": 7.332619993885662, "grad_norm": 1.5662755966186523, "learning_rate": 2.3153581588892192e-05, "loss": 0.1093, "step": 23985 }, { "epoch": 7.332925710791807, "grad_norm": 0.3789205849170685, "learning_rate": 2.3153156978472254e-05, "loss": 0.1086, "step": 23986 }, { "epoch": 7.333231427697951, "grad_norm": 0.521248459815979, "learning_rate": 2.3152732368052313e-05, "loss": 0.1246, "step": 23987 }, { "epoch": 7.3335371446040964, "grad_norm": 0.3398914635181427, "learning_rate": 2.3152307757632375e-05, "loss": 0.1264, "step": 23988 }, { "epoch": 7.333842861510242, "grad_norm": 0.39047306776046753, "learning_rate": 2.3151883147212433e-05, "loss": 0.1477, "step": 23989 }, { "epoch": 7.334148578416387, "grad_norm": 0.5977687835693359, "learning_rate": 2.3151458536792496e-05, "loss": 0.1551, "step": 23990 }, { "epoch": 7.334454295322532, "grad_norm": 0.5765331387519836, "learning_rate": 2.3151033926372554e-05, "loss": 0.1709, "step": 23991 }, { "epoch": 7.334760012228676, "grad_norm": 0.4538646638393402, "learning_rate": 2.3150609315952616e-05, "loss": 0.1651, "step": 23992 }, { "epoch": 7.335065729134821, "grad_norm": 1.6461440324783325, "learning_rate": 2.3150184705532675e-05, "loss": 0.1494, "step": 23993 }, { "epoch": 7.335371446040966, "grad_norm": 2.5345730781555176, "learning_rate": 2.3149760095112734e-05, "loss": 0.1731, "step": 23994 }, { "epoch": 7.335677162947111, "grad_norm": 19.407346725463867, "learning_rate": 2.3149335484692796e-05, "loss": 0.1851, "step": 23995 }, { "epoch": 7.335982879853256, "grad_norm": 1.3814663887023926, "learning_rate": 2.3148910874272855e-05, "loss": 0.2377, "step": 23996 }, { "epoch": 7.336288596759401, "grad_norm": 1.6397733688354492, "learning_rate": 2.3148486263852917e-05, "loss": 0.198, "step": 23997 }, { "epoch": 7.336594313665546, "grad_norm": 0.28886592388153076, "learning_rate": 2.3148061653432975e-05, "loss": 0.126, "step": 23998 }, { "epoch": 7.336900030571691, "grad_norm": 0.3009711503982544, "learning_rate": 2.3147637043013037e-05, "loss": 0.08, "step": 23999 }, { "epoch": 7.337205747477835, "grad_norm": 0.2695312201976776, "learning_rate": 2.3147212432593096e-05, "loss": 0.0843, "step": 24000 }, { "epoch": 7.337205747477835, "eval_cer": 0.18858629832084056, "eval_loss": 0.22998958826065063, "eval_runtime": 19.1035, "eval_samples_per_second": 237.548, "eval_steps_per_second": 0.785, "eval_wer": 0.32870590206977457, "step": 24000 }, { "epoch": 7.337205747477835, "step": 24000, "total_flos": 8.234870246741828e+20, "train_loss": 0.2190107949541416, "train_runtime": 52573.3399, "train_samples_per_second": 477.739, "train_steps_per_second": 1.493 } ], "logging_steps": 1.0, "max_steps": 78504, "num_input_tokens_seen": 0, "num_train_epochs": 24, "save_steps": 1000, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 5 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.234870246741828e+20, "train_batch_size": 160, "trial_name": null, "trial_params": null }