{ "best_metric": null, "best_model_checkpoint": null, "epoch": 20.0, "eval_steps": 500, "global_step": 23940, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04177109440267335, "grad_norm": 3.0113365650177, "learning_rate": 1.9959896399030832e-05, "loss": 3.8241, "step": 50 }, { "epoch": 0.0835421888053467, "grad_norm": 83.7524642944336, "learning_rate": 1.9918121814687946e-05, "loss": 2.3334, "step": 100 }, { "epoch": 0.12531328320802004, "grad_norm": 3.9804179668426514, "learning_rate": 1.987634723034506e-05, "loss": 1.8564, "step": 150 }, { "epoch": 0.1670843776106934, "grad_norm": 4.718357563018799, "learning_rate": 1.9834572646002174e-05, "loss": 1.7395, "step": 200 }, { "epoch": 0.20885547201336674, "grad_norm": 7.3187336921691895, "learning_rate": 1.9792798061659288e-05, "loss": 1.7964, "step": 250 }, { "epoch": 0.2506265664160401, "grad_norm": 4.755648136138916, "learning_rate": 1.9751023477316402e-05, "loss": 1.8215, "step": 300 }, { "epoch": 0.29239766081871343, "grad_norm": 103.38441467285156, "learning_rate": 1.9709248892973516e-05, "loss": 1.8225, "step": 350 }, { "epoch": 0.3341687552213868, "grad_norm": 2.0915794372558594, "learning_rate": 1.966747430863063e-05, "loss": 1.8623, "step": 400 }, { "epoch": 0.37593984962406013, "grad_norm": 5.180814266204834, "learning_rate": 1.9625699724287744e-05, "loss": 1.7722, "step": 450 }, { "epoch": 0.4177109440267335, "grad_norm": 2.494590997695923, "learning_rate": 1.9583925139944858e-05, "loss": 1.8884, "step": 500 }, { "epoch": 0.4594820384294068, "grad_norm": 2.6546897888183594, "learning_rate": 1.9542150555601972e-05, "loss": 1.8798, "step": 550 }, { "epoch": 0.5012531328320802, "grad_norm": 3.439070224761963, "learning_rate": 1.9500375971259086e-05, "loss": 1.7612, "step": 600 }, { "epoch": 0.5430242272347535, "grad_norm": 2.7611918449401855, "learning_rate": 1.9458601386916203e-05, "loss": 1.808, "step": 650 }, { "epoch": 0.5847953216374269, "grad_norm": 3.4334182739257812, "learning_rate": 1.9416826802573317e-05, "loss": 1.6839, "step": 700 }, { "epoch": 0.6265664160401002, "grad_norm": 4.846071720123291, "learning_rate": 1.937505221823043e-05, "loss": 1.7979, "step": 750 }, { "epoch": 0.6683375104427736, "grad_norm": 1.9863933324813843, "learning_rate": 1.9333277633887545e-05, "loss": 1.8056, "step": 800 }, { "epoch": 0.7101086048454469, "grad_norm": 3.9223458766937256, "learning_rate": 1.9291503049544656e-05, "loss": 1.6927, "step": 850 }, { "epoch": 0.7518796992481203, "grad_norm": 2.9789743423461914, "learning_rate": 1.9249728465201773e-05, "loss": 1.8234, "step": 900 }, { "epoch": 0.7936507936507936, "grad_norm": 2.060720443725586, "learning_rate": 1.9207953880858887e-05, "loss": 1.6206, "step": 950 }, { "epoch": 0.835421888053467, "grad_norm": 2.2652785778045654, "learning_rate": 1.9166179296516e-05, "loss": 1.6918, "step": 1000 }, { "epoch": 0.8771929824561403, "grad_norm": 2.8497183322906494, "learning_rate": 1.9124404712173115e-05, "loss": 1.6869, "step": 1050 }, { "epoch": 0.9189640768588136, "grad_norm": 4.50927209854126, "learning_rate": 1.908263012783023e-05, "loss": 1.8064, "step": 1100 }, { "epoch": 0.960735171261487, "grad_norm": 3.4589695930480957, "learning_rate": 1.9040855543487343e-05, "loss": 1.6045, "step": 1150 }, { "epoch": 1.0025062656641603, "grad_norm": 2.7117397785186768, "learning_rate": 1.8999080959144457e-05, "loss": 1.6438, "step": 1200 }, { "epoch": 1.0442773600668338, "grad_norm": 2.4935483932495117, "learning_rate": 1.895730637480157e-05, "loss": 1.6739, "step": 1250 }, { "epoch": 1.086048454469507, "grad_norm": 2.9701192378997803, "learning_rate": 1.8915531790458685e-05, "loss": 1.733, "step": 1300 }, { "epoch": 1.1278195488721805, "grad_norm": 3.8167076110839844, "learning_rate": 1.8873757206115803e-05, "loss": 1.6363, "step": 1350 }, { "epoch": 1.1695906432748537, "grad_norm": 2.2734756469726562, "learning_rate": 1.8831982621772917e-05, "loss": 1.6808, "step": 1400 }, { "epoch": 1.2113617376775272, "grad_norm": 3.4276649951934814, "learning_rate": 1.879020803743003e-05, "loss": 1.7535, "step": 1450 }, { "epoch": 1.2531328320802004, "grad_norm": 4.668817043304443, "learning_rate": 1.874843345308714e-05, "loss": 1.6547, "step": 1500 }, { "epoch": 1.294903926482874, "grad_norm": 6.21998405456543, "learning_rate": 1.870665886874426e-05, "loss": 1.5982, "step": 1550 }, { "epoch": 1.3366750208855471, "grad_norm": 8.45278549194336, "learning_rate": 1.8664884284401372e-05, "loss": 1.6862, "step": 1600 }, { "epoch": 1.3784461152882206, "grad_norm": 3.1017799377441406, "learning_rate": 1.8623109700058486e-05, "loss": 1.5701, "step": 1650 }, { "epoch": 1.4202172096908938, "grad_norm": 5.954344272613525, "learning_rate": 1.85813351157156e-05, "loss": 1.5799, "step": 1700 }, { "epoch": 1.4619883040935673, "grad_norm": 4.534444808959961, "learning_rate": 1.8539560531372714e-05, "loss": 1.6148, "step": 1750 }, { "epoch": 1.5037593984962405, "grad_norm": 6.303493976593018, "learning_rate": 1.849778594702983e-05, "loss": 1.527, "step": 1800 }, { "epoch": 1.545530492898914, "grad_norm": 3.002648115158081, "learning_rate": 1.8456011362686942e-05, "loss": 1.6922, "step": 1850 }, { "epoch": 1.5873015873015874, "grad_norm": 10.854593276977539, "learning_rate": 1.8414236778344056e-05, "loss": 1.5546, "step": 1900 }, { "epoch": 1.6290726817042607, "grad_norm": 4.601009368896484, "learning_rate": 1.837246219400117e-05, "loss": 1.5693, "step": 1950 }, { "epoch": 1.670843776106934, "grad_norm": 3.6931846141815186, "learning_rate": 1.8330687609658288e-05, "loss": 1.6022, "step": 2000 }, { "epoch": 1.7126148705096074, "grad_norm": 11.248991012573242, "learning_rate": 1.82889130253154e-05, "loss": 1.4866, "step": 2050 }, { "epoch": 1.7543859649122808, "grad_norm": 2.7923028469085693, "learning_rate": 1.8247138440972512e-05, "loss": 1.5897, "step": 2100 }, { "epoch": 1.796157059314954, "grad_norm": 4.407562732696533, "learning_rate": 1.8205363856629626e-05, "loss": 1.4225, "step": 2150 }, { "epoch": 1.8379281537176273, "grad_norm": 7.397324562072754, "learning_rate": 1.816358927228674e-05, "loss": 1.5255, "step": 2200 }, { "epoch": 1.8796992481203008, "grad_norm": 12.785990715026855, "learning_rate": 1.8121814687943858e-05, "loss": 1.5248, "step": 2250 }, { "epoch": 1.9214703425229742, "grad_norm": 6.455284595489502, "learning_rate": 1.808004010360097e-05, "loss": 1.5326, "step": 2300 }, { "epoch": 1.9632414369256475, "grad_norm": 3.2796213626861572, "learning_rate": 1.8038265519258086e-05, "loss": 1.5007, "step": 2350 }, { "epoch": 2.0050125313283207, "grad_norm": 7.908225059509277, "learning_rate": 1.79964909349152e-05, "loss": 1.4949, "step": 2400 }, { "epoch": 2.046783625730994, "grad_norm": 12.498689651489258, "learning_rate": 1.7954716350572314e-05, "loss": 1.3854, "step": 2450 }, { "epoch": 2.0885547201336676, "grad_norm": 13.456962585449219, "learning_rate": 1.7912941766229427e-05, "loss": 1.3631, "step": 2500 }, { "epoch": 2.1303258145363406, "grad_norm": 12.199966430664062, "learning_rate": 1.787116718188654e-05, "loss": 1.3836, "step": 2550 }, { "epoch": 2.172096908939014, "grad_norm": 3.8431236743927, "learning_rate": 1.7829392597543655e-05, "loss": 1.34, "step": 2600 }, { "epoch": 2.2138680033416875, "grad_norm": 16.643613815307617, "learning_rate": 1.778761801320077e-05, "loss": 1.4281, "step": 2650 }, { "epoch": 2.255639097744361, "grad_norm": 14.22512435913086, "learning_rate": 1.7745843428857887e-05, "loss": 1.3201, "step": 2700 }, { "epoch": 2.2974101921470345, "grad_norm": 12.082077026367188, "learning_rate": 1.7704068844514997e-05, "loss": 1.4239, "step": 2750 }, { "epoch": 2.3391812865497075, "grad_norm": 11.998297691345215, "learning_rate": 1.766229426017211e-05, "loss": 1.4133, "step": 2800 }, { "epoch": 2.380952380952381, "grad_norm": 20.736862182617188, "learning_rate": 1.7620519675829225e-05, "loss": 1.2625, "step": 2850 }, { "epoch": 2.4227234753550544, "grad_norm": 8.49312973022461, "learning_rate": 1.7578745091486343e-05, "loss": 1.3471, "step": 2900 }, { "epoch": 2.4644945697577274, "grad_norm": 8.388376235961914, "learning_rate": 1.7536970507143457e-05, "loss": 1.2723, "step": 2950 }, { "epoch": 2.506265664160401, "grad_norm": 11.559739112854004, "learning_rate": 1.749519592280057e-05, "loss": 1.306, "step": 3000 }, { "epoch": 2.5480367585630743, "grad_norm": 10.767087936401367, "learning_rate": 1.7453421338457685e-05, "loss": 1.3359, "step": 3050 }, { "epoch": 2.589807852965748, "grad_norm": 14.302842140197754, "learning_rate": 1.74116467541148e-05, "loss": 1.3609, "step": 3100 }, { "epoch": 2.6315789473684212, "grad_norm": 26.425922393798828, "learning_rate": 1.7369872169771913e-05, "loss": 1.3165, "step": 3150 }, { "epoch": 2.6733500417710943, "grad_norm": 21.576194763183594, "learning_rate": 1.7328097585429027e-05, "loss": 1.4171, "step": 3200 }, { "epoch": 2.7151211361737677, "grad_norm": 13.986672401428223, "learning_rate": 1.728632300108614e-05, "loss": 1.2982, "step": 3250 }, { "epoch": 2.756892230576441, "grad_norm": 20.48072052001953, "learning_rate": 1.7244548416743255e-05, "loss": 1.3041, "step": 3300 }, { "epoch": 2.798663324979114, "grad_norm": 0.8754037618637085, "learning_rate": 1.720277383240037e-05, "loss": 1.2004, "step": 3350 }, { "epoch": 2.8404344193817876, "grad_norm": 32.664878845214844, "learning_rate": 1.7160999248057483e-05, "loss": 1.2166, "step": 3400 }, { "epoch": 2.882205513784461, "grad_norm": 13.420926094055176, "learning_rate": 1.7119224663714597e-05, "loss": 1.2593, "step": 3450 }, { "epoch": 2.9239766081871346, "grad_norm": 16.320348739624023, "learning_rate": 1.707745007937171e-05, "loss": 1.2575, "step": 3500 }, { "epoch": 2.965747702589808, "grad_norm": 51.816585540771484, "learning_rate": 1.7035675495028824e-05, "loss": 1.3123, "step": 3550 }, { "epoch": 3.007518796992481, "grad_norm": 7.434196472167969, "learning_rate": 1.6993900910685942e-05, "loss": 1.1879, "step": 3600 }, { "epoch": 3.0492898913951545, "grad_norm": 19.095539093017578, "learning_rate": 1.6952126326343056e-05, "loss": 1.0774, "step": 3650 }, { "epoch": 3.091060985797828, "grad_norm": 11.363150596618652, "learning_rate": 1.691035174200017e-05, "loss": 1.1259, "step": 3700 }, { "epoch": 3.1328320802005014, "grad_norm": 40.07558822631836, "learning_rate": 1.6868577157657284e-05, "loss": 1.058, "step": 3750 }, { "epoch": 3.1746031746031744, "grad_norm": 23.62975311279297, "learning_rate": 1.6826802573314394e-05, "loss": 1.0941, "step": 3800 }, { "epoch": 3.216374269005848, "grad_norm": 23.654922485351562, "learning_rate": 1.6785027988971512e-05, "loss": 0.9878, "step": 3850 }, { "epoch": 3.2581453634085213, "grad_norm": 15.095159530639648, "learning_rate": 1.6743253404628626e-05, "loss": 1.0991, "step": 3900 }, { "epoch": 3.299916457811195, "grad_norm": 24.225435256958008, "learning_rate": 1.670147882028574e-05, "loss": 1.0553, "step": 3950 }, { "epoch": 3.341687552213868, "grad_norm": 41.239261627197266, "learning_rate": 1.6659704235942854e-05, "loss": 1.015, "step": 4000 }, { "epoch": 3.3834586466165413, "grad_norm": 21.136133193969727, "learning_rate": 1.6617929651599968e-05, "loss": 1.0451, "step": 4050 }, { "epoch": 3.4252297410192147, "grad_norm": 14.230918884277344, "learning_rate": 1.657615506725708e-05, "loss": 1.0563, "step": 4100 }, { "epoch": 3.467000835421888, "grad_norm": 16.35744285583496, "learning_rate": 1.6534380482914196e-05, "loss": 0.9853, "step": 4150 }, { "epoch": 3.5087719298245617, "grad_norm": 19.833560943603516, "learning_rate": 1.649260589857131e-05, "loss": 1.0809, "step": 4200 }, { "epoch": 3.5505430242272347, "grad_norm": 19.3465518951416, "learning_rate": 1.6450831314228424e-05, "loss": 1.06, "step": 4250 }, { "epoch": 3.592314118629908, "grad_norm": 47.185245513916016, "learning_rate": 1.640905672988554e-05, "loss": 0.9934, "step": 4300 }, { "epoch": 3.6340852130325816, "grad_norm": 19.286062240600586, "learning_rate": 1.6367282145542655e-05, "loss": 0.9328, "step": 4350 }, { "epoch": 3.6758563074352546, "grad_norm": 27.635419845581055, "learning_rate": 1.6325507561199766e-05, "loss": 0.9286, "step": 4400 }, { "epoch": 3.717627401837928, "grad_norm": 28.93077850341797, "learning_rate": 1.628373297685688e-05, "loss": 0.8911, "step": 4450 }, { "epoch": 3.7593984962406015, "grad_norm": 6.602542877197266, "learning_rate": 1.6241958392513997e-05, "loss": 0.9376, "step": 4500 }, { "epoch": 3.801169590643275, "grad_norm": 13.683168411254883, "learning_rate": 1.620018380817111e-05, "loss": 0.9827, "step": 4550 }, { "epoch": 3.8429406850459484, "grad_norm": 8.653618812561035, "learning_rate": 1.6158409223828225e-05, "loss": 0.9686, "step": 4600 }, { "epoch": 3.8847117794486214, "grad_norm": 29.841218948364258, "learning_rate": 1.611663463948534e-05, "loss": 0.9835, "step": 4650 }, { "epoch": 3.926482873851295, "grad_norm": 16.030054092407227, "learning_rate": 1.6074860055142453e-05, "loss": 0.9497, "step": 4700 }, { "epoch": 3.9682539682539684, "grad_norm": 41.21371078491211, "learning_rate": 1.6033085470799567e-05, "loss": 0.9437, "step": 4750 }, { "epoch": 4.010025062656641, "grad_norm": 31.541624069213867, "learning_rate": 1.599131088645668e-05, "loss": 0.8401, "step": 4800 }, { "epoch": 4.051796157059315, "grad_norm": 28.929073333740234, "learning_rate": 1.5949536302113795e-05, "loss": 0.8333, "step": 4850 }, { "epoch": 4.093567251461988, "grad_norm": 12.41563606262207, "learning_rate": 1.590776171777091e-05, "loss": 0.7276, "step": 4900 }, { "epoch": 4.135338345864661, "grad_norm": 33.126747131347656, "learning_rate": 1.5865987133428026e-05, "loss": 0.7591, "step": 4950 }, { "epoch": 4.177109440267335, "grad_norm": 18.336153030395508, "learning_rate": 1.5824212549085137e-05, "loss": 0.8042, "step": 5000 }, { "epoch": 4.218880534670008, "grad_norm": 13.671285629272461, "learning_rate": 1.578243796474225e-05, "loss": 0.798, "step": 5050 }, { "epoch": 4.260651629072681, "grad_norm": 17.29936408996582, "learning_rate": 1.5740663380399365e-05, "loss": 0.7078, "step": 5100 }, { "epoch": 4.302422723475355, "grad_norm": 14.447928428649902, "learning_rate": 1.569888879605648e-05, "loss": 0.7174, "step": 5150 }, { "epoch": 4.344193817878028, "grad_norm": 57.8358268737793, "learning_rate": 1.5657114211713596e-05, "loss": 0.7757, "step": 5200 }, { "epoch": 4.385964912280702, "grad_norm": 25.56312370300293, "learning_rate": 1.561533962737071e-05, "loss": 0.8152, "step": 5250 }, { "epoch": 4.427736006683375, "grad_norm": 21.909448623657227, "learning_rate": 1.5573565043027824e-05, "loss": 0.6839, "step": 5300 }, { "epoch": 4.469507101086048, "grad_norm": 34.13754653930664, "learning_rate": 1.5531790458684938e-05, "loss": 0.7502, "step": 5350 }, { "epoch": 4.511278195488722, "grad_norm": 32.14202117919922, "learning_rate": 1.5490015874342052e-05, "loss": 0.786, "step": 5400 }, { "epoch": 4.553049289891395, "grad_norm": 22.156665802001953, "learning_rate": 1.5448241289999166e-05, "loss": 0.7236, "step": 5450 }, { "epoch": 4.594820384294069, "grad_norm": 25.08436393737793, "learning_rate": 1.540646670565628e-05, "loss": 0.7738, "step": 5500 }, { "epoch": 4.636591478696742, "grad_norm": 32.960784912109375, "learning_rate": 1.5364692121313394e-05, "loss": 0.7778, "step": 5550 }, { "epoch": 4.678362573099415, "grad_norm": 16.070016860961914, "learning_rate": 1.5322917536970508e-05, "loss": 0.792, "step": 5600 }, { "epoch": 4.720133667502089, "grad_norm": 12.606107711791992, "learning_rate": 1.5281142952627622e-05, "loss": 0.7382, "step": 5650 }, { "epoch": 4.761904761904762, "grad_norm": 22.041399002075195, "learning_rate": 1.5239368368284738e-05, "loss": 0.7351, "step": 5700 }, { "epoch": 4.803675856307435, "grad_norm": 22.424896240234375, "learning_rate": 1.5197593783941851e-05, "loss": 0.6698, "step": 5750 }, { "epoch": 4.845446950710109, "grad_norm": 18.415851593017578, "learning_rate": 1.5155819199598964e-05, "loss": 0.7545, "step": 5800 }, { "epoch": 4.887218045112782, "grad_norm": 52.402984619140625, "learning_rate": 1.5114044615256081e-05, "loss": 0.6534, "step": 5850 }, { "epoch": 4.928989139515455, "grad_norm": 28.06797218322754, "learning_rate": 1.5072270030913193e-05, "loss": 0.5699, "step": 5900 }, { "epoch": 4.970760233918129, "grad_norm": 11.340112686157227, "learning_rate": 1.5030495446570307e-05, "loss": 0.6786, "step": 5950 }, { "epoch": 5.012531328320802, "grad_norm": 18.82775115966797, "learning_rate": 1.4988720862227421e-05, "loss": 0.6637, "step": 6000 }, { "epoch": 5.054302422723476, "grad_norm": 66.8216781616211, "learning_rate": 1.4946946277884535e-05, "loss": 0.5432, "step": 6050 }, { "epoch": 5.096073517126149, "grad_norm": 8.892007827758789, "learning_rate": 1.4905171693541651e-05, "loss": 0.695, "step": 6100 }, { "epoch": 5.137844611528822, "grad_norm": 16.294639587402344, "learning_rate": 1.4863397109198765e-05, "loss": 0.5784, "step": 6150 }, { "epoch": 5.179615705931496, "grad_norm": 19.78766441345215, "learning_rate": 1.4821622524855879e-05, "loss": 0.503, "step": 6200 }, { "epoch": 5.221386800334169, "grad_norm": 13.428692817687988, "learning_rate": 1.4779847940512993e-05, "loss": 0.6292, "step": 6250 }, { "epoch": 5.2631578947368425, "grad_norm": 29.248811721801758, "learning_rate": 1.4738073356170109e-05, "loss": 0.5902, "step": 6300 }, { "epoch": 5.3049289891395155, "grad_norm": 7.9793877601623535, "learning_rate": 1.4696298771827223e-05, "loss": 0.5765, "step": 6350 }, { "epoch": 5.3467000835421885, "grad_norm": 10.095882415771484, "learning_rate": 1.4654524187484337e-05, "loss": 0.4703, "step": 6400 }, { "epoch": 5.388471177944862, "grad_norm": 25.082889556884766, "learning_rate": 1.4612749603141449e-05, "loss": 0.4934, "step": 6450 }, { "epoch": 5.430242272347535, "grad_norm": 31.18419075012207, "learning_rate": 1.4570975018798563e-05, "loss": 0.5474, "step": 6500 }, { "epoch": 5.472013366750208, "grad_norm": 18.715656280517578, "learning_rate": 1.4529200434455679e-05, "loss": 0.63, "step": 6550 }, { "epoch": 5.513784461152882, "grad_norm": 14.067596435546875, "learning_rate": 1.4487425850112793e-05, "loss": 0.621, "step": 6600 }, { "epoch": 5.555555555555555, "grad_norm": 15.184345245361328, "learning_rate": 1.4445651265769907e-05, "loss": 0.5204, "step": 6650 }, { "epoch": 5.597326649958229, "grad_norm": 15.532989501953125, "learning_rate": 1.440387668142702e-05, "loss": 0.5897, "step": 6700 }, { "epoch": 5.639097744360902, "grad_norm": 54.043617248535156, "learning_rate": 1.4362102097084136e-05, "loss": 0.4629, "step": 6750 }, { "epoch": 5.680868838763575, "grad_norm": 39.661746978759766, "learning_rate": 1.432032751274125e-05, "loss": 0.4543, "step": 6800 }, { "epoch": 5.722639933166249, "grad_norm": 20.885473251342773, "learning_rate": 1.4278552928398364e-05, "loss": 0.5593, "step": 6850 }, { "epoch": 5.764411027568922, "grad_norm": 17.765832901000977, "learning_rate": 1.4236778344055478e-05, "loss": 0.5543, "step": 6900 }, { "epoch": 5.806182121971595, "grad_norm": 32.63746643066406, "learning_rate": 1.419500375971259e-05, "loss": 0.3481, "step": 6950 }, { "epoch": 5.847953216374269, "grad_norm": 16.857807159423828, "learning_rate": 1.4153229175369708e-05, "loss": 0.4595, "step": 7000 }, { "epoch": 5.889724310776942, "grad_norm": 10.114936828613281, "learning_rate": 1.411145459102682e-05, "loss": 0.4995, "step": 7050 }, { "epoch": 5.931495405179616, "grad_norm": 16.946565628051758, "learning_rate": 1.4069680006683934e-05, "loss": 0.4598, "step": 7100 }, { "epoch": 5.973266499582289, "grad_norm": 15.710683822631836, "learning_rate": 1.4027905422341048e-05, "loss": 0.4452, "step": 7150 }, { "epoch": 6.015037593984962, "grad_norm": 25.516210556030273, "learning_rate": 1.3986130837998162e-05, "loss": 0.5015, "step": 7200 }, { "epoch": 6.056808688387636, "grad_norm": 14.8129243850708, "learning_rate": 1.3944356253655278e-05, "loss": 0.4292, "step": 7250 }, { "epoch": 6.098579782790309, "grad_norm": 3.4943878650665283, "learning_rate": 1.3902581669312392e-05, "loss": 0.3825, "step": 7300 }, { "epoch": 6.140350877192983, "grad_norm": 34.67987060546875, "learning_rate": 1.3860807084969506e-05, "loss": 0.368, "step": 7350 }, { "epoch": 6.182121971595656, "grad_norm": 13.267960548400879, "learning_rate": 1.381903250062662e-05, "loss": 0.3394, "step": 7400 }, { "epoch": 6.223893065998329, "grad_norm": 25.280872344970703, "learning_rate": 1.3777257916283735e-05, "loss": 0.4007, "step": 7450 }, { "epoch": 6.265664160401003, "grad_norm": 28.221872329711914, "learning_rate": 1.373548333194085e-05, "loss": 0.386, "step": 7500 }, { "epoch": 6.307435254803676, "grad_norm": 23.17299461364746, "learning_rate": 1.3693708747597963e-05, "loss": 0.445, "step": 7550 }, { "epoch": 6.349206349206349, "grad_norm": 4.611133575439453, "learning_rate": 1.3651934163255076e-05, "loss": 0.3661, "step": 7600 }, { "epoch": 6.390977443609023, "grad_norm": 35.275779724121094, "learning_rate": 1.361015957891219e-05, "loss": 0.4559, "step": 7650 }, { "epoch": 6.432748538011696, "grad_norm": 27.652978897094727, "learning_rate": 1.3568384994569305e-05, "loss": 0.3594, "step": 7700 }, { "epoch": 6.474519632414369, "grad_norm": 12.323234558105469, "learning_rate": 1.352661041022642e-05, "loss": 0.4166, "step": 7750 }, { "epoch": 6.516290726817043, "grad_norm": 19.783565521240234, "learning_rate": 1.3484835825883533e-05, "loss": 0.4232, "step": 7800 }, { "epoch": 6.558061821219716, "grad_norm": 14.999006271362305, "learning_rate": 1.3443061241540647e-05, "loss": 0.4427, "step": 7850 }, { "epoch": 6.59983291562239, "grad_norm": 12.637155532836914, "learning_rate": 1.3401286657197763e-05, "loss": 0.3311, "step": 7900 }, { "epoch": 6.641604010025063, "grad_norm": 24.413984298706055, "learning_rate": 1.3359512072854877e-05, "loss": 0.4073, "step": 7950 }, { "epoch": 6.683375104427736, "grad_norm": 20.41080665588379, "learning_rate": 1.331773748851199e-05, "loss": 0.4343, "step": 8000 }, { "epoch": 6.7251461988304095, "grad_norm": 32.916595458984375, "learning_rate": 1.3275962904169105e-05, "loss": 0.355, "step": 8050 }, { "epoch": 6.7669172932330826, "grad_norm": 13.167571067810059, "learning_rate": 1.3234188319826217e-05, "loss": 0.4479, "step": 8100 }, { "epoch": 6.8086883876357565, "grad_norm": 18.277618408203125, "learning_rate": 1.3192413735483334e-05, "loss": 0.3349, "step": 8150 }, { "epoch": 6.8504594820384295, "grad_norm": 18.956066131591797, "learning_rate": 1.3150639151140447e-05, "loss": 0.2944, "step": 8200 }, { "epoch": 6.8922305764411025, "grad_norm": 29.309457778930664, "learning_rate": 1.310886456679756e-05, "loss": 0.3497, "step": 8250 }, { "epoch": 6.934001670843776, "grad_norm": 0.8253272771835327, "learning_rate": 1.3067089982454675e-05, "loss": 0.411, "step": 8300 }, { "epoch": 6.975772765246449, "grad_norm": 38.29353332519531, "learning_rate": 1.302531539811179e-05, "loss": 0.3151, "step": 8350 }, { "epoch": 7.017543859649122, "grad_norm": 26.124536514282227, "learning_rate": 1.2983540813768904e-05, "loss": 0.322, "step": 8400 }, { "epoch": 7.059314954051796, "grad_norm": 24.413715362548828, "learning_rate": 1.2941766229426018e-05, "loss": 0.1929, "step": 8450 }, { "epoch": 7.101086048454469, "grad_norm": 28.574317932128906, "learning_rate": 1.2899991645083132e-05, "loss": 0.2357, "step": 8500 }, { "epoch": 7.142857142857143, "grad_norm": 13.467860221862793, "learning_rate": 1.2858217060740246e-05, "loss": 0.3188, "step": 8550 }, { "epoch": 7.184628237259816, "grad_norm": 23.04376983642578, "learning_rate": 1.2816442476397362e-05, "loss": 0.3352, "step": 8600 }, { "epoch": 7.226399331662489, "grad_norm": 0.04887882620096207, "learning_rate": 1.2774667892054476e-05, "loss": 0.356, "step": 8650 }, { "epoch": 7.268170426065163, "grad_norm": 15.562792778015137, "learning_rate": 1.2732893307711588e-05, "loss": 0.3169, "step": 8700 }, { "epoch": 7.309941520467836, "grad_norm": 7.763381004333496, "learning_rate": 1.2691118723368702e-05, "loss": 0.2877, "step": 8750 }, { "epoch": 7.351712614870509, "grad_norm": 42.26727294921875, "learning_rate": 1.2649344139025818e-05, "loss": 0.2561, "step": 8800 }, { "epoch": 7.393483709273183, "grad_norm": 16.665681838989258, "learning_rate": 1.2607569554682932e-05, "loss": 0.2958, "step": 8850 }, { "epoch": 7.435254803675856, "grad_norm": 44.200042724609375, "learning_rate": 1.2565794970340046e-05, "loss": 0.2894, "step": 8900 }, { "epoch": 7.47702589807853, "grad_norm": 14.340874671936035, "learning_rate": 1.252402038599716e-05, "loss": 0.3186, "step": 8950 }, { "epoch": 7.518796992481203, "grad_norm": 41.146812438964844, "learning_rate": 1.2482245801654274e-05, "loss": 0.2913, "step": 9000 }, { "epoch": 7.560568086883876, "grad_norm": 13.177423477172852, "learning_rate": 1.244047121731139e-05, "loss": 0.2311, "step": 9050 }, { "epoch": 7.60233918128655, "grad_norm": 19.08924102783203, "learning_rate": 1.2398696632968503e-05, "loss": 0.2501, "step": 9100 }, { "epoch": 7.644110275689223, "grad_norm": 13.80128002166748, "learning_rate": 1.2356922048625617e-05, "loss": 0.27, "step": 9150 }, { "epoch": 7.685881370091897, "grad_norm": 32.72663497924805, "learning_rate": 1.2315147464282731e-05, "loss": 0.268, "step": 9200 }, { "epoch": 7.72765246449457, "grad_norm": 28.15250587463379, "learning_rate": 1.2273372879939847e-05, "loss": 0.3212, "step": 9250 }, { "epoch": 7.769423558897243, "grad_norm": 13.896389961242676, "learning_rate": 1.2231598295596961e-05, "loss": 0.3293, "step": 9300 }, { "epoch": 7.811194653299917, "grad_norm": 14.249073028564453, "learning_rate": 1.2189823711254073e-05, "loss": 0.2786, "step": 9350 }, { "epoch": 7.85296574770259, "grad_norm": 19.6274356842041, "learning_rate": 1.2148049126911187e-05, "loss": 0.2812, "step": 9400 }, { "epoch": 7.894736842105263, "grad_norm": 15.745018005371094, "learning_rate": 1.2106274542568301e-05, "loss": 0.2721, "step": 9450 }, { "epoch": 7.936507936507937, "grad_norm": 11.491929054260254, "learning_rate": 1.2064499958225417e-05, "loss": 0.2833, "step": 9500 }, { "epoch": 7.97827903091061, "grad_norm": 4.299226760864258, "learning_rate": 1.2022725373882531e-05, "loss": 0.2335, "step": 9550 }, { "epoch": 8.020050125313283, "grad_norm": 11.306835174560547, "learning_rate": 1.1980950789539645e-05, "loss": 0.1985, "step": 9600 }, { "epoch": 8.061821219715956, "grad_norm": 45.70457077026367, "learning_rate": 1.1939176205196759e-05, "loss": 0.1748, "step": 9650 }, { "epoch": 8.10359231411863, "grad_norm": 140.19454956054688, "learning_rate": 1.1897401620853875e-05, "loss": 0.2033, "step": 9700 }, { "epoch": 8.145363408521304, "grad_norm": 16.645404815673828, "learning_rate": 1.1855627036510989e-05, "loss": 0.1642, "step": 9750 }, { "epoch": 8.187134502923977, "grad_norm": 16.9295654296875, "learning_rate": 1.1813852452168103e-05, "loss": 0.1904, "step": 9800 }, { "epoch": 8.22890559732665, "grad_norm": 34.388484954833984, "learning_rate": 1.1772077867825215e-05, "loss": 0.217, "step": 9850 }, { "epoch": 8.270676691729323, "grad_norm": 12.104029655456543, "learning_rate": 1.1730303283482329e-05, "loss": 0.2691, "step": 9900 }, { "epoch": 8.312447786131997, "grad_norm": 2.027843475341797, "learning_rate": 1.1688528699139445e-05, "loss": 0.2058, "step": 9950 }, { "epoch": 8.35421888053467, "grad_norm": 26.941038131713867, "learning_rate": 1.1646754114796558e-05, "loss": 0.1581, "step": 10000 }, { "epoch": 8.395989974937343, "grad_norm": 28.439321517944336, "learning_rate": 1.1604979530453672e-05, "loss": 0.2169, "step": 10050 }, { "epoch": 8.437761069340016, "grad_norm": 3.916905641555786, "learning_rate": 1.1563204946110786e-05, "loss": 0.1771, "step": 10100 }, { "epoch": 8.47953216374269, "grad_norm": 30.46429443359375, "learning_rate": 1.1521430361767902e-05, "loss": 0.2606, "step": 10150 }, { "epoch": 8.521303258145362, "grad_norm": 4.947411060333252, "learning_rate": 1.1479655777425016e-05, "loss": 0.203, "step": 10200 }, { "epoch": 8.563074352548037, "grad_norm": 12.219408988952637, "learning_rate": 1.143788119308213e-05, "loss": 0.2081, "step": 10250 }, { "epoch": 8.60484544695071, "grad_norm": 45.41501998901367, "learning_rate": 1.1396106608739244e-05, "loss": 0.1994, "step": 10300 }, { "epoch": 8.646616541353383, "grad_norm": 1.2857117652893066, "learning_rate": 1.1354332024396358e-05, "loss": 0.2014, "step": 10350 }, { "epoch": 8.688387635756056, "grad_norm": 18.340967178344727, "learning_rate": 1.1312557440053474e-05, "loss": 0.179, "step": 10400 }, { "epoch": 8.73015873015873, "grad_norm": 35.21903610229492, "learning_rate": 1.1270782855710588e-05, "loss": 0.1677, "step": 10450 }, { "epoch": 8.771929824561404, "grad_norm": 16.27471160888672, "learning_rate": 1.12290082713677e-05, "loss": 0.1792, "step": 10500 }, { "epoch": 8.813700918964077, "grad_norm": 22.30707550048828, "learning_rate": 1.1187233687024814e-05, "loss": 0.2007, "step": 10550 }, { "epoch": 8.85547201336675, "grad_norm": 2.8700950145721436, "learning_rate": 1.114545910268193e-05, "loss": 0.2363, "step": 10600 }, { "epoch": 8.897243107769423, "grad_norm": 24.8457088470459, "learning_rate": 1.1103684518339044e-05, "loss": 0.1915, "step": 10650 }, { "epoch": 8.939014202172096, "grad_norm": 9.145515441894531, "learning_rate": 1.1061909933996158e-05, "loss": 0.244, "step": 10700 }, { "epoch": 8.980785296574771, "grad_norm": 0.5550752282142639, "learning_rate": 1.1020135349653272e-05, "loss": 0.1613, "step": 10750 }, { "epoch": 9.022556390977444, "grad_norm": 18.79015350341797, "learning_rate": 1.0978360765310386e-05, "loss": 0.2604, "step": 10800 }, { "epoch": 9.064327485380117, "grad_norm": 3.671290397644043, "learning_rate": 1.0936586180967501e-05, "loss": 0.1305, "step": 10850 }, { "epoch": 9.10609857978279, "grad_norm": 10.10545539855957, "learning_rate": 1.0894811596624615e-05, "loss": 0.1584, "step": 10900 }, { "epoch": 9.147869674185463, "grad_norm": 0.5803263187408447, "learning_rate": 1.085303701228173e-05, "loss": 0.1038, "step": 10950 }, { "epoch": 9.189640768588138, "grad_norm": 13.21904468536377, "learning_rate": 1.0811262427938841e-05, "loss": 0.156, "step": 11000 }, { "epoch": 9.23141186299081, "grad_norm": 19.366302490234375, "learning_rate": 1.0769487843595955e-05, "loss": 0.1567, "step": 11050 }, { "epoch": 9.273182957393484, "grad_norm": 63.86299514770508, "learning_rate": 1.0727713259253071e-05, "loss": 0.134, "step": 11100 }, { "epoch": 9.314954051796157, "grad_norm": 9.471511840820312, "learning_rate": 1.0685938674910185e-05, "loss": 0.1439, "step": 11150 }, { "epoch": 9.35672514619883, "grad_norm": 17.094606399536133, "learning_rate": 1.0644164090567299e-05, "loss": 0.1504, "step": 11200 }, { "epoch": 9.398496240601503, "grad_norm": 0.7686102390289307, "learning_rate": 1.0602389506224413e-05, "loss": 0.198, "step": 11250 }, { "epoch": 9.440267335004178, "grad_norm": 4.632621765136719, "learning_rate": 1.0560614921881529e-05, "loss": 0.1299, "step": 11300 }, { "epoch": 9.48203842940685, "grad_norm": 17.580154418945312, "learning_rate": 1.0518840337538643e-05, "loss": 0.1221, "step": 11350 }, { "epoch": 9.523809523809524, "grad_norm": 28.859804153442383, "learning_rate": 1.0477065753195757e-05, "loss": 0.1497, "step": 11400 }, { "epoch": 9.565580618212197, "grad_norm": 3.4997804164886475, "learning_rate": 1.043529116885287e-05, "loss": 0.1228, "step": 11450 }, { "epoch": 9.60735171261487, "grad_norm": 21.530311584472656, "learning_rate": 1.0393516584509985e-05, "loss": 0.122, "step": 11500 }, { "epoch": 9.649122807017545, "grad_norm": 4.085865020751953, "learning_rate": 1.03517420001671e-05, "loss": 0.1625, "step": 11550 }, { "epoch": 9.690893901420218, "grad_norm": 13.100555419921875, "learning_rate": 1.0309967415824214e-05, "loss": 0.1361, "step": 11600 }, { "epoch": 9.73266499582289, "grad_norm": 22.437877655029297, "learning_rate": 1.0268192831481327e-05, "loss": 0.2052, "step": 11650 }, { "epoch": 9.774436090225564, "grad_norm": 24.361730575561523, "learning_rate": 1.022641824713844e-05, "loss": 0.1314, "step": 11700 }, { "epoch": 9.816207184628237, "grad_norm": 4.0465497970581055, "learning_rate": 1.0184643662795556e-05, "loss": 0.1526, "step": 11750 }, { "epoch": 9.857978279030911, "grad_norm": 0.056704986840486526, "learning_rate": 1.014286907845267e-05, "loss": 0.1579, "step": 11800 }, { "epoch": 9.899749373433584, "grad_norm": 1.126349687576294, "learning_rate": 1.0101094494109784e-05, "loss": 0.1602, "step": 11850 }, { "epoch": 9.941520467836257, "grad_norm": 42.518741607666016, "learning_rate": 1.0059319909766898e-05, "loss": 0.1495, "step": 11900 }, { "epoch": 9.98329156223893, "grad_norm": 15.24010944366455, "learning_rate": 1.0017545325424012e-05, "loss": 0.1109, "step": 11950 }, { "epoch": 10.025062656641603, "grad_norm": 2.6156651973724365, "learning_rate": 9.975770741081126e-06, "loss": 0.165, "step": 12000 }, { "epoch": 10.066833751044278, "grad_norm": 2.6075878143310547, "learning_rate": 9.933996156738242e-06, "loss": 0.0826, "step": 12050 }, { "epoch": 10.108604845446951, "grad_norm": 9.561171531677246, "learning_rate": 9.892221572395356e-06, "loss": 0.0878, "step": 12100 }, { "epoch": 10.150375939849624, "grad_norm": 3.180940628051758, "learning_rate": 9.85044698805247e-06, "loss": 0.0723, "step": 12150 }, { "epoch": 10.192147034252297, "grad_norm": 2.523481845855713, "learning_rate": 9.808672403709584e-06, "loss": 0.1363, "step": 12200 }, { "epoch": 10.23391812865497, "grad_norm": 14.353055000305176, "learning_rate": 9.766897819366698e-06, "loss": 0.099, "step": 12250 }, { "epoch": 10.275689223057643, "grad_norm": 1.3948580026626587, "learning_rate": 9.725123235023812e-06, "loss": 0.11, "step": 12300 }, { "epoch": 10.317460317460318, "grad_norm": 18.495405197143555, "learning_rate": 9.683348650680926e-06, "loss": 0.1414, "step": 12350 }, { "epoch": 10.359231411862991, "grad_norm": 33.3175163269043, "learning_rate": 9.641574066338041e-06, "loss": 0.126, "step": 12400 }, { "epoch": 10.401002506265664, "grad_norm": 6.8393235206604, "learning_rate": 9.599799481995154e-06, "loss": 0.1304, "step": 12450 }, { "epoch": 10.442773600668337, "grad_norm": 20.7825927734375, "learning_rate": 9.55802489765227e-06, "loss": 0.1272, "step": 12500 }, { "epoch": 10.48454469507101, "grad_norm": 19.048812866210938, "learning_rate": 9.516250313309383e-06, "loss": 0.108, "step": 12550 }, { "epoch": 10.526315789473685, "grad_norm": 19.517911911010742, "learning_rate": 9.474475728966497e-06, "loss": 0.1222, "step": 12600 }, { "epoch": 10.568086883876358, "grad_norm": 0.2565244734287262, "learning_rate": 9.432701144623611e-06, "loss": 0.0917, "step": 12650 }, { "epoch": 10.609857978279031, "grad_norm": 10.629936218261719, "learning_rate": 9.390926560280727e-06, "loss": 0.1145, "step": 12700 }, { "epoch": 10.651629072681704, "grad_norm": 1.9969385862350464, "learning_rate": 9.349151975937841e-06, "loss": 0.0877, "step": 12750 }, { "epoch": 10.693400167084377, "grad_norm": 13.717659950256348, "learning_rate": 9.307377391594953e-06, "loss": 0.0868, "step": 12800 }, { "epoch": 10.73517126148705, "grad_norm": 13.955449104309082, "learning_rate": 9.265602807252069e-06, "loss": 0.0922, "step": 12850 }, { "epoch": 10.776942355889725, "grad_norm": 9.980537414550781, "learning_rate": 9.223828222909183e-06, "loss": 0.0768, "step": 12900 }, { "epoch": 10.818713450292398, "grad_norm": 24.630155563354492, "learning_rate": 9.182053638566297e-06, "loss": 0.1128, "step": 12950 }, { "epoch": 10.86048454469507, "grad_norm": 2.8442764282226562, "learning_rate": 9.140279054223411e-06, "loss": 0.0794, "step": 13000 }, { "epoch": 10.902255639097744, "grad_norm": 39.42329025268555, "learning_rate": 9.098504469880527e-06, "loss": 0.0781, "step": 13050 }, { "epoch": 10.944026733500417, "grad_norm": 103.70584106445312, "learning_rate": 9.056729885537639e-06, "loss": 0.0872, "step": 13100 }, { "epoch": 10.985797827903092, "grad_norm": 0.15249623358249664, "learning_rate": 9.014955301194755e-06, "loss": 0.1059, "step": 13150 }, { "epoch": 11.027568922305765, "grad_norm": 10.956684112548828, "learning_rate": 8.973180716851868e-06, "loss": 0.0955, "step": 13200 }, { "epoch": 11.069340016708438, "grad_norm": 7.523435115814209, "learning_rate": 8.931406132508982e-06, "loss": 0.0671, "step": 13250 }, { "epoch": 11.11111111111111, "grad_norm": 7.550344467163086, "learning_rate": 8.889631548166096e-06, "loss": 0.0896, "step": 13300 }, { "epoch": 11.152882205513784, "grad_norm": 31.79115867614746, "learning_rate": 8.84785696382321e-06, "loss": 0.0464, "step": 13350 }, { "epoch": 11.194653299916459, "grad_norm": 3.681047201156616, "learning_rate": 8.806082379480324e-06, "loss": 0.0848, "step": 13400 }, { "epoch": 11.236424394319132, "grad_norm": 0.7510688304901123, "learning_rate": 8.764307795137438e-06, "loss": 0.0511, "step": 13450 }, { "epoch": 11.278195488721805, "grad_norm": 8.21001148223877, "learning_rate": 8.722533210794554e-06, "loss": 0.1051, "step": 13500 }, { "epoch": 11.319966583124478, "grad_norm": 8.100149154663086, "learning_rate": 8.680758626451668e-06, "loss": 0.0728, "step": 13550 }, { "epoch": 11.36173767752715, "grad_norm": 3.457951545715332, "learning_rate": 8.638984042108782e-06, "loss": 0.0934, "step": 13600 }, { "epoch": 11.403508771929825, "grad_norm": 1.1937059164047241, "learning_rate": 8.597209457765896e-06, "loss": 0.0834, "step": 13650 }, { "epoch": 11.445279866332498, "grad_norm": 0.9485940337181091, "learning_rate": 8.55543487342301e-06, "loss": 0.0755, "step": 13700 }, { "epoch": 11.487050960735171, "grad_norm": 12.561485290527344, "learning_rate": 8.513660289080124e-06, "loss": 0.0602, "step": 13750 }, { "epoch": 11.528822055137844, "grad_norm": 21.742900848388672, "learning_rate": 8.471885704737238e-06, "loss": 0.0639, "step": 13800 }, { "epoch": 11.570593149540517, "grad_norm": 1.016876220703125, "learning_rate": 8.430111120394354e-06, "loss": 0.0677, "step": 13850 }, { "epoch": 11.61236424394319, "grad_norm": 16.71541404724121, "learning_rate": 8.388336536051468e-06, "loss": 0.0618, "step": 13900 }, { "epoch": 11.654135338345865, "grad_norm": 14.695789337158203, "learning_rate": 8.346561951708582e-06, "loss": 0.0884, "step": 13950 }, { "epoch": 11.695906432748538, "grad_norm": 26.240062713623047, "learning_rate": 8.304787367365696e-06, "loss": 0.0623, "step": 14000 }, { "epoch": 11.737677527151211, "grad_norm": 0.9301555752754211, "learning_rate": 8.26301278302281e-06, "loss": 0.0856, "step": 14050 }, { "epoch": 11.779448621553884, "grad_norm": 0.215810164809227, "learning_rate": 8.221238198679924e-06, "loss": 0.0892, "step": 14100 }, { "epoch": 11.821219715956557, "grad_norm": 20.944414138793945, "learning_rate": 8.179463614337038e-06, "loss": 0.0455, "step": 14150 }, { "epoch": 11.862990810359232, "grad_norm": 16.5097599029541, "learning_rate": 8.137689029994153e-06, "loss": 0.0588, "step": 14200 }, { "epoch": 11.904761904761905, "grad_norm": 10.964118957519531, "learning_rate": 8.095914445651265e-06, "loss": 0.0798, "step": 14250 }, { "epoch": 11.946532999164578, "grad_norm": 4.6156134605407715, "learning_rate": 8.054139861308381e-06, "loss": 0.0681, "step": 14300 }, { "epoch": 11.988304093567251, "grad_norm": 15.3454008102417, "learning_rate": 8.012365276965495e-06, "loss": 0.0767, "step": 14350 }, { "epoch": 12.030075187969924, "grad_norm": 2.1442370414733887, "learning_rate": 7.970590692622609e-06, "loss": 0.0346, "step": 14400 }, { "epoch": 12.071846282372599, "grad_norm": 17.47662353515625, "learning_rate": 7.928816108279723e-06, "loss": 0.0359, "step": 14450 }, { "epoch": 12.113617376775272, "grad_norm": 9.307701110839844, "learning_rate": 7.887041523936837e-06, "loss": 0.0536, "step": 14500 }, { "epoch": 12.155388471177945, "grad_norm": 1.3280810117721558, "learning_rate": 7.845266939593951e-06, "loss": 0.0726, "step": 14550 }, { "epoch": 12.197159565580618, "grad_norm": 1.0069313049316406, "learning_rate": 7.803492355251065e-06, "loss": 0.0725, "step": 14600 }, { "epoch": 12.238930659983291, "grad_norm": 42.40116500854492, "learning_rate": 7.76171777090818e-06, "loss": 0.0281, "step": 14650 }, { "epoch": 12.280701754385966, "grad_norm": 0.04481910914182663, "learning_rate": 7.719943186565295e-06, "loss": 0.0288, "step": 14700 }, { "epoch": 12.322472848788639, "grad_norm": 1.7356605529785156, "learning_rate": 7.678168602222409e-06, "loss": 0.0698, "step": 14750 }, { "epoch": 12.364243943191312, "grad_norm": 13.85879135131836, "learning_rate": 7.636394017879523e-06, "loss": 0.0511, "step": 14800 }, { "epoch": 12.406015037593985, "grad_norm": 4.5668253898620605, "learning_rate": 7.5946194335366375e-06, "loss": 0.0349, "step": 14850 }, { "epoch": 12.447786131996658, "grad_norm": 0.4067160189151764, "learning_rate": 7.5528448491937515e-06, "loss": 0.0309, "step": 14900 }, { "epoch": 12.48955722639933, "grad_norm": 1.8756296634674072, "learning_rate": 7.511070264850865e-06, "loss": 0.089, "step": 14950 }, { "epoch": 12.531328320802006, "grad_norm": 0.7279142141342163, "learning_rate": 7.469295680507979e-06, "loss": 0.0592, "step": 15000 }, { "epoch": 12.573099415204679, "grad_norm": 20.90660285949707, "learning_rate": 7.427521096165093e-06, "loss": 0.0613, "step": 15050 }, { "epoch": 12.614870509607352, "grad_norm": 21.889909744262695, "learning_rate": 7.385746511822208e-06, "loss": 0.0526, "step": 15100 }, { "epoch": 12.656641604010025, "grad_norm": 0.16797950863838196, "learning_rate": 7.343971927479322e-06, "loss": 0.044, "step": 15150 }, { "epoch": 12.698412698412698, "grad_norm": 2.835975408554077, "learning_rate": 7.302197343136437e-06, "loss": 0.0481, "step": 15200 }, { "epoch": 12.740183792815372, "grad_norm": 17.579313278198242, "learning_rate": 7.26042275879355e-06, "loss": 0.0322, "step": 15250 }, { "epoch": 12.781954887218046, "grad_norm": 4.634361267089844, "learning_rate": 7.218648174450665e-06, "loss": 0.0523, "step": 15300 }, { "epoch": 12.823725981620719, "grad_norm": 0.9335712194442749, "learning_rate": 7.176873590107779e-06, "loss": 0.0618, "step": 15350 }, { "epoch": 12.865497076023392, "grad_norm": 0.37698858976364136, "learning_rate": 7.135099005764893e-06, "loss": 0.0698, "step": 15400 }, { "epoch": 12.907268170426065, "grad_norm": 12.935955047607422, "learning_rate": 7.093324421422008e-06, "loss": 0.0293, "step": 15450 }, { "epoch": 12.949039264828738, "grad_norm": 27.532899856567383, "learning_rate": 7.051549837079121e-06, "loss": 0.0345, "step": 15500 }, { "epoch": 12.990810359231412, "grad_norm": 13.847272872924805, "learning_rate": 7.009775252736236e-06, "loss": 0.0266, "step": 15550 }, { "epoch": 13.032581453634085, "grad_norm": 0.8636922240257263, "learning_rate": 6.96800066839335e-06, "loss": 0.0283, "step": 15600 }, { "epoch": 13.074352548036758, "grad_norm": 9.82494831085205, "learning_rate": 6.9262260840504646e-06, "loss": 0.0219, "step": 15650 }, { "epoch": 13.116123642439431, "grad_norm": 0.6327198147773743, "learning_rate": 6.8844514997075785e-06, "loss": 0.032, "step": 15700 }, { "epoch": 13.157894736842104, "grad_norm": 0.04686570540070534, "learning_rate": 6.8426769153646925e-06, "loss": 0.0247, "step": 15750 }, { "epoch": 13.19966583124478, "grad_norm": 3.148859977722168, "learning_rate": 6.800902331021807e-06, "loss": 0.0408, "step": 15800 }, { "epoch": 13.241436925647452, "grad_norm": 14.439146995544434, "learning_rate": 6.7591277466789205e-06, "loss": 0.0261, "step": 15850 }, { "epoch": 13.283208020050125, "grad_norm": 2.363865375518799, "learning_rate": 6.717353162336035e-06, "loss": 0.0497, "step": 15900 }, { "epoch": 13.324979114452798, "grad_norm": 11.770255088806152, "learning_rate": 6.675578577993149e-06, "loss": 0.0395, "step": 15950 }, { "epoch": 13.366750208855471, "grad_norm": 0.6963861584663391, "learning_rate": 6.633803993650264e-06, "loss": 0.0408, "step": 16000 }, { "epoch": 13.408521303258146, "grad_norm": 9.609217643737793, "learning_rate": 6.592029409307378e-06, "loss": 0.0364, "step": 16050 }, { "epoch": 13.450292397660819, "grad_norm": 7.188763618469238, "learning_rate": 6.550254824964493e-06, "loss": 0.0381, "step": 16100 }, { "epoch": 13.492063492063492, "grad_norm": 0.07387153059244156, "learning_rate": 6.508480240621606e-06, "loss": 0.0504, "step": 16150 }, { "epoch": 13.533834586466165, "grad_norm": 7.845670223236084, "learning_rate": 6.46670565627872e-06, "loss": 0.0326, "step": 16200 }, { "epoch": 13.575605680868838, "grad_norm": 6.502562522888184, "learning_rate": 6.424931071935835e-06, "loss": 0.0458, "step": 16250 }, { "epoch": 13.617376775271513, "grad_norm": 0.5432217121124268, "learning_rate": 6.383156487592949e-06, "loss": 0.0229, "step": 16300 }, { "epoch": 13.659147869674186, "grad_norm": 0.12599503993988037, "learning_rate": 6.341381903250064e-06, "loss": 0.0313, "step": 16350 }, { "epoch": 13.700918964076859, "grad_norm": 1.3932889699935913, "learning_rate": 6.299607318907177e-06, "loss": 0.036, "step": 16400 }, { "epoch": 13.742690058479532, "grad_norm": 8.858582496643066, "learning_rate": 6.257832734564292e-06, "loss": 0.0259, "step": 16450 }, { "epoch": 13.784461152882205, "grad_norm": 2.9520416259765625, "learning_rate": 6.216058150221406e-06, "loss": 0.0385, "step": 16500 }, { "epoch": 13.826232247284878, "grad_norm": 0.8918272852897644, "learning_rate": 6.1742835658785204e-06, "loss": 0.0216, "step": 16550 }, { "epoch": 13.868003341687553, "grad_norm": 24.075159072875977, "learning_rate": 6.1325089815356344e-06, "loss": 0.0164, "step": 16600 }, { "epoch": 13.909774436090226, "grad_norm": 7.580496311187744, "learning_rate": 6.0907343971927476e-06, "loss": 0.0266, "step": 16650 }, { "epoch": 13.951545530492899, "grad_norm": 19.381996154785156, "learning_rate": 6.048959812849862e-06, "loss": 0.0436, "step": 16700 }, { "epoch": 13.993316624895572, "grad_norm": 0.01642206870019436, "learning_rate": 6.007185228506976e-06, "loss": 0.0139, "step": 16750 }, { "epoch": 14.035087719298245, "grad_norm": 19.350568771362305, "learning_rate": 5.965410644164091e-06, "loss": 0.0173, "step": 16800 }, { "epoch": 14.07685881370092, "grad_norm": 0.03829874470829964, "learning_rate": 5.923636059821205e-06, "loss": 0.0285, "step": 16850 }, { "epoch": 14.118629908103593, "grad_norm": 0.529155433177948, "learning_rate": 5.88186147547832e-06, "loss": 0.0236, "step": 16900 }, { "epoch": 14.160401002506266, "grad_norm": 1.4649921655654907, "learning_rate": 5.840086891135434e-06, "loss": 0.0402, "step": 16950 }, { "epoch": 14.202172096908939, "grad_norm": 15.808168411254883, "learning_rate": 5.798312306792549e-06, "loss": 0.0397, "step": 17000 }, { "epoch": 14.243943191311612, "grad_norm": 0.01485319435596466, "learning_rate": 5.756537722449662e-06, "loss": 0.0259, "step": 17050 }, { "epoch": 14.285714285714286, "grad_norm": 0.036885835230350494, "learning_rate": 5.714763138106776e-06, "loss": 0.0116, "step": 17100 }, { "epoch": 14.32748538011696, "grad_norm": 0.09616250544786453, "learning_rate": 5.672988553763891e-06, "loss": 0.0165, "step": 17150 }, { "epoch": 14.369256474519633, "grad_norm": 0.14387579262256622, "learning_rate": 5.631213969421005e-06, "loss": 0.0357, "step": 17200 }, { "epoch": 14.411027568922306, "grad_norm": 0.9712551236152649, "learning_rate": 5.5894393850781196e-06, "loss": 0.0318, "step": 17250 }, { "epoch": 14.452798663324979, "grad_norm": 1.3842188119888306, "learning_rate": 5.547664800735233e-06, "loss": 0.0077, "step": 17300 }, { "epoch": 14.494569757727653, "grad_norm": 0.19776180386543274, "learning_rate": 5.5058902163923475e-06, "loss": 0.0215, "step": 17350 }, { "epoch": 14.536340852130326, "grad_norm": 0.09455841779708862, "learning_rate": 5.4641156320494615e-06, "loss": 0.0178, "step": 17400 }, { "epoch": 14.578111946533, "grad_norm": 0.8986994028091431, "learning_rate": 5.4223410477065755e-06, "loss": 0.0197, "step": 17450 }, { "epoch": 14.619883040935672, "grad_norm": 0.009756785817444324, "learning_rate": 5.38056646336369e-06, "loss": 0.0185, "step": 17500 }, { "epoch": 14.661654135338345, "grad_norm": 0.09191206842660904, "learning_rate": 5.3387918790208035e-06, "loss": 0.0149, "step": 17550 }, { "epoch": 14.703425229741018, "grad_norm": 1.768214225769043, "learning_rate": 5.297017294677918e-06, "loss": 0.0115, "step": 17600 }, { "epoch": 14.745196324143693, "grad_norm": 1.6435145139694214, "learning_rate": 5.255242710335032e-06, "loss": 0.0153, "step": 17650 }, { "epoch": 14.786967418546366, "grad_norm": 0.0256047360599041, "learning_rate": 5.213468125992147e-06, "loss": 0.0223, "step": 17700 }, { "epoch": 14.82873851294904, "grad_norm": 2.076021671295166, "learning_rate": 5.171693541649261e-06, "loss": 0.0229, "step": 17750 }, { "epoch": 14.870509607351712, "grad_norm": 0.2123277485370636, "learning_rate": 5.129918957306376e-06, "loss": 0.0101, "step": 17800 }, { "epoch": 14.912280701754385, "grad_norm": 0.03163406625390053, "learning_rate": 5.088144372963489e-06, "loss": 0.0186, "step": 17850 }, { "epoch": 14.95405179615706, "grad_norm": 0.5532212257385254, "learning_rate": 5.046369788620603e-06, "loss": 0.0191, "step": 17900 }, { "epoch": 14.995822890559733, "grad_norm": 0.05094052106142044, "learning_rate": 5.004595204277718e-06, "loss": 0.022, "step": 17950 }, { "epoch": 15.037593984962406, "grad_norm": 0.04656049981713295, "learning_rate": 4.962820619934832e-06, "loss": 0.0155, "step": 18000 }, { "epoch": 15.079365079365079, "grad_norm": 9.495512008666992, "learning_rate": 4.921046035591947e-06, "loss": 0.012, "step": 18050 }, { "epoch": 15.121136173767752, "grad_norm": 0.034731555730104446, "learning_rate": 4.879271451249061e-06, "loss": 0.0042, "step": 18100 }, { "epoch": 15.162907268170427, "grad_norm": 0.02487257681787014, "learning_rate": 4.837496866906175e-06, "loss": 0.0131, "step": 18150 }, { "epoch": 15.2046783625731, "grad_norm": 10.468315124511719, "learning_rate": 4.795722282563289e-06, "loss": 0.0167, "step": 18200 }, { "epoch": 15.246449456975773, "grad_norm": 3.7355704307556152, "learning_rate": 4.753947698220403e-06, "loss": 0.0062, "step": 18250 }, { "epoch": 15.288220551378446, "grad_norm": 0.007894457317888737, "learning_rate": 4.712173113877517e-06, "loss": 0.0102, "step": 18300 }, { "epoch": 15.329991645781119, "grad_norm": 0.007423860020935535, "learning_rate": 4.670398529534631e-06, "loss": 0.0187, "step": 18350 }, { "epoch": 15.371762740183792, "grad_norm": 0.5450906753540039, "learning_rate": 4.628623945191746e-06, "loss": 0.0278, "step": 18400 }, { "epoch": 15.413533834586467, "grad_norm": 0.5019950270652771, "learning_rate": 4.58684936084886e-06, "loss": 0.0299, "step": 18450 }, { "epoch": 15.45530492898914, "grad_norm": 1.3405569791793823, "learning_rate": 4.545074776505974e-06, "loss": 0.022, "step": 18500 }, { "epoch": 15.497076023391813, "grad_norm": 7.201114177703857, "learning_rate": 4.503300192163088e-06, "loss": 0.0144, "step": 18550 }, { "epoch": 15.538847117794486, "grad_norm": 1.0454356670379639, "learning_rate": 4.461525607820202e-06, "loss": 0.0144, "step": 18600 }, { "epoch": 15.580618212197159, "grad_norm": 1.4038639068603516, "learning_rate": 4.419751023477317e-06, "loss": 0.0219, "step": 18650 }, { "epoch": 15.622389306599834, "grad_norm": 0.048461660742759705, "learning_rate": 4.377976439134431e-06, "loss": 0.009, "step": 18700 }, { "epoch": 15.664160401002507, "grad_norm": 0.1847960352897644, "learning_rate": 4.336201854791545e-06, "loss": 0.0076, "step": 18750 }, { "epoch": 15.70593149540518, "grad_norm": 0.23470734059810638, "learning_rate": 4.29442727044866e-06, "loss": 0.0065, "step": 18800 }, { "epoch": 15.747702589807853, "grad_norm": 10.248291969299316, "learning_rate": 4.252652686105774e-06, "loss": 0.0114, "step": 18850 }, { "epoch": 15.789473684210526, "grad_norm": 0.04861776903271675, "learning_rate": 4.210878101762888e-06, "loss": 0.0083, "step": 18900 }, { "epoch": 15.8312447786132, "grad_norm": 0.00836202036589384, "learning_rate": 4.169103517420002e-06, "loss": 0.0204, "step": 18950 }, { "epoch": 15.873015873015873, "grad_norm": 39.85865783691406, "learning_rate": 4.127328933077116e-06, "loss": 0.0101, "step": 19000 }, { "epoch": 15.914786967418546, "grad_norm": 0.07122869789600372, "learning_rate": 4.0855543487342305e-06, "loss": 0.0087, "step": 19050 }, { "epoch": 15.95655806182122, "grad_norm": 0.07924563437700272, "learning_rate": 4.0437797643913445e-06, "loss": 0.0127, "step": 19100 }, { "epoch": 15.998329156223893, "grad_norm": 0.040103524923324585, "learning_rate": 4.0020051800484585e-06, "loss": 0.0114, "step": 19150 }, { "epoch": 16.040100250626566, "grad_norm": 0.3489997684955597, "learning_rate": 3.960230595705573e-06, "loss": 0.0034, "step": 19200 }, { "epoch": 16.08187134502924, "grad_norm": 0.0418890118598938, "learning_rate": 3.918456011362687e-06, "loss": 0.0168, "step": 19250 }, { "epoch": 16.12364243943191, "grad_norm": 0.024545153602957726, "learning_rate": 3.876681427019801e-06, "loss": 0.0121, "step": 19300 }, { "epoch": 16.165413533834588, "grad_norm": 8.744132041931152, "learning_rate": 3.834906842676916e-06, "loss": 0.0089, "step": 19350 }, { "epoch": 16.20718462823726, "grad_norm": 0.2575714886188507, "learning_rate": 3.7931322583340296e-06, "loss": 0.0097, "step": 19400 }, { "epoch": 16.248955722639934, "grad_norm": 0.00845133326947689, "learning_rate": 3.751357673991144e-06, "loss": 0.0158, "step": 19450 }, { "epoch": 16.290726817042607, "grad_norm": 2.029202699661255, "learning_rate": 3.709583089648258e-06, "loss": 0.0085, "step": 19500 }, { "epoch": 16.33249791144528, "grad_norm": 0.009376639500260353, "learning_rate": 3.6678085053053724e-06, "loss": 0.0038, "step": 19550 }, { "epoch": 16.374269005847953, "grad_norm": 0.05808446928858757, "learning_rate": 3.626033920962487e-06, "loss": 0.0179, "step": 19600 }, { "epoch": 16.416040100250626, "grad_norm": 0.11780844628810883, "learning_rate": 3.584259336619601e-06, "loss": 0.0087, "step": 19650 }, { "epoch": 16.4578111946533, "grad_norm": 0.15603607892990112, "learning_rate": 3.542484752276715e-06, "loss": 0.0041, "step": 19700 }, { "epoch": 16.499582289055972, "grad_norm": 0.8933451175689697, "learning_rate": 3.5007101679338296e-06, "loss": 0.0133, "step": 19750 }, { "epoch": 16.541353383458645, "grad_norm": 0.1364767998456955, "learning_rate": 3.458935583590943e-06, "loss": 0.0069, "step": 19800 }, { "epoch": 16.58312447786132, "grad_norm": 1.5579296350479126, "learning_rate": 3.4171609992480576e-06, "loss": 0.0086, "step": 19850 }, { "epoch": 16.624895572263995, "grad_norm": 0.11376599967479706, "learning_rate": 3.375386414905172e-06, "loss": 0.0018, "step": 19900 }, { "epoch": 16.666666666666668, "grad_norm": 3.527081251144409, "learning_rate": 3.333611830562286e-06, "loss": 0.0068, "step": 19950 }, { "epoch": 16.70843776106934, "grad_norm": 0.16554129123687744, "learning_rate": 3.2918372462194004e-06, "loss": 0.0058, "step": 20000 }, { "epoch": 16.750208855472014, "grad_norm": 7.927056312561035, "learning_rate": 3.2500626618765148e-06, "loss": 0.0041, "step": 20050 }, { "epoch": 16.791979949874687, "grad_norm": 6.927394390106201, "learning_rate": 3.2082880775336287e-06, "loss": 0.0112, "step": 20100 }, { "epoch": 16.83375104427736, "grad_norm": 3.62300181388855, "learning_rate": 3.166513493190743e-06, "loss": 0.0088, "step": 20150 }, { "epoch": 16.875522138680033, "grad_norm": 37.684913635253906, "learning_rate": 3.1247389088478576e-06, "loss": 0.0044, "step": 20200 }, { "epoch": 16.917293233082706, "grad_norm": 0.34140291810035706, "learning_rate": 3.082964324504971e-06, "loss": 0.0092, "step": 20250 }, { "epoch": 16.95906432748538, "grad_norm": 0.7771002650260925, "learning_rate": 3.0411897401620855e-06, "loss": 0.0084, "step": 20300 }, { "epoch": 17.000835421888052, "grad_norm": 0.10395874083042145, "learning_rate": 2.9994151558192e-06, "loss": 0.0062, "step": 20350 }, { "epoch": 17.04260651629073, "grad_norm": 0.12849818170070648, "learning_rate": 2.957640571476314e-06, "loss": 0.0101, "step": 20400 }, { "epoch": 17.0843776106934, "grad_norm": 0.010177390649914742, "learning_rate": 2.9158659871334283e-06, "loss": 0.0056, "step": 20450 }, { "epoch": 17.126148705096075, "grad_norm": 10.7208833694458, "learning_rate": 2.8740914027905427e-06, "loss": 0.0063, "step": 20500 }, { "epoch": 17.167919799498748, "grad_norm": 0.31796014308929443, "learning_rate": 2.8323168184476567e-06, "loss": 0.0082, "step": 20550 }, { "epoch": 17.20969089390142, "grad_norm": 0.3368360698223114, "learning_rate": 2.790542234104771e-06, "loss": 0.0027, "step": 20600 }, { "epoch": 17.251461988304094, "grad_norm": 0.871994137763977, "learning_rate": 2.7487676497618855e-06, "loss": 0.0057, "step": 20650 }, { "epoch": 17.293233082706767, "grad_norm": 3.6776578426361084, "learning_rate": 2.706993065418999e-06, "loss": 0.0039, "step": 20700 }, { "epoch": 17.33500417710944, "grad_norm": 0.03436708822846413, "learning_rate": 2.6652184810761135e-06, "loss": 0.0029, "step": 20750 }, { "epoch": 17.376775271512113, "grad_norm": 12.215385437011719, "learning_rate": 2.6234438967332274e-06, "loss": 0.0104, "step": 20800 }, { "epoch": 17.418546365914786, "grad_norm": 0.1122766062617302, "learning_rate": 2.581669312390342e-06, "loss": 0.0082, "step": 20850 }, { "epoch": 17.46031746031746, "grad_norm": 0.01590600423514843, "learning_rate": 2.5398947280474563e-06, "loss": 0.0059, "step": 20900 }, { "epoch": 17.502088554720135, "grad_norm": 8.229164123535156, "learning_rate": 2.4981201437045707e-06, "loss": 0.0014, "step": 20950 }, { "epoch": 17.54385964912281, "grad_norm": 0.13155962526798248, "learning_rate": 2.4563455593616846e-06, "loss": 0.0027, "step": 21000 }, { "epoch": 17.58563074352548, "grad_norm": 2.3657147884368896, "learning_rate": 2.4145709750187986e-06, "loss": 0.0061, "step": 21050 }, { "epoch": 17.627401837928154, "grad_norm": 0.07491889595985413, "learning_rate": 2.372796390675913e-06, "loss": 0.0006, "step": 21100 }, { "epoch": 17.669172932330827, "grad_norm": 0.2693181037902832, "learning_rate": 2.3310218063330274e-06, "loss": 0.0082, "step": 21150 }, { "epoch": 17.7109440267335, "grad_norm": 8.37026309967041, "learning_rate": 2.2892472219901414e-06, "loss": 0.0062, "step": 21200 }, { "epoch": 17.752715121136173, "grad_norm": 0.2343984991312027, "learning_rate": 2.2474726376472554e-06, "loss": 0.0047, "step": 21250 }, { "epoch": 17.794486215538846, "grad_norm": 0.3801390528678894, "learning_rate": 2.20569805330437e-06, "loss": 0.0054, "step": 21300 }, { "epoch": 17.83625730994152, "grad_norm": 0.00528654083609581, "learning_rate": 2.163923468961484e-06, "loss": 0.0008, "step": 21350 }, { "epoch": 17.878028404344192, "grad_norm": 0.014739965088665485, "learning_rate": 2.122148884618598e-06, "loss": 0.0031, "step": 21400 }, { "epoch": 17.91979949874687, "grad_norm": 0.04143223166465759, "learning_rate": 2.080374300275712e-06, "loss": 0.0013, "step": 21450 }, { "epoch": 17.961570593149542, "grad_norm": 0.11256618052721024, "learning_rate": 2.0385997159328266e-06, "loss": 0.0038, "step": 21500 }, { "epoch": 18.003341687552215, "grad_norm": 0.31975257396698, "learning_rate": 1.996825131589941e-06, "loss": 0.0076, "step": 21550 }, { "epoch": 18.045112781954888, "grad_norm": 0.0022968221455812454, "learning_rate": 1.9550505472470554e-06, "loss": 0.0031, "step": 21600 }, { "epoch": 18.08688387635756, "grad_norm": 0.06881808489561081, "learning_rate": 1.9132759629041693e-06, "loss": 0.005, "step": 21650 }, { "epoch": 18.128654970760234, "grad_norm": 0.26212799549102783, "learning_rate": 1.8715013785612835e-06, "loss": 0.0016, "step": 21700 }, { "epoch": 18.170426065162907, "grad_norm": 0.0171457901597023, "learning_rate": 1.8297267942183977e-06, "loss": 0.0067, "step": 21750 }, { "epoch": 18.21219715956558, "grad_norm": 0.2334776520729065, "learning_rate": 1.787952209875512e-06, "loss": 0.0027, "step": 21800 }, { "epoch": 18.253968253968253, "grad_norm": 0.08637866377830505, "learning_rate": 1.746177625532626e-06, "loss": 0.0053, "step": 21850 }, { "epoch": 18.295739348370926, "grad_norm": 0.11498326063156128, "learning_rate": 1.7044030411897403e-06, "loss": 0.0023, "step": 21900 }, { "epoch": 18.3375104427736, "grad_norm": 0.014128020964562893, "learning_rate": 1.6626284568468545e-06, "loss": 0.008, "step": 21950 }, { "epoch": 18.379281537176276, "grad_norm": 0.13422049582004547, "learning_rate": 1.620853872503969e-06, "loss": 0.0024, "step": 22000 }, { "epoch": 18.42105263157895, "grad_norm": 0.13545355200767517, "learning_rate": 1.5790792881610829e-06, "loss": 0.0039, "step": 22050 }, { "epoch": 18.46282372598162, "grad_norm": 0.10272631794214249, "learning_rate": 1.537304703818197e-06, "loss": 0.003, "step": 22100 }, { "epoch": 18.504594820384295, "grad_norm": 3.581731081008911, "learning_rate": 1.4955301194753113e-06, "loss": 0.003, "step": 22150 }, { "epoch": 18.546365914786968, "grad_norm": 0.019360538572072983, "learning_rate": 1.4537555351324257e-06, "loss": 0.001, "step": 22200 }, { "epoch": 18.58813700918964, "grad_norm": 0.20149047672748566, "learning_rate": 1.4119809507895399e-06, "loss": 0.0029, "step": 22250 }, { "epoch": 18.629908103592314, "grad_norm": 0.017035024240612984, "learning_rate": 1.3702063664466539e-06, "loss": 0.0014, "step": 22300 }, { "epoch": 18.671679197994987, "grad_norm": 0.325811505317688, "learning_rate": 1.3284317821037683e-06, "loss": 0.0052, "step": 22350 }, { "epoch": 18.71345029239766, "grad_norm": 0.020387643948197365, "learning_rate": 1.2866571977608824e-06, "loss": 0.0011, "step": 22400 }, { "epoch": 18.755221386800333, "grad_norm": 1.3581466674804688, "learning_rate": 1.2448826134179966e-06, "loss": 0.0013, "step": 22450 }, { "epoch": 18.796992481203006, "grad_norm": 0.05018683522939682, "learning_rate": 1.2031080290751108e-06, "loss": 0.0012, "step": 22500 }, { "epoch": 18.838763575605682, "grad_norm": 0.17220334708690643, "learning_rate": 1.161333444732225e-06, "loss": 0.003, "step": 22550 }, { "epoch": 18.880534670008355, "grad_norm": 0.02596069872379303, "learning_rate": 1.1195588603893392e-06, "loss": 0.0011, "step": 22600 }, { "epoch": 18.92230576441103, "grad_norm": 0.21498483419418335, "learning_rate": 1.0777842760464534e-06, "loss": 0.0049, "step": 22650 }, { "epoch": 18.9640768588137, "grad_norm": 0.21940931677818298, "learning_rate": 1.0360096917035676e-06, "loss": 0.0007, "step": 22700 }, { "epoch": 19.005847953216374, "grad_norm": 0.09297411888837814, "learning_rate": 9.942351073606818e-07, "loss": 0.0008, "step": 22750 }, { "epoch": 19.047619047619047, "grad_norm": 0.01873987540602684, "learning_rate": 9.524605230177961e-07, "loss": 0.0005, "step": 22800 }, { "epoch": 19.08939014202172, "grad_norm": 0.029663298279047012, "learning_rate": 9.106859386749102e-07, "loss": 0.0022, "step": 22850 }, { "epoch": 19.131161236424393, "grad_norm": 0.20812617242336273, "learning_rate": 8.689113543320245e-07, "loss": 0.0065, "step": 22900 }, { "epoch": 19.172932330827066, "grad_norm": 0.047941118478775024, "learning_rate": 8.271367699891386e-07, "loss": 0.0047, "step": 22950 }, { "epoch": 19.21470342522974, "grad_norm": 0.18795344233512878, "learning_rate": 7.853621856462529e-07, "loss": 0.0022, "step": 23000 }, { "epoch": 19.256474519632416, "grad_norm": 5.738811492919922, "learning_rate": 7.435876013033672e-07, "loss": 0.0025, "step": 23050 }, { "epoch": 19.29824561403509, "grad_norm": 0.030096910893917084, "learning_rate": 7.018130169604812e-07, "loss": 0.0006, "step": 23100 }, { "epoch": 19.340016708437762, "grad_norm": 0.07230346649885178, "learning_rate": 6.600384326175955e-07, "loss": 0.0006, "step": 23150 }, { "epoch": 19.381787802840435, "grad_norm": 0.1296992152929306, "learning_rate": 6.182638482747097e-07, "loss": 0.0003, "step": 23200 }, { "epoch": 19.423558897243108, "grad_norm": 0.15579549968242645, "learning_rate": 5.764892639318239e-07, "loss": 0.0013, "step": 23250 }, { "epoch": 19.46532999164578, "grad_norm": 0.018042083829641342, "learning_rate": 5.347146795889381e-07, "loss": 0.0016, "step": 23300 }, { "epoch": 19.507101086048454, "grad_norm": 0.4105440378189087, "learning_rate": 4.929400952460523e-07, "loss": 0.0006, "step": 23350 }, { "epoch": 19.548872180451127, "grad_norm": 0.038732532411813736, "learning_rate": 4.5116551090316656e-07, "loss": 0.004, "step": 23400 }, { "epoch": 19.5906432748538, "grad_norm": 2.204296827316284, "learning_rate": 4.093909265602808e-07, "loss": 0.0011, "step": 23450 }, { "epoch": 19.632414369256473, "grad_norm": 0.014660513959825039, "learning_rate": 3.67616342217395e-07, "loss": 0.0009, "step": 23500 }, { "epoch": 19.674185463659146, "grad_norm": 0.011966018006205559, "learning_rate": 3.258417578745092e-07, "loss": 0.0006, "step": 23550 }, { "epoch": 19.715956558061823, "grad_norm": 0.03099227510392666, "learning_rate": 2.840671735316234e-07, "loss": 0.0009, "step": 23600 }, { "epoch": 19.757727652464496, "grad_norm": 0.09417425096035004, "learning_rate": 2.422925891887376e-07, "loss": 0.0032, "step": 23650 }, { "epoch": 19.79949874686717, "grad_norm": 0.153071790933609, "learning_rate": 2.005180048458518e-07, "loss": 0.0021, "step": 23700 }, { "epoch": 19.841269841269842, "grad_norm": 0.015282063744962215, "learning_rate": 1.5874342050296602e-07, "loss": 0.0005, "step": 23750 }, { "epoch": 19.883040935672515, "grad_norm": 0.012147588655352592, "learning_rate": 1.1696883616008022e-07, "loss": 0.0041, "step": 23800 }, { "epoch": 19.924812030075188, "grad_norm": 0.16458311676979065, "learning_rate": 7.519425181719443e-08, "loss": 0.0018, "step": 23850 }, { "epoch": 19.96658312447786, "grad_norm": 0.11444168537855148, "learning_rate": 3.341966747430863e-08, "loss": 0.002, "step": 23900 } ], "logging_steps": 50, "max_steps": 23940, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.5544269721786112e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }