{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.705223668504519, "eval_steps": 500, "global_step": 30000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0028420394475075312, "grad_norm": 57.54774856567383, "learning_rate": 1.2787723785166242e-07, "loss": 0.2424, "step": 50 }, { "epoch": 0.0056840788950150625, "grad_norm": 65.50086975097656, "learning_rate": 2.6712134129013924e-07, "loss": 0.2428, "step": 100 }, { "epoch": 0.008526118342522595, "grad_norm": 0.10987269878387451, "learning_rate": 4.0920716112531976e-07, "loss": 0.2234, "step": 150 }, { "epoch": 0.011368157790030125, "grad_norm": 78.04790496826172, "learning_rate": 5.512929809605002e-07, "loss": 0.2119, "step": 200 }, { "epoch": 0.014210197237537657, "grad_norm": 6.256505489349365, "learning_rate": 6.933788007956807e-07, "loss": 0.204, "step": 250 }, { "epoch": 0.01705223668504519, "grad_norm": 0.00032545399153605103, "learning_rate": 8.354646206308611e-07, "loss": 0.1687, "step": 300 }, { "epoch": 0.01989427613255272, "grad_norm": 0.01708165928721428, "learning_rate": 9.775504404660415e-07, "loss": 0.2217, "step": 350 }, { "epoch": 0.02273631558006025, "grad_norm": 0.045344240963459015, "learning_rate": 1.119636260301222e-06, "loss": 0.1419, "step": 400 }, { "epoch": 0.025578355027567782, "grad_norm": 9.43325424194336, "learning_rate": 1.2617220801364026e-06, "loss": 0.1425, "step": 450 }, { "epoch": 0.028420394475075314, "grad_norm": 90.27054595947266, "learning_rate": 1.403807899971583e-06, "loss": 0.0719, "step": 500 }, { "epoch": 0.03126243392258284, "grad_norm": 7.323994159698486, "learning_rate": 1.5430520034100598e-06, "loss": 0.1488, "step": 550 }, { "epoch": 0.03410447337009038, "grad_norm": 34.19244384765625, "learning_rate": 1.6851378232452402e-06, "loss": 0.1622, "step": 600 }, { "epoch": 0.03694651281759791, "grad_norm": 1.936219573020935, "learning_rate": 1.8272236430804209e-06, "loss": 0.0807, "step": 650 }, { "epoch": 0.03978855226510544, "grad_norm": 3.6323654057923704e-05, "learning_rate": 1.9693094629156013e-06, "loss": 0.1124, "step": 700 }, { "epoch": 0.04263059171261297, "grad_norm": 0.5096821784973145, "learning_rate": 2.1113952827507815e-06, "loss": 0.1151, "step": 750 }, { "epoch": 0.0454726311601205, "grad_norm": 0.00015973972040228546, "learning_rate": 2.253481102585962e-06, "loss": 0.169, "step": 800 }, { "epoch": 0.048314670607628035, "grad_norm": 0.0007457645260728896, "learning_rate": 2.3955669224211427e-06, "loss": 0.1206, "step": 850 }, { "epoch": 0.051156710055135564, "grad_norm": 1.5516124963760376, "learning_rate": 2.537652742256323e-06, "loss": 0.1217, "step": 900 }, { "epoch": 0.0539987495026431, "grad_norm": 0.0006615748861804605, "learning_rate": 2.6797385620915036e-06, "loss": 0.0555, "step": 950 }, { "epoch": 0.05684078895015063, "grad_norm": 0.007951493375003338, "learning_rate": 2.81898266552998e-06, "loss": 0.0929, "step": 1000 }, { "epoch": 0.05968282839765816, "grad_norm": 0.09356316924095154, "learning_rate": 2.9610684853651604e-06, "loss": 0.1697, "step": 1050 }, { "epoch": 0.06252486784516569, "grad_norm": 1.9378662109375, "learning_rate": 3.103154305200341e-06, "loss": 0.1442, "step": 1100 }, { "epoch": 0.06536690729267322, "grad_norm": 56.01464080810547, "learning_rate": 3.245240125035522e-06, "loss": 0.0769, "step": 1150 }, { "epoch": 0.06820894674018076, "grad_norm": 0.0007533690659329295, "learning_rate": 3.3873259448707023e-06, "loss": 0.188, "step": 1200 }, { "epoch": 0.07105098618768829, "grad_norm": 51.44038391113281, "learning_rate": 3.529411764705883e-06, "loss": 0.0824, "step": 1250 }, { "epoch": 0.07389302563519581, "grad_norm": 5.564255237579346, "learning_rate": 3.6714975845410635e-06, "loss": 0.0542, "step": 1300 }, { "epoch": 0.07673506508270335, "grad_norm": 3.297595500946045, "learning_rate": 3.8135834043762437e-06, "loss": 0.1266, "step": 1350 }, { "epoch": 0.07957710453021088, "grad_norm": 0.000204528245376423, "learning_rate": 3.955669224211424e-06, "loss": 0.0458, "step": 1400 }, { "epoch": 0.0824191439777184, "grad_norm": 19.779905319213867, "learning_rate": 4.0977550440466045e-06, "loss": 0.0888, "step": 1450 }, { "epoch": 0.08526118342522594, "grad_norm": 135.17404174804688, "learning_rate": 4.239840863881785e-06, "loss": 0.1275, "step": 1500 }, { "epoch": 0.08810322287273348, "grad_norm": 0.0021571500692516565, "learning_rate": 4.381926683716966e-06, "loss": 0.1331, "step": 1550 }, { "epoch": 0.090945262320241, "grad_norm": 1.5359570681994228e-07, "learning_rate": 4.5240125035521456e-06, "loss": 0.0501, "step": 1600 }, { "epoch": 0.09378730176774853, "grad_norm": 9.060190200805664, "learning_rate": 4.666098323387326e-06, "loss": 0.1749, "step": 1650 }, { "epoch": 0.09662934121525607, "grad_norm": 6.6538286209106445, "learning_rate": 4.808184143222507e-06, "loss": 0.1299, "step": 1700 }, { "epoch": 0.0994713806627636, "grad_norm": 1.7331750392913818, "learning_rate": 4.9502699630576874e-06, "loss": 0.1119, "step": 1750 }, { "epoch": 0.10231342011027113, "grad_norm": 0.000718842027708888, "learning_rate": 5.092355782892868e-06, "loss": 0.1281, "step": 1800 }, { "epoch": 0.10515545955777866, "grad_norm": 0.0, "learning_rate": 5.234441602728049e-06, "loss": 0.1247, "step": 1850 }, { "epoch": 0.1079974990052862, "grad_norm": 5.815588519908488e-05, "learning_rate": 5.376527422563229e-06, "loss": 0.0826, "step": 1900 }, { "epoch": 0.11083953845279372, "grad_norm": 5.133784294128418, "learning_rate": 5.51861324239841e-06, "loss": 0.13, "step": 1950 }, { "epoch": 0.11368157790030126, "grad_norm": 0.00022838923905510455, "learning_rate": 5.66069906223359e-06, "loss": 0.0581, "step": 2000 }, { "epoch": 0.11652361734780879, "grad_norm": 1.6444566249847412, "learning_rate": 5.80278488206877e-06, "loss": 0.0852, "step": 2050 }, { "epoch": 0.11936565679531631, "grad_norm": 0.00024610653053969145, "learning_rate": 5.944870701903951e-06, "loss": 0.0439, "step": 2100 }, { "epoch": 0.12220769624282385, "grad_norm": 0.0024437729734927416, "learning_rate": 6.086956521739132e-06, "loss": 0.1028, "step": 2150 }, { "epoch": 0.12504973569033137, "grad_norm": 0.005096614360809326, "learning_rate": 6.229042341574311e-06, "loss": 0.1236, "step": 2200 }, { "epoch": 0.12789177513783892, "grad_norm": 12.081178665161133, "learning_rate": 6.371128161409492e-06, "loss": 0.1659, "step": 2250 }, { "epoch": 0.13073381458534644, "grad_norm": 0.0001074994434020482, "learning_rate": 6.513213981244673e-06, "loss": 0.1151, "step": 2300 }, { "epoch": 0.13357585403285396, "grad_norm": 6.296492576599121, "learning_rate": 6.655299801079853e-06, "loss": 0.1861, "step": 2350 }, { "epoch": 0.1364178934803615, "grad_norm": 0.00017095121438615024, "learning_rate": 6.797385620915034e-06, "loss": 0.0689, "step": 2400 }, { "epoch": 0.13925993292786903, "grad_norm": 10.775776863098145, "learning_rate": 6.939471440750214e-06, "loss": 0.1361, "step": 2450 }, { "epoch": 0.14210197237537658, "grad_norm": 0.0, "learning_rate": 7.081557260585394e-06, "loss": 0.0663, "step": 2500 }, { "epoch": 0.1449440118228841, "grad_norm": 0.005744911730289459, "learning_rate": 7.223643080420575e-06, "loss": 0.0922, "step": 2550 }, { "epoch": 0.14778605127039163, "grad_norm": 469.4988098144531, "learning_rate": 7.3657289002557555e-06, "loss": 0.1687, "step": 2600 }, { "epoch": 0.15062809071789918, "grad_norm": 0.016451528295874596, "learning_rate": 7.507814720090935e-06, "loss": 0.1479, "step": 2650 }, { "epoch": 0.1534701301654067, "grad_norm": 0.0014400017680600286, "learning_rate": 7.649900539926117e-06, "loss": 0.0615, "step": 2700 }, { "epoch": 0.15631216961291422, "grad_norm": 0.00020282872719690204, "learning_rate": 7.791986359761297e-06, "loss": 0.0617, "step": 2750 }, { "epoch": 0.15915420906042177, "grad_norm": 3.0903574952390045e-05, "learning_rate": 7.934072179596476e-06, "loss": 0.1239, "step": 2800 }, { "epoch": 0.1619962485079293, "grad_norm": 9.299596968048718e-07, "learning_rate": 8.076157999431657e-06, "loss": 0.0617, "step": 2850 }, { "epoch": 0.1648382879554368, "grad_norm": 0.11574897170066833, "learning_rate": 8.218243819266838e-06, "loss": 0.1036, "step": 2900 }, { "epoch": 0.16768032740294436, "grad_norm": 116.9762191772461, "learning_rate": 8.360329639102018e-06, "loss": 0.1664, "step": 2950 }, { "epoch": 0.17052236685045188, "grad_norm": 0.2586289942264557, "learning_rate": 8.502415458937199e-06, "loss": 0.0869, "step": 3000 }, { "epoch": 0.1733644062979594, "grad_norm": 8.557782173156738, "learning_rate": 8.64450127877238e-06, "loss": 0.0485, "step": 3050 }, { "epoch": 0.17620644574546696, "grad_norm": 0.15316586196422577, "learning_rate": 8.783745382210856e-06, "loss": 0.1106, "step": 3100 }, { "epoch": 0.17904848519297448, "grad_norm": 0.005629907362163067, "learning_rate": 8.925831202046037e-06, "loss": 0.0911, "step": 3150 }, { "epoch": 0.181890524640482, "grad_norm": 34.47971725463867, "learning_rate": 9.067917021881217e-06, "loss": 0.1148, "step": 3200 }, { "epoch": 0.18473256408798955, "grad_norm": 0.0, "learning_rate": 9.210002841716398e-06, "loss": 0.0532, "step": 3250 }, { "epoch": 0.18757460353549707, "grad_norm": 0.018848825246095657, "learning_rate": 9.352088661551579e-06, "loss": 0.1158, "step": 3300 }, { "epoch": 0.1904166429830046, "grad_norm": 0.0013218846870586276, "learning_rate": 9.49417448138676e-06, "loss": 0.1626, "step": 3350 }, { "epoch": 0.19325868243051214, "grad_norm": 18.353458404541016, "learning_rate": 9.63626030122194e-06, "loss": 0.2518, "step": 3400 }, { "epoch": 0.19610072187801966, "grad_norm": 0.3147176206111908, "learning_rate": 9.778346121057119e-06, "loss": 0.1259, "step": 3450 }, { "epoch": 0.1989427613255272, "grad_norm": 38.72705078125, "learning_rate": 9.9204319408923e-06, "loss": 0.0641, "step": 3500 }, { "epoch": 0.20178480077303473, "grad_norm": 19.4537296295166, "learning_rate": 9.993052704708372e-06, "loss": 0.1496, "step": 3550 }, { "epoch": 0.20462684022054226, "grad_norm": 4.483327865600586, "learning_rate": 9.977263397227397e-06, "loss": 0.3053, "step": 3600 }, { "epoch": 0.2074688796680498, "grad_norm": 0.0016082617221400142, "learning_rate": 9.961474089746424e-06, "loss": 0.0714, "step": 3650 }, { "epoch": 0.21031091911555733, "grad_norm": 0.03687046840786934, "learning_rate": 9.94568478226545e-06, "loss": 0.0565, "step": 3700 }, { "epoch": 0.21315295856306485, "grad_norm": 40.0361328125, "learning_rate": 9.929895474784477e-06, "loss": 0.0625, "step": 3750 }, { "epoch": 0.2159949980105724, "grad_norm": 5.997519016265869, "learning_rate": 9.914106167303503e-06, "loss": 0.1478, "step": 3800 }, { "epoch": 0.21883703745807992, "grad_norm": 0.00617703702300787, "learning_rate": 9.898316859822528e-06, "loss": 0.0988, "step": 3850 }, { "epoch": 0.22167907690558744, "grad_norm": 12.838679313659668, "learning_rate": 9.882527552341555e-06, "loss": 0.1475, "step": 3900 }, { "epoch": 0.224521116353095, "grad_norm": 11.926225662231445, "learning_rate": 9.866738244860581e-06, "loss": 0.1435, "step": 3950 }, { "epoch": 0.2273631558006025, "grad_norm": 91.65484619140625, "learning_rate": 9.850948937379608e-06, "loss": 0.1622, "step": 4000 }, { "epoch": 0.23020519524811003, "grad_norm": 3.657653513400305e-09, "learning_rate": 9.835159629898633e-06, "loss": 0.0968, "step": 4050 }, { "epoch": 0.23304723469561758, "grad_norm": 0.11928972601890564, "learning_rate": 9.81937032241766e-06, "loss": 0.0658, "step": 4100 }, { "epoch": 0.2358892741431251, "grad_norm": 21.343875885009766, "learning_rate": 9.803581014936686e-06, "loss": 0.0891, "step": 4150 }, { "epoch": 0.23873131359063263, "grad_norm": 0.12927766144275665, "learning_rate": 9.787791707455712e-06, "loss": 0.0523, "step": 4200 }, { "epoch": 0.24157335303814018, "grad_norm": 0.00678109098225832, "learning_rate": 9.772002399974739e-06, "loss": 0.1186, "step": 4250 }, { "epoch": 0.2444153924856477, "grad_norm": 0.013820752501487732, "learning_rate": 9.756528878643384e-06, "loss": 0.1553, "step": 4300 }, { "epoch": 0.24725743193315522, "grad_norm": 2.3970553874969482, "learning_rate": 9.74073957116241e-06, "loss": 0.1442, "step": 4350 }, { "epoch": 0.25009947138066274, "grad_norm": 4.65661096572876, "learning_rate": 9.724950263681435e-06, "loss": 0.0965, "step": 4400 }, { "epoch": 0.2529415108281703, "grad_norm": 0.03147602081298828, "learning_rate": 9.709160956200462e-06, "loss": 0.0287, "step": 4450 }, { "epoch": 0.25578355027567784, "grad_norm": 20.07131576538086, "learning_rate": 9.693371648719488e-06, "loss": 0.0311, "step": 4500 }, { "epoch": 0.25862558972318533, "grad_norm": 0.0002657310396898538, "learning_rate": 9.677582341238513e-06, "loss": 0.0466, "step": 4550 }, { "epoch": 0.2614676291706929, "grad_norm": 10.225363731384277, "learning_rate": 9.66179303375754e-06, "loss": 0.0621, "step": 4600 }, { "epoch": 0.26430966861820043, "grad_norm": 0.006805745884776115, "learning_rate": 9.646003726276566e-06, "loss": 0.094, "step": 4650 }, { "epoch": 0.2671517080657079, "grad_norm": 4.655883458326571e-05, "learning_rate": 9.630214418795593e-06, "loss": 0.0517, "step": 4700 }, { "epoch": 0.2699937475132155, "grad_norm": 0.045272473245859146, "learning_rate": 9.614425111314618e-06, "loss": 0.1587, "step": 4750 }, { "epoch": 0.272835786960723, "grad_norm": 0.01270443107932806, "learning_rate": 9.598635803833644e-06, "loss": 0.1364, "step": 4800 }, { "epoch": 0.2756778264082305, "grad_norm": 0.4563452899456024, "learning_rate": 9.58284649635267e-06, "loss": 0.0339, "step": 4850 }, { "epoch": 0.27851986585573807, "grad_norm": 0.18673022091388702, "learning_rate": 9.567057188871697e-06, "loss": 0.0691, "step": 4900 }, { "epoch": 0.2813619053032456, "grad_norm": 0.00010059810301754624, "learning_rate": 9.551267881390724e-06, "loss": 0.1812, "step": 4950 }, { "epoch": 0.28420394475075317, "grad_norm": 5.0043439865112305, "learning_rate": 9.535478573909749e-06, "loss": 0.1325, "step": 5000 }, { "epoch": 0.28704598419826066, "grad_norm": 31.060195922851562, "learning_rate": 9.519689266428775e-06, "loss": 0.1619, "step": 5050 }, { "epoch": 0.2898880236457682, "grad_norm": 0.00012152839917689562, "learning_rate": 9.503899958947802e-06, "loss": 0.1023, "step": 5100 }, { "epoch": 0.29273006309327576, "grad_norm": 14.489540100097656, "learning_rate": 9.488110651466828e-06, "loss": 0.1508, "step": 5150 }, { "epoch": 0.29557210254078325, "grad_norm": 0.019262468442320824, "learning_rate": 9.472321343985853e-06, "loss": 0.1213, "step": 5200 }, { "epoch": 0.2984141419882908, "grad_norm": 0.1599639654159546, "learning_rate": 9.45653203650488e-06, "loss": 0.0864, "step": 5250 }, { "epoch": 0.30125618143579835, "grad_norm": 0.5401346683502197, "learning_rate": 9.440742729023906e-06, "loss": 0.0361, "step": 5300 }, { "epoch": 0.30409822088330585, "grad_norm": 3.4409524687362136e-06, "learning_rate": 9.424953421542931e-06, "loss": 0.0481, "step": 5350 }, { "epoch": 0.3069402603308134, "grad_norm": 0.0, "learning_rate": 9.409164114061958e-06, "loss": 0.0139, "step": 5400 }, { "epoch": 0.30978229977832095, "grad_norm": 2.688269853591919, "learning_rate": 9.393374806580984e-06, "loss": 0.1695, "step": 5450 }, { "epoch": 0.31262433922582844, "grad_norm": 0.005981958005577326, "learning_rate": 9.37758549910001e-06, "loss": 0.1257, "step": 5500 }, { "epoch": 0.315466378673336, "grad_norm": 0.001644986099563539, "learning_rate": 9.361796191619036e-06, "loss": 0.0571, "step": 5550 }, { "epoch": 0.31830841812084354, "grad_norm": 0.694561243057251, "learning_rate": 9.346006884138062e-06, "loss": 0.0634, "step": 5600 }, { "epoch": 0.32115045756835103, "grad_norm": 2.125787119666711e-07, "learning_rate": 9.330217576657089e-06, "loss": 0.0685, "step": 5650 }, { "epoch": 0.3239924970158586, "grad_norm": 77.54013061523438, "learning_rate": 9.314428269176115e-06, "loss": 0.0829, "step": 5700 }, { "epoch": 0.32683453646336613, "grad_norm": 6.395081436494365e-05, "learning_rate": 9.298638961695142e-06, "loss": 0.0962, "step": 5750 }, { "epoch": 0.3296765759108736, "grad_norm": 0.044299863278865814, "learning_rate": 9.282849654214167e-06, "loss": 0.0724, "step": 5800 }, { "epoch": 0.3325186153583812, "grad_norm": 9.034195899963379, "learning_rate": 9.267060346733193e-06, "loss": 0.0446, "step": 5850 }, { "epoch": 0.3353606548058887, "grad_norm": 4.154941082000732, "learning_rate": 9.251271039252218e-06, "loss": 0.0458, "step": 5900 }, { "epoch": 0.3382026942533962, "grad_norm": 0.0, "learning_rate": 9.235481731771244e-06, "loss": 0.0919, "step": 5950 }, { "epoch": 0.34104473370090377, "grad_norm": 2.351287364959717, "learning_rate": 9.219692424290273e-06, "loss": 0.0731, "step": 6000 }, { "epoch": 0.3438867731484113, "grad_norm": 12.486845016479492, "learning_rate": 9.203903116809298e-06, "loss": 0.0798, "step": 6050 }, { "epoch": 0.3467288125959188, "grad_norm": 0.0013139373622834682, "learning_rate": 9.188113809328324e-06, "loss": 0.0301, "step": 6100 }, { "epoch": 0.34957085204342636, "grad_norm": 3.027071237564087, "learning_rate": 9.172324501847349e-06, "loss": 0.1212, "step": 6150 }, { "epoch": 0.3524128914909339, "grad_norm": 0.10458099842071533, "learning_rate": 9.156535194366376e-06, "loss": 0.1526, "step": 6200 }, { "epoch": 0.3552549309384414, "grad_norm": 137.82711791992188, "learning_rate": 9.140745886885402e-06, "loss": 0.0809, "step": 6250 }, { "epoch": 0.35809697038594895, "grad_norm": 0.0011035481002181768, "learning_rate": 9.124956579404429e-06, "loss": 0.1041, "step": 6300 }, { "epoch": 0.3609390098334565, "grad_norm": 67.53794860839844, "learning_rate": 9.109167271923455e-06, "loss": 0.1585, "step": 6350 }, { "epoch": 0.363781049280964, "grad_norm": 4.530911445617676, "learning_rate": 9.09337796444248e-06, "loss": 0.0381, "step": 6400 }, { "epoch": 0.36662308872847155, "grad_norm": 1.725799302221276e-05, "learning_rate": 9.077588656961507e-06, "loss": 0.1591, "step": 6450 }, { "epoch": 0.3694651281759791, "grad_norm": 0.6152939200401306, "learning_rate": 9.061799349480533e-06, "loss": 0.1556, "step": 6500 }, { "epoch": 0.3723071676234866, "grad_norm": 0.0009957764996215701, "learning_rate": 9.04601004199956e-06, "loss": 0.0495, "step": 6550 }, { "epoch": 0.37514920707099414, "grad_norm": 56.166805267333984, "learning_rate": 9.030220734518584e-06, "loss": 0.092, "step": 6600 }, { "epoch": 0.3779912465185017, "grad_norm": 1.8076647165798931e-06, "learning_rate": 9.014431427037611e-06, "loss": 0.1177, "step": 6650 }, { "epoch": 0.3808332859660092, "grad_norm": 0.33760207891464233, "learning_rate": 8.998642119556636e-06, "loss": 0.149, "step": 6700 }, { "epoch": 0.38367532541351673, "grad_norm": 0.07961608469486237, "learning_rate": 8.982852812075662e-06, "loss": 0.0512, "step": 6750 }, { "epoch": 0.3865173648610243, "grad_norm": 0.0025313154328614473, "learning_rate": 8.967063504594689e-06, "loss": 0.1012, "step": 6800 }, { "epoch": 0.3893594043085318, "grad_norm": 86.97044372558594, "learning_rate": 8.951274197113716e-06, "loss": 0.1753, "step": 6850 }, { "epoch": 0.3922014437560393, "grad_norm": 0.000364658742910251, "learning_rate": 8.935484889632742e-06, "loss": 0.0682, "step": 6900 }, { "epoch": 0.3950434832035469, "grad_norm": 2.1032631397247314, "learning_rate": 8.919695582151767e-06, "loss": 0.0982, "step": 6950 }, { "epoch": 0.3978855226510544, "grad_norm": 80.62301635742188, "learning_rate": 8.903906274670793e-06, "loss": 0.0643, "step": 7000 }, { "epoch": 0.4007275620985619, "grad_norm": 0.008460620418190956, "learning_rate": 8.88811696718982e-06, "loss": 0.1701, "step": 7050 }, { "epoch": 0.40356960154606947, "grad_norm": 0.28310349583625793, "learning_rate": 8.872327659708847e-06, "loss": 0.1009, "step": 7100 }, { "epoch": 0.406411640993577, "grad_norm": 1.7792461221688427e-05, "learning_rate": 8.856538352227873e-06, "loss": 0.1712, "step": 7150 }, { "epoch": 0.4092536804410845, "grad_norm": 0.23674604296684265, "learning_rate": 8.840749044746898e-06, "loss": 0.0986, "step": 7200 }, { "epoch": 0.41209571988859206, "grad_norm": 16.953800201416016, "learning_rate": 8.824959737265924e-06, "loss": 0.0325, "step": 7250 }, { "epoch": 0.4149377593360996, "grad_norm": 9.964702621800825e-05, "learning_rate": 8.80917042978495e-06, "loss": 0.0717, "step": 7300 }, { "epoch": 0.4177797987836071, "grad_norm": 0.001411137287504971, "learning_rate": 8.793381122303976e-06, "loss": 0.1559, "step": 7350 }, { "epoch": 0.42062183823111465, "grad_norm": 9.269462316296995e-05, "learning_rate": 8.777591814823002e-06, "loss": 0.0398, "step": 7400 }, { "epoch": 0.4234638776786222, "grad_norm": 3.6857614517211914, "learning_rate": 8.761802507342029e-06, "loss": 0.1754, "step": 7450 }, { "epoch": 0.4263059171261297, "grad_norm": 0.08209937065839767, "learning_rate": 8.746013199861055e-06, "loss": 0.2188, "step": 7500 }, { "epoch": 0.42914795657363725, "grad_norm": 1.8349966921960004e-05, "learning_rate": 8.73022389238008e-06, "loss": 0.1063, "step": 7550 }, { "epoch": 0.4319899960211448, "grad_norm": 16.519161224365234, "learning_rate": 8.714434584899107e-06, "loss": 0.1812, "step": 7600 }, { "epoch": 0.4348320354686523, "grad_norm": 17.88888168334961, "learning_rate": 8.698645277418133e-06, "loss": 0.129, "step": 7650 }, { "epoch": 0.43767407491615984, "grad_norm": 12.852983474731445, "learning_rate": 8.68285596993716e-06, "loss": 0.1197, "step": 7700 }, { "epoch": 0.4405161143636674, "grad_norm": 4.835977165384975e-07, "learning_rate": 8.667066662456185e-06, "loss": 0.085, "step": 7750 }, { "epoch": 0.4433581538111749, "grad_norm": 35.268035888671875, "learning_rate": 8.651277354975211e-06, "loss": 0.0604, "step": 7800 }, { "epoch": 0.44620019325868243, "grad_norm": 0.4759794771671295, "learning_rate": 8.635488047494238e-06, "loss": 0.132, "step": 7850 }, { "epoch": 0.44904223270619, "grad_norm": 0.00010199743701377884, "learning_rate": 8.619698740013264e-06, "loss": 0.0506, "step": 7900 }, { "epoch": 0.4518842721536975, "grad_norm": 36.29248809814453, "learning_rate": 8.603909432532291e-06, "loss": 0.0669, "step": 7950 }, { "epoch": 0.454726311601205, "grad_norm": 10.3939208984375, "learning_rate": 8.588120125051316e-06, "loss": 0.026, "step": 8000 }, { "epoch": 0.4575683510487126, "grad_norm": 0.008960050530731678, "learning_rate": 8.572330817570342e-06, "loss": 0.1339, "step": 8050 }, { "epoch": 0.46041039049622007, "grad_norm": 0.04594314098358154, "learning_rate": 8.556541510089367e-06, "loss": 0.11, "step": 8100 }, { "epoch": 0.4632524299437276, "grad_norm": 0.0010862394701689482, "learning_rate": 8.540752202608394e-06, "loss": 0.1289, "step": 8150 }, { "epoch": 0.46609446939123517, "grad_norm": 5.387957571656443e-05, "learning_rate": 8.52496289512742e-06, "loss": 0.1622, "step": 8200 }, { "epoch": 0.46893650883874266, "grad_norm": 0.2264394611120224, "learning_rate": 8.509173587646447e-06, "loss": 0.1995, "step": 8250 }, { "epoch": 0.4717785482862502, "grad_norm": 0.0063866786658763885, "learning_rate": 8.493384280165473e-06, "loss": 0.1046, "step": 8300 }, { "epoch": 0.47462058773375776, "grad_norm": 0.0015529695665463805, "learning_rate": 8.477594972684498e-06, "loss": 0.0418, "step": 8350 }, { "epoch": 0.47746262718126525, "grad_norm": 0.0007633518544025719, "learning_rate": 8.461805665203525e-06, "loss": 0.1287, "step": 8400 }, { "epoch": 0.4803046666287728, "grad_norm": 0.6651237607002258, "learning_rate": 8.446016357722551e-06, "loss": 0.0839, "step": 8450 }, { "epoch": 0.48314670607628035, "grad_norm": 3.4617685741977766e-05, "learning_rate": 8.430227050241578e-06, "loss": 0.0385, "step": 8500 }, { "epoch": 0.48598874552378785, "grad_norm": 5.1106840146530885e-06, "learning_rate": 8.414437742760603e-06, "loss": 0.0937, "step": 8550 }, { "epoch": 0.4888307849712954, "grad_norm": 8.78021240234375, "learning_rate": 8.39864843527963e-06, "loss": 0.1118, "step": 8600 }, { "epoch": 0.49167282441880295, "grad_norm": 0.8296887874603271, "learning_rate": 8.382859127798656e-06, "loss": 0.057, "step": 8650 }, { "epoch": 0.49451486386631044, "grad_norm": 0.00016456704179290682, "learning_rate": 8.36706982031768e-06, "loss": 0.0419, "step": 8700 }, { "epoch": 0.497356903313818, "grad_norm": 33.392677307128906, "learning_rate": 8.351280512836707e-06, "loss": 0.0601, "step": 8750 }, { "epoch": 0.5001989427613255, "grad_norm": 3.6260087490081787, "learning_rate": 8.335491205355734e-06, "loss": 0.0204, "step": 8800 }, { "epoch": 0.5030409822088331, "grad_norm": 7.736544609069824, "learning_rate": 8.31970189787476e-06, "loss": 0.0247, "step": 8850 }, { "epoch": 0.5058830216563406, "grad_norm": 14.83883285522461, "learning_rate": 8.303912590393785e-06, "loss": 0.1137, "step": 8900 }, { "epoch": 0.5087250611038481, "grad_norm": 6.366983143379912e-05, "learning_rate": 8.288123282912812e-06, "loss": 0.0366, "step": 8950 }, { "epoch": 0.5115671005513557, "grad_norm": 0.0004063360102009028, "learning_rate": 8.272333975431838e-06, "loss": 0.115, "step": 9000 }, { "epoch": 0.5144091399988632, "grad_norm": 0.02094779722392559, "learning_rate": 8.256544667950865e-06, "loss": 0.1327, "step": 9050 }, { "epoch": 0.5172511794463707, "grad_norm": 0.001826010993681848, "learning_rate": 8.240755360469891e-06, "loss": 0.0476, "step": 9100 }, { "epoch": 0.5200932188938783, "grad_norm": 118.02986907958984, "learning_rate": 8.224966052988916e-06, "loss": 0.1068, "step": 9150 }, { "epoch": 0.5229352583413858, "grad_norm": 0.033574797213077545, "learning_rate": 8.209176745507943e-06, "loss": 0.0931, "step": 9200 }, { "epoch": 0.5257772977888933, "grad_norm": 7.4097010838158894e-06, "learning_rate": 8.19338743802697e-06, "loss": 0.0992, "step": 9250 }, { "epoch": 0.5286193372364009, "grad_norm": 2.317996025085449, "learning_rate": 8.177598130545996e-06, "loss": 0.0587, "step": 9300 }, { "epoch": 0.5314613766839084, "grad_norm": 11.575976371765137, "learning_rate": 8.16180882306502e-06, "loss": 0.1835, "step": 9350 }, { "epoch": 0.5343034161314159, "grad_norm": 0.006680720020085573, "learning_rate": 8.146019515584047e-06, "loss": 0.1228, "step": 9400 }, { "epoch": 0.5371454555789235, "grad_norm": 9.924306141328998e-06, "learning_rate": 8.130230208103074e-06, "loss": 0.0339, "step": 9450 }, { "epoch": 0.539987495026431, "grad_norm": 0.005702057387679815, "learning_rate": 8.114440900622099e-06, "loss": 0.0756, "step": 9500 }, { "epoch": 0.5428295344739384, "grad_norm": 0.00034645816776901484, "learning_rate": 8.098651593141125e-06, "loss": 0.0622, "step": 9550 }, { "epoch": 0.545671573921446, "grad_norm": 1.937679735419806e-05, "learning_rate": 8.082862285660152e-06, "loss": 0.1943, "step": 9600 }, { "epoch": 0.5485136133689535, "grad_norm": 0.00980337243527174, "learning_rate": 8.067072978179178e-06, "loss": 0.1115, "step": 9650 }, { "epoch": 0.551355652816461, "grad_norm": 0.0001525239204056561, "learning_rate": 8.051283670698203e-06, "loss": 0.0439, "step": 9700 }, { "epoch": 0.5541976922639686, "grad_norm": 7.418223368915733e-09, "learning_rate": 8.03549436321723e-06, "loss": 0.0743, "step": 9750 }, { "epoch": 0.5570397317114761, "grad_norm": 0.00019548562704585493, "learning_rate": 8.019705055736256e-06, "loss": 0.0746, "step": 9800 }, { "epoch": 0.5598817711589836, "grad_norm": 32.85011672973633, "learning_rate": 8.003915748255283e-06, "loss": 0.0919, "step": 9850 }, { "epoch": 0.5627238106064912, "grad_norm": 3.377399206161499, "learning_rate": 7.98812644077431e-06, "loss": 0.0775, "step": 9900 }, { "epoch": 0.5655658500539987, "grad_norm": 3.1045570373535156, "learning_rate": 7.972337133293334e-06, "loss": 0.0148, "step": 9950 }, { "epoch": 0.5684078895015063, "grad_norm": 1.1402869404264493e-07, "learning_rate": 7.95654782581236e-06, "loss": 0.1409, "step": 10000 }, { "epoch": 0.5712499289490138, "grad_norm": 0.08472077548503876, "learning_rate": 7.940758518331385e-06, "loss": 0.1262, "step": 10050 }, { "epoch": 0.5740919683965213, "grad_norm": 3.7520876503549516e-05, "learning_rate": 7.924969210850412e-06, "loss": 0.1203, "step": 10100 }, { "epoch": 0.5769340078440289, "grad_norm": 1.6792670294307754e-06, "learning_rate": 7.90917990336944e-06, "loss": 0.0585, "step": 10150 }, { "epoch": 0.5797760472915364, "grad_norm": 37.07989501953125, "learning_rate": 7.893706382038085e-06, "loss": 0.109, "step": 10200 }, { "epoch": 0.5826180867390439, "grad_norm": 0.17833800613880157, "learning_rate": 7.877917074557112e-06, "loss": 0.061, "step": 10250 }, { "epoch": 0.5854601261865515, "grad_norm": 0.006795084569603205, "learning_rate": 7.862127767076137e-06, "loss": 0.0365, "step": 10300 }, { "epoch": 0.588302165634059, "grad_norm": 0.06681282073259354, "learning_rate": 7.846338459595163e-06, "loss": 0.0365, "step": 10350 }, { "epoch": 0.5911442050815665, "grad_norm": 0.7294309139251709, "learning_rate": 7.830549152114188e-06, "loss": 0.1649, "step": 10400 }, { "epoch": 0.5939862445290741, "grad_norm": 0.08393588662147522, "learning_rate": 7.814759844633214e-06, "loss": 0.0394, "step": 10450 }, { "epoch": 0.5968282839765816, "grad_norm": 0.0002703500213101506, "learning_rate": 7.798970537152241e-06, "loss": 0.1197, "step": 10500 }, { "epoch": 0.5996703234240891, "grad_norm": 8.633947372436523, "learning_rate": 7.783181229671268e-06, "loss": 0.0285, "step": 10550 }, { "epoch": 0.6025123628715967, "grad_norm": 7.161734538385645e-05, "learning_rate": 7.767391922190294e-06, "loss": 0.1152, "step": 10600 }, { "epoch": 0.6053544023191042, "grad_norm": 5.04560157423839e-05, "learning_rate": 7.751602614709319e-06, "loss": 0.117, "step": 10650 }, { "epoch": 0.6081964417666117, "grad_norm": 0.01567472331225872, "learning_rate": 7.735813307228345e-06, "loss": 0.0394, "step": 10700 }, { "epoch": 0.6110384812141193, "grad_norm": 2.0176475048065186, "learning_rate": 7.720023999747372e-06, "loss": 0.0204, "step": 10750 }, { "epoch": 0.6138805206616268, "grad_norm": 0.00012829320621676743, "learning_rate": 7.704234692266399e-06, "loss": 0.0992, "step": 10800 }, { "epoch": 0.6167225601091343, "grad_norm": 3.674347400665283, "learning_rate": 7.688445384785423e-06, "loss": 0.1526, "step": 10850 }, { "epoch": 0.6195645995566419, "grad_norm": 7.29197267901327e-07, "learning_rate": 7.67265607730445e-06, "loss": 0.0499, "step": 10900 }, { "epoch": 0.6224066390041494, "grad_norm": 0.7920209169387817, "learning_rate": 7.656866769823476e-06, "loss": 0.0199, "step": 10950 }, { "epoch": 0.6252486784516569, "grad_norm": 0.0007642278214916587, "learning_rate": 7.641077462342501e-06, "loss": 0.0235, "step": 11000 }, { "epoch": 0.6280907178991645, "grad_norm": 0.00012507560313679278, "learning_rate": 7.625288154861529e-06, "loss": 0.0386, "step": 11050 }, { "epoch": 0.630932757346672, "grad_norm": 13.769145965576172, "learning_rate": 7.6094988473805544e-06, "loss": 0.0587, "step": 11100 }, { "epoch": 0.6337747967941795, "grad_norm": 0.000428153172833845, "learning_rate": 7.593709539899581e-06, "loss": 0.0064, "step": 11150 }, { "epoch": 0.6366168362416871, "grad_norm": 0.3749142587184906, "learning_rate": 7.577920232418607e-06, "loss": 0.1439, "step": 11200 }, { "epoch": 0.6394588756891946, "grad_norm": 0.020476723089814186, "learning_rate": 7.562130924937633e-06, "loss": 0.1217, "step": 11250 }, { "epoch": 0.6423009151367021, "grad_norm": 0.001190952374599874, "learning_rate": 7.54634161745666e-06, "loss": 0.0628, "step": 11300 }, { "epoch": 0.6451429545842097, "grad_norm": 92.49651336669922, "learning_rate": 7.530552309975685e-06, "loss": 0.09, "step": 11350 }, { "epoch": 0.6479849940317172, "grad_norm": 1.3711645603179932, "learning_rate": 7.514763002494712e-06, "loss": 0.0637, "step": 11400 }, { "epoch": 0.6508270334792247, "grad_norm": 0.023164603859186172, "learning_rate": 7.498973695013737e-06, "loss": 0.0724, "step": 11450 }, { "epoch": 0.6536690729267323, "grad_norm": 0.029271801933646202, "learning_rate": 7.483184387532763e-06, "loss": 0.0863, "step": 11500 }, { "epoch": 0.6565111123742398, "grad_norm": 2.1439642906188965, "learning_rate": 7.467395080051789e-06, "loss": 0.058, "step": 11550 }, { "epoch": 0.6593531518217473, "grad_norm": 0.046266451478004456, "learning_rate": 7.451605772570816e-06, "loss": 0.1189, "step": 11600 }, { "epoch": 0.6621951912692549, "grad_norm": 0.024902399629354477, "learning_rate": 7.435816465089841e-06, "loss": 0.0322, "step": 11650 }, { "epoch": 0.6650372307167624, "grad_norm": 2.5316705176692267e-08, "learning_rate": 7.420027157608868e-06, "loss": 0.1323, "step": 11700 }, { "epoch": 0.6678792701642698, "grad_norm": 0.004433684982359409, "learning_rate": 7.404237850127894e-06, "loss": 0.1089, "step": 11750 }, { "epoch": 0.6707213096117774, "grad_norm": 5.489003456204955e-07, "learning_rate": 7.38844854264692e-06, "loss": 0.0541, "step": 11800 }, { "epoch": 0.6735633490592849, "grad_norm": 1.6806092162369168e-07, "learning_rate": 7.372659235165947e-06, "loss": 0.1402, "step": 11850 }, { "epoch": 0.6764053885067924, "grad_norm": 104.10470581054688, "learning_rate": 7.356869927684972e-06, "loss": 0.0989, "step": 11900 }, { "epoch": 0.6792474279543, "grad_norm": 0.02749534137547016, "learning_rate": 7.341080620203999e-06, "loss": 0.128, "step": 11950 }, { "epoch": 0.6820894674018075, "grad_norm": 3.8121328316265135e-08, "learning_rate": 7.325291312723024e-06, "loss": 0.0741, "step": 12000 }, { "epoch": 0.684931506849315, "grad_norm": 0.007383296731859446, "learning_rate": 7.30950200524205e-06, "loss": 0.0224, "step": 12050 }, { "epoch": 0.6877735462968226, "grad_norm": 0.5884593725204468, "learning_rate": 7.293712697761078e-06, "loss": 0.1348, "step": 12100 }, { "epoch": 0.6906155857443301, "grad_norm": 1.046032428741455, "learning_rate": 7.2779233902801025e-06, "loss": 0.0473, "step": 12150 }, { "epoch": 0.6934576251918376, "grad_norm": 7.873534923419356e-05, "learning_rate": 7.262134082799129e-06, "loss": 0.0492, "step": 12200 }, { "epoch": 0.6962996646393452, "grad_norm": 7.752254962921143, "learning_rate": 7.246344775318155e-06, "loss": 0.1462, "step": 12250 }, { "epoch": 0.6991417040868527, "grad_norm": 2.2376983165740967, "learning_rate": 7.230555467837181e-06, "loss": 0.0378, "step": 12300 }, { "epoch": 0.7019837435343602, "grad_norm": 0.35472461581230164, "learning_rate": 7.214766160356207e-06, "loss": 0.0216, "step": 12350 }, { "epoch": 0.7048257829818678, "grad_norm": 0.04186001047492027, "learning_rate": 7.1989768528752335e-06, "loss": 0.1067, "step": 12400 }, { "epoch": 0.7076678224293753, "grad_norm": 16.21009635925293, "learning_rate": 7.18318754539426e-06, "loss": 0.1441, "step": 12450 }, { "epoch": 0.7105098618768828, "grad_norm": 14.107819557189941, "learning_rate": 7.167398237913286e-06, "loss": 0.0753, "step": 12500 }, { "epoch": 0.7133519013243904, "grad_norm": 0.01802966371178627, "learning_rate": 7.151608930432312e-06, "loss": 0.0667, "step": 12550 }, { "epoch": 0.7161939407718979, "grad_norm": 0.003547944827005267, "learning_rate": 7.135819622951338e-06, "loss": 0.0373, "step": 12600 }, { "epoch": 0.7190359802194054, "grad_norm": 7.837246084818617e-05, "learning_rate": 7.1200303154703646e-06, "loss": 0.0838, "step": 12650 }, { "epoch": 0.721878019666913, "grad_norm": 17.419485092163086, "learning_rate": 7.104241007989389e-06, "loss": 0.0929, "step": 12700 }, { "epoch": 0.7247200591144205, "grad_norm": 16.272008895874023, "learning_rate": 7.088451700508416e-06, "loss": 0.0836, "step": 12750 }, { "epoch": 0.727562098561928, "grad_norm": 0.4332881271839142, "learning_rate": 7.072662393027442e-06, "loss": 0.084, "step": 12800 }, { "epoch": 0.7304041380094356, "grad_norm": 26.405973434448242, "learning_rate": 7.056873085546468e-06, "loss": 0.1307, "step": 12850 }, { "epoch": 0.7332461774569431, "grad_norm": 3.659460844573914e-06, "learning_rate": 7.041083778065495e-06, "loss": 0.0403, "step": 12900 }, { "epoch": 0.7360882169044506, "grad_norm": 0.07355604320764542, "learning_rate": 7.02561025673414e-06, "loss": 0.0219, "step": 12950 }, { "epoch": 0.7389302563519582, "grad_norm": 15.638680458068848, "learning_rate": 7.009820949253167e-06, "loss": 0.0685, "step": 13000 }, { "epoch": 0.7417722957994657, "grad_norm": 3.5204797313781455e-05, "learning_rate": 6.994031641772192e-06, "loss": 0.0855, "step": 13050 }, { "epoch": 0.7446143352469732, "grad_norm": 0.004958641715347767, "learning_rate": 6.978242334291218e-06, "loss": 0.1128, "step": 13100 }, { "epoch": 0.7474563746944808, "grad_norm": 0.0, "learning_rate": 6.962453026810244e-06, "loss": 0.085, "step": 13150 }, { "epoch": 0.7502984141419883, "grad_norm": 58.35037612915039, "learning_rate": 6.946663719329271e-06, "loss": 0.1085, "step": 13200 }, { "epoch": 0.7531404535894958, "grad_norm": 0.00835405569523573, "learning_rate": 6.930874411848297e-06, "loss": 0.0568, "step": 13250 }, { "epoch": 0.7559824930370034, "grad_norm": 0.04398033395409584, "learning_rate": 6.915085104367323e-06, "loss": 0.0882, "step": 13300 }, { "epoch": 0.7588245324845109, "grad_norm": 0.10432946681976318, "learning_rate": 6.899295796886349e-06, "loss": 0.0702, "step": 13350 }, { "epoch": 0.7616665719320184, "grad_norm": 0.006950623821467161, "learning_rate": 6.883506489405375e-06, "loss": 0.0776, "step": 13400 }, { "epoch": 0.764508611379526, "grad_norm": 4.063835513079539e-05, "learning_rate": 6.867717181924402e-06, "loss": 0.1236, "step": 13450 }, { "epoch": 0.7673506508270335, "grad_norm": 0.007932694628834724, "learning_rate": 6.851927874443427e-06, "loss": 0.026, "step": 13500 }, { "epoch": 0.770192690274541, "grad_norm": 19.588890075683594, "learning_rate": 6.836138566962454e-06, "loss": 0.0918, "step": 13550 }, { "epoch": 0.7730347297220486, "grad_norm": 3.374583101845019e-08, "learning_rate": 6.8203492594814804e-06, "loss": 0.0674, "step": 13600 }, { "epoch": 0.7758767691695561, "grad_norm": 0.013583102263510227, "learning_rate": 6.804559952000505e-06, "loss": 0.1481, "step": 13650 }, { "epoch": 0.7787188086170636, "grad_norm": 0.005707134492695332, "learning_rate": 6.788770644519533e-06, "loss": 0.0763, "step": 13700 }, { "epoch": 0.7815608480645712, "grad_norm": 6.749715805053711, "learning_rate": 6.7729813370385575e-06, "loss": 0.0702, "step": 13750 }, { "epoch": 0.7844028875120787, "grad_norm": 1.4639660548709799e-05, "learning_rate": 6.757192029557584e-06, "loss": 0.1161, "step": 13800 }, { "epoch": 0.7872449269595861, "grad_norm": 41.65675354003906, "learning_rate": 6.74140272207661e-06, "loss": 0.0103, "step": 13850 }, { "epoch": 0.7900869664070937, "grad_norm": 2.1764416694641113, "learning_rate": 6.725613414595636e-06, "loss": 0.1131, "step": 13900 }, { "epoch": 0.7929290058546012, "grad_norm": 0.10050699859857559, "learning_rate": 6.709824107114663e-06, "loss": 0.0279, "step": 13950 }, { "epoch": 0.7957710453021088, "grad_norm": 11.600699424743652, "learning_rate": 6.6940347996336885e-06, "loss": 0.1378, "step": 14000 }, { "epoch": 0.7986130847496163, "grad_norm": 0.03505561128258705, "learning_rate": 6.678245492152715e-06, "loss": 0.0564, "step": 14050 }, { "epoch": 0.8014551241971238, "grad_norm": 0.0009206988615915179, "learning_rate": 6.662456184671741e-06, "loss": 0.0696, "step": 14100 }, { "epoch": 0.8042971636446314, "grad_norm": 0.0005649039521813393, "learning_rate": 6.646666877190767e-06, "loss": 0.0485, "step": 14150 }, { "epoch": 0.8071392030921389, "grad_norm": 0.33858436346054077, "learning_rate": 6.630877569709793e-06, "loss": 0.0709, "step": 14200 }, { "epoch": 0.8099812425396464, "grad_norm": 0.00026180280838161707, "learning_rate": 6.6150882622288196e-06, "loss": 0.0319, "step": 14250 }, { "epoch": 0.812823281987154, "grad_norm": 16.801115036010742, "learning_rate": 6.599298954747844e-06, "loss": 0.0738, "step": 14300 }, { "epoch": 0.8156653214346615, "grad_norm": 2.7744860649108887, "learning_rate": 6.583509647266871e-06, "loss": 0.118, "step": 14350 }, { "epoch": 0.818507360882169, "grad_norm": 0.009649834595620632, "learning_rate": 6.567720339785898e-06, "loss": 0.0634, "step": 14400 }, { "epoch": 0.8213494003296766, "grad_norm": 0.05146016180515289, "learning_rate": 6.551931032304923e-06, "loss": 0.0418, "step": 14450 }, { "epoch": 0.8241914397771841, "grad_norm": 0.00019788475765381008, "learning_rate": 6.53614172482395e-06, "loss": 0.0531, "step": 14500 }, { "epoch": 0.8270334792246916, "grad_norm": 13.125266075134277, "learning_rate": 6.5203524173429754e-06, "loss": 0.1156, "step": 14550 }, { "epoch": 0.8298755186721992, "grad_norm": 6.222808837890625, "learning_rate": 6.504563109862002e-06, "loss": 0.1122, "step": 14600 }, { "epoch": 0.8327175581197067, "grad_norm": 0.0, "learning_rate": 6.488773802381028e-06, "loss": 0.0167, "step": 14650 }, { "epoch": 0.8355595975672142, "grad_norm": 4.431889533996582, "learning_rate": 6.472984494900054e-06, "loss": 0.091, "step": 14700 }, { "epoch": 0.8384016370147218, "grad_norm": 1.2102794926249771e-06, "learning_rate": 6.457195187419081e-06, "loss": 0.0298, "step": 14750 }, { "epoch": 0.8412436764622293, "grad_norm": 3.0437583923339844, "learning_rate": 6.4414058799381065e-06, "loss": 0.0563, "step": 14800 }, { "epoch": 0.8440857159097368, "grad_norm": 1.0684104267966177e-08, "learning_rate": 6.425616572457133e-06, "loss": 0.081, "step": 14850 }, { "epoch": 0.8469277553572444, "grad_norm": 0.0, "learning_rate": 6.409827264976159e-06, "loss": 0.1041, "step": 14900 }, { "epoch": 0.8497697948047519, "grad_norm": 1.7737058442435227e-05, "learning_rate": 6.394037957495185e-06, "loss": 0.0392, "step": 14950 }, { "epoch": 0.8526118342522594, "grad_norm": 0.0064972261898219585, "learning_rate": 6.37824865001421e-06, "loss": 0.0737, "step": 15000 }, { "epoch": 0.855453873699767, "grad_norm": 0.0786210373044014, "learning_rate": 6.362459342533237e-06, "loss": 0.1005, "step": 15050 }, { "epoch": 0.8582959131472745, "grad_norm": 11.170258522033691, "learning_rate": 6.346670035052264e-06, "loss": 0.0359, "step": 15100 }, { "epoch": 0.861137952594782, "grad_norm": 0.0001765089255059138, "learning_rate": 6.330880727571289e-06, "loss": 0.0978, "step": 15150 }, { "epoch": 0.8639799920422896, "grad_norm": 0.008763090707361698, "learning_rate": 6.3150914200903154e-06, "loss": 0.092, "step": 15200 }, { "epoch": 0.8668220314897971, "grad_norm": 0.15193457901477814, "learning_rate": 6.299302112609341e-06, "loss": 0.0651, "step": 15250 }, { "epoch": 0.8696640709373046, "grad_norm": 3.0488173961639404, "learning_rate": 6.283512805128368e-06, "loss": 0.1541, "step": 15300 }, { "epoch": 0.8725061103848122, "grad_norm": 0.009500974789261818, "learning_rate": 6.267723497647393e-06, "loss": 0.1415, "step": 15350 }, { "epoch": 0.8753481498323197, "grad_norm": 0.0, "learning_rate": 6.25193419016642e-06, "loss": 0.0618, "step": 15400 }, { "epoch": 0.8781901892798272, "grad_norm": 1.0989417205564678e-05, "learning_rate": 6.236144882685446e-06, "loss": 0.0635, "step": 15450 }, { "epoch": 0.8810322287273348, "grad_norm": 0.0017705713398754597, "learning_rate": 6.220355575204472e-06, "loss": 0.1652, "step": 15500 }, { "epoch": 0.8838742681748423, "grad_norm": 4.008788164355792e-05, "learning_rate": 6.204566267723499e-06, "loss": 0.1281, "step": 15550 }, { "epoch": 0.8867163076223498, "grad_norm": 0.042210742831230164, "learning_rate": 6.188776960242524e-06, "loss": 0.0348, "step": 15600 }, { "epoch": 0.8895583470698574, "grad_norm": 4.716152398032136e-05, "learning_rate": 6.172987652761551e-06, "loss": 0.1308, "step": 15650 }, { "epoch": 0.8924003865173649, "grad_norm": 0.07087603211402893, "learning_rate": 6.157198345280576e-06, "loss": 0.0361, "step": 15700 }, { "epoch": 0.8952424259648724, "grad_norm": 11.82944107055664, "learning_rate": 6.141409037799602e-06, "loss": 0.0311, "step": 15750 }, { "epoch": 0.89808446541238, "grad_norm": 0.00012191317364340648, "learning_rate": 6.125619730318628e-06, "loss": 0.1061, "step": 15800 }, { "epoch": 0.9009265048598875, "grad_norm": 23.001506805419922, "learning_rate": 6.1098304228376546e-06, "loss": 0.1868, "step": 15850 }, { "epoch": 0.903768544307395, "grad_norm": 2.36151686294761e-06, "learning_rate": 6.094041115356681e-06, "loss": 0.1393, "step": 15900 }, { "epoch": 0.9066105837549026, "grad_norm": 3.5732333660125732, "learning_rate": 6.078251807875707e-06, "loss": 0.0392, "step": 15950 }, { "epoch": 0.90945262320241, "grad_norm": 1.3125786781311035, "learning_rate": 6.062462500394733e-06, "loss": 0.1064, "step": 16000 }, { "epoch": 0.9122946626499175, "grad_norm": 2.166292667388916, "learning_rate": 6.046673192913759e-06, "loss": 0.0266, "step": 16050 }, { "epoch": 0.9151367020974251, "grad_norm": 24.00419807434082, "learning_rate": 6.030883885432786e-06, "loss": 0.0539, "step": 16100 }, { "epoch": 0.9179787415449326, "grad_norm": 0.09000074118375778, "learning_rate": 6.015094577951811e-06, "loss": 0.0383, "step": 16150 }, { "epoch": 0.9208207809924401, "grad_norm": 1.0749312423286028e-05, "learning_rate": 5.999305270470838e-06, "loss": 0.0342, "step": 16200 }, { "epoch": 0.9236628204399477, "grad_norm": 26.545013427734375, "learning_rate": 5.983515962989864e-06, "loss": 0.1773, "step": 16250 }, { "epoch": 0.9265048598874552, "grad_norm": 3.1472824048250914e-05, "learning_rate": 5.96772665550889e-06, "loss": 0.0289, "step": 16300 }, { "epoch": 0.9293468993349627, "grad_norm": 0.002298567909747362, "learning_rate": 5.951937348027917e-06, "loss": 0.0352, "step": 16350 }, { "epoch": 0.9321889387824703, "grad_norm": 0.01259683445096016, "learning_rate": 5.9361480405469415e-06, "loss": 0.0968, "step": 16400 }, { "epoch": 0.9350309782299778, "grad_norm": 0.002880501328036189, "learning_rate": 5.920358733065969e-06, "loss": 0.021, "step": 16450 }, { "epoch": 0.9378730176774853, "grad_norm": 0.0010927625698968768, "learning_rate": 5.904569425584994e-06, "loss": 0.1326, "step": 16500 }, { "epoch": 0.9407150571249929, "grad_norm": 0.0001717354025458917, "learning_rate": 5.88878011810402e-06, "loss": 0.0726, "step": 16550 }, { "epoch": 0.9435570965725004, "grad_norm": 9.740878158481792e-05, "learning_rate": 5.872990810623047e-06, "loss": 0.122, "step": 16600 }, { "epoch": 0.9463991360200079, "grad_norm": 0.2150394767522812, "learning_rate": 5.8572015031420725e-06, "loss": 0.048, "step": 16650 }, { "epoch": 0.9492411754675155, "grad_norm": 0.004849886521697044, "learning_rate": 5.841412195661099e-06, "loss": 0.0753, "step": 16700 }, { "epoch": 0.952083214915023, "grad_norm": 69.4601821899414, "learning_rate": 5.825622888180125e-06, "loss": 0.1706, "step": 16750 }, { "epoch": 0.9549252543625305, "grad_norm": 8.404773712158203, "learning_rate": 5.809833580699151e-06, "loss": 0.0946, "step": 16800 }, { "epoch": 0.9577672938100381, "grad_norm": 3.914994158549234e-05, "learning_rate": 5.794044273218177e-06, "loss": 0.051, "step": 16850 }, { "epoch": 0.9606093332575456, "grad_norm": 20.864805221557617, "learning_rate": 5.7782549657372035e-06, "loss": 0.0442, "step": 16900 }, { "epoch": 0.9634513727050531, "grad_norm": 0.0, "learning_rate": 5.762781444405848e-06, "loss": 0.0238, "step": 16950 }, { "epoch": 0.9662934121525607, "grad_norm": 3.836441209159602e-08, "learning_rate": 5.746992136924875e-06, "loss": 0.0203, "step": 17000 }, { "epoch": 0.9691354516000682, "grad_norm": 0.11022733151912689, "learning_rate": 5.731518615593521e-06, "loss": 0.0633, "step": 17050 }, { "epoch": 0.9719774910475757, "grad_norm": 6.875392864458263e-05, "learning_rate": 5.715729308112546e-06, "loss": 0.0258, "step": 17100 }, { "epoch": 0.9748195304950833, "grad_norm": 0.5252150297164917, "learning_rate": 5.699940000631573e-06, "loss": 0.0757, "step": 17150 }, { "epoch": 0.9776615699425908, "grad_norm": 0.0, "learning_rate": 5.6841506931505985e-06, "loss": 0.0724, "step": 17200 }, { "epoch": 0.9805036093900983, "grad_norm": 13.295694351196289, "learning_rate": 5.668361385669625e-06, "loss": 0.063, "step": 17250 }, { "epoch": 0.9833456488376059, "grad_norm": 34.442378997802734, "learning_rate": 5.652572078188651e-06, "loss": 0.0818, "step": 17300 }, { "epoch": 0.9861876882851134, "grad_norm": 2.82287366815126e-08, "learning_rate": 5.636782770707677e-06, "loss": 0.1122, "step": 17350 }, { "epoch": 0.9890297277326209, "grad_norm": 0.0009897969430312514, "learning_rate": 5.620993463226704e-06, "loss": 0.0318, "step": 17400 }, { "epoch": 0.9918717671801285, "grad_norm": 0.039634283632040024, "learning_rate": 5.6052041557457296e-06, "loss": 0.0564, "step": 17450 }, { "epoch": 0.994713806627636, "grad_norm": 4.96626091003418, "learning_rate": 5.589414848264756e-06, "loss": 0.0139, "step": 17500 }, { "epoch": 0.9975558460751435, "grad_norm": 0.39126285910606384, "learning_rate": 5.573625540783781e-06, "loss": 0.1943, "step": 17550 }, { "epoch": 1.000397885522651, "grad_norm": 0.017159905284643173, "learning_rate": 5.557836233302808e-06, "loss": 0.0975, "step": 17600 }, { "epoch": 1.0032399249701587, "grad_norm": 41.97768020629883, "learning_rate": 5.542046925821833e-06, "loss": 0.0861, "step": 17650 }, { "epoch": 1.0060819644176662, "grad_norm": 5.398248390520166e-07, "learning_rate": 5.52625761834086e-06, "loss": 0.0694, "step": 17700 }, { "epoch": 1.0089240038651737, "grad_norm": 23.04943084716797, "learning_rate": 5.510468310859886e-06, "loss": 0.0654, "step": 17750 }, { "epoch": 1.0117660433126812, "grad_norm": 0.0, "learning_rate": 5.494679003378912e-06, "loss": 0.0055, "step": 17800 }, { "epoch": 1.0146080827601887, "grad_norm": 0.0006691432208754122, "learning_rate": 5.4788896958979385e-06, "loss": 0.0648, "step": 17850 }, { "epoch": 1.0174501222076962, "grad_norm": 0.00031599996145814657, "learning_rate": 5.463100388416964e-06, "loss": 0.0884, "step": 17900 }, { "epoch": 1.0202921616552039, "grad_norm": 0.012487998232245445, "learning_rate": 5.447311080935991e-06, "loss": 0.0247, "step": 17950 }, { "epoch": 1.0231342011027114, "grad_norm": 0.00032109449966810644, "learning_rate": 5.4315217734550165e-06, "loss": 0.0202, "step": 18000 }, { "epoch": 1.0259762405502189, "grad_norm": 6.941709041595459, "learning_rate": 5.415732465974043e-06, "loss": 0.0439, "step": 18050 }, { "epoch": 1.0288182799977263, "grad_norm": 12.864021301269531, "learning_rate": 5.399943158493069e-06, "loss": 0.1079, "step": 18100 }, { "epoch": 1.0316603194452338, "grad_norm": 100.53852844238281, "learning_rate": 5.384153851012095e-06, "loss": 0.0697, "step": 18150 }, { "epoch": 1.0345023588927413, "grad_norm": 0.0044522699899971485, "learning_rate": 5.368364543531122e-06, "loss": 0.1182, "step": 18200 }, { "epoch": 1.037344398340249, "grad_norm": 0.0, "learning_rate": 5.352575236050147e-06, "loss": 0.048, "step": 18250 }, { "epoch": 1.0401864377877565, "grad_norm": 0.006007139105349779, "learning_rate": 5.336785928569174e-06, "loss": 0.0383, "step": 18300 }, { "epoch": 1.043028477235264, "grad_norm": 0.17537954449653625, "learning_rate": 5.320996621088199e-06, "loss": 0.1162, "step": 18350 }, { "epoch": 1.0458705166827715, "grad_norm": 0.25201526284217834, "learning_rate": 5.305207313607225e-06, "loss": 0.0501, "step": 18400 }, { "epoch": 1.048712556130279, "grad_norm": 0.09729473292827606, "learning_rate": 5.289418006126251e-06, "loss": 0.0249, "step": 18450 }, { "epoch": 1.0515545955777865, "grad_norm": 0.0008796801557764411, "learning_rate": 5.273628698645278e-06, "loss": 0.0382, "step": 18500 }, { "epoch": 1.0543966350252942, "grad_norm": 0.22367514669895172, "learning_rate": 5.257839391164304e-06, "loss": 0.0164, "step": 18550 }, { "epoch": 1.0572386744728017, "grad_norm": 1.985193787135131e-09, "learning_rate": 5.24205008368333e-06, "loss": 0.0627, "step": 18600 }, { "epoch": 1.0600807139203092, "grad_norm": 0.008963909931480885, "learning_rate": 5.2262607762023564e-06, "loss": 0.0234, "step": 18650 }, { "epoch": 1.0629227533678167, "grad_norm": 2.174706220626831, "learning_rate": 5.210471468721382e-06, "loss": 0.0741, "step": 18700 }, { "epoch": 1.0657647928153242, "grad_norm": 0.002892330288887024, "learning_rate": 5.194682161240409e-06, "loss": 0.0683, "step": 18750 }, { "epoch": 1.0686068322628317, "grad_norm": 11.141602516174316, "learning_rate": 5.178892853759434e-06, "loss": 0.0813, "step": 18800 }, { "epoch": 1.0714488717103394, "grad_norm": 0.9745445251464844, "learning_rate": 5.163103546278461e-06, "loss": 0.0394, "step": 18850 }, { "epoch": 1.074290911157847, "grad_norm": 0.38554519414901733, "learning_rate": 5.1473142387974875e-06, "loss": 0.0333, "step": 18900 }, { "epoch": 1.0771329506053544, "grad_norm": 0.19123269617557526, "learning_rate": 5.131524931316513e-06, "loss": 0.1176, "step": 18950 }, { "epoch": 1.079974990052862, "grad_norm": 1.6241638434166816e-07, "learning_rate": 5.11573562383554e-06, "loss": 0.0636, "step": 19000 }, { "epoch": 1.0828170295003694, "grad_norm": 1.685952838670346e-06, "learning_rate": 5.0999463163545646e-06, "loss": 0.0237, "step": 19050 }, { "epoch": 1.085659068947877, "grad_norm": 0.0023898116778582335, "learning_rate": 5.084157008873591e-06, "loss": 0.0273, "step": 19100 }, { "epoch": 1.0885011083953846, "grad_norm": 0.0, "learning_rate": 5.068367701392617e-06, "loss": 0.043, "step": 19150 }, { "epoch": 1.091343147842892, "grad_norm": 0.3458022475242615, "learning_rate": 5.052578393911643e-06, "loss": 0.0843, "step": 19200 }, { "epoch": 1.0941851872903996, "grad_norm": 0.9719235897064209, "learning_rate": 5.03678908643067e-06, "loss": 0.0222, "step": 19250 }, { "epoch": 1.097027226737907, "grad_norm": 1.172130168924923e-06, "learning_rate": 5.020999778949696e-06, "loss": 0.0503, "step": 19300 }, { "epoch": 1.0998692661854146, "grad_norm": 0.011351634748280048, "learning_rate": 5.005210471468722e-06, "loss": 0.024, "step": 19350 }, { "epoch": 1.1027113056329223, "grad_norm": 5.623274803161621, "learning_rate": 4.989736950137367e-06, "loss": 0.0955, "step": 19400 }, { "epoch": 1.1055533450804298, "grad_norm": 9.029454231262207, "learning_rate": 4.9739476426563935e-06, "loss": 0.0208, "step": 19450 }, { "epoch": 1.1083953845279373, "grad_norm": 0.28683456778526306, "learning_rate": 4.95815833517542e-06, "loss": 0.0629, "step": 19500 }, { "epoch": 1.1112374239754448, "grad_norm": 0.09671737253665924, "learning_rate": 4.942369027694446e-06, "loss": 0.0206, "step": 19550 }, { "epoch": 1.1140794634229523, "grad_norm": 1.800557402020786e-05, "learning_rate": 4.9265797202134715e-06, "loss": 0.1151, "step": 19600 }, { "epoch": 1.1169215028704598, "grad_norm": 6.646442489000037e-05, "learning_rate": 4.910790412732498e-06, "loss": 0.0679, "step": 19650 }, { "epoch": 1.1197635423179673, "grad_norm": 0.011751332320272923, "learning_rate": 4.895001105251524e-06, "loss": 0.035, "step": 19700 }, { "epoch": 1.122605581765475, "grad_norm": 0.4591018259525299, "learning_rate": 4.87921179777055e-06, "loss": 0.0971, "step": 19750 }, { "epoch": 1.1254476212129825, "grad_norm": 0.0011384054087102413, "learning_rate": 4.863422490289576e-06, "loss": 0.0254, "step": 19800 }, { "epoch": 1.12828966066049, "grad_norm": 0.0007807065267115831, "learning_rate": 4.8476331828086025e-06, "loss": 0.0274, "step": 19850 }, { "epoch": 1.1311317001079975, "grad_norm": 0.00026429726858623326, "learning_rate": 4.831843875327629e-06, "loss": 0.0357, "step": 19900 }, { "epoch": 1.133973739555505, "grad_norm": 2.2718031686963513e-05, "learning_rate": 4.816054567846655e-06, "loss": 0.1093, "step": 19950 }, { "epoch": 1.1368157790030127, "grad_norm": 13.4371976852417, "learning_rate": 4.80026526036568e-06, "loss": 0.0034, "step": 20000 }, { "epoch": 1.1396578184505202, "grad_norm": 5.951151251792908e-07, "learning_rate": 4.784475952884707e-06, "loss": 0.1306, "step": 20050 }, { "epoch": 1.1424998578980277, "grad_norm": 0.40396201610565186, "learning_rate": 4.768686645403733e-06, "loss": 0.0353, "step": 20100 }, { "epoch": 1.1453418973455352, "grad_norm": 1.299700379371643, "learning_rate": 4.752897337922759e-06, "loss": 0.1621, "step": 20150 }, { "epoch": 1.1481839367930426, "grad_norm": 3.819512858171947e-05, "learning_rate": 4.737108030441786e-06, "loss": 0.0524, "step": 20200 }, { "epoch": 1.1510259762405501, "grad_norm": 0.20907586812973022, "learning_rate": 4.7213187229608114e-06, "loss": 0.0874, "step": 20250 }, { "epoch": 1.1538680156880576, "grad_norm": 8.881078247213736e-05, "learning_rate": 4.705529415479837e-06, "loss": 0.0295, "step": 20300 }, { "epoch": 1.1567100551355654, "grad_norm": 2.5526607036590576, "learning_rate": 4.689740107998864e-06, "loss": 0.0356, "step": 20350 }, { "epoch": 1.1595520945830728, "grad_norm": 5.97112830291735e-06, "learning_rate": 4.673950800517889e-06, "loss": 0.0568, "step": 20400 }, { "epoch": 1.1623941340305803, "grad_norm": 0.003886153921484947, "learning_rate": 4.658161493036916e-06, "loss": 0.0842, "step": 20450 }, { "epoch": 1.1652361734780878, "grad_norm": 50.715335845947266, "learning_rate": 4.642372185555942e-06, "loss": 0.0163, "step": 20500 }, { "epoch": 1.1680782129255953, "grad_norm": 2.8300539156589366e-07, "learning_rate": 4.626582878074968e-06, "loss": 0.1058, "step": 20550 }, { "epoch": 1.170920252373103, "grad_norm": 0.002394747221842408, "learning_rate": 4.610793570593995e-06, "loss": 0.0125, "step": 20600 }, { "epoch": 1.1737622918206105, "grad_norm": 57.08656692504883, "learning_rate": 4.59500426311302e-06, "loss": 0.009, "step": 20650 }, { "epoch": 1.176604331268118, "grad_norm": 6.404465675354004, "learning_rate": 4.579214955632046e-06, "loss": 0.095, "step": 20700 }, { "epoch": 1.1794463707156255, "grad_norm": 7.977810309967026e-05, "learning_rate": 4.563425648151073e-06, "loss": 0.0397, "step": 20750 }, { "epoch": 1.182288410163133, "grad_norm": 10.515766143798828, "learning_rate": 4.547636340670098e-06, "loss": 0.0379, "step": 20800 }, { "epoch": 1.1851304496106405, "grad_norm": 1.9584187269210815, "learning_rate": 4.531847033189125e-06, "loss": 0.0077, "step": 20850 }, { "epoch": 1.187972489058148, "grad_norm": 0.00011891603207914159, "learning_rate": 4.516057725708151e-06, "loss": 0.0245, "step": 20900 }, { "epoch": 1.1908145285056557, "grad_norm": 0.3122955858707428, "learning_rate": 4.500268418227177e-06, "loss": 0.0591, "step": 20950 }, { "epoch": 1.1936565679531632, "grad_norm": 16.025440216064453, "learning_rate": 4.484479110746203e-06, "loss": 0.0758, "step": 21000 }, { "epoch": 1.1964986074006707, "grad_norm": 3.662534027171205e-07, "learning_rate": 4.468689803265229e-06, "loss": 0.039, "step": 21050 }, { "epoch": 1.1993406468481782, "grad_norm": 4.489954221753578e-07, "learning_rate": 4.452900495784255e-06, "loss": 0.0575, "step": 21100 }, { "epoch": 1.2021826862956857, "grad_norm": 0.011245720088481903, "learning_rate": 4.437111188303282e-06, "loss": 0.054, "step": 21150 }, { "epoch": 1.2050247257431934, "grad_norm": 3.188164843237473e-09, "learning_rate": 4.421321880822307e-06, "loss": 0.0181, "step": 21200 }, { "epoch": 1.207866765190701, "grad_norm": 0.026144633069634438, "learning_rate": 4.405532573341334e-06, "loss": 0.0479, "step": 21250 }, { "epoch": 1.2107088046382084, "grad_norm": 6.747923180228099e-05, "learning_rate": 4.3897432658603595e-06, "loss": 0.0522, "step": 21300 }, { "epoch": 1.213550844085716, "grad_norm": 0.0006858771084807813, "learning_rate": 4.373953958379386e-06, "loss": 0.0839, "step": 21350 }, { "epoch": 1.2163928835332234, "grad_norm": 0.0, "learning_rate": 4.358164650898412e-06, "loss": 0.0449, "step": 21400 }, { "epoch": 1.2192349229807309, "grad_norm": 0.0, "learning_rate": 4.342375343417438e-06, "loss": 0.0279, "step": 21450 }, { "epoch": 1.2220769624282386, "grad_norm": 4.745191795052506e-09, "learning_rate": 4.326586035936464e-06, "loss": 0.0281, "step": 21500 }, { "epoch": 1.224919001875746, "grad_norm": 0.9357084035873413, "learning_rate": 4.3107967284554906e-06, "loss": 0.0262, "step": 21550 }, { "epoch": 1.2277610413232536, "grad_norm": 0.07754403352737427, "learning_rate": 4.295007420974516e-06, "loss": 0.0196, "step": 21600 }, { "epoch": 1.230603080770761, "grad_norm": 19.192798614501953, "learning_rate": 4.279218113493542e-06, "loss": 0.1323, "step": 21650 }, { "epoch": 1.2334451202182686, "grad_norm": 0.22631697356700897, "learning_rate": 4.2634288060125685e-06, "loss": 0.0064, "step": 21700 }, { "epoch": 1.236287159665776, "grad_norm": 0.002126089995726943, "learning_rate": 4.247639498531595e-06, "loss": 0.0126, "step": 21750 }, { "epoch": 1.2391291991132838, "grad_norm": 0.00010815684800036252, "learning_rate": 4.231850191050621e-06, "loss": 0.1006, "step": 21800 }, { "epoch": 1.2419712385607913, "grad_norm": 0.0022329194471240044, "learning_rate": 4.216060883569647e-06, "loss": 0.0015, "step": 21850 }, { "epoch": 1.2448132780082988, "grad_norm": 9.05926513671875, "learning_rate": 4.200271576088673e-06, "loss": 0.0082, "step": 21900 }, { "epoch": 1.2476553174558063, "grad_norm": 2.0575951609203003e-08, "learning_rate": 4.1844822686076995e-06, "loss": 0.0227, "step": 21950 }, { "epoch": 1.2504973569033138, "grad_norm": 0.30678364634513855, "learning_rate": 4.168692961126725e-06, "loss": 0.0661, "step": 22000 }, { "epoch": 1.2533393963508215, "grad_norm": 11.889669418334961, "learning_rate": 4.152903653645751e-06, "loss": 0.1212, "step": 22050 }, { "epoch": 1.2561814357983287, "grad_norm": 0.0029533756896853447, "learning_rate": 4.1371143461647775e-06, "loss": 0.0344, "step": 22100 }, { "epoch": 1.2590234752458365, "grad_norm": 0.03896043822169304, "learning_rate": 4.121325038683804e-06, "loss": 0.0451, "step": 22150 }, { "epoch": 1.261865514693344, "grad_norm": 4.8546731704846025e-05, "learning_rate": 4.10553573120283e-06, "loss": 0.0652, "step": 22200 }, { "epoch": 1.2647075541408515, "grad_norm": 4.843131065368652, "learning_rate": 4.089746423721856e-06, "loss": 0.0979, "step": 22250 }, { "epoch": 1.267549593588359, "grad_norm": 35.316070556640625, "learning_rate": 4.073957116240882e-06, "loss": 0.1115, "step": 22300 }, { "epoch": 1.2703916330358664, "grad_norm": 0.015540444292128086, "learning_rate": 4.058167808759908e-06, "loss": 0.1143, "step": 22350 }, { "epoch": 1.2732336724833742, "grad_norm": 0.0, "learning_rate": 4.042378501278934e-06, "loss": 0.0605, "step": 22400 }, { "epoch": 1.2760757119308817, "grad_norm": 0.11483113467693329, "learning_rate": 4.02658919379796e-06, "loss": 0.0694, "step": 22450 }, { "epoch": 1.2789177513783891, "grad_norm": 0.0, "learning_rate": 4.0111156724666064e-06, "loss": 0.0244, "step": 22500 }, { "epoch": 1.2817597908258966, "grad_norm": 12.185845375061035, "learning_rate": 3.995326364985632e-06, "loss": 0.0255, "step": 22550 }, { "epoch": 1.2846018302734041, "grad_norm": 0.0005437165382318199, "learning_rate": 3.979537057504658e-06, "loss": 0.0106, "step": 22600 }, { "epoch": 1.2874438697209118, "grad_norm": 56.67045211791992, "learning_rate": 3.963747750023684e-06, "loss": 0.0206, "step": 22650 }, { "epoch": 1.2902859091684191, "grad_norm": 0.03569436073303223, "learning_rate": 3.94795844254271e-06, "loss": 0.0786, "step": 22700 }, { "epoch": 1.2931279486159268, "grad_norm": 6.000441074371338, "learning_rate": 3.932169135061737e-06, "loss": 0.0555, "step": 22750 }, { "epoch": 1.2959699880634343, "grad_norm": 1.0103756189346313, "learning_rate": 3.916379827580762e-06, "loss": 0.0322, "step": 22800 }, { "epoch": 1.2988120275109418, "grad_norm": 26.3443660736084, "learning_rate": 3.900590520099789e-06, "loss": 0.0515, "step": 22850 }, { "epoch": 1.3016540669584493, "grad_norm": 0.0, "learning_rate": 3.884801212618815e-06, "loss": 0.0095, "step": 22900 }, { "epoch": 1.3044961064059568, "grad_norm": 5.664130640070653e-07, "learning_rate": 3.869011905137841e-06, "loss": 0.0217, "step": 22950 }, { "epoch": 1.3073381458534645, "grad_norm": 17.054153442382812, "learning_rate": 3.853222597656867e-06, "loss": 0.0293, "step": 23000 }, { "epoch": 1.310180185300972, "grad_norm": 8.179932046914473e-05, "learning_rate": 3.837433290175893e-06, "loss": 0.0244, "step": 23050 }, { "epoch": 1.3130222247484795, "grad_norm": 0.0, "learning_rate": 3.821643982694919e-06, "loss": 0.0364, "step": 23100 }, { "epoch": 1.315864264195987, "grad_norm": 1.6720003259251826e-05, "learning_rate": 3.805854675213945e-06, "loss": 0.0105, "step": 23150 }, { "epoch": 1.3187063036434945, "grad_norm": 0.009736117906868458, "learning_rate": 3.7900653677329713e-06, "loss": 0.0575, "step": 23200 }, { "epoch": 1.3215483430910022, "grad_norm": 0.024012545123696327, "learning_rate": 3.774276060251998e-06, "loss": 0.0041, "step": 23250 }, { "epoch": 1.3243903825385097, "grad_norm": 0.17959673702716827, "learning_rate": 3.758486752771024e-06, "loss": 0.0043, "step": 23300 }, { "epoch": 1.3272324219860172, "grad_norm": 4.943171977996826, "learning_rate": 3.74269744529005e-06, "loss": 0.038, "step": 23350 }, { "epoch": 1.3300744614335247, "grad_norm": 3.026745787337859e-07, "learning_rate": 3.726908137809076e-06, "loss": 0.0904, "step": 23400 }, { "epoch": 1.3329165008810322, "grad_norm": 0.00022610007727053016, "learning_rate": 3.711118830328102e-06, "loss": 0.0339, "step": 23450 }, { "epoch": 1.3357585403285397, "grad_norm": 0.005768894217908382, "learning_rate": 3.695329522847128e-06, "loss": 0.015, "step": 23500 }, { "epoch": 1.3386005797760472, "grad_norm": 0.0002275512961205095, "learning_rate": 3.679540215366154e-06, "loss": 0.0295, "step": 23550 }, { "epoch": 1.341442619223555, "grad_norm": 0.00039534183451905847, "learning_rate": 3.6637509078851806e-06, "loss": 0.0608, "step": 23600 }, { "epoch": 1.3442846586710624, "grad_norm": 0.14771956205368042, "learning_rate": 3.6479616004042068e-06, "loss": 0.0424, "step": 23650 }, { "epoch": 1.3471266981185699, "grad_norm": 44.124168395996094, "learning_rate": 3.632172292923233e-06, "loss": 0.0198, "step": 23700 }, { "epoch": 1.3499687375660774, "grad_norm": 10.954936027526855, "learning_rate": 3.616382985442259e-06, "loss": 0.0862, "step": 23750 }, { "epoch": 1.3528107770135849, "grad_norm": 4.782924634127994e-08, "learning_rate": 3.6005936779612847e-06, "loss": 0.0176, "step": 23800 }, { "epoch": 1.3556528164610926, "grad_norm": 0.0013809322845190763, "learning_rate": 3.584804370480311e-06, "loss": 0.0198, "step": 23850 }, { "epoch": 1.3584948559086, "grad_norm": 0.0003040616284124553, "learning_rate": 3.569015062999337e-06, "loss": 0.038, "step": 23900 }, { "epoch": 1.3613368953561076, "grad_norm": 9.038470238920127e-07, "learning_rate": 3.553225755518363e-06, "loss": 0.0641, "step": 23950 }, { "epoch": 1.364178934803615, "grad_norm": 0.0, "learning_rate": 3.5374364480373896e-06, "loss": 0.0146, "step": 24000 }, { "epoch": 1.3670209742511226, "grad_norm": 3.145011578453705e-05, "learning_rate": 3.5216471405564157e-06, "loss": 0.0137, "step": 24050 }, { "epoch": 1.36986301369863, "grad_norm": 4.684041500091553, "learning_rate": 3.505857833075442e-06, "loss": 0.032, "step": 24100 }, { "epoch": 1.3727050531461376, "grad_norm": 0.015789663419127464, "learning_rate": 3.4900685255944675e-06, "loss": 0.0633, "step": 24150 }, { "epoch": 1.3755470925936453, "grad_norm": 0.7576214075088501, "learning_rate": 3.4742792181134937e-06, "loss": 0.0384, "step": 24200 }, { "epoch": 1.3783891320411528, "grad_norm": 0.0016375042032450438, "learning_rate": 3.4584899106325198e-06, "loss": 0.0086, "step": 24250 }, { "epoch": 1.3812311714886603, "grad_norm": 1.0054521560668945, "learning_rate": 3.442700603151546e-06, "loss": 0.0322, "step": 24300 }, { "epoch": 1.3840732109361678, "grad_norm": 0.0003854624228551984, "learning_rate": 3.426911295670572e-06, "loss": 0.0674, "step": 24350 }, { "epoch": 1.3869152503836752, "grad_norm": 13.511655807495117, "learning_rate": 3.4111219881895986e-06, "loss": 0.0099, "step": 24400 }, { "epoch": 1.389757289831183, "grad_norm": 59.527740478515625, "learning_rate": 3.3953326807086247e-06, "loss": 0.0336, "step": 24450 }, { "epoch": 1.3925993292786905, "grad_norm": 40.474220275878906, "learning_rate": 3.3795433732276504e-06, "loss": 0.0696, "step": 24500 }, { "epoch": 1.395441368726198, "grad_norm": 0.0, "learning_rate": 3.3637540657466765e-06, "loss": 0.0793, "step": 24550 }, { "epoch": 1.3982834081737054, "grad_norm": 4.2510448110988364e-05, "learning_rate": 3.3479647582657026e-06, "loss": 0.0256, "step": 24600 }, { "epoch": 1.401125447621213, "grad_norm": 1.3378597497940063, "learning_rate": 3.3321754507847287e-06, "loss": 0.0258, "step": 24650 }, { "epoch": 1.4039674870687204, "grad_norm": 2.330773713765666e-05, "learning_rate": 3.316386143303755e-06, "loss": 0.0153, "step": 24700 }, { "epoch": 1.406809526516228, "grad_norm": 0.46982133388519287, "learning_rate": 3.3005968358227814e-06, "loss": 0.024, "step": 24750 }, { "epoch": 1.4096515659637356, "grad_norm": 10.896574974060059, "learning_rate": 3.2848075283418075e-06, "loss": 0.0144, "step": 24800 }, { "epoch": 1.4124936054112431, "grad_norm": 0.0, "learning_rate": 3.2690182208608332e-06, "loss": 0.035, "step": 24850 }, { "epoch": 1.4153356448587506, "grad_norm": 0.0003001675650011748, "learning_rate": 3.2532289133798593e-06, "loss": 0.1234, "step": 24900 }, { "epoch": 1.4181776843062581, "grad_norm": 31.084274291992188, "learning_rate": 3.2374396058988855e-06, "loss": 0.0675, "step": 24950 }, { "epoch": 1.4210197237537656, "grad_norm": 47.6000862121582, "learning_rate": 3.2216502984179116e-06, "loss": 0.2793, "step": 25000 }, { "epoch": 1.4238617632012733, "grad_norm": 6.876327037811279, "learning_rate": 3.2058609909369377e-06, "loss": 0.0427, "step": 25050 }, { "epoch": 1.4267038026487808, "grad_norm": 0.4660356640815735, "learning_rate": 3.190071683455964e-06, "loss": 0.0831, "step": 25100 }, { "epoch": 1.4295458420962883, "grad_norm": 0.00010924594971584156, "learning_rate": 3.1742823759749904e-06, "loss": 0.0358, "step": 25150 }, { "epoch": 1.4323878815437958, "grad_norm": 63.72904968261719, "learning_rate": 3.1584930684940165e-06, "loss": 0.0245, "step": 25200 }, { "epoch": 1.4352299209913033, "grad_norm": 0.10768713057041168, "learning_rate": 3.142703761013042e-06, "loss": 0.0913, "step": 25250 }, { "epoch": 1.438071960438811, "grad_norm": 0.008805965073406696, "learning_rate": 3.1269144535320683e-06, "loss": 0.0541, "step": 25300 }, { "epoch": 1.4409139998863183, "grad_norm": 12.953926086425781, "learning_rate": 3.1111251460510944e-06, "loss": 0.063, "step": 25350 }, { "epoch": 1.443756039333826, "grad_norm": 1.0326876020982922e-09, "learning_rate": 3.0953358385701205e-06, "loss": 0.0773, "step": 25400 }, { "epoch": 1.4465980787813335, "grad_norm": 0.2626782953739166, "learning_rate": 3.0795465310891467e-06, "loss": 0.0148, "step": 25450 }, { "epoch": 1.449440118228841, "grad_norm": 3.7921090126037598, "learning_rate": 3.063757223608173e-06, "loss": 0.0316, "step": 25500 }, { "epoch": 1.4522821576763485, "grad_norm": 0.00019790299120359123, "learning_rate": 3.0479679161271993e-06, "loss": 0.0278, "step": 25550 }, { "epoch": 1.455124197123856, "grad_norm": 0.013136902824044228, "learning_rate": 3.032178608646225e-06, "loss": 0.0257, "step": 25600 }, { "epoch": 1.4579662365713637, "grad_norm": 1.2140697240829468, "learning_rate": 3.016389301165251e-06, "loss": 0.0163, "step": 25650 }, { "epoch": 1.4608082760188712, "grad_norm": 3.1799275875091553, "learning_rate": 3.0005999936842773e-06, "loss": 0.0293, "step": 25700 }, { "epoch": 1.4636503154663787, "grad_norm": 7.747816562652588, "learning_rate": 2.9848106862033034e-06, "loss": 0.0483, "step": 25750 }, { "epoch": 1.4664923549138862, "grad_norm": 2.0789725851955154e-07, "learning_rate": 2.9690213787223295e-06, "loss": 0.0473, "step": 25800 }, { "epoch": 1.4693343943613937, "grad_norm": 0.23084664344787598, "learning_rate": 2.953232071241355e-06, "loss": 0.0239, "step": 25850 }, { "epoch": 1.4721764338089014, "grad_norm": 0.0003376183158252388, "learning_rate": 2.937442763760382e-06, "loss": 0.0176, "step": 25900 }, { "epoch": 1.4750184732564087, "grad_norm": 6.0435391787905246e-05, "learning_rate": 2.921653456279408e-06, "loss": 0.0492, "step": 25950 }, { "epoch": 1.4778605127039164, "grad_norm": 0.0005897375522181392, "learning_rate": 2.905864148798434e-06, "loss": 0.0205, "step": 26000 }, { "epoch": 1.4807025521514239, "grad_norm": 9.022598266601562, "learning_rate": 2.89007484131746e-06, "loss": 0.0061, "step": 26050 }, { "epoch": 1.4835445915989314, "grad_norm": 15.136653900146484, "learning_rate": 2.874285533836486e-06, "loss": 0.0072, "step": 26100 }, { "epoch": 1.4863866310464389, "grad_norm": 0.25347283482551575, "learning_rate": 2.8584962263555123e-06, "loss": 0.0679, "step": 26150 }, { "epoch": 1.4892286704939464, "grad_norm": 1.6028684377670288, "learning_rate": 2.842706918874538e-06, "loss": 0.0308, "step": 26200 }, { "epoch": 1.492070709941454, "grad_norm": 0.049529995769262314, "learning_rate": 2.826917611393564e-06, "loss": 0.137, "step": 26250 }, { "epoch": 1.4949127493889616, "grad_norm": 5.941664695739746, "learning_rate": 2.8111283039125907e-06, "loss": 0.0137, "step": 26300 }, { "epoch": 1.497754788836469, "grad_norm": 0.010181618854403496, "learning_rate": 2.795338996431617e-06, "loss": 0.0175, "step": 26350 }, { "epoch": 1.5005968282839766, "grad_norm": 9.913866233546287e-05, "learning_rate": 2.779549688950643e-06, "loss": 0.0906, "step": 26400 }, { "epoch": 1.503438867731484, "grad_norm": 1.6712743189373214e-08, "learning_rate": 2.763760381469669e-06, "loss": 0.029, "step": 26450 }, { "epoch": 1.5062809071789918, "grad_norm": 7.696108514210209e-05, "learning_rate": 2.747971073988695e-06, "loss": 0.0605, "step": 26500 }, { "epoch": 1.509122946626499, "grad_norm": 85.16954040527344, "learning_rate": 2.732181766507721e-06, "loss": 0.0232, "step": 26550 }, { "epoch": 1.5119649860740068, "grad_norm": 0.16279102861881256, "learning_rate": 2.716392459026747e-06, "loss": 0.0315, "step": 26600 }, { "epoch": 1.5148070255215142, "grad_norm": 0.0, "learning_rate": 2.700918937695393e-06, "loss": 0.1029, "step": 26650 }, { "epoch": 1.5176490649690217, "grad_norm": 0.00065478595206514, "learning_rate": 2.6851296302144192e-06, "loss": 0.0315, "step": 26700 }, { "epoch": 1.5204911044165295, "grad_norm": 0.6491963863372803, "learning_rate": 2.6693403227334454e-06, "loss": 0.0609, "step": 26750 }, { "epoch": 1.5233331438640367, "grad_norm": 0.001989447046071291, "learning_rate": 2.6535510152524715e-06, "loss": 0.0594, "step": 26800 }, { "epoch": 1.5261751833115444, "grad_norm": 1.6186556816101074, "learning_rate": 2.637761707771497e-06, "loss": 0.0182, "step": 26850 }, { "epoch": 1.529017222759052, "grad_norm": 0.1283130794763565, "learning_rate": 2.6219724002905233e-06, "loss": 0.0569, "step": 26900 }, { "epoch": 1.5318592622065594, "grad_norm": 105.00154876708984, "learning_rate": 2.6061830928095494e-06, "loss": 0.0629, "step": 26950 }, { "epoch": 1.534701301654067, "grad_norm": Infinity, "learning_rate": 2.5903937853285755e-06, "loss": 0.0781, "step": 27000 }, { "epoch": 1.5375433411015744, "grad_norm": 5.722159102106161e-08, "learning_rate": 2.5749202639972217e-06, "loss": 0.1001, "step": 27050 }, { "epoch": 1.5403853805490821, "grad_norm": 0.35033631324768066, "learning_rate": 2.5591309565162474e-06, "loss": 0.0788, "step": 27100 }, { "epoch": 1.5432274199965894, "grad_norm": 6.983494677115232e-05, "learning_rate": 2.5433416490352735e-06, "loss": 0.0205, "step": 27150 }, { "epoch": 1.5460694594440971, "grad_norm": 0.059845082461833954, "learning_rate": 2.5275523415542996e-06, "loss": 0.0428, "step": 27200 }, { "epoch": 1.5489114988916046, "grad_norm": 0.0, "learning_rate": 2.5117630340733257e-06, "loss": 0.0226, "step": 27250 }, { "epoch": 1.5517535383391121, "grad_norm": 1.1369974828312479e-07, "learning_rate": 2.495973726592352e-06, "loss": 0.0412, "step": 27300 }, { "epoch": 1.5545955777866198, "grad_norm": 3.52167672801329e-09, "learning_rate": 2.480184419111378e-06, "loss": 0.063, "step": 27350 }, { "epoch": 1.557437617234127, "grad_norm": 0.2713753283023834, "learning_rate": 2.464395111630404e-06, "loss": 0.0641, "step": 27400 }, { "epoch": 1.5602796566816348, "grad_norm": 0.0, "learning_rate": 2.44860580414943e-06, "loss": 0.0709, "step": 27450 }, { "epoch": 1.5631216961291423, "grad_norm": 8.362411563211936e-08, "learning_rate": 2.4328164966684563e-06, "loss": 0.0802, "step": 27500 }, { "epoch": 1.5659637355766498, "grad_norm": 3.1302518355147413e-09, "learning_rate": 2.4170271891874824e-06, "loss": 0.0137, "step": 27550 }, { "epoch": 1.5688057750241573, "grad_norm": 4.806706419913098e-05, "learning_rate": 2.4012378817065086e-06, "loss": 0.1185, "step": 27600 }, { "epoch": 1.5716478144716648, "grad_norm": 0.00012610612611752003, "learning_rate": 2.3854485742255347e-06, "loss": 0.0443, "step": 27650 }, { "epoch": 1.5744898539191725, "grad_norm": 0.0010404565837234259, "learning_rate": 2.369659266744561e-06, "loss": 0.0207, "step": 27700 }, { "epoch": 1.5773318933666798, "grad_norm": 0.0, "learning_rate": 2.353869959263587e-06, "loss": 0.0393, "step": 27750 }, { "epoch": 1.5801739328141875, "grad_norm": 1.3449719517666381e-05, "learning_rate": 2.338080651782613e-06, "loss": 0.0149, "step": 27800 }, { "epoch": 1.583015972261695, "grad_norm": 1.4916712045669556, "learning_rate": 2.322291344301639e-06, "loss": 0.052, "step": 27850 }, { "epoch": 1.5858580117092025, "grad_norm": 0.0, "learning_rate": 2.3065020368206653e-06, "loss": 0.0102, "step": 27900 }, { "epoch": 1.5887000511567102, "grad_norm": 0.0003791541967075318, "learning_rate": 2.2907127293396914e-06, "loss": 0.0403, "step": 27950 }, { "epoch": 1.5915420906042175, "grad_norm": 9.415854454040527, "learning_rate": 2.2749234218587175e-06, "loss": 0.0706, "step": 28000 }, { "epoch": 1.5943841300517252, "grad_norm": 1.28175975078193e-06, "learning_rate": 2.2591341143777436e-06, "loss": 0.0578, "step": 28050 }, { "epoch": 1.5972261694992327, "grad_norm": 37.6640739440918, "learning_rate": 2.2433448068967698e-06, "loss": 0.0572, "step": 28100 }, { "epoch": 1.6000682089467402, "grad_norm": 1.626979947090149, "learning_rate": 2.227555499415796e-06, "loss": 0.0493, "step": 28150 }, { "epoch": 1.6029102483942477, "grad_norm": 13.51364803314209, "learning_rate": 2.211766191934822e-06, "loss": 0.027, "step": 28200 }, { "epoch": 1.6057522878417552, "grad_norm": 12.810977935791016, "learning_rate": 2.195976884453848e-06, "loss": 0.0872, "step": 28250 }, { "epoch": 1.6085943272892629, "grad_norm": 0.00011065157741541043, "learning_rate": 2.1801875769728742e-06, "loss": 0.0116, "step": 28300 }, { "epoch": 1.6114363667367702, "grad_norm": 1.6890456890905625e-06, "learning_rate": 2.1643982694919004e-06, "loss": 0.0039, "step": 28350 }, { "epoch": 1.6142784061842779, "grad_norm": 0.014668595045804977, "learning_rate": 2.1486089620109265e-06, "loss": 0.0415, "step": 28400 }, { "epoch": 1.6171204456317854, "grad_norm": 0.0, "learning_rate": 2.1328196545299526e-06, "loss": 0.1203, "step": 28450 }, { "epoch": 1.6199624850792929, "grad_norm": 0.022854184731841087, "learning_rate": 2.1170303470489787e-06, "loss": 0.0953, "step": 28500 }, { "epoch": 1.6228045245268006, "grad_norm": 0.08891911804676056, "learning_rate": 2.101241039568005e-06, "loss": 0.0383, "step": 28550 }, { "epoch": 1.6256465639743078, "grad_norm": 0.0, "learning_rate": 2.0854517320870305e-06, "loss": 0.0203, "step": 28600 }, { "epoch": 1.6284886034218156, "grad_norm": 0.0, "learning_rate": 2.069662424606057e-06, "loss": 0.0135, "step": 28650 }, { "epoch": 1.631330642869323, "grad_norm": 0.15456999838352203, "learning_rate": 2.053873117125083e-06, "loss": 0.0332, "step": 28700 }, { "epoch": 1.6341726823168305, "grad_norm": 10.664115905761719, "learning_rate": 2.0380838096441093e-06, "loss": 0.13, "step": 28750 }, { "epoch": 1.637014721764338, "grad_norm": 0.0007495724712498486, "learning_rate": 2.022294502163135e-06, "loss": 0.0863, "step": 28800 }, { "epoch": 1.6398567612118455, "grad_norm": 3.1083343029022217, "learning_rate": 2.0065051946821616e-06, "loss": 0.0204, "step": 28850 }, { "epoch": 1.6426988006593533, "grad_norm": 0.0011413018219172955, "learning_rate": 1.9907158872011877e-06, "loss": 0.0497, "step": 28900 }, { "epoch": 1.6455408401068605, "grad_norm": 0.5448282361030579, "learning_rate": 1.9749265797202134e-06, "loss": 0.0141, "step": 28950 }, { "epoch": 1.6483828795543682, "grad_norm": 2.421827957732603e-05, "learning_rate": 1.95913727223924e-06, "loss": 0.0758, "step": 29000 }, { "epoch": 1.6512249190018757, "grad_norm": 2.173584789488814e-06, "learning_rate": 1.943347964758266e-06, "loss": 0.0717, "step": 29050 }, { "epoch": 1.6540669584493832, "grad_norm": 0.00021623531938530505, "learning_rate": 1.927558657277292e-06, "loss": 0.0236, "step": 29100 }, { "epoch": 1.656908997896891, "grad_norm": 0.003926194738596678, "learning_rate": 1.911769349796318e-06, "loss": 0.0391, "step": 29150 }, { "epoch": 1.6597510373443982, "grad_norm": 0.0, "learning_rate": 1.8959800423153444e-06, "loss": 0.0407, "step": 29200 }, { "epoch": 1.662593076791906, "grad_norm": 1.2383061402942985e-05, "learning_rate": 1.8801907348343703e-06, "loss": 0.0302, "step": 29250 }, { "epoch": 1.6654351162394134, "grad_norm": 5.248524392875709e-10, "learning_rate": 1.8644014273533964e-06, "loss": 0.0136, "step": 29300 }, { "epoch": 1.668277155686921, "grad_norm": 18.064258575439453, "learning_rate": 1.8486121198724225e-06, "loss": 0.0027, "step": 29350 }, { "epoch": 1.6711191951344284, "grad_norm": 2.318140745162964, "learning_rate": 1.8331385985410682e-06, "loss": 0.0253, "step": 29400 }, { "epoch": 1.673961234581936, "grad_norm": 0.006675691809505224, "learning_rate": 1.8173492910600944e-06, "loss": 0.0188, "step": 29450 }, { "epoch": 1.6768032740294436, "grad_norm": 0.0, "learning_rate": 1.8015599835791203e-06, "loss": 0.0972, "step": 29500 }, { "epoch": 1.679645313476951, "grad_norm": 0.00033315145992673934, "learning_rate": 1.7857706760981464e-06, "loss": 0.0953, "step": 29550 }, { "epoch": 1.6824873529244586, "grad_norm": 32.545352935791016, "learning_rate": 1.7699813686171727e-06, "loss": 0.0512, "step": 29600 }, { "epoch": 1.685329392371966, "grad_norm": 0.003776384051889181, "learning_rate": 1.7541920611361986e-06, "loss": 0.0956, "step": 29650 }, { "epoch": 1.6881714318194736, "grad_norm": 7.56796362111345e-05, "learning_rate": 1.7384027536552248e-06, "loss": 0.0529, "step": 29700 }, { "epoch": 1.6910134712669813, "grad_norm": 0.0019209177698940039, "learning_rate": 1.7226134461742509e-06, "loss": 0.0833, "step": 29750 }, { "epoch": 1.6938555107144886, "grad_norm": 29.27110481262207, "learning_rate": 1.7068241386932772e-06, "loss": 0.0195, "step": 29800 }, { "epoch": 1.6966975501619963, "grad_norm": 14.960783004760742, "learning_rate": 1.6910348312123031e-06, "loss": 0.0339, "step": 29850 }, { "epoch": 1.6995395896095038, "grad_norm": 0.0008143290760926902, "learning_rate": 1.6752455237313292e-06, "loss": 0.0417, "step": 29900 }, { "epoch": 1.7023816290570113, "grad_norm": 0.03020702302455902, "learning_rate": 1.6594562162503556e-06, "loss": 0.0453, "step": 29950 }, { "epoch": 1.705223668504519, "grad_norm": 0.005310211796313524, "learning_rate": 1.6436669087693815e-06, "loss": 0.0126, "step": 30000 } ], "logging_steps": 50, "max_steps": 35186, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 6000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }