{ "best_metric": null, "best_model_checkpoint": null, "epoch": 20.0, "eval_steps": 500, "global_step": 12620, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01584786053882726, "grad_norm": 6.662916660308838, "learning_rate": 1.584786053882726e-06, "loss": 1.0759, "step": 10 }, { "epoch": 0.03169572107765452, "grad_norm": 5.666072845458984, "learning_rate": 3.169572107765452e-06, "loss": 1.0851, "step": 20 }, { "epoch": 0.04754358161648178, "grad_norm": 5.542662620544434, "learning_rate": 4.754358161648178e-06, "loss": 1.0049, "step": 30 }, { "epoch": 0.06339144215530904, "grad_norm": 3.666578769683838, "learning_rate": 6.339144215530904e-06, "loss": 0.7551, "step": 40 }, { "epoch": 0.07923930269413629, "grad_norm": 1.8371094465255737, "learning_rate": 7.923930269413629e-06, "loss": 0.5051, "step": 50 }, { "epoch": 0.09508716323296355, "grad_norm": 1.8207030296325684, "learning_rate": 9.508716323296357e-06, "loss": 0.3629, "step": 60 }, { "epoch": 0.1109350237717908, "grad_norm": 1.8528467416763306, "learning_rate": 1.109350237717908e-05, "loss": 0.2969, "step": 70 }, { "epoch": 0.12678288431061807, "grad_norm": 1.4607429504394531, "learning_rate": 1.2678288431061808e-05, "loss": 0.2692, "step": 80 }, { "epoch": 0.14263074484944532, "grad_norm": 1.288856029510498, "learning_rate": 1.4263074484944532e-05, "loss": 0.2774, "step": 90 }, { "epoch": 0.15847860538827258, "grad_norm": 1.119730830192566, "learning_rate": 1.5847860538827258e-05, "loss": 0.2193, "step": 100 }, { "epoch": 0.17432646592709986, "grad_norm": 0.9573730826377869, "learning_rate": 1.7432646592709986e-05, "loss": 0.2068, "step": 110 }, { "epoch": 0.1901743264659271, "grad_norm": 0.9363383054733276, "learning_rate": 1.9017432646592713e-05, "loss": 0.2061, "step": 120 }, { "epoch": 0.20602218700475436, "grad_norm": 0.9630652070045471, "learning_rate": 2.0602218700475437e-05, "loss": 0.1937, "step": 130 }, { "epoch": 0.2218700475435816, "grad_norm": 1.1261632442474365, "learning_rate": 2.218700475435816e-05, "loss": 0.1769, "step": 140 }, { "epoch": 0.23771790808240886, "grad_norm": 0.859064519405365, "learning_rate": 2.377179080824089e-05, "loss": 0.1788, "step": 150 }, { "epoch": 0.25356576862123614, "grad_norm": 0.8056842684745789, "learning_rate": 2.5356576862123617e-05, "loss": 0.1486, "step": 160 }, { "epoch": 0.2694136291600634, "grad_norm": 0.889215886592865, "learning_rate": 2.694136291600634e-05, "loss": 0.1597, "step": 170 }, { "epoch": 0.28526148969889065, "grad_norm": 0.90467369556427, "learning_rate": 2.8526148969889065e-05, "loss": 0.1571, "step": 180 }, { "epoch": 0.3011093502377179, "grad_norm": 0.9579144716262817, "learning_rate": 3.0110935023771792e-05, "loss": 0.1378, "step": 190 }, { "epoch": 0.31695721077654515, "grad_norm": 1.2036852836608887, "learning_rate": 3.1695721077654516e-05, "loss": 0.1315, "step": 200 }, { "epoch": 0.3328050713153724, "grad_norm": 0.9379674196243286, "learning_rate": 3.3280507131537244e-05, "loss": 0.128, "step": 210 }, { "epoch": 0.3486529318541997, "grad_norm": 0.9713570475578308, "learning_rate": 3.486529318541997e-05, "loss": 0.124, "step": 220 }, { "epoch": 0.36450079239302696, "grad_norm": 0.9157624244689941, "learning_rate": 3.64500792393027e-05, "loss": 0.1264, "step": 230 }, { "epoch": 0.3803486529318542, "grad_norm": 0.5934288501739502, "learning_rate": 3.8034865293185427e-05, "loss": 0.1121, "step": 240 }, { "epoch": 0.39619651347068147, "grad_norm": 0.865558922290802, "learning_rate": 3.961965134706815e-05, "loss": 0.1222, "step": 250 }, { "epoch": 0.4120443740095087, "grad_norm": 0.8881043195724487, "learning_rate": 4.1204437400950875e-05, "loss": 0.1067, "step": 260 }, { "epoch": 0.42789223454833597, "grad_norm": 0.7673732042312622, "learning_rate": 4.27892234548336e-05, "loss": 0.1088, "step": 270 }, { "epoch": 0.4437400950871632, "grad_norm": 0.5540139079093933, "learning_rate": 4.437400950871632e-05, "loss": 0.094, "step": 280 }, { "epoch": 0.4595879556259905, "grad_norm": 0.8689896464347839, "learning_rate": 4.595879556259905e-05, "loss": 0.0978, "step": 290 }, { "epoch": 0.4754358161648177, "grad_norm": 0.6194612979888916, "learning_rate": 4.754358161648178e-05, "loss": 0.0791, "step": 300 }, { "epoch": 0.49128367670364503, "grad_norm": 0.698275089263916, "learning_rate": 4.9128367670364506e-05, "loss": 0.0883, "step": 310 }, { "epoch": 0.5071315372424723, "grad_norm": 0.7928034663200378, "learning_rate": 5.071315372424723e-05, "loss": 0.0994, "step": 320 }, { "epoch": 0.5229793977812995, "grad_norm": 0.5640034675598145, "learning_rate": 5.2297939778129954e-05, "loss": 0.0832, "step": 330 }, { "epoch": 0.5388272583201268, "grad_norm": 0.8224833011627197, "learning_rate": 5.388272583201268e-05, "loss": 0.0926, "step": 340 }, { "epoch": 0.554675118858954, "grad_norm": 0.9649167656898499, "learning_rate": 5.546751188589541e-05, "loss": 0.0896, "step": 350 }, { "epoch": 0.5705229793977813, "grad_norm": 0.7821329832077026, "learning_rate": 5.705229793977813e-05, "loss": 0.1003, "step": 360 }, { "epoch": 0.5863708399366085, "grad_norm": 0.5526576638221741, "learning_rate": 5.863708399366086e-05, "loss": 0.0826, "step": 370 }, { "epoch": 0.6022187004754358, "grad_norm": 0.6012312769889832, "learning_rate": 6.0221870047543585e-05, "loss": 0.0808, "step": 380 }, { "epoch": 0.618066561014263, "grad_norm": 0.6598588824272156, "learning_rate": 6.18066561014263e-05, "loss": 0.0781, "step": 390 }, { "epoch": 0.6339144215530903, "grad_norm": 0.6164723634719849, "learning_rate": 6.339144215530903e-05, "loss": 0.0725, "step": 400 }, { "epoch": 0.6497622820919176, "grad_norm": 0.6792150139808655, "learning_rate": 6.497622820919176e-05, "loss": 0.0694, "step": 410 }, { "epoch": 0.6656101426307448, "grad_norm": 0.5863011479377747, "learning_rate": 6.656101426307449e-05, "loss": 0.0771, "step": 420 }, { "epoch": 0.6814580031695721, "grad_norm": 0.6146591305732727, "learning_rate": 6.814580031695722e-05, "loss": 0.0768, "step": 430 }, { "epoch": 0.6973058637083994, "grad_norm": 0.4906776547431946, "learning_rate": 6.973058637083994e-05, "loss": 0.073, "step": 440 }, { "epoch": 0.7131537242472267, "grad_norm": 0.7824636101722717, "learning_rate": 7.131537242472267e-05, "loss": 0.074, "step": 450 }, { "epoch": 0.7290015847860539, "grad_norm": 0.5947498679161072, "learning_rate": 7.29001584786054e-05, "loss": 0.0786, "step": 460 }, { "epoch": 0.7448494453248812, "grad_norm": 0.49313023686408997, "learning_rate": 7.448494453248813e-05, "loss": 0.0694, "step": 470 }, { "epoch": 0.7606973058637084, "grad_norm": 0.36435437202453613, "learning_rate": 7.606973058637085e-05, "loss": 0.067, "step": 480 }, { "epoch": 0.7765451664025357, "grad_norm": 0.6767722368240356, "learning_rate": 7.765451664025357e-05, "loss": 0.0747, "step": 490 }, { "epoch": 0.7923930269413629, "grad_norm": 0.5377907752990723, "learning_rate": 7.92393026941363e-05, "loss": 0.0709, "step": 500 }, { "epoch": 0.8082408874801902, "grad_norm": 0.740249752998352, "learning_rate": 8.082408874801902e-05, "loss": 0.0762, "step": 510 }, { "epoch": 0.8240887480190174, "grad_norm": 0.5422998666763306, "learning_rate": 8.240887480190175e-05, "loss": 0.0636, "step": 520 }, { "epoch": 0.8399366085578447, "grad_norm": 0.7832110524177551, "learning_rate": 8.399366085578448e-05, "loss": 0.0673, "step": 530 }, { "epoch": 0.8557844690966719, "grad_norm": 0.8280210494995117, "learning_rate": 8.55784469096672e-05, "loss": 0.0729, "step": 540 }, { "epoch": 0.8716323296354992, "grad_norm": 0.4729553461074829, "learning_rate": 8.716323296354992e-05, "loss": 0.0802, "step": 550 }, { "epoch": 0.8874801901743264, "grad_norm": 0.49598968029022217, "learning_rate": 8.874801901743265e-05, "loss": 0.0707, "step": 560 }, { "epoch": 0.9033280507131537, "grad_norm": 0.5164231657981873, "learning_rate": 9.033280507131537e-05, "loss": 0.0723, "step": 570 }, { "epoch": 0.919175911251981, "grad_norm": 0.8574791550636292, "learning_rate": 9.19175911251981e-05, "loss": 0.0691, "step": 580 }, { "epoch": 0.9350237717908082, "grad_norm": 0.6576387286186218, "learning_rate": 9.350237717908083e-05, "loss": 0.0652, "step": 590 }, { "epoch": 0.9508716323296355, "grad_norm": 0.5376480221748352, "learning_rate": 9.508716323296356e-05, "loss": 0.0665, "step": 600 }, { "epoch": 0.9667194928684627, "grad_norm": 0.3735605776309967, "learning_rate": 9.667194928684627e-05, "loss": 0.0663, "step": 610 }, { "epoch": 0.9825673534072901, "grad_norm": 0.4635787606239319, "learning_rate": 9.825673534072901e-05, "loss": 0.0641, "step": 620 }, { "epoch": 0.9984152139461173, "grad_norm": 0.7257930040359497, "learning_rate": 9.984152139461174e-05, "loss": 0.0689, "step": 630 }, { "epoch": 1.0142630744849446, "grad_norm": 0.4881118834018707, "learning_rate": 9.999986095395153e-05, "loss": 0.0612, "step": 640 }, { "epoch": 1.0301109350237718, "grad_norm": 0.4648587107658386, "learning_rate": 9.99993803019373e-05, "loss": 0.0638, "step": 650 }, { "epoch": 1.045958795562599, "grad_norm": 0.47810834646224976, "learning_rate": 9.999855633063904e-05, "loss": 0.0594, "step": 660 }, { "epoch": 1.0618066561014263, "grad_norm": 0.5248630046844482, "learning_rate": 9.999738904571453e-05, "loss": 0.0613, "step": 670 }, { "epoch": 1.0776545166402536, "grad_norm": 0.5785868167877197, "learning_rate": 9.999587845517889e-05, "loss": 0.0658, "step": 680 }, { "epoch": 1.0935023771790808, "grad_norm": 0.5674669742584229, "learning_rate": 9.999402456940454e-05, "loss": 0.0584, "step": 690 }, { "epoch": 1.109350237717908, "grad_norm": 0.5009925961494446, "learning_rate": 9.999182740112115e-05, "loss": 0.061, "step": 700 }, { "epoch": 1.1251980982567353, "grad_norm": 0.4827341139316559, "learning_rate": 9.99892869654155e-05, "loss": 0.0668, "step": 710 }, { "epoch": 1.1410459587955626, "grad_norm": 0.5092630982398987, "learning_rate": 9.998640327973141e-05, "loss": 0.0644, "step": 720 }, { "epoch": 1.1568938193343898, "grad_norm": 0.5155372619628906, "learning_rate": 9.998317636386964e-05, "loss": 0.0629, "step": 730 }, { "epoch": 1.172741679873217, "grad_norm": 0.49632275104522705, "learning_rate": 9.997960623998772e-05, "loss": 0.0617, "step": 740 }, { "epoch": 1.1885895404120443, "grad_norm": 0.44003790616989136, "learning_rate": 9.997569293259977e-05, "loss": 0.0492, "step": 750 }, { "epoch": 1.2044374009508716, "grad_norm": 0.5962253212928772, "learning_rate": 9.997143646857643e-05, "loss": 0.0606, "step": 760 }, { "epoch": 1.2202852614896988, "grad_norm": 0.5411946177482605, "learning_rate": 9.996683687714457e-05, "loss": 0.0613, "step": 770 }, { "epoch": 1.236133122028526, "grad_norm": 0.38321027159690857, "learning_rate": 9.996189418988715e-05, "loss": 0.0524, "step": 780 }, { "epoch": 1.2519809825673534, "grad_norm": 0.41845589876174927, "learning_rate": 9.9956608440743e-05, "loss": 0.0703, "step": 790 }, { "epoch": 1.2678288431061806, "grad_norm": 0.5127869844436646, "learning_rate": 9.995097966600655e-05, "loss": 0.0644, "step": 800 }, { "epoch": 1.2836767036450079, "grad_norm": 0.46290555596351624, "learning_rate": 9.994500790432762e-05, "loss": 0.0645, "step": 810 }, { "epoch": 1.299524564183835, "grad_norm": 0.4053901731967926, "learning_rate": 9.993869319671114e-05, "loss": 0.0595, "step": 820 }, { "epoch": 1.3153724247226624, "grad_norm": 0.5436721444129944, "learning_rate": 9.993203558651687e-05, "loss": 0.0549, "step": 830 }, { "epoch": 1.3312202852614896, "grad_norm": 0.47135990858078003, "learning_rate": 9.992503511945907e-05, "loss": 0.0716, "step": 840 }, { "epoch": 1.3470681458003169, "grad_norm": 0.5177005529403687, "learning_rate": 9.991769184360629e-05, "loss": 0.0616, "step": 850 }, { "epoch": 1.3629160063391441, "grad_norm": 0.49199798703193665, "learning_rate": 9.991000580938087e-05, "loss": 0.0559, "step": 860 }, { "epoch": 1.3787638668779714, "grad_norm": 0.4700946509838104, "learning_rate": 9.990197706955877e-05, "loss": 0.0534, "step": 870 }, { "epoch": 1.3946117274167986, "grad_norm": 0.49670886993408203, "learning_rate": 9.98936056792691e-05, "loss": 0.0494, "step": 880 }, { "epoch": 1.4104595879556259, "grad_norm": 0.42340198159217834, "learning_rate": 9.988489169599373e-05, "loss": 0.0577, "step": 890 }, { "epoch": 1.4263074484944531, "grad_norm": 0.395857036113739, "learning_rate": 9.987583517956703e-05, "loss": 0.0465, "step": 900 }, { "epoch": 1.4421553090332804, "grad_norm": 0.4153169095516205, "learning_rate": 9.986643619217524e-05, "loss": 0.0497, "step": 910 }, { "epoch": 1.4580031695721076, "grad_norm": 0.36922216415405273, "learning_rate": 9.985669479835629e-05, "loss": 0.055, "step": 920 }, { "epoch": 1.473851030110935, "grad_norm": 0.43492555618286133, "learning_rate": 9.984661106499913e-05, "loss": 0.0459, "step": 930 }, { "epoch": 1.4896988906497624, "grad_norm": 0.4480608403682709, "learning_rate": 9.983618506134344e-05, "loss": 0.0567, "step": 940 }, { "epoch": 1.5055467511885894, "grad_norm": 0.4628249704837799, "learning_rate": 9.98254168589791e-05, "loss": 0.0595, "step": 950 }, { "epoch": 1.5213946117274166, "grad_norm": 0.2762962877750397, "learning_rate": 9.98143065318456e-05, "loss": 0.055, "step": 960 }, { "epoch": 1.537242472266244, "grad_norm": 0.4730890095233917, "learning_rate": 9.980285415623172e-05, "loss": 0.0571, "step": 970 }, { "epoch": 1.5530903328050714, "grad_norm": 0.5532105565071106, "learning_rate": 9.979105981077483e-05, "loss": 0.05, "step": 980 }, { "epoch": 1.5689381933438986, "grad_norm": 0.3818027675151825, "learning_rate": 9.977892357646049e-05, "loss": 0.0564, "step": 990 }, { "epoch": 1.5847860538827259, "grad_norm": 0.4600647985935211, "learning_rate": 9.976644553662178e-05, "loss": 0.0517, "step": 1000 }, { "epoch": 1.6006339144215531, "grad_norm": 0.39071592688560486, "learning_rate": 9.975362577693879e-05, "loss": 0.0583, "step": 1010 }, { "epoch": 1.6164817749603804, "grad_norm": 0.5415641069412231, "learning_rate": 9.974046438543805e-05, "loss": 0.0549, "step": 1020 }, { "epoch": 1.6323296354992076, "grad_norm": 0.4827348291873932, "learning_rate": 9.972696145249185e-05, "loss": 0.0537, "step": 1030 }, { "epoch": 1.6481774960380349, "grad_norm": 0.4411364495754242, "learning_rate": 9.971311707081769e-05, "loss": 0.0474, "step": 1040 }, { "epoch": 1.6640253565768621, "grad_norm": 0.46820345520973206, "learning_rate": 9.96989313354776e-05, "loss": 0.0491, "step": 1050 }, { "epoch": 1.6798732171156894, "grad_norm": 0.38286513090133667, "learning_rate": 9.968440434387756e-05, "loss": 0.0536, "step": 1060 }, { "epoch": 1.6957210776545166, "grad_norm": 0.45724204182624817, "learning_rate": 9.966953619576667e-05, "loss": 0.0543, "step": 1070 }, { "epoch": 1.7115689381933439, "grad_norm": 0.4794733226299286, "learning_rate": 9.965432699323669e-05, "loss": 0.0456, "step": 1080 }, { "epoch": 1.7274167987321711, "grad_norm": 0.4839860498905182, "learning_rate": 9.963877684072113e-05, "loss": 0.0558, "step": 1090 }, { "epoch": 1.7432646592709984, "grad_norm": 0.4146821200847626, "learning_rate": 9.962288584499466e-05, "loss": 0.0492, "step": 1100 }, { "epoch": 1.7591125198098256, "grad_norm": 0.4119221568107605, "learning_rate": 9.960665411517235e-05, "loss": 0.045, "step": 1110 }, { "epoch": 1.7749603803486529, "grad_norm": 0.5363435745239258, "learning_rate": 9.959008176270892e-05, "loss": 0.0508, "step": 1120 }, { "epoch": 1.7908082408874801, "grad_norm": 1.4820852279663086, "learning_rate": 9.957316890139792e-05, "loss": 0.0546, "step": 1130 }, { "epoch": 1.8066561014263076, "grad_norm": 1.0511815547943115, "learning_rate": 9.955591564737099e-05, "loss": 0.0624, "step": 1140 }, { "epoch": 1.8225039619651349, "grad_norm": 1.2796475887298584, "learning_rate": 9.953832211909715e-05, "loss": 0.0498, "step": 1150 }, { "epoch": 1.8383518225039621, "grad_norm": 0.44021984934806824, "learning_rate": 9.952038843738181e-05, "loss": 0.0545, "step": 1160 }, { "epoch": 1.8541996830427894, "grad_norm": 0.3289697468280792, "learning_rate": 9.950211472536609e-05, "loss": 0.0525, "step": 1170 }, { "epoch": 1.8700475435816166, "grad_norm": 0.33451518416404724, "learning_rate": 9.948350110852587e-05, "loss": 0.0532, "step": 1180 }, { "epoch": 1.8858954041204439, "grad_norm": 0.28288567066192627, "learning_rate": 9.946454771467104e-05, "loss": 0.0554, "step": 1190 }, { "epoch": 1.9017432646592711, "grad_norm": 0.43782156705856323, "learning_rate": 9.944525467394452e-05, "loss": 0.0464, "step": 1200 }, { "epoch": 1.9175911251980984, "grad_norm": 0.43314260244369507, "learning_rate": 9.94256221188214e-05, "loss": 0.0457, "step": 1210 }, { "epoch": 1.9334389857369256, "grad_norm": 0.3398579955101013, "learning_rate": 9.940565018410805e-05, "loss": 0.0479, "step": 1220 }, { "epoch": 1.9492868462757529, "grad_norm": 0.4561791718006134, "learning_rate": 9.938533900694118e-05, "loss": 0.0524, "step": 1230 }, { "epoch": 1.9651347068145801, "grad_norm": 0.4242919981479645, "learning_rate": 9.93646887267869e-05, "loss": 0.0472, "step": 1240 }, { "epoch": 1.9809825673534074, "grad_norm": 0.4337559938430786, "learning_rate": 9.934369948543972e-05, "loss": 0.0483, "step": 1250 }, { "epoch": 1.9968304278922346, "grad_norm": 0.3361351490020752, "learning_rate": 9.93223714270217e-05, "loss": 0.0584, "step": 1260 }, { "epoch": 2.012678288431062, "grad_norm": 0.38473933935165405, "learning_rate": 9.93007046979813e-05, "loss": 0.0464, "step": 1270 }, { "epoch": 2.028526148969889, "grad_norm": 0.4139537811279297, "learning_rate": 9.92786994470925e-05, "loss": 0.0482, "step": 1280 }, { "epoch": 2.0443740095087164, "grad_norm": 0.4201923906803131, "learning_rate": 9.92563558254537e-05, "loss": 0.0385, "step": 1290 }, { "epoch": 2.0602218700475436, "grad_norm": 0.36632242798805237, "learning_rate": 9.923367398648671e-05, "loss": 0.0482, "step": 1300 }, { "epoch": 2.076069730586371, "grad_norm": 0.5546051859855652, "learning_rate": 9.921065408593574e-05, "loss": 0.0522, "step": 1310 }, { "epoch": 2.091917591125198, "grad_norm": 0.34690526127815247, "learning_rate": 9.918729628186628e-05, "loss": 0.0493, "step": 1320 }, { "epoch": 2.1077654516640254, "grad_norm": 0.32220637798309326, "learning_rate": 9.916360073466397e-05, "loss": 0.0445, "step": 1330 }, { "epoch": 2.1236133122028527, "grad_norm": 0.3459847569465637, "learning_rate": 9.913956760703363e-05, "loss": 0.0528, "step": 1340 }, { "epoch": 2.13946117274168, "grad_norm": 0.5802285671234131, "learning_rate": 9.911519706399798e-05, "loss": 0.0517, "step": 1350 }, { "epoch": 2.155309033280507, "grad_norm": 0.49526548385620117, "learning_rate": 9.909048927289668e-05, "loss": 0.0521, "step": 1360 }, { "epoch": 2.1711568938193344, "grad_norm": 0.48161160945892334, "learning_rate": 9.906544440338504e-05, "loss": 0.0486, "step": 1370 }, { "epoch": 2.1870047543581617, "grad_norm": 0.45289692282676697, "learning_rate": 9.904006262743293e-05, "loss": 0.0541, "step": 1380 }, { "epoch": 2.202852614896989, "grad_norm": 0.2760493755340576, "learning_rate": 9.901434411932358e-05, "loss": 0.0488, "step": 1390 }, { "epoch": 2.218700475435816, "grad_norm": 0.3549060523509979, "learning_rate": 9.898828905565236e-05, "loss": 0.0428, "step": 1400 }, { "epoch": 2.2345483359746434, "grad_norm": 0.39921554923057556, "learning_rate": 9.896189761532563e-05, "loss": 0.046, "step": 1410 }, { "epoch": 2.2503961965134707, "grad_norm": 0.31093716621398926, "learning_rate": 9.89351699795594e-05, "loss": 0.0507, "step": 1420 }, { "epoch": 2.266244057052298, "grad_norm": 0.48712223768234253, "learning_rate": 9.890810633187825e-05, "loss": 0.0537, "step": 1430 }, { "epoch": 2.282091917591125, "grad_norm": 0.2943997085094452, "learning_rate": 9.888070685811389e-05, "loss": 0.0434, "step": 1440 }, { "epoch": 2.2979397781299524, "grad_norm": 0.5522529482841492, "learning_rate": 9.885297174640401e-05, "loss": 0.0508, "step": 1450 }, { "epoch": 2.3137876386687797, "grad_norm": 0.43696942925453186, "learning_rate": 9.882490118719095e-05, "loss": 0.0469, "step": 1460 }, { "epoch": 2.329635499207607, "grad_norm": 0.34890133142471313, "learning_rate": 9.87964953732204e-05, "loss": 0.0406, "step": 1470 }, { "epoch": 2.345483359746434, "grad_norm": 0.4267130494117737, "learning_rate": 9.876775449954003e-05, "loss": 0.0482, "step": 1480 }, { "epoch": 2.3613312202852614, "grad_norm": 0.44068190455436707, "learning_rate": 9.873867876349822e-05, "loss": 0.0509, "step": 1490 }, { "epoch": 2.3771790808240887, "grad_norm": 0.6220930814743042, "learning_rate": 9.870926836474265e-05, "loss": 0.0451, "step": 1500 }, { "epoch": 2.393026941362916, "grad_norm": 0.33855652809143066, "learning_rate": 9.867952350521899e-05, "loss": 0.0489, "step": 1510 }, { "epoch": 2.408874801901743, "grad_norm": 0.35320836305618286, "learning_rate": 9.864944438916943e-05, "loss": 0.0553, "step": 1520 }, { "epoch": 2.4247226624405704, "grad_norm": 0.44654354453086853, "learning_rate": 9.861903122313132e-05, "loss": 0.0523, "step": 1530 }, { "epoch": 2.4405705229793977, "grad_norm": 0.41467657685279846, "learning_rate": 9.858828421593582e-05, "loss": 0.0457, "step": 1540 }, { "epoch": 2.456418383518225, "grad_norm": 0.4683903157711029, "learning_rate": 9.855720357870635e-05, "loss": 0.0451, "step": 1550 }, { "epoch": 2.472266244057052, "grad_norm": 0.3570249378681183, "learning_rate": 9.852578952485716e-05, "loss": 0.0496, "step": 1560 }, { "epoch": 2.4881141045958794, "grad_norm": 0.31039777398109436, "learning_rate": 9.849404227009196e-05, "loss": 0.047, "step": 1570 }, { "epoch": 2.5039619651347067, "grad_norm": 0.35661402344703674, "learning_rate": 9.846196203240234e-05, "loss": 0.0451, "step": 1580 }, { "epoch": 2.519809825673534, "grad_norm": 0.2695527970790863, "learning_rate": 9.842954903206634e-05, "loss": 0.0405, "step": 1590 }, { "epoch": 2.535657686212361, "grad_norm": 0.2955043613910675, "learning_rate": 9.839680349164684e-05, "loss": 0.0378, "step": 1600 }, { "epoch": 2.5515055467511885, "grad_norm": 0.32479894161224365, "learning_rate": 9.836372563599017e-05, "loss": 0.0398, "step": 1610 }, { "epoch": 2.5673534072900157, "grad_norm": 0.2761167883872986, "learning_rate": 9.833031569222443e-05, "loss": 0.0373, "step": 1620 }, { "epoch": 2.583201267828843, "grad_norm": 0.42928674817085266, "learning_rate": 9.829657388975803e-05, "loss": 0.0445, "step": 1630 }, { "epoch": 2.59904912836767, "grad_norm": 0.27622172236442566, "learning_rate": 9.826250046027809e-05, "loss": 0.038, "step": 1640 }, { "epoch": 2.6148969889064975, "grad_norm": 0.3467934727668762, "learning_rate": 9.822809563774881e-05, "loss": 0.0417, "step": 1650 }, { "epoch": 2.6307448494453247, "grad_norm": 0.3505828380584717, "learning_rate": 9.81933596584099e-05, "loss": 0.0424, "step": 1660 }, { "epoch": 2.6465927099841524, "grad_norm": 0.38430240750312805, "learning_rate": 9.815829276077492e-05, "loss": 0.0407, "step": 1670 }, { "epoch": 2.662440570522979, "grad_norm": 0.280718058347702, "learning_rate": 9.812289518562975e-05, "loss": 0.0415, "step": 1680 }, { "epoch": 2.678288431061807, "grad_norm": 0.3132197856903076, "learning_rate": 9.808716717603076e-05, "loss": 0.0467, "step": 1690 }, { "epoch": 2.6941362916006337, "grad_norm": 0.43638864159584045, "learning_rate": 9.80511089773033e-05, "loss": 0.0488, "step": 1700 }, { "epoch": 2.7099841521394614, "grad_norm": 0.39665859937667847, "learning_rate": 9.801472083703993e-05, "loss": 0.043, "step": 1710 }, { "epoch": 2.7258320126782882, "grad_norm": 0.43067046999931335, "learning_rate": 9.797800300509879e-05, "loss": 0.044, "step": 1720 }, { "epoch": 2.741679873217116, "grad_norm": 0.4771805703639984, "learning_rate": 9.794095573360173e-05, "loss": 0.0428, "step": 1730 }, { "epoch": 2.7575277337559427, "grad_norm": 0.28175461292266846, "learning_rate": 9.790357927693282e-05, "loss": 0.0407, "step": 1740 }, { "epoch": 2.7733755942947704, "grad_norm": 0.3774772584438324, "learning_rate": 9.786587389173639e-05, "loss": 0.0526, "step": 1750 }, { "epoch": 2.7892234548335972, "grad_norm": 0.38130345940589905, "learning_rate": 9.782783983691534e-05, "loss": 0.0397, "step": 1760 }, { "epoch": 2.805071315372425, "grad_norm": 0.3435608744621277, "learning_rate": 9.778947737362942e-05, "loss": 0.0421, "step": 1770 }, { "epoch": 2.8209191759112517, "grad_norm": 0.3259428143501282, "learning_rate": 9.775078676529338e-05, "loss": 0.0534, "step": 1780 }, { "epoch": 2.8367670364500794, "grad_norm": 0.37528592348098755, "learning_rate": 9.771176827757512e-05, "loss": 0.0397, "step": 1790 }, { "epoch": 2.8526148969889062, "grad_norm": 0.22234618663787842, "learning_rate": 9.767242217839397e-05, "loss": 0.0425, "step": 1800 }, { "epoch": 2.868462757527734, "grad_norm": 0.366315096616745, "learning_rate": 9.763274873791874e-05, "loss": 0.048, "step": 1810 }, { "epoch": 2.8843106180665607, "grad_norm": 0.2822195887565613, "learning_rate": 9.759274822856598e-05, "loss": 0.0394, "step": 1820 }, { "epoch": 2.9001584786053884, "grad_norm": 0.2494378387928009, "learning_rate": 9.7552420924998e-05, "loss": 0.0525, "step": 1830 }, { "epoch": 2.9160063391442153, "grad_norm": 0.3741839528083801, "learning_rate": 9.751176710412106e-05, "loss": 0.0406, "step": 1840 }, { "epoch": 2.931854199683043, "grad_norm": 0.436900794506073, "learning_rate": 9.747078704508343e-05, "loss": 0.04, "step": 1850 }, { "epoch": 2.94770206022187, "grad_norm": 0.4009973704814911, "learning_rate": 9.742948102927351e-05, "loss": 0.0379, "step": 1860 }, { "epoch": 2.9635499207606975, "grad_norm": 0.34224581718444824, "learning_rate": 9.738784934031781e-05, "loss": 0.0383, "step": 1870 }, { "epoch": 2.9793977812995247, "grad_norm": 0.32024386525154114, "learning_rate": 9.734589226407913e-05, "loss": 0.0421, "step": 1880 }, { "epoch": 2.995245641838352, "grad_norm": 0.2870291769504547, "learning_rate": 9.730361008865452e-05, "loss": 0.038, "step": 1890 }, { "epoch": 3.011093502377179, "grad_norm": 0.30015242099761963, "learning_rate": 9.726100310437327e-05, "loss": 0.0427, "step": 1900 }, { "epoch": 3.0269413629160065, "grad_norm": 0.37298670411109924, "learning_rate": 9.721807160379503e-05, "loss": 0.0368, "step": 1910 }, { "epoch": 3.0427892234548337, "grad_norm": 0.3384378254413605, "learning_rate": 9.717481588170765e-05, "loss": 0.0338, "step": 1920 }, { "epoch": 3.058637083993661, "grad_norm": 0.456691175699234, "learning_rate": 9.713123623512532e-05, "loss": 0.0406, "step": 1930 }, { "epoch": 3.074484944532488, "grad_norm": 0.28940680623054504, "learning_rate": 9.70873329632864e-05, "loss": 0.039, "step": 1940 }, { "epoch": 3.0903328050713155, "grad_norm": 0.3975684940814972, "learning_rate": 9.704310636765142e-05, "loss": 0.0445, "step": 1950 }, { "epoch": 3.1061806656101427, "grad_norm": 0.33805158734321594, "learning_rate": 9.699855675190099e-05, "loss": 0.0452, "step": 1960 }, { "epoch": 3.12202852614897, "grad_norm": 0.4213513433933258, "learning_rate": 9.695368442193378e-05, "loss": 0.0371, "step": 1970 }, { "epoch": 3.1378763866877972, "grad_norm": 0.3289247751235962, "learning_rate": 9.69084896858643e-05, "loss": 0.0345, "step": 1980 }, { "epoch": 3.1537242472266245, "grad_norm": 0.3328181505203247, "learning_rate": 9.68629728540209e-05, "loss": 0.0436, "step": 1990 }, { "epoch": 3.1695721077654517, "grad_norm": 0.37982073426246643, "learning_rate": 9.681713423894359e-05, "loss": 0.0415, "step": 2000 }, { "epoch": 3.185419968304279, "grad_norm": 0.4652085602283478, "learning_rate": 9.677097415538186e-05, "loss": 0.0391, "step": 2010 }, { "epoch": 3.2012678288431062, "grad_norm": 0.44633859395980835, "learning_rate": 9.672449292029257e-05, "loss": 0.0375, "step": 2020 }, { "epoch": 3.2171156893819335, "grad_norm": 0.4091266095638275, "learning_rate": 9.66776908528378e-05, "loss": 0.042, "step": 2030 }, { "epoch": 3.2329635499207607, "grad_norm": 0.37333956360816956, "learning_rate": 9.663056827438252e-05, "loss": 0.0416, "step": 2040 }, { "epoch": 3.248811410459588, "grad_norm": 0.3241555988788605, "learning_rate": 9.65831255084926e-05, "loss": 0.0375, "step": 2050 }, { "epoch": 3.2646592709984152, "grad_norm": 0.3356478214263916, "learning_rate": 9.653536288093237e-05, "loss": 0.0379, "step": 2060 }, { "epoch": 3.2805071315372425, "grad_norm": 0.3036734163761139, "learning_rate": 9.648728071966251e-05, "loss": 0.0401, "step": 2070 }, { "epoch": 3.2963549920760697, "grad_norm": 0.3258291780948639, "learning_rate": 9.64388793548378e-05, "loss": 0.0403, "step": 2080 }, { "epoch": 3.312202852614897, "grad_norm": 0.24130631983280182, "learning_rate": 9.639015911880478e-05, "loss": 0.0429, "step": 2090 }, { "epoch": 3.3280507131537242, "grad_norm": 0.32036861777305603, "learning_rate": 9.634112034609955e-05, "loss": 0.044, "step": 2100 }, { "epoch": 3.3438985736925515, "grad_norm": 0.4092555046081543, "learning_rate": 9.629176337344538e-05, "loss": 0.04, "step": 2110 }, { "epoch": 3.3597464342313788, "grad_norm": 0.2772470712661743, "learning_rate": 9.62420885397505e-05, "loss": 0.0441, "step": 2120 }, { "epoch": 3.375594294770206, "grad_norm": 0.26047825813293457, "learning_rate": 9.619209618610569e-05, "loss": 0.0465, "step": 2130 }, { "epoch": 3.3914421553090333, "grad_norm": 0.3633308410644531, "learning_rate": 9.614178665578199e-05, "loss": 0.0366, "step": 2140 }, { "epoch": 3.4072900158478605, "grad_norm": 0.3024093806743622, "learning_rate": 9.609116029422834e-05, "loss": 0.0394, "step": 2150 }, { "epoch": 3.4231378763866878, "grad_norm": 0.30647802352905273, "learning_rate": 9.604021744906915e-05, "loss": 0.0378, "step": 2160 }, { "epoch": 3.438985736925515, "grad_norm": 0.36456671357154846, "learning_rate": 9.598895847010198e-05, "loss": 0.0378, "step": 2170 }, { "epoch": 3.4548335974643423, "grad_norm": 0.28538623452186584, "learning_rate": 9.593738370929513e-05, "loss": 0.0367, "step": 2180 }, { "epoch": 3.4706814580031695, "grad_norm": 0.3848700523376465, "learning_rate": 9.588549352078517e-05, "loss": 0.0376, "step": 2190 }, { "epoch": 3.4865293185419968, "grad_norm": 0.4077780842781067, "learning_rate": 9.583328826087456e-05, "loss": 0.0401, "step": 2200 }, { "epoch": 3.502377179080824, "grad_norm": 0.29360130429267883, "learning_rate": 9.578076828802922e-05, "loss": 0.0377, "step": 2210 }, { "epoch": 3.5182250396196513, "grad_norm": 0.23208092153072357, "learning_rate": 9.572793396287598e-05, "loss": 0.0456, "step": 2220 }, { "epoch": 3.5340729001584785, "grad_norm": 0.2558813691139221, "learning_rate": 9.567478564820019e-05, "loss": 0.032, "step": 2230 }, { "epoch": 3.5499207606973058, "grad_norm": 0.24153469502925873, "learning_rate": 9.562132370894321e-05, "loss": 0.0374, "step": 2240 }, { "epoch": 3.565768621236133, "grad_norm": 0.3506905436515808, "learning_rate": 9.55675485121999e-05, "loss": 0.0455, "step": 2250 }, { "epoch": 3.5816164817749603, "grad_norm": 0.3719404637813568, "learning_rate": 9.551346042721604e-05, "loss": 0.042, "step": 2260 }, { "epoch": 3.5974643423137875, "grad_norm": 0.27092769742012024, "learning_rate": 9.545905982538592e-05, "loss": 0.0351, "step": 2270 }, { "epoch": 3.613312202852615, "grad_norm": 0.492046982049942, "learning_rate": 9.540434708024966e-05, "loss": 0.0414, "step": 2280 }, { "epoch": 3.629160063391442, "grad_norm": 0.3070124387741089, "learning_rate": 9.534932256749074e-05, "loss": 0.0396, "step": 2290 }, { "epoch": 3.6450079239302693, "grad_norm": 0.30400729179382324, "learning_rate": 9.529398666493336e-05, "loss": 0.0356, "step": 2300 }, { "epoch": 3.6608557844690965, "grad_norm": 0.26050692796707153, "learning_rate": 9.523833975253988e-05, "loss": 0.0419, "step": 2310 }, { "epoch": 3.676703645007924, "grad_norm": 0.27897489070892334, "learning_rate": 9.51823822124082e-05, "loss": 0.034, "step": 2320 }, { "epoch": 3.692551505546751, "grad_norm": 0.3234636187553406, "learning_rate": 9.512611442876914e-05, "loss": 0.0428, "step": 2330 }, { "epoch": 3.7083993660855783, "grad_norm": 0.3589284121990204, "learning_rate": 9.506953678798378e-05, "loss": 0.0387, "step": 2340 }, { "epoch": 3.7242472266244055, "grad_norm": 0.4306239187717438, "learning_rate": 9.501264967854084e-05, "loss": 0.0474, "step": 2350 }, { "epoch": 3.740095087163233, "grad_norm": 0.22645992040634155, "learning_rate": 9.495545349105401e-05, "loss": 0.0385, "step": 2360 }, { "epoch": 3.75594294770206, "grad_norm": 0.3852635622024536, "learning_rate": 9.489794861825923e-05, "loss": 0.0345, "step": 2370 }, { "epoch": 3.7717908082408877, "grad_norm": 0.27143415808677673, "learning_rate": 9.484013545501203e-05, "loss": 0.0361, "step": 2380 }, { "epoch": 3.7876386687797146, "grad_norm": 0.28437864780426025, "learning_rate": 9.47820143982848e-05, "loss": 0.0341, "step": 2390 }, { "epoch": 3.8034865293185423, "grad_norm": 0.2932587265968323, "learning_rate": 9.472358584716408e-05, "loss": 0.0462, "step": 2400 }, { "epoch": 3.819334389857369, "grad_norm": 0.4200306236743927, "learning_rate": 9.466485020284782e-05, "loss": 0.0421, "step": 2410 }, { "epoch": 3.8351822503961968, "grad_norm": 0.41429468989372253, "learning_rate": 9.46058078686426e-05, "loss": 0.0429, "step": 2420 }, { "epoch": 3.8510301109350236, "grad_norm": 0.29475075006484985, "learning_rate": 9.454645924996087e-05, "loss": 0.036, "step": 2430 }, { "epoch": 3.8668779714738513, "grad_norm": 0.36007851362228394, "learning_rate": 9.448680475431819e-05, "loss": 0.0343, "step": 2440 }, { "epoch": 3.882725832012678, "grad_norm": 0.31012412905693054, "learning_rate": 9.442684479133044e-05, "loss": 0.0328, "step": 2450 }, { "epoch": 3.8985736925515058, "grad_norm": 0.3186264932155609, "learning_rate": 9.436657977271093e-05, "loss": 0.0357, "step": 2460 }, { "epoch": 3.9144215530903326, "grad_norm": 0.289347380399704, "learning_rate": 9.430601011226763e-05, "loss": 0.0322, "step": 2470 }, { "epoch": 3.9302694136291603, "grad_norm": 0.8456028699874878, "learning_rate": 9.424513622590038e-05, "loss": 0.0368, "step": 2480 }, { "epoch": 3.946117274167987, "grad_norm": 0.2324322909116745, "learning_rate": 9.418395853159793e-05, "loss": 0.0334, "step": 2490 }, { "epoch": 3.9619651347068148, "grad_norm": 0.24869747459888458, "learning_rate": 9.412247744943512e-05, "loss": 0.0333, "step": 2500 }, { "epoch": 3.9778129952456416, "grad_norm": 0.31198471784591675, "learning_rate": 9.406069340157003e-05, "loss": 0.0335, "step": 2510 }, { "epoch": 3.9936608557844693, "grad_norm": 0.3861044645309448, "learning_rate": 9.399860681224098e-05, "loss": 0.0363, "step": 2520 }, { "epoch": 4.009508716323296, "grad_norm": 0.21961505711078644, "learning_rate": 9.393621810776376e-05, "loss": 0.0353, "step": 2530 }, { "epoch": 4.025356576862124, "grad_norm": 0.28296446800231934, "learning_rate": 9.387352771652856e-05, "loss": 0.0438, "step": 2540 }, { "epoch": 4.041204437400951, "grad_norm": 0.48964765667915344, "learning_rate": 9.381053606899713e-05, "loss": 0.0346, "step": 2550 }, { "epoch": 4.057052297939778, "grad_norm": 0.38259637355804443, "learning_rate": 9.374724359769979e-05, "loss": 0.0342, "step": 2560 }, { "epoch": 4.072900158478605, "grad_norm": 0.29834380745887756, "learning_rate": 9.368365073723241e-05, "loss": 0.031, "step": 2570 }, { "epoch": 4.088748019017433, "grad_norm": 0.41095930337905884, "learning_rate": 9.361975792425356e-05, "loss": 0.0344, "step": 2580 }, { "epoch": 4.10459587955626, "grad_norm": 0.21189731359481812, "learning_rate": 9.355556559748133e-05, "loss": 0.0367, "step": 2590 }, { "epoch": 4.120443740095087, "grad_norm": 0.2741738557815552, "learning_rate": 9.349107419769048e-05, "loss": 0.0366, "step": 2600 }, { "epoch": 4.136291600633914, "grad_norm": 0.28367432951927185, "learning_rate": 9.342628416770928e-05, "loss": 0.0301, "step": 2610 }, { "epoch": 4.152139461172742, "grad_norm": 0.29805341362953186, "learning_rate": 9.336119595241665e-05, "loss": 0.0332, "step": 2620 }, { "epoch": 4.167987321711569, "grad_norm": 0.340262770652771, "learning_rate": 9.329580999873887e-05, "loss": 0.0332, "step": 2630 }, { "epoch": 4.183835182250396, "grad_norm": 0.2894122302532196, "learning_rate": 9.323012675564668e-05, "loss": 0.0333, "step": 2640 }, { "epoch": 4.199683042789223, "grad_norm": 0.2781189978122711, "learning_rate": 9.316414667415216e-05, "loss": 0.0348, "step": 2650 }, { "epoch": 4.215530903328051, "grad_norm": 0.321756511926651, "learning_rate": 9.309787020730562e-05, "loss": 0.0303, "step": 2660 }, { "epoch": 4.231378763866878, "grad_norm": 0.275852233171463, "learning_rate": 9.303129781019249e-05, "loss": 0.0407, "step": 2670 }, { "epoch": 4.247226624405705, "grad_norm": 0.44196420907974243, "learning_rate": 9.296442993993015e-05, "loss": 0.0395, "step": 2680 }, { "epoch": 4.263074484944532, "grad_norm": 0.2846081852912903, "learning_rate": 9.289726705566491e-05, "loss": 0.0344, "step": 2690 }, { "epoch": 4.27892234548336, "grad_norm": 0.31943535804748535, "learning_rate": 9.282980961856875e-05, "loss": 0.0388, "step": 2700 }, { "epoch": 4.294770206022187, "grad_norm": 0.4001297354698181, "learning_rate": 9.276205809183618e-05, "loss": 0.0366, "step": 2710 }, { "epoch": 4.310618066561014, "grad_norm": 0.2874903976917267, "learning_rate": 9.26940129406811e-05, "loss": 0.0302, "step": 2720 }, { "epoch": 4.326465927099841, "grad_norm": 0.3430187404155731, "learning_rate": 9.262567463233352e-05, "loss": 0.0368, "step": 2730 }, { "epoch": 4.342313787638669, "grad_norm": 0.3248710632324219, "learning_rate": 9.255704363603645e-05, "loss": 0.0337, "step": 2740 }, { "epoch": 4.358161648177496, "grad_norm": 0.3309313654899597, "learning_rate": 9.248812042304263e-05, "loss": 0.0328, "step": 2750 }, { "epoch": 4.374009508716323, "grad_norm": 0.2918064594268799, "learning_rate": 9.24189054666113e-05, "loss": 0.0394, "step": 2760 }, { "epoch": 4.38985736925515, "grad_norm": 0.35082730650901794, "learning_rate": 9.23493992420049e-05, "loss": 0.0406, "step": 2770 }, { "epoch": 4.405705229793978, "grad_norm": 0.32973727583885193, "learning_rate": 9.227960222648593e-05, "loss": 0.034, "step": 2780 }, { "epoch": 4.4215530903328055, "grad_norm": 0.23779386281967163, "learning_rate": 9.220951489931352e-05, "loss": 0.0371, "step": 2790 }, { "epoch": 4.437400950871632, "grad_norm": 0.2471320629119873, "learning_rate": 9.213913774174028e-05, "loss": 0.0317, "step": 2800 }, { "epoch": 4.453248811410459, "grad_norm": 0.3636610805988312, "learning_rate": 9.20684712370089e-05, "loss": 0.0356, "step": 2810 }, { "epoch": 4.469096671949287, "grad_norm": 0.18174231052398682, "learning_rate": 9.199751587034887e-05, "loss": 0.0258, "step": 2820 }, { "epoch": 4.4849445324881145, "grad_norm": 0.20908503234386444, "learning_rate": 9.192627212897315e-05, "loss": 0.0368, "step": 2830 }, { "epoch": 4.500792393026941, "grad_norm": 0.27427220344543457, "learning_rate": 9.185474050207478e-05, "loss": 0.0382, "step": 2840 }, { "epoch": 4.516640253565768, "grad_norm": 0.35455378890037537, "learning_rate": 9.178292148082362e-05, "loss": 0.0338, "step": 2850 }, { "epoch": 4.532488114104596, "grad_norm": 0.3077165484428406, "learning_rate": 9.171081555836287e-05, "loss": 0.032, "step": 2860 }, { "epoch": 4.5483359746434235, "grad_norm": 0.29954010248184204, "learning_rate": 9.163842322980573e-05, "loss": 0.0363, "step": 2870 }, { "epoch": 4.56418383518225, "grad_norm": 0.23956748843193054, "learning_rate": 9.156574499223202e-05, "loss": 0.0319, "step": 2880 }, { "epoch": 4.580031695721077, "grad_norm": 0.24991659820079803, "learning_rate": 9.149278134468472e-05, "loss": 0.0351, "step": 2890 }, { "epoch": 4.595879556259905, "grad_norm": 0.35879701375961304, "learning_rate": 9.141953278816661e-05, "loss": 0.0364, "step": 2900 }, { "epoch": 4.6117274167987325, "grad_norm": 0.2529746890068054, "learning_rate": 9.134599982563674e-05, "loss": 0.0357, "step": 2910 }, { "epoch": 4.627575277337559, "grad_norm": 0.23599006235599518, "learning_rate": 9.127218296200705e-05, "loss": 0.0363, "step": 2920 }, { "epoch": 4.643423137876387, "grad_norm": 0.3693040907382965, "learning_rate": 9.119808270413891e-05, "loss": 0.036, "step": 2930 }, { "epoch": 4.659270998415214, "grad_norm": 0.37512966990470886, "learning_rate": 9.112369956083953e-05, "loss": 0.0379, "step": 2940 }, { "epoch": 4.675118858954042, "grad_norm": 0.35540756583213806, "learning_rate": 9.104903404285862e-05, "loss": 0.0305, "step": 2950 }, { "epoch": 4.690966719492868, "grad_norm": 0.4176557660102844, "learning_rate": 9.097408666288475e-05, "loss": 0.0355, "step": 2960 }, { "epoch": 4.706814580031696, "grad_norm": 0.28811272978782654, "learning_rate": 9.089885793554195e-05, "loss": 0.0376, "step": 2970 }, { "epoch": 4.722662440570523, "grad_norm": 0.3358956575393677, "learning_rate": 9.082334837738607e-05, "loss": 0.0368, "step": 2980 }, { "epoch": 4.738510301109351, "grad_norm": 0.3090055584907532, "learning_rate": 9.074755850690127e-05, "loss": 0.0326, "step": 2990 }, { "epoch": 4.754358161648177, "grad_norm": 0.24217335879802704, "learning_rate": 9.067148884449647e-05, "loss": 0.0271, "step": 3000 }, { "epoch": 4.770206022187005, "grad_norm": 0.361965149641037, "learning_rate": 9.059513991250181e-05, "loss": 0.0361, "step": 3010 }, { "epoch": 4.786053882725832, "grad_norm": 0.36846402287483215, "learning_rate": 9.051851223516501e-05, "loss": 0.0381, "step": 3020 }, { "epoch": 4.80190174326466, "grad_norm": 0.3030705451965332, "learning_rate": 9.044160633864776e-05, "loss": 0.0363, "step": 3030 }, { "epoch": 4.817749603803486, "grad_norm": 0.40651705861091614, "learning_rate": 9.036442275102213e-05, "loss": 0.0305, "step": 3040 }, { "epoch": 4.833597464342314, "grad_norm": 0.2696928381919861, "learning_rate": 9.0286962002267e-05, "loss": 0.0386, "step": 3050 }, { "epoch": 4.849445324881141, "grad_norm": 0.3362119197845459, "learning_rate": 9.020922462426433e-05, "loss": 0.0318, "step": 3060 }, { "epoch": 4.865293185419969, "grad_norm": 0.21661606431007385, "learning_rate": 9.013121115079557e-05, "loss": 0.0338, "step": 3070 }, { "epoch": 4.881141045958795, "grad_norm": 0.2977627217769623, "learning_rate": 9.005292211753792e-05, "loss": 0.0323, "step": 3080 }, { "epoch": 4.896988906497623, "grad_norm": 0.3265908658504486, "learning_rate": 8.997435806206078e-05, "loss": 0.032, "step": 3090 }, { "epoch": 4.91283676703645, "grad_norm": 0.45224496722221375, "learning_rate": 8.989551952382192e-05, "loss": 0.0347, "step": 3100 }, { "epoch": 4.928684627575278, "grad_norm": 0.3116205930709839, "learning_rate": 8.981640704416385e-05, "loss": 0.0278, "step": 3110 }, { "epoch": 4.944532488114104, "grad_norm": 0.38788729906082153, "learning_rate": 8.97370211663101e-05, "loss": 0.0356, "step": 3120 }, { "epoch": 4.960380348652932, "grad_norm": 0.3053205609321594, "learning_rate": 8.965736243536152e-05, "loss": 0.0298, "step": 3130 }, { "epoch": 4.976228209191759, "grad_norm": 0.3261253535747528, "learning_rate": 8.957743139829243e-05, "loss": 0.038, "step": 3140 }, { "epoch": 4.992076069730587, "grad_norm": 0.3000582158565521, "learning_rate": 8.949722860394693e-05, "loss": 0.0485, "step": 3150 }, { "epoch": 5.007923930269413, "grad_norm": 0.3081798553466797, "learning_rate": 8.941675460303522e-05, "loss": 0.0401, "step": 3160 }, { "epoch": 5.023771790808241, "grad_norm": 0.29715317487716675, "learning_rate": 8.933600994812965e-05, "loss": 0.0314, "step": 3170 }, { "epoch": 5.039619651347068, "grad_norm": 0.20959503948688507, "learning_rate": 8.925499519366102e-05, "loss": 0.0344, "step": 3180 }, { "epoch": 5.055467511885896, "grad_norm": 0.34640997648239136, "learning_rate": 8.917371089591482e-05, "loss": 0.0324, "step": 3190 }, { "epoch": 5.071315372424722, "grad_norm": 0.29564642906188965, "learning_rate": 8.909215761302728e-05, "loss": 0.0404, "step": 3200 }, { "epoch": 5.08716323296355, "grad_norm": 0.29282501339912415, "learning_rate": 8.90103359049816e-05, "loss": 0.0317, "step": 3210 }, { "epoch": 5.103011093502377, "grad_norm": 0.3910326063632965, "learning_rate": 8.892824633360419e-05, "loss": 0.0297, "step": 3220 }, { "epoch": 5.118858954041205, "grad_norm": 0.30237722396850586, "learning_rate": 8.884588946256069e-05, "loss": 0.0372, "step": 3230 }, { "epoch": 5.134706814580031, "grad_norm": 0.3003133535385132, "learning_rate": 8.876326585735213e-05, "loss": 0.0332, "step": 3240 }, { "epoch": 5.150554675118859, "grad_norm": 0.2812441885471344, "learning_rate": 8.868037608531108e-05, "loss": 0.0315, "step": 3250 }, { "epoch": 5.166402535657686, "grad_norm": 0.2651035785675049, "learning_rate": 8.859722071559777e-05, "loss": 0.0292, "step": 3260 }, { "epoch": 5.182250396196514, "grad_norm": 0.31288737058639526, "learning_rate": 8.85138003191961e-05, "loss": 0.0294, "step": 3270 }, { "epoch": 5.19809825673534, "grad_norm": 0.2833364009857178, "learning_rate": 8.843011546890978e-05, "loss": 0.0331, "step": 3280 }, { "epoch": 5.213946117274168, "grad_norm": 0.25718948245048523, "learning_rate": 8.834616673935839e-05, "loss": 0.0281, "step": 3290 }, { "epoch": 5.229793977812995, "grad_norm": 0.28992629051208496, "learning_rate": 8.82619547069734e-05, "loss": 0.034, "step": 3300 }, { "epoch": 5.245641838351823, "grad_norm": 0.2499540150165558, "learning_rate": 8.817747994999432e-05, "loss": 0.027, "step": 3310 }, { "epoch": 5.261489698890649, "grad_norm": 0.25445619225502014, "learning_rate": 8.80927430484646e-05, "loss": 0.0316, "step": 3320 }, { "epoch": 5.277337559429477, "grad_norm": 0.28179076313972473, "learning_rate": 8.800774458422765e-05, "loss": 0.035, "step": 3330 }, { "epoch": 5.293185419968304, "grad_norm": 0.30823758244514465, "learning_rate": 8.792248514092299e-05, "loss": 0.0259, "step": 3340 }, { "epoch": 5.309033280507132, "grad_norm": 0.3379741311073303, "learning_rate": 8.783696530398207e-05, "loss": 0.033, "step": 3350 }, { "epoch": 5.324881141045958, "grad_norm": 0.29917508363723755, "learning_rate": 8.775118566062435e-05, "loss": 0.0278, "step": 3360 }, { "epoch": 5.340729001584786, "grad_norm": 0.15989099442958832, "learning_rate": 8.766514679985325e-05, "loss": 0.0315, "step": 3370 }, { "epoch": 5.356576862123613, "grad_norm": 0.2137162983417511, "learning_rate": 8.757884931245211e-05, "loss": 0.0333, "step": 3380 }, { "epoch": 5.372424722662441, "grad_norm": 0.30674856901168823, "learning_rate": 8.749229379098008e-05, "loss": 0.0308, "step": 3390 }, { "epoch": 5.3882725832012675, "grad_norm": 0.23785285651683807, "learning_rate": 8.740548082976814e-05, "loss": 0.0278, "step": 3400 }, { "epoch": 5.404120443740095, "grad_norm": 0.25887709856033325, "learning_rate": 8.731841102491494e-05, "loss": 0.0283, "step": 3410 }, { "epoch": 5.419968304278922, "grad_norm": 0.3679006099700928, "learning_rate": 8.723108497428276e-05, "loss": 0.0273, "step": 3420 }, { "epoch": 5.43581616481775, "grad_norm": 0.40523847937583923, "learning_rate": 8.714350327749337e-05, "loss": 0.0319, "step": 3430 }, { "epoch": 5.4516640253565765, "grad_norm": 0.2975967228412628, "learning_rate": 8.705566653592393e-05, "loss": 0.0382, "step": 3440 }, { "epoch": 5.467511885895404, "grad_norm": 0.27645203471183777, "learning_rate": 8.696757535270285e-05, "loss": 0.0413, "step": 3450 }, { "epoch": 5.483359746434231, "grad_norm": 0.23291446268558502, "learning_rate": 8.68792303327057e-05, "loss": 0.0306, "step": 3460 }, { "epoch": 5.499207606973059, "grad_norm": 0.34922730922698975, "learning_rate": 8.679063208255095e-05, "loss": 0.0299, "step": 3470 }, { "epoch": 5.5150554675118855, "grad_norm": 0.2651195228099823, "learning_rate": 8.67017812105959e-05, "loss": 0.0279, "step": 3480 }, { "epoch": 5.530903328050713, "grad_norm": 0.23726455867290497, "learning_rate": 8.661267832693247e-05, "loss": 0.0311, "step": 3490 }, { "epoch": 5.546751188589541, "grad_norm": 0.22650249302387238, "learning_rate": 8.6523324043383e-05, "loss": 0.0319, "step": 3500 }, { "epoch": 5.562599049128368, "grad_norm": 0.275462806224823, "learning_rate": 8.643371897349609e-05, "loss": 0.0328, "step": 3510 }, { "epoch": 5.5784469096671945, "grad_norm": 0.30848929286003113, "learning_rate": 8.63438637325423e-05, "loss": 0.0353, "step": 3520 }, { "epoch": 5.594294770206022, "grad_norm": 0.22483864426612854, "learning_rate": 8.625375893751005e-05, "loss": 0.0291, "step": 3530 }, { "epoch": 5.61014263074485, "grad_norm": 0.2007935345172882, "learning_rate": 8.616340520710124e-05, "loss": 0.0287, "step": 3540 }, { "epoch": 5.625990491283677, "grad_norm": 0.24104808270931244, "learning_rate": 8.607280316172717e-05, "loss": 0.0296, "step": 3550 }, { "epoch": 5.6418383518225035, "grad_norm": 0.25262153148651123, "learning_rate": 8.598195342350413e-05, "loss": 0.0332, "step": 3560 }, { "epoch": 5.657686212361331, "grad_norm": 0.2854628264904022, "learning_rate": 8.589085661624915e-05, "loss": 0.0287, "step": 3570 }, { "epoch": 5.673534072900159, "grad_norm": 0.27987590432167053, "learning_rate": 8.579951336547583e-05, "loss": 0.0358, "step": 3580 }, { "epoch": 5.689381933438986, "grad_norm": 0.28694331645965576, "learning_rate": 8.570792429838994e-05, "loss": 0.0301, "step": 3590 }, { "epoch": 5.705229793977813, "grad_norm": 0.4414514899253845, "learning_rate": 8.561609004388511e-05, "loss": 0.0276, "step": 3600 }, { "epoch": 5.72107765451664, "grad_norm": 0.36731958389282227, "learning_rate": 8.552401123253857e-05, "loss": 0.0326, "step": 3610 }, { "epoch": 5.736925515055468, "grad_norm": 0.3216352164745331, "learning_rate": 8.543168849660682e-05, "loss": 0.0351, "step": 3620 }, { "epoch": 5.752773375594295, "grad_norm": 0.2965521812438965, "learning_rate": 8.533912247002116e-05, "loss": 0.0336, "step": 3630 }, { "epoch": 5.768621236133122, "grad_norm": 0.37146931886672974, "learning_rate": 8.524631378838357e-05, "loss": 0.041, "step": 3640 }, { "epoch": 5.784469096671949, "grad_norm": 0.27054694294929504, "learning_rate": 8.515326308896213e-05, "loss": 0.0333, "step": 3650 }, { "epoch": 5.800316957210777, "grad_norm": 0.30338549613952637, "learning_rate": 8.505997101068675e-05, "loss": 0.0305, "step": 3660 }, { "epoch": 5.816164817749604, "grad_norm": 0.2014935314655304, "learning_rate": 8.496643819414476e-05, "loss": 0.0292, "step": 3670 }, { "epoch": 5.832012678288431, "grad_norm": 0.3620418906211853, "learning_rate": 8.48726652815765e-05, "loss": 0.0345, "step": 3680 }, { "epoch": 5.847860538827258, "grad_norm": 0.22847791016101837, "learning_rate": 8.477865291687095e-05, "loss": 0.038, "step": 3690 }, { "epoch": 5.863708399366086, "grad_norm": 0.42736053466796875, "learning_rate": 8.468440174556127e-05, "loss": 0.0341, "step": 3700 }, { "epoch": 5.879556259904913, "grad_norm": 0.2668206989765167, "learning_rate": 8.458991241482036e-05, "loss": 0.0365, "step": 3710 }, { "epoch": 5.89540412044374, "grad_norm": 0.24107444286346436, "learning_rate": 8.449518557345645e-05, "loss": 0.033, "step": 3720 }, { "epoch": 5.911251980982567, "grad_norm": 0.2556779384613037, "learning_rate": 8.440022187190864e-05, "loss": 0.0336, "step": 3730 }, { "epoch": 5.927099841521395, "grad_norm": 0.2224377542734146, "learning_rate": 8.43050219622424e-05, "loss": 0.0257, "step": 3740 }, { "epoch": 5.942947702060222, "grad_norm": 0.247999370098114, "learning_rate": 8.420958649814513e-05, "loss": 0.0325, "step": 3750 }, { "epoch": 5.958795562599049, "grad_norm": 0.3033657670021057, "learning_rate": 8.411391613492165e-05, "loss": 0.0336, "step": 3760 }, { "epoch": 5.974643423137876, "grad_norm": 0.3270326852798462, "learning_rate": 8.401801152948973e-05, "loss": 0.0302, "step": 3770 }, { "epoch": 5.990491283676704, "grad_norm": 0.23401206731796265, "learning_rate": 8.392187334037555e-05, "loss": 0.0308, "step": 3780 }, { "epoch": 6.006339144215531, "grad_norm": 0.2145588994026184, "learning_rate": 8.382550222770915e-05, "loss": 0.035, "step": 3790 }, { "epoch": 6.022187004754358, "grad_norm": 0.27132412791252136, "learning_rate": 8.372889885321996e-05, "loss": 0.0313, "step": 3800 }, { "epoch": 6.038034865293185, "grad_norm": 0.21529650688171387, "learning_rate": 8.363206388023224e-05, "loss": 0.0297, "step": 3810 }, { "epoch": 6.053882725832013, "grad_norm": 0.25313499569892883, "learning_rate": 8.353499797366051e-05, "loss": 0.0255, "step": 3820 }, { "epoch": 6.06973058637084, "grad_norm": 0.19570957124233246, "learning_rate": 8.343770180000497e-05, "loss": 0.0275, "step": 3830 }, { "epoch": 6.085578446909667, "grad_norm": 0.24506336450576782, "learning_rate": 8.334017602734697e-05, "loss": 0.0268, "step": 3840 }, { "epoch": 6.101426307448494, "grad_norm": 0.21346315741539001, "learning_rate": 8.324242132534435e-05, "loss": 0.024, "step": 3850 }, { "epoch": 6.117274167987322, "grad_norm": 0.3212679624557495, "learning_rate": 8.314443836522692e-05, "loss": 0.036, "step": 3860 }, { "epoch": 6.133122028526149, "grad_norm": 0.24916702508926392, "learning_rate": 8.304622781979183e-05, "loss": 0.0271, "step": 3870 }, { "epoch": 6.148969889064976, "grad_norm": 0.30624908208847046, "learning_rate": 8.294779036339893e-05, "loss": 0.0318, "step": 3880 }, { "epoch": 6.164817749603803, "grad_norm": 0.2676468789577484, "learning_rate": 8.284912667196612e-05, "loss": 0.0294, "step": 3890 }, { "epoch": 6.180665610142631, "grad_norm": 0.24745798110961914, "learning_rate": 8.275023742296474e-05, "loss": 0.0303, "step": 3900 }, { "epoch": 6.196513470681458, "grad_norm": 0.2466627061367035, "learning_rate": 8.265112329541495e-05, "loss": 0.0255, "step": 3910 }, { "epoch": 6.212361331220285, "grad_norm": 0.3070094883441925, "learning_rate": 8.255178496988101e-05, "loss": 0.0284, "step": 3920 }, { "epoch": 6.228209191759112, "grad_norm": 0.3049757778644562, "learning_rate": 8.245222312846663e-05, "loss": 0.0286, "step": 3930 }, { "epoch": 6.24405705229794, "grad_norm": 0.3167661428451538, "learning_rate": 8.235243845481029e-05, "loss": 0.0256, "step": 3940 }, { "epoch": 6.259904912836767, "grad_norm": 0.2966691851615906, "learning_rate": 8.225243163408051e-05, "loss": 0.0332, "step": 3950 }, { "epoch": 6.2757527733755945, "grad_norm": 0.29441869258880615, "learning_rate": 8.215220335297124e-05, "loss": 0.0279, "step": 3960 }, { "epoch": 6.291600633914421, "grad_norm": 0.2598278522491455, "learning_rate": 8.205175429969701e-05, "loss": 0.0327, "step": 3970 }, { "epoch": 6.307448494453249, "grad_norm": 0.3308967351913452, "learning_rate": 8.195108516398834e-05, "loss": 0.0301, "step": 3980 }, { "epoch": 6.323296354992076, "grad_norm": 0.2924744486808777, "learning_rate": 8.185019663708689e-05, "loss": 0.035, "step": 3990 }, { "epoch": 6.3391442155309035, "grad_norm": 0.29859915375709534, "learning_rate": 8.174908941174078e-05, "loss": 0.0293, "step": 4000 }, { "epoch": 6.35499207606973, "grad_norm": 0.2642618715763092, "learning_rate": 8.164776418219982e-05, "loss": 0.0377, "step": 4010 }, { "epoch": 6.370839936608558, "grad_norm": 0.25345122814178467, "learning_rate": 8.154622164421075e-05, "loss": 0.0321, "step": 4020 }, { "epoch": 6.386687797147385, "grad_norm": 0.27396586537361145, "learning_rate": 8.144446249501244e-05, "loss": 0.0362, "step": 4030 }, { "epoch": 6.4025356576862125, "grad_norm": 0.23460988700389862, "learning_rate": 8.13424874333311e-05, "loss": 0.0288, "step": 4040 }, { "epoch": 6.418383518225039, "grad_norm": 0.268079549074173, "learning_rate": 8.124029715937552e-05, "loss": 0.0337, "step": 4050 }, { "epoch": 6.434231378763867, "grad_norm": 0.23016807436943054, "learning_rate": 8.113789237483224e-05, "loss": 0.0297, "step": 4060 }, { "epoch": 6.450079239302694, "grad_norm": 0.21488989889621735, "learning_rate": 8.103527378286071e-05, "loss": 0.0226, "step": 4070 }, { "epoch": 6.4659270998415215, "grad_norm": 0.3006250262260437, "learning_rate": 8.093244208808847e-05, "loss": 0.0323, "step": 4080 }, { "epoch": 6.481774960380348, "grad_norm": 0.31131377816200256, "learning_rate": 8.082939799660641e-05, "loss": 0.0263, "step": 4090 }, { "epoch": 6.497622820919176, "grad_norm": 0.3602330982685089, "learning_rate": 8.072614221596372e-05, "loss": 0.0327, "step": 4100 }, { "epoch": 6.513470681458003, "grad_norm": 0.24554632604122162, "learning_rate": 8.062267545516323e-05, "loss": 0.0307, "step": 4110 }, { "epoch": 6.5293185419968305, "grad_norm": 0.3024232089519501, "learning_rate": 8.05189984246564e-05, "loss": 0.031, "step": 4120 }, { "epoch": 6.545166402535658, "grad_norm": 0.20746688544750214, "learning_rate": 8.041511183633855e-05, "loss": 0.0296, "step": 4130 }, { "epoch": 6.561014263074485, "grad_norm": 0.2613235414028168, "learning_rate": 8.03110164035439e-05, "loss": 0.0349, "step": 4140 }, { "epoch": 6.576862123613312, "grad_norm": 0.41507190465927124, "learning_rate": 8.020671284104072e-05, "loss": 0.0377, "step": 4150 }, { "epoch": 6.5927099841521395, "grad_norm": 0.2900952696800232, "learning_rate": 8.010220186502635e-05, "loss": 0.0296, "step": 4160 }, { "epoch": 6.608557844690967, "grad_norm": 0.26226314902305603, "learning_rate": 7.999748419312234e-05, "loss": 0.0289, "step": 4170 }, { "epoch": 6.624405705229794, "grad_norm": 0.3070898950099945, "learning_rate": 7.989256054436956e-05, "loss": 0.0298, "step": 4180 }, { "epoch": 6.640253565768621, "grad_norm": 0.2827918231487274, "learning_rate": 7.978743163922316e-05, "loss": 0.0299, "step": 4190 }, { "epoch": 6.6561014263074485, "grad_norm": 0.2928052842617035, "learning_rate": 7.968209819954768e-05, "loss": 0.0337, "step": 4200 }, { "epoch": 6.671949286846276, "grad_norm": 0.23168888688087463, "learning_rate": 7.957656094861214e-05, "loss": 0.0334, "step": 4210 }, { "epoch": 6.687797147385103, "grad_norm": 0.24511629343032837, "learning_rate": 7.947082061108497e-05, "loss": 0.0302, "step": 4220 }, { "epoch": 6.70364500792393, "grad_norm": 0.24456819891929626, "learning_rate": 7.93648779130291e-05, "loss": 0.0311, "step": 4230 }, { "epoch": 6.7194928684627575, "grad_norm": 0.26930612325668335, "learning_rate": 7.925873358189699e-05, "loss": 0.0291, "step": 4240 }, { "epoch": 6.735340729001585, "grad_norm": 0.18482516705989838, "learning_rate": 7.91523883465256e-05, "loss": 0.0313, "step": 4250 }, { "epoch": 6.751188589540412, "grad_norm": 0.36619842052459717, "learning_rate": 7.904584293713134e-05, "loss": 0.0298, "step": 4260 }, { "epoch": 6.767036450079239, "grad_norm": 0.28840282559394836, "learning_rate": 7.893909808530518e-05, "loss": 0.0318, "step": 4270 }, { "epoch": 6.7828843106180665, "grad_norm": 0.2239818572998047, "learning_rate": 7.883215452400752e-05, "loss": 0.0295, "step": 4280 }, { "epoch": 6.798732171156894, "grad_norm": 0.21004091203212738, "learning_rate": 7.872501298756319e-05, "loss": 0.0284, "step": 4290 }, { "epoch": 6.814580031695721, "grad_norm": 0.21372993290424347, "learning_rate": 7.861767421165644e-05, "loss": 0.031, "step": 4300 }, { "epoch": 6.830427892234549, "grad_norm": 0.20823988318443298, "learning_rate": 7.851013893332584e-05, "loss": 0.0275, "step": 4310 }, { "epoch": 6.8462757527733755, "grad_norm": 0.24077993631362915, "learning_rate": 7.84024078909592e-05, "loss": 0.0267, "step": 4320 }, { "epoch": 6.862123613312203, "grad_norm": 0.29702138900756836, "learning_rate": 7.82944818242886e-05, "loss": 0.0293, "step": 4330 }, { "epoch": 6.87797147385103, "grad_norm": 0.23424126207828522, "learning_rate": 7.818636147438523e-05, "loss": 0.0254, "step": 4340 }, { "epoch": 6.893819334389858, "grad_norm": 0.28826698660850525, "learning_rate": 7.807804758365431e-05, "loss": 0.028, "step": 4350 }, { "epoch": 6.9096671949286845, "grad_norm": 0.25839823484420776, "learning_rate": 7.796954089583e-05, "loss": 0.0339, "step": 4360 }, { "epoch": 6.925515055467512, "grad_norm": 0.25523653626441956, "learning_rate": 7.786084215597029e-05, "loss": 0.0283, "step": 4370 }, { "epoch": 6.941362916006339, "grad_norm": 0.23376896977424622, "learning_rate": 7.775195211045193e-05, "loss": 0.0287, "step": 4380 }, { "epoch": 6.957210776545167, "grad_norm": 0.2951514720916748, "learning_rate": 7.764287150696523e-05, "loss": 0.0279, "step": 4390 }, { "epoch": 6.9730586370839935, "grad_norm": 0.3112223446369171, "learning_rate": 7.753360109450893e-05, "loss": 0.0348, "step": 4400 }, { "epoch": 6.988906497622821, "grad_norm": 0.3574570119380951, "learning_rate": 7.742414162338519e-05, "loss": 0.0315, "step": 4410 }, { "epoch": 7.004754358161648, "grad_norm": 0.25105416774749756, "learning_rate": 7.73144938451942e-05, "loss": 0.0259, "step": 4420 }, { "epoch": 7.020602218700476, "grad_norm": 0.313162624835968, "learning_rate": 7.720465851282927e-05, "loss": 0.0293, "step": 4430 }, { "epoch": 7.0364500792393025, "grad_norm": 0.2756791412830353, "learning_rate": 7.70946363804715e-05, "loss": 0.032, "step": 4440 }, { "epoch": 7.05229793977813, "grad_norm": 0.2672293484210968, "learning_rate": 7.698442820358463e-05, "loss": 0.0295, "step": 4450 }, { "epoch": 7.068145800316957, "grad_norm": 0.27197128534317017, "learning_rate": 7.687403473890988e-05, "loss": 0.0329, "step": 4460 }, { "epoch": 7.083993660855785, "grad_norm": 0.3267204761505127, "learning_rate": 7.676345674446077e-05, "loss": 0.0336, "step": 4470 }, { "epoch": 7.0998415213946116, "grad_norm": 0.3577364683151245, "learning_rate": 7.665269497951787e-05, "loss": 0.0253, "step": 4480 }, { "epoch": 7.115689381933439, "grad_norm": 0.25939124822616577, "learning_rate": 7.65417502046236e-05, "loss": 0.0257, "step": 4490 }, { "epoch": 7.131537242472266, "grad_norm": 0.211978480219841, "learning_rate": 7.6430623181577e-05, "loss": 0.0276, "step": 4500 }, { "epoch": 7.147385103011094, "grad_norm": 0.22676114737987518, "learning_rate": 7.631931467342853e-05, "loss": 0.0264, "step": 4510 }, { "epoch": 7.163232963549921, "grad_norm": 0.3186163604259491, "learning_rate": 7.620782544447483e-05, "loss": 0.0312, "step": 4520 }, { "epoch": 7.179080824088748, "grad_norm": 0.2680210769176483, "learning_rate": 7.609615626025342e-05, "loss": 0.0297, "step": 4530 }, { "epoch": 7.194928684627575, "grad_norm": 0.25488680601119995, "learning_rate": 7.598430788753748e-05, "loss": 0.0309, "step": 4540 }, { "epoch": 7.210776545166403, "grad_norm": 0.25716468691825867, "learning_rate": 7.587228109433061e-05, "loss": 0.0295, "step": 4550 }, { "epoch": 7.22662440570523, "grad_norm": 0.17865824699401855, "learning_rate": 7.576007664986149e-05, "loss": 0.0275, "step": 4560 }, { "epoch": 7.242472266244057, "grad_norm": 0.25337857007980347, "learning_rate": 7.56476953245787e-05, "loss": 0.0309, "step": 4570 }, { "epoch": 7.258320126782884, "grad_norm": 0.23190538585186005, "learning_rate": 7.553513789014531e-05, "loss": 0.0326, "step": 4580 }, { "epoch": 7.274167987321712, "grad_norm": 0.23697835206985474, "learning_rate": 7.542240511943362e-05, "loss": 0.0289, "step": 4590 }, { "epoch": 7.290015847860539, "grad_norm": 0.19046033918857574, "learning_rate": 7.530949778651995e-05, "loss": 0.0272, "step": 4600 }, { "epoch": 7.305863708399366, "grad_norm": 0.2411852329969406, "learning_rate": 7.519641666667918e-05, "loss": 0.0281, "step": 4610 }, { "epoch": 7.321711568938193, "grad_norm": 0.2323843538761139, "learning_rate": 7.508316253637951e-05, "loss": 0.0286, "step": 4620 }, { "epoch": 7.337559429477021, "grad_norm": 0.2985825538635254, "learning_rate": 7.496973617327714e-05, "loss": 0.027, "step": 4630 }, { "epoch": 7.353407290015848, "grad_norm": 0.2772405743598938, "learning_rate": 7.485613835621088e-05, "loss": 0.0287, "step": 4640 }, { "epoch": 7.369255150554675, "grad_norm": 0.28249087929725647, "learning_rate": 7.474236986519679e-05, "loss": 0.029, "step": 4650 }, { "epoch": 7.385103011093502, "grad_norm": 0.2735413908958435, "learning_rate": 7.462843148142292e-05, "loss": 0.0285, "step": 4660 }, { "epoch": 7.40095087163233, "grad_norm": 0.3959973454475403, "learning_rate": 7.451432398724384e-05, "loss": 0.0314, "step": 4670 }, { "epoch": 7.416798732171157, "grad_norm": 0.23869942128658295, "learning_rate": 7.440004816617533e-05, "loss": 0.0302, "step": 4680 }, { "epoch": 7.432646592709984, "grad_norm": 0.2646492123603821, "learning_rate": 7.428560480288896e-05, "loss": 0.0277, "step": 4690 }, { "epoch": 7.448494453248811, "grad_norm": 0.23564158380031586, "learning_rate": 7.417099468320676e-05, "loss": 0.0284, "step": 4700 }, { "epoch": 7.464342313787639, "grad_norm": 0.19051893055438995, "learning_rate": 7.405621859409577e-05, "loss": 0.031, "step": 4710 }, { "epoch": 7.480190174326466, "grad_norm": 0.5017970204353333, "learning_rate": 7.394127732366264e-05, "loss": 0.028, "step": 4720 }, { "epoch": 7.496038034865293, "grad_norm": 0.24149303138256073, "learning_rate": 7.382617166114826e-05, "loss": 0.0263, "step": 4730 }, { "epoch": 7.51188589540412, "grad_norm": 0.2918100357055664, "learning_rate": 7.371090239692228e-05, "loss": 0.029, "step": 4740 }, { "epoch": 7.527733755942948, "grad_norm": 0.41638660430908203, "learning_rate": 7.359547032247773e-05, "loss": 0.0279, "step": 4750 }, { "epoch": 7.543581616481775, "grad_norm": 0.24228066205978394, "learning_rate": 7.347987623042561e-05, "loss": 0.0249, "step": 4760 }, { "epoch": 7.559429477020602, "grad_norm": 0.3426589369773865, "learning_rate": 7.336412091448936e-05, "loss": 0.0291, "step": 4770 }, { "epoch": 7.575277337559429, "grad_norm": 0.381527841091156, "learning_rate": 7.324820516949946e-05, "loss": 0.0329, "step": 4780 }, { "epoch": 7.591125198098257, "grad_norm": 0.26290562748908997, "learning_rate": 7.3132129791388e-05, "loss": 0.0305, "step": 4790 }, { "epoch": 7.606973058637084, "grad_norm": 0.28301799297332764, "learning_rate": 7.301589557718315e-05, "loss": 0.0224, "step": 4800 }, { "epoch": 7.622820919175911, "grad_norm": 0.33471032977104187, "learning_rate": 7.28995033250038e-05, "loss": 0.0356, "step": 4810 }, { "epoch": 7.638668779714738, "grad_norm": 0.219041109085083, "learning_rate": 7.278295383405389e-05, "loss": 0.0278, "step": 4820 }, { "epoch": 7.654516640253566, "grad_norm": 0.27412205934524536, "learning_rate": 7.266624790461713e-05, "loss": 0.0271, "step": 4830 }, { "epoch": 7.6703645007923935, "grad_norm": 0.27656254172325134, "learning_rate": 7.254938633805137e-05, "loss": 0.0296, "step": 4840 }, { "epoch": 7.68621236133122, "grad_norm": 0.23747026920318604, "learning_rate": 7.243236993678311e-05, "loss": 0.0217, "step": 4850 }, { "epoch": 7.702060221870047, "grad_norm": 0.29850152134895325, "learning_rate": 7.231519950430212e-05, "loss": 0.0297, "step": 4860 }, { "epoch": 7.717908082408875, "grad_norm": 0.2872811555862427, "learning_rate": 7.219787584515567e-05, "loss": 0.0274, "step": 4870 }, { "epoch": 7.7337559429477025, "grad_norm": 0.26487553119659424, "learning_rate": 7.208039976494329e-05, "loss": 0.0267, "step": 4880 }, { "epoch": 7.749603803486529, "grad_norm": 0.32571732997894287, "learning_rate": 7.196277207031103e-05, "loss": 0.031, "step": 4890 }, { "epoch": 7.765451664025356, "grad_norm": 0.2101273387670517, "learning_rate": 7.184499356894606e-05, "loss": 0.0261, "step": 4900 }, { "epoch": 7.781299524564184, "grad_norm": 0.3179239332675934, "learning_rate": 7.172706506957095e-05, "loss": 0.0303, "step": 4910 }, { "epoch": 7.7971473851030115, "grad_norm": 0.1984127014875412, "learning_rate": 7.160898738193833e-05, "loss": 0.0226, "step": 4920 }, { "epoch": 7.812995245641838, "grad_norm": 0.19061654806137085, "learning_rate": 7.149076131682521e-05, "loss": 0.0219, "step": 4930 }, { "epoch": 7.828843106180665, "grad_norm": 0.27196112275123596, "learning_rate": 7.137238768602739e-05, "loss": 0.0327, "step": 4940 }, { "epoch": 7.844690966719493, "grad_norm": 0.2761131525039673, "learning_rate": 7.125386730235395e-05, "loss": 0.0258, "step": 4950 }, { "epoch": 7.8605388272583205, "grad_norm": 0.22716206312179565, "learning_rate": 7.113520097962165e-05, "loss": 0.0306, "step": 4960 }, { "epoch": 7.876386687797147, "grad_norm": 0.278010755777359, "learning_rate": 7.101638953264933e-05, "loss": 0.0261, "step": 4970 }, { "epoch": 7.892234548335974, "grad_norm": 0.19748617708683014, "learning_rate": 7.08974337772523e-05, "loss": 0.0216, "step": 4980 }, { "epoch": 7.908082408874802, "grad_norm": 0.35271981358528137, "learning_rate": 7.077833453023678e-05, "loss": 0.0236, "step": 4990 }, { "epoch": 7.9239302694136295, "grad_norm": 0.33073899149894714, "learning_rate": 7.065909260939429e-05, "loss": 0.0274, "step": 5000 }, { "epoch": 7.939778129952456, "grad_norm": 0.36262351274490356, "learning_rate": 7.053970883349599e-05, "loss": 0.0229, "step": 5010 }, { "epoch": 7.955625990491284, "grad_norm": 0.4560012221336365, "learning_rate": 7.04201840222871e-05, "loss": 0.027, "step": 5020 }, { "epoch": 7.971473851030111, "grad_norm": 0.3530636727809906, "learning_rate": 7.03005189964812e-05, "loss": 0.0307, "step": 5030 }, { "epoch": 7.9873217115689386, "grad_norm": 0.2944605052471161, "learning_rate": 7.018071457775474e-05, "loss": 0.0254, "step": 5040 }, { "epoch": 8.003169572107765, "grad_norm": 0.25718453526496887, "learning_rate": 7.006077158874124e-05, "loss": 0.0289, "step": 5050 }, { "epoch": 8.019017432646592, "grad_norm": 0.23285925388336182, "learning_rate": 6.994069085302573e-05, "loss": 0.0278, "step": 5060 }, { "epoch": 8.03486529318542, "grad_norm": 0.2729281485080719, "learning_rate": 6.98204731951391e-05, "loss": 0.0259, "step": 5070 }, { "epoch": 8.050713153724248, "grad_norm": 0.2978493869304657, "learning_rate": 6.970011944055234e-05, "loss": 0.0231, "step": 5080 }, { "epoch": 8.066561014263074, "grad_norm": 0.20820550620555878, "learning_rate": 6.9579630415671e-05, "loss": 0.0281, "step": 5090 }, { "epoch": 8.082408874801901, "grad_norm": 0.23685221374034882, "learning_rate": 6.945900694782949e-05, "loss": 0.0251, "step": 5100 }, { "epoch": 8.09825673534073, "grad_norm": 0.25722959637641907, "learning_rate": 6.933824986528527e-05, "loss": 0.0302, "step": 5110 }, { "epoch": 8.114104595879557, "grad_norm": 0.28215500712394714, "learning_rate": 6.921735999721338e-05, "loss": 0.0218, "step": 5120 }, { "epoch": 8.129952456418383, "grad_norm": 0.24379587173461914, "learning_rate": 6.909633817370051e-05, "loss": 0.0274, "step": 5130 }, { "epoch": 8.14580031695721, "grad_norm": 0.295631468296051, "learning_rate": 6.897518522573951e-05, "loss": 0.0226, "step": 5140 }, { "epoch": 8.161648177496039, "grad_norm": 0.24112898111343384, "learning_rate": 6.885390198522356e-05, "loss": 0.027, "step": 5150 }, { "epoch": 8.177496038034866, "grad_norm": 0.2933104336261749, "learning_rate": 6.873248928494046e-05, "loss": 0.0257, "step": 5160 }, { "epoch": 8.193343898573692, "grad_norm": 0.29547762870788574, "learning_rate": 6.8610947958567e-05, "loss": 0.0242, "step": 5170 }, { "epoch": 8.20919175911252, "grad_norm": 0.27927926182746887, "learning_rate": 6.848927884066311e-05, "loss": 0.0257, "step": 5180 }, { "epoch": 8.225039619651348, "grad_norm": 0.2721002697944641, "learning_rate": 6.836748276666627e-05, "loss": 0.0244, "step": 5190 }, { "epoch": 8.240887480190175, "grad_norm": 0.25311270356178284, "learning_rate": 6.824556057288563e-05, "loss": 0.0279, "step": 5200 }, { "epoch": 8.256735340729001, "grad_norm": 0.23902995884418488, "learning_rate": 6.81235130964964e-05, "loss": 0.0312, "step": 5210 }, { "epoch": 8.272583201267828, "grad_norm": 0.30612844228744507, "learning_rate": 6.8001341175534e-05, "loss": 0.0357, "step": 5220 }, { "epoch": 8.288431061806657, "grad_norm": 0.19130030274391174, "learning_rate": 6.787904564888837e-05, "loss": 0.0242, "step": 5230 }, { "epoch": 8.304278922345484, "grad_norm": 0.2579098045825958, "learning_rate": 6.775662735629816e-05, "loss": 0.0329, "step": 5240 }, { "epoch": 8.32012678288431, "grad_norm": 0.3037128150463104, "learning_rate": 6.763408713834498e-05, "loss": 0.0262, "step": 5250 }, { "epoch": 8.335974643423137, "grad_norm": 0.2066265344619751, "learning_rate": 6.751142583644767e-05, "loss": 0.0311, "step": 5260 }, { "epoch": 8.351822503961966, "grad_norm": 0.19183726608753204, "learning_rate": 6.738864429285648e-05, "loss": 0.0291, "step": 5270 }, { "epoch": 8.367670364500793, "grad_norm": 0.2202986180782318, "learning_rate": 6.72657433506473e-05, "loss": 0.0224, "step": 5280 }, { "epoch": 8.38351822503962, "grad_norm": 0.2542373538017273, "learning_rate": 6.714272385371585e-05, "loss": 0.0254, "step": 5290 }, { "epoch": 8.399366085578446, "grad_norm": 0.33272790908813477, "learning_rate": 6.701958664677191e-05, "loss": 0.0245, "step": 5300 }, { "epoch": 8.415213946117275, "grad_norm": 0.25956010818481445, "learning_rate": 6.68963325753335e-05, "loss": 0.0255, "step": 5310 }, { "epoch": 8.431061806656102, "grad_norm": 0.314311683177948, "learning_rate": 6.677296248572112e-05, "loss": 0.0248, "step": 5320 }, { "epoch": 8.446909667194928, "grad_norm": 0.28039562702178955, "learning_rate": 6.664947722505188e-05, "loss": 0.0282, "step": 5330 }, { "epoch": 8.462757527733755, "grad_norm": 0.23970749974250793, "learning_rate": 6.652587764123373e-05, "loss": 0.0273, "step": 5340 }, { "epoch": 8.478605388272584, "grad_norm": 0.1702006310224533, "learning_rate": 6.640216458295958e-05, "loss": 0.0291, "step": 5350 }, { "epoch": 8.49445324881141, "grad_norm": 0.13902607560157776, "learning_rate": 6.627833889970155e-05, "loss": 0.0241, "step": 5360 }, { "epoch": 8.510301109350237, "grad_norm": 0.2187580019235611, "learning_rate": 6.615440144170502e-05, "loss": 0.027, "step": 5370 }, { "epoch": 8.526148969889064, "grad_norm": 0.2224210649728775, "learning_rate": 6.603035305998301e-05, "loss": 0.0235, "step": 5380 }, { "epoch": 8.541996830427893, "grad_norm": 0.32996585965156555, "learning_rate": 6.590619460631005e-05, "loss": 0.0267, "step": 5390 }, { "epoch": 8.55784469096672, "grad_norm": 0.31346139311790466, "learning_rate": 6.578192693321656e-05, "loss": 0.0194, "step": 5400 }, { "epoch": 8.573692551505546, "grad_norm": 0.198611781001091, "learning_rate": 6.565755089398285e-05, "loss": 0.0256, "step": 5410 }, { "epoch": 8.589540412044373, "grad_norm": 0.2415742725133896, "learning_rate": 6.553306734263342e-05, "loss": 0.0233, "step": 5420 }, { "epoch": 8.605388272583202, "grad_norm": 0.3221810460090637, "learning_rate": 6.540847713393088e-05, "loss": 0.025, "step": 5430 }, { "epoch": 8.621236133122029, "grad_norm": 0.17353218793869019, "learning_rate": 6.528378112337031e-05, "loss": 0.0229, "step": 5440 }, { "epoch": 8.637083993660855, "grad_norm": 0.31122300028800964, "learning_rate": 6.515898016717318e-05, "loss": 0.0229, "step": 5450 }, { "epoch": 8.652931854199682, "grad_norm": 0.27111196517944336, "learning_rate": 6.50340751222816e-05, "loss": 0.0329, "step": 5460 }, { "epoch": 8.66877971473851, "grad_norm": 0.29258912801742554, "learning_rate": 6.49090668463525e-05, "loss": 0.0251, "step": 5470 }, { "epoch": 8.684627575277338, "grad_norm": 0.23192371428012848, "learning_rate": 6.478395619775145e-05, "loss": 0.0294, "step": 5480 }, { "epoch": 8.700475435816164, "grad_norm": 0.31985238194465637, "learning_rate": 6.465874403554711e-05, "loss": 0.0242, "step": 5490 }, { "epoch": 8.716323296354991, "grad_norm": 0.23439311981201172, "learning_rate": 6.453343121950513e-05, "loss": 0.0267, "step": 5500 }, { "epoch": 8.73217115689382, "grad_norm": 0.18457037210464478, "learning_rate": 6.44080186100823e-05, "loss": 0.0232, "step": 5510 }, { "epoch": 8.748019017432647, "grad_norm": 0.2508156895637512, "learning_rate": 6.428250706842064e-05, "loss": 0.0365, "step": 5520 }, { "epoch": 8.763866877971473, "grad_norm": 0.2573819160461426, "learning_rate": 6.415689745634147e-05, "loss": 0.029, "step": 5530 }, { "epoch": 8.7797147385103, "grad_norm": 0.2110164314508438, "learning_rate": 6.403119063633956e-05, "loss": 0.0254, "step": 5540 }, { "epoch": 8.795562599049129, "grad_norm": 0.3200654089450836, "learning_rate": 6.390538747157706e-05, "loss": 0.028, "step": 5550 }, { "epoch": 8.811410459587956, "grad_norm": 0.2371603101491928, "learning_rate": 6.377948882587777e-05, "loss": 0.0217, "step": 5560 }, { "epoch": 8.827258320126782, "grad_norm": 0.2176957130432129, "learning_rate": 6.365349556372105e-05, "loss": 0.0319, "step": 5570 }, { "epoch": 8.843106180665611, "grad_norm": 0.2418396770954132, "learning_rate": 6.352740855023594e-05, "loss": 0.0258, "step": 5580 }, { "epoch": 8.858954041204438, "grad_norm": 0.24693243205547333, "learning_rate": 6.340122865119524e-05, "loss": 0.0293, "step": 5590 }, { "epoch": 8.874801901743265, "grad_norm": 0.249970942735672, "learning_rate": 6.327495673300957e-05, "loss": 0.0276, "step": 5600 }, { "epoch": 8.890649762282091, "grad_norm": 0.21087859570980072, "learning_rate": 6.314859366272132e-05, "loss": 0.0234, "step": 5610 }, { "epoch": 8.906497622820918, "grad_norm": 0.2701822817325592, "learning_rate": 6.302214030799883e-05, "loss": 0.022, "step": 5620 }, { "epoch": 8.922345483359747, "grad_norm": 0.261089950799942, "learning_rate": 6.28955975371304e-05, "loss": 0.0264, "step": 5630 }, { "epoch": 8.938193343898574, "grad_norm": 0.3843868672847748, "learning_rate": 6.276896621901825e-05, "loss": 0.0272, "step": 5640 }, { "epoch": 8.9540412044374, "grad_norm": 0.3247261643409729, "learning_rate": 6.26422472231726e-05, "loss": 0.0275, "step": 5650 }, { "epoch": 8.969889064976229, "grad_norm": 0.27681615948677063, "learning_rate": 6.251544141970578e-05, "loss": 0.0281, "step": 5660 }, { "epoch": 8.985736925515056, "grad_norm": 0.255501925945282, "learning_rate": 6.238854967932612e-05, "loss": 0.0249, "step": 5670 }, { "epoch": 9.001584786053883, "grad_norm": 0.2693521976470947, "learning_rate": 6.2261572873332e-05, "loss": 0.0202, "step": 5680 }, { "epoch": 9.01743264659271, "grad_norm": 0.21597042679786682, "learning_rate": 6.213451187360601e-05, "loss": 0.0238, "step": 5690 }, { "epoch": 9.033280507131538, "grad_norm": 0.3910636007785797, "learning_rate": 6.200736755260877e-05, "loss": 0.023, "step": 5700 }, { "epoch": 9.049128367670365, "grad_norm": 0.22803229093551636, "learning_rate": 6.188014078337305e-05, "loss": 0.0227, "step": 5710 }, { "epoch": 9.064976228209192, "grad_norm": 0.22921766340732574, "learning_rate": 6.175283243949772e-05, "loss": 0.0225, "step": 5720 }, { "epoch": 9.080824088748018, "grad_norm": 0.2634933590888977, "learning_rate": 6.162544339514183e-05, "loss": 0.0304, "step": 5730 }, { "epoch": 9.096671949286847, "grad_norm": 0.5331051349639893, "learning_rate": 6.149797452501851e-05, "loss": 0.0282, "step": 5740 }, { "epoch": 9.112519809825674, "grad_norm": 0.2564757466316223, "learning_rate": 6.137042670438907e-05, "loss": 0.0262, "step": 5750 }, { "epoch": 9.1283676703645, "grad_norm": 0.24122044444084167, "learning_rate": 6.124280080905685e-05, "loss": 0.0243, "step": 5760 }, { "epoch": 9.144215530903328, "grad_norm": 0.20856255292892456, "learning_rate": 6.111509771536138e-05, "loss": 0.0255, "step": 5770 }, { "epoch": 9.160063391442156, "grad_norm": 0.39979806542396545, "learning_rate": 6.098731830017217e-05, "loss": 0.0281, "step": 5780 }, { "epoch": 9.175911251980983, "grad_norm": 0.16420406103134155, "learning_rate": 6.0859463440882866e-05, "loss": 0.0217, "step": 5790 }, { "epoch": 9.19175911251981, "grad_norm": 0.25281447172164917, "learning_rate": 6.073153401540512e-05, "loss": 0.0279, "step": 5800 }, { "epoch": 9.207606973058637, "grad_norm": 0.25699812173843384, "learning_rate": 6.060353090216261e-05, "loss": 0.0258, "step": 5810 }, { "epoch": 9.223454833597465, "grad_norm": 0.19040873646736145, "learning_rate": 6.0475454980084945e-05, "loss": 0.0233, "step": 5820 }, { "epoch": 9.239302694136292, "grad_norm": 0.21894507110118866, "learning_rate": 6.0347307128601716e-05, "loss": 0.0203, "step": 5830 }, { "epoch": 9.255150554675119, "grad_norm": 0.35552018880844116, "learning_rate": 6.021908822763641e-05, "loss": 0.0238, "step": 5840 }, { "epoch": 9.270998415213946, "grad_norm": 0.328046053647995, "learning_rate": 6.0090799157600354e-05, "loss": 0.0249, "step": 5850 }, { "epoch": 9.286846275752774, "grad_norm": 0.23552384972572327, "learning_rate": 5.996244079938671e-05, "loss": 0.0236, "step": 5860 }, { "epoch": 9.302694136291601, "grad_norm": 0.2591778337955475, "learning_rate": 5.983401403436437e-05, "loss": 0.0248, "step": 5870 }, { "epoch": 9.318541996830428, "grad_norm": 0.16465957462787628, "learning_rate": 5.970551974437198e-05, "loss": 0.0208, "step": 5880 }, { "epoch": 9.334389857369255, "grad_norm": 0.25457292795181274, "learning_rate": 5.957695881171184e-05, "loss": 0.033, "step": 5890 }, { "epoch": 9.350237717908083, "grad_norm": 0.19111283123493195, "learning_rate": 5.944833211914382e-05, "loss": 0.0318, "step": 5900 }, { "epoch": 9.36608557844691, "grad_norm": 0.30721551179885864, "learning_rate": 5.931964054987935e-05, "loss": 0.0224, "step": 5910 }, { "epoch": 9.381933438985737, "grad_norm": 0.25978097319602966, "learning_rate": 5.9190884987575336e-05, "loss": 0.0251, "step": 5920 }, { "epoch": 9.397781299524564, "grad_norm": 0.2720729112625122, "learning_rate": 5.906206631632807e-05, "loss": 0.025, "step": 5930 }, { "epoch": 9.413629160063392, "grad_norm": 0.26405835151672363, "learning_rate": 5.8933185420667217e-05, "loss": 0.0266, "step": 5940 }, { "epoch": 9.429477020602219, "grad_norm": 0.27683427929878235, "learning_rate": 5.880424318554967e-05, "loss": 0.0256, "step": 5950 }, { "epoch": 9.445324881141046, "grad_norm": 0.2533441185951233, "learning_rate": 5.867524049635352e-05, "loss": 0.0255, "step": 5960 }, { "epoch": 9.461172741679873, "grad_norm": 0.3351084589958191, "learning_rate": 5.854617823887196e-05, "loss": 0.0257, "step": 5970 }, { "epoch": 9.477020602218701, "grad_norm": 0.2585383951663971, "learning_rate": 5.841705729930721e-05, "loss": 0.0257, "step": 5980 }, { "epoch": 9.492868462757528, "grad_norm": 0.2588648796081543, "learning_rate": 5.828787856426444e-05, "loss": 0.0226, "step": 5990 }, { "epoch": 9.508716323296355, "grad_norm": 0.2622322738170624, "learning_rate": 5.8158642920745655e-05, "loss": 0.0221, "step": 6000 }, { "epoch": 9.524564183835182, "grad_norm": 0.23283162713050842, "learning_rate": 5.802935125614361e-05, "loss": 0.0177, "step": 6010 }, { "epoch": 9.54041204437401, "grad_norm": 0.265953928232193, "learning_rate": 5.790000445823576e-05, "loss": 0.0237, "step": 6020 }, { "epoch": 9.556259904912837, "grad_norm": 0.23547948896884918, "learning_rate": 5.777060341517811e-05, "loss": 0.0254, "step": 6030 }, { "epoch": 9.572107765451664, "grad_norm": 0.3150040805339813, "learning_rate": 5.764114901549914e-05, "loss": 0.0298, "step": 6040 }, { "epoch": 9.58795562599049, "grad_norm": 0.23534265160560608, "learning_rate": 5.7511642148093704e-05, "loss": 0.0208, "step": 6050 }, { "epoch": 9.60380348652932, "grad_norm": 0.2798217833042145, "learning_rate": 5.7382083702216925e-05, "loss": 0.0264, "step": 6060 }, { "epoch": 9.619651347068146, "grad_norm": 0.2324879914522171, "learning_rate": 5.725247456747809e-05, "loss": 0.0315, "step": 6070 }, { "epoch": 9.635499207606973, "grad_norm": 0.25599566102027893, "learning_rate": 5.7122815633834506e-05, "loss": 0.0227, "step": 6080 }, { "epoch": 9.6513470681458, "grad_norm": 0.1766338348388672, "learning_rate": 5.699310779158551e-05, "loss": 0.0222, "step": 6090 }, { "epoch": 9.667194928684628, "grad_norm": 0.2305234670639038, "learning_rate": 5.686335193136616e-05, "loss": 0.0229, "step": 6100 }, { "epoch": 9.683042789223455, "grad_norm": 0.24864676594734192, "learning_rate": 5.673354894414129e-05, "loss": 0.0259, "step": 6110 }, { "epoch": 9.698890649762282, "grad_norm": 0.25202295184135437, "learning_rate": 5.660369972119933e-05, "loss": 0.0237, "step": 6120 }, { "epoch": 9.714738510301109, "grad_norm": 0.32556819915771484, "learning_rate": 5.6473805154146174e-05, "loss": 0.02, "step": 6130 }, { "epoch": 9.730586370839937, "grad_norm": 0.2521624267101288, "learning_rate": 5.634386613489908e-05, "loss": 0.0242, "step": 6140 }, { "epoch": 9.746434231378764, "grad_norm": 0.25148093700408936, "learning_rate": 5.6213883555680516e-05, "loss": 0.0269, "step": 6150 }, { "epoch": 9.76228209191759, "grad_norm": 0.22112874686717987, "learning_rate": 5.608385830901206e-05, "loss": 0.0285, "step": 6160 }, { "epoch": 9.778129952456418, "grad_norm": 0.33593472838401794, "learning_rate": 5.5953791287708254e-05, "loss": 0.03, "step": 6170 }, { "epoch": 9.793977812995246, "grad_norm": 0.306130975484848, "learning_rate": 5.5823683384870554e-05, "loss": 0.0244, "step": 6180 }, { "epoch": 9.809825673534073, "grad_norm": 0.3085562288761139, "learning_rate": 5.569353549388103e-05, "loss": 0.027, "step": 6190 }, { "epoch": 9.8256735340729, "grad_norm": 0.2247430682182312, "learning_rate": 5.556334850839637e-05, "loss": 0.0234, "step": 6200 }, { "epoch": 9.841521394611727, "grad_norm": 0.26314494013786316, "learning_rate": 5.543312332234174e-05, "loss": 0.024, "step": 6210 }, { "epoch": 9.857369255150555, "grad_norm": 0.22496825456619263, "learning_rate": 5.530286082990454e-05, "loss": 0.0194, "step": 6220 }, { "epoch": 9.873217115689382, "grad_norm": 0.29987284541130066, "learning_rate": 5.5172561925528386e-05, "loss": 0.0252, "step": 6230 }, { "epoch": 9.889064976228209, "grad_norm": 0.3042098581790924, "learning_rate": 5.5042227503906894e-05, "loss": 0.0246, "step": 6240 }, { "epoch": 9.904912836767036, "grad_norm": 0.22687886655330658, "learning_rate": 5.491185845997757e-05, "loss": 0.026, "step": 6250 }, { "epoch": 9.920760697305864, "grad_norm": 0.2479943484067917, "learning_rate": 5.478145568891562e-05, "loss": 0.0289, "step": 6260 }, { "epoch": 9.936608557844691, "grad_norm": 0.20297874510288239, "learning_rate": 5.465102008612789e-05, "loss": 0.0233, "step": 6270 }, { "epoch": 9.952456418383518, "grad_norm": 0.17246457934379578, "learning_rate": 5.452055254724664e-05, "loss": 0.0253, "step": 6280 }, { "epoch": 9.968304278922346, "grad_norm": 0.24328118562698364, "learning_rate": 5.4390053968123386e-05, "loss": 0.025, "step": 6290 }, { "epoch": 9.984152139461173, "grad_norm": 0.18752968311309814, "learning_rate": 5.425952524482283e-05, "loss": 0.024, "step": 6300 }, { "epoch": 10.0, "grad_norm": 0.18232440948486328, "learning_rate": 5.4128967273616625e-05, "loss": 0.0241, "step": 6310 }, { "epoch": 10.015847860538827, "grad_norm": 0.22801880538463593, "learning_rate": 5.3998380950977266e-05, "loss": 0.0209, "step": 6320 }, { "epoch": 10.031695721077655, "grad_norm": 0.21135802567005157, "learning_rate": 5.386776717357193e-05, "loss": 0.0234, "step": 6330 }, { "epoch": 10.047543581616482, "grad_norm": 0.2743472754955292, "learning_rate": 5.373712683825629e-05, "loss": 0.0237, "step": 6340 }, { "epoch": 10.063391442155309, "grad_norm": 0.2664951682090759, "learning_rate": 5.3606460842068426e-05, "loss": 0.0249, "step": 6350 }, { "epoch": 10.079239302694136, "grad_norm": 0.20999731123447418, "learning_rate": 5.347577008222253e-05, "loss": 0.0244, "step": 6360 }, { "epoch": 10.095087163232964, "grad_norm": 0.18719319999217987, "learning_rate": 5.334505545610293e-05, "loss": 0.0239, "step": 6370 }, { "epoch": 10.110935023771791, "grad_norm": 0.17207162082195282, "learning_rate": 5.321431786125778e-05, "loss": 0.0218, "step": 6380 }, { "epoch": 10.126782884310618, "grad_norm": 0.21071314811706543, "learning_rate": 5.3083558195392936e-05, "loss": 0.021, "step": 6390 }, { "epoch": 10.142630744849445, "grad_norm": 0.21377994120121002, "learning_rate": 5.295277735636583e-05, "loss": 0.0226, "step": 6400 }, { "epoch": 10.158478605388273, "grad_norm": 0.16608726978302002, "learning_rate": 5.282197624217928e-05, "loss": 0.0227, "step": 6410 }, { "epoch": 10.1743264659271, "grad_norm": 0.19757942855358124, "learning_rate": 5.2691155750975316e-05, "loss": 0.0196, "step": 6420 }, { "epoch": 10.190174326465927, "grad_norm": 0.1993936449289322, "learning_rate": 5.2560316781029005e-05, "loss": 0.0199, "step": 6430 }, { "epoch": 10.206022187004754, "grad_norm": 0.20808455348014832, "learning_rate": 5.2429460230742346e-05, "loss": 0.0214, "step": 6440 }, { "epoch": 10.221870047543582, "grad_norm": 0.1672813892364502, "learning_rate": 5.2298586998637956e-05, "loss": 0.0243, "step": 6450 }, { "epoch": 10.23771790808241, "grad_norm": 0.26778897643089294, "learning_rate": 5.216769798335311e-05, "loss": 0.025, "step": 6460 }, { "epoch": 10.253565768621236, "grad_norm": 0.22870604693889618, "learning_rate": 5.203679408363341e-05, "loss": 0.021, "step": 6470 }, { "epoch": 10.269413629160063, "grad_norm": 0.2953716516494751, "learning_rate": 5.190587619832664e-05, "loss": 0.0215, "step": 6480 }, { "epoch": 10.285261489698891, "grad_norm": 0.3255462944507599, "learning_rate": 5.1774945226376624e-05, "loss": 0.0166, "step": 6490 }, { "epoch": 10.301109350237718, "grad_norm": 0.17969000339508057, "learning_rate": 5.1644002066817063e-05, "loss": 0.0205, "step": 6500 }, { "epoch": 10.316957210776545, "grad_norm": 0.2460571676492691, "learning_rate": 5.151304761876536e-05, "loss": 0.0201, "step": 6510 }, { "epoch": 10.332805071315372, "grad_norm": 0.178553506731987, "learning_rate": 5.1382082781416396e-05, "loss": 0.0203, "step": 6520 }, { "epoch": 10.3486529318542, "grad_norm": 0.18054994940757751, "learning_rate": 5.125110845403638e-05, "loss": 0.0204, "step": 6530 }, { "epoch": 10.364500792393027, "grad_norm": 0.2226029634475708, "learning_rate": 5.112012553595671e-05, "loss": 0.0202, "step": 6540 }, { "epoch": 10.380348652931854, "grad_norm": 0.23070666193962097, "learning_rate": 5.0989134926567785e-05, "loss": 0.0205, "step": 6550 }, { "epoch": 10.39619651347068, "grad_norm": 0.1447778195142746, "learning_rate": 5.085813752531278e-05, "loss": 0.0273, "step": 6560 }, { "epoch": 10.41204437400951, "grad_norm": 0.18221695721149445, "learning_rate": 5.072713423168154e-05, "loss": 0.0196, "step": 6570 }, { "epoch": 10.427892234548336, "grad_norm": 0.2584993839263916, "learning_rate": 5.0596125945204334e-05, "loss": 0.0205, "step": 6580 }, { "epoch": 10.443740095087163, "grad_norm": 0.19126753509044647, "learning_rate": 5.046511356544574e-05, "loss": 0.0226, "step": 6590 }, { "epoch": 10.45958795562599, "grad_norm": 0.19277669489383698, "learning_rate": 5.033409799199844e-05, "loss": 0.0195, "step": 6600 }, { "epoch": 10.475435816164818, "grad_norm": 0.22546206414699554, "learning_rate": 5.020308012447704e-05, "loss": 0.022, "step": 6610 }, { "epoch": 10.491283676703645, "grad_norm": 0.26715290546417236, "learning_rate": 5.0072060862511893e-05, "loss": 0.0232, "step": 6620 }, { "epoch": 10.507131537242472, "grad_norm": 0.23546898365020752, "learning_rate": 4.994104110574295e-05, "loss": 0.0233, "step": 6630 }, { "epoch": 10.522979397781299, "grad_norm": 0.38194459676742554, "learning_rate": 4.981002175381352e-05, "loss": 0.0266, "step": 6640 }, { "epoch": 10.538827258320127, "grad_norm": 0.17723363637924194, "learning_rate": 4.9679003706364185e-05, "loss": 0.0249, "step": 6650 }, { "epoch": 10.554675118858954, "grad_norm": 0.30575594305992126, "learning_rate": 4.9547987863026507e-05, "loss": 0.0268, "step": 6660 }, { "epoch": 10.570522979397781, "grad_norm": 0.2724224328994751, "learning_rate": 4.9416975123416966e-05, "loss": 0.0216, "step": 6670 }, { "epoch": 10.586370839936608, "grad_norm": 0.3302716910839081, "learning_rate": 4.92859663871307e-05, "loss": 0.0222, "step": 6680 }, { "epoch": 10.602218700475436, "grad_norm": 0.182839035987854, "learning_rate": 4.915496255373537e-05, "loss": 0.0241, "step": 6690 }, { "epoch": 10.618066561014263, "grad_norm": 0.18011973798274994, "learning_rate": 4.902396452276498e-05, "loss": 0.0166, "step": 6700 }, { "epoch": 10.63391442155309, "grad_norm": 0.2910979688167572, "learning_rate": 4.8892973193713684e-05, "loss": 0.0268, "step": 6710 }, { "epoch": 10.649762282091917, "grad_norm": 0.20945270359516144, "learning_rate": 4.876198946602963e-05, "loss": 0.0243, "step": 6720 }, { "epoch": 10.665610142630745, "grad_norm": 0.2104242444038391, "learning_rate": 4.86310142391087e-05, "loss": 0.0217, "step": 6730 }, { "epoch": 10.681458003169572, "grad_norm": 0.22012865543365479, "learning_rate": 4.850004841228852e-05, "loss": 0.0187, "step": 6740 }, { "epoch": 10.697305863708399, "grad_norm": 0.252900093793869, "learning_rate": 4.836909288484208e-05, "loss": 0.0284, "step": 6750 }, { "epoch": 10.713153724247226, "grad_norm": 0.2362486571073532, "learning_rate": 4.8238148555971704e-05, "loss": 0.0178, "step": 6760 }, { "epoch": 10.729001584786054, "grad_norm": 0.28352028131484985, "learning_rate": 4.81072163248028e-05, "loss": 0.0281, "step": 6770 }, { "epoch": 10.744849445324881, "grad_norm": 0.31054121255874634, "learning_rate": 4.7976297090377706e-05, "loss": 0.0271, "step": 6780 }, { "epoch": 10.760697305863708, "grad_norm": 0.15438808500766754, "learning_rate": 4.7845391751649505e-05, "loss": 0.0256, "step": 6790 }, { "epoch": 10.776545166402535, "grad_norm": 0.17651043832302094, "learning_rate": 4.7714501207475884e-05, "loss": 0.0218, "step": 6800 }, { "epoch": 10.792393026941363, "grad_norm": 0.2993830740451813, "learning_rate": 4.7583626356612954e-05, "loss": 0.0219, "step": 6810 }, { "epoch": 10.80824088748019, "grad_norm": 0.21443192660808563, "learning_rate": 4.745276809770905e-05, "loss": 0.0198, "step": 6820 }, { "epoch": 10.824088748019017, "grad_norm": 0.22990483045578003, "learning_rate": 4.732192732929858e-05, "loss": 0.024, "step": 6830 }, { "epoch": 10.839936608557844, "grad_norm": 0.2523830831050873, "learning_rate": 4.7191104949795845e-05, "loss": 0.02, "step": 6840 }, { "epoch": 10.855784469096672, "grad_norm": 0.19074945151805878, "learning_rate": 4.706030185748894e-05, "loss": 0.0235, "step": 6850 }, { "epoch": 10.8716323296355, "grad_norm": 0.17805525660514832, "learning_rate": 4.692951895053342e-05, "loss": 0.024, "step": 6860 }, { "epoch": 10.887480190174326, "grad_norm": 0.25457364320755005, "learning_rate": 4.6798757126946324e-05, "loss": 0.0225, "step": 6870 }, { "epoch": 10.903328050713153, "grad_norm": 0.2769658863544464, "learning_rate": 4.6668017284599866e-05, "loss": 0.0186, "step": 6880 }, { "epoch": 10.919175911251982, "grad_norm": 0.27840906381607056, "learning_rate": 4.653730032121539e-05, "loss": 0.0213, "step": 6890 }, { "epoch": 10.935023771790808, "grad_norm": 0.31035539507865906, "learning_rate": 4.640660713435709e-05, "loss": 0.022, "step": 6900 }, { "epoch": 10.950871632329635, "grad_norm": 0.2523256540298462, "learning_rate": 4.627593862142594e-05, "loss": 0.0261, "step": 6910 }, { "epoch": 10.966719492868462, "grad_norm": 0.2741487920284271, "learning_rate": 4.61452956796534e-05, "loss": 0.0243, "step": 6920 }, { "epoch": 10.98256735340729, "grad_norm": 0.18995286524295807, "learning_rate": 4.601467920609547e-05, "loss": 0.0261, "step": 6930 }, { "epoch": 10.998415213946117, "grad_norm": 0.33396896719932556, "learning_rate": 4.588409009762634e-05, "loss": 0.0268, "step": 6940 }, { "epoch": 11.014263074484944, "grad_norm": 0.2645708918571472, "learning_rate": 4.575352925093229e-05, "loss": 0.0221, "step": 6950 }, { "epoch": 11.030110935023771, "grad_norm": 0.21601872146129608, "learning_rate": 4.562299756250557e-05, "loss": 0.0197, "step": 6960 }, { "epoch": 11.0459587955626, "grad_norm": 0.26823803782463074, "learning_rate": 4.549249592863822e-05, "loss": 0.0318, "step": 6970 }, { "epoch": 11.061806656101426, "grad_norm": 0.40468984842300415, "learning_rate": 4.536202524541588e-05, "loss": 0.0201, "step": 6980 }, { "epoch": 11.077654516640253, "grad_norm": 0.2228170931339264, "learning_rate": 4.5231586408711684e-05, "loss": 0.0232, "step": 6990 }, { "epoch": 11.09350237717908, "grad_norm": 0.17821644246578217, "learning_rate": 4.510118031418009e-05, "loss": 0.0193, "step": 7000 }, { "epoch": 11.109350237717909, "grad_norm": 0.22201032936573029, "learning_rate": 4.4970807857250745e-05, "loss": 0.0235, "step": 7010 }, { "epoch": 11.125198098256735, "grad_norm": 0.16020157933235168, "learning_rate": 4.4840469933122314e-05, "loss": 0.0206, "step": 7020 }, { "epoch": 11.141045958795562, "grad_norm": 0.18815340101718903, "learning_rate": 4.471016743675633e-05, "loss": 0.0202, "step": 7030 }, { "epoch": 11.15689381933439, "grad_norm": 0.2237204611301422, "learning_rate": 4.457990126287112e-05, "loss": 0.021, "step": 7040 }, { "epoch": 11.172741679873218, "grad_norm": 0.2936099171638489, "learning_rate": 4.444967230593551e-05, "loss": 0.0203, "step": 7050 }, { "epoch": 11.188589540412044, "grad_norm": 0.1436583399772644, "learning_rate": 4.431948146016286e-05, "loss": 0.0197, "step": 7060 }, { "epoch": 11.204437400950871, "grad_norm": 0.2675095796585083, "learning_rate": 4.418932961950478e-05, "loss": 0.02, "step": 7070 }, { "epoch": 11.2202852614897, "grad_norm": 0.23882818222045898, "learning_rate": 4.405921767764511e-05, "loss": 0.0217, "step": 7080 }, { "epoch": 11.236133122028527, "grad_norm": 0.2709539830684662, "learning_rate": 4.392914652799368e-05, "loss": 0.0209, "step": 7090 }, { "epoch": 11.251980982567353, "grad_norm": 0.18802231550216675, "learning_rate": 4.3799117063680254e-05, "loss": 0.0173, "step": 7100 }, { "epoch": 11.26782884310618, "grad_norm": 0.25173911452293396, "learning_rate": 4.366913017754836e-05, "loss": 0.0228, "step": 7110 }, { "epoch": 11.283676703645009, "grad_norm": 0.2181670218706131, "learning_rate": 4.3539186762149106e-05, "loss": 0.016, "step": 7120 }, { "epoch": 11.299524564183836, "grad_norm": 0.18725943565368652, "learning_rate": 4.3409287709735204e-05, "loss": 0.0234, "step": 7130 }, { "epoch": 11.315372424722662, "grad_norm": 0.3149115741252899, "learning_rate": 4.3279433912254675e-05, "loss": 0.0213, "step": 7140 }, { "epoch": 11.33122028526149, "grad_norm": 0.2042395919561386, "learning_rate": 4.314962626134484e-05, "loss": 0.0206, "step": 7150 }, { "epoch": 11.347068145800318, "grad_norm": 0.14478328824043274, "learning_rate": 4.301986564832613e-05, "loss": 0.0203, "step": 7160 }, { "epoch": 11.362916006339145, "grad_norm": 0.20697103440761566, "learning_rate": 4.289015296419603e-05, "loss": 0.0156, "step": 7170 }, { "epoch": 11.378763866877971, "grad_norm": 0.2516174912452698, "learning_rate": 4.276048909962286e-05, "loss": 0.021, "step": 7180 }, { "epoch": 11.394611727416798, "grad_norm": 0.30749985575675964, "learning_rate": 4.263087494493977e-05, "loss": 0.0189, "step": 7190 }, { "epoch": 11.410459587955627, "grad_norm": 0.2317238450050354, "learning_rate": 4.2501311390138574e-05, "loss": 0.0245, "step": 7200 }, { "epoch": 11.426307448494454, "grad_norm": 0.24530279636383057, "learning_rate": 4.2371799324863614e-05, "loss": 0.0185, "step": 7210 }, { "epoch": 11.44215530903328, "grad_norm": 0.16856257617473602, "learning_rate": 4.224233963840574e-05, "loss": 0.0223, "step": 7220 }, { "epoch": 11.458003169572107, "grad_norm": 0.15289132297039032, "learning_rate": 4.2112933219696106e-05, "loss": 0.0157, "step": 7230 }, { "epoch": 11.473851030110936, "grad_norm": 0.17484936118125916, "learning_rate": 4.198358095730006e-05, "loss": 0.0212, "step": 7240 }, { "epoch": 11.489698890649763, "grad_norm": 0.18419259786605835, "learning_rate": 4.185428373941115e-05, "loss": 0.0207, "step": 7250 }, { "epoch": 11.50554675118859, "grad_norm": 0.2928980588912964, "learning_rate": 4.172504245384496e-05, "loss": 0.0217, "step": 7260 }, { "epoch": 11.521394611727416, "grad_norm": 0.19275160133838654, "learning_rate": 4.1595857988033e-05, "loss": 0.0194, "step": 7270 }, { "epoch": 11.537242472266245, "grad_norm": 0.3847340941429138, "learning_rate": 4.146673122901662e-05, "loss": 0.0199, "step": 7280 }, { "epoch": 11.553090332805072, "grad_norm": 0.25312259793281555, "learning_rate": 4.1337663063440946e-05, "loss": 0.0174, "step": 7290 }, { "epoch": 11.568938193343898, "grad_norm": 0.274879515171051, "learning_rate": 4.120865437754877e-05, "loss": 0.0238, "step": 7300 }, { "epoch": 11.584786053882725, "grad_norm": 0.22004622220993042, "learning_rate": 4.1079706057174455e-05, "loss": 0.0231, "step": 7310 }, { "epoch": 11.600633914421554, "grad_norm": 0.4630294740200043, "learning_rate": 4.095081898773787e-05, "loss": 0.022, "step": 7320 }, { "epoch": 11.61648177496038, "grad_norm": 0.15254133939743042, "learning_rate": 4.0821994054238325e-05, "loss": 0.0218, "step": 7330 }, { "epoch": 11.632329635499207, "grad_norm": 0.18909721076488495, "learning_rate": 4.069323214124845e-05, "loss": 0.0241, "step": 7340 }, { "epoch": 11.648177496038034, "grad_norm": 0.18203580379486084, "learning_rate": 4.0564534132908164e-05, "loss": 0.0206, "step": 7350 }, { "epoch": 11.664025356576863, "grad_norm": 0.31021520495414734, "learning_rate": 4.04359009129186e-05, "loss": 0.0229, "step": 7360 }, { "epoch": 11.67987321711569, "grad_norm": 0.21043580770492554, "learning_rate": 4.0307333364535973e-05, "loss": 0.0243, "step": 7370 }, { "epoch": 11.695721077654516, "grad_norm": 0.17714616656303406, "learning_rate": 4.017883237056561e-05, "loss": 0.02, "step": 7380 }, { "epoch": 11.711568938193343, "grad_norm": 0.23153972625732422, "learning_rate": 4.005039881335583e-05, "loss": 0.0178, "step": 7390 }, { "epoch": 11.727416798732172, "grad_norm": 0.7659839391708374, "learning_rate": 3.99220335747919e-05, "loss": 0.0213, "step": 7400 }, { "epoch": 11.743264659270999, "grad_norm": 0.2092520147562027, "learning_rate": 3.979373753628999e-05, "loss": 0.023, "step": 7410 }, { "epoch": 11.759112519809825, "grad_norm": 0.3415199816226959, "learning_rate": 3.9665511578791096e-05, "loss": 0.021, "step": 7420 }, { "epoch": 11.774960380348652, "grad_norm": 0.31222307682037354, "learning_rate": 3.9537356582755034e-05, "loss": 0.0214, "step": 7430 }, { "epoch": 11.79080824088748, "grad_norm": 0.18112266063690186, "learning_rate": 3.940927342815428e-05, "loss": 0.0234, "step": 7440 }, { "epoch": 11.806656101426308, "grad_norm": 0.28897473216056824, "learning_rate": 3.9281262994468114e-05, "loss": 0.0258, "step": 7450 }, { "epoch": 11.822503961965134, "grad_norm": 0.28549882769584656, "learning_rate": 3.915332616067643e-05, "loss": 0.0188, "step": 7460 }, { "epoch": 11.838351822503961, "grad_norm": 0.19967828691005707, "learning_rate": 3.9025463805253765e-05, "loss": 0.0201, "step": 7470 }, { "epoch": 11.85419968304279, "grad_norm": 0.27357855439186096, "learning_rate": 3.889767680616324e-05, "loss": 0.0193, "step": 7480 }, { "epoch": 11.870047543581617, "grad_norm": 0.202061265707016, "learning_rate": 3.8769966040850566e-05, "loss": 0.0188, "step": 7490 }, { "epoch": 11.885895404120443, "grad_norm": 0.24488794803619385, "learning_rate": 3.864233238623796e-05, "loss": 0.0177, "step": 7500 }, { "epoch": 11.90174326465927, "grad_norm": 0.23348113894462585, "learning_rate": 3.851477671871818e-05, "loss": 0.0189, "step": 7510 }, { "epoch": 11.917591125198099, "grad_norm": 0.31944724917411804, "learning_rate": 3.838729991414852e-05, "loss": 0.0211, "step": 7520 }, { "epoch": 11.933438985736926, "grad_norm": 0.24721786379814148, "learning_rate": 3.82599028478447e-05, "loss": 0.0159, "step": 7530 }, { "epoch": 11.949286846275752, "grad_norm": 0.2412160336971283, "learning_rate": 3.8132586394574974e-05, "loss": 0.0231, "step": 7540 }, { "epoch": 11.96513470681458, "grad_norm": 0.2842359244823456, "learning_rate": 3.8005351428554036e-05, "loss": 0.0179, "step": 7550 }, { "epoch": 11.980982567353408, "grad_norm": 0.19113971292972565, "learning_rate": 3.78781988234371e-05, "loss": 0.0178, "step": 7560 }, { "epoch": 11.996830427892235, "grad_norm": 0.24129873514175415, "learning_rate": 3.775112945231377e-05, "loss": 0.0214, "step": 7570 }, { "epoch": 12.012678288431061, "grad_norm": 0.30563119053840637, "learning_rate": 3.7624144187702174e-05, "loss": 0.0207, "step": 7580 }, { "epoch": 12.028526148969888, "grad_norm": 0.16946931183338165, "learning_rate": 3.7497243901542934e-05, "loss": 0.0194, "step": 7590 }, { "epoch": 12.044374009508717, "grad_norm": 0.23966370522975922, "learning_rate": 3.7370429465193154e-05, "loss": 0.0198, "step": 7600 }, { "epoch": 12.060221870047544, "grad_norm": 0.2549941837787628, "learning_rate": 3.724370174942047e-05, "loss": 0.023, "step": 7610 }, { "epoch": 12.07606973058637, "grad_norm": 0.2220945656299591, "learning_rate": 3.711706162439704e-05, "loss": 0.0174, "step": 7620 }, { "epoch": 12.091917591125197, "grad_norm": 0.16276349127292633, "learning_rate": 3.699050995969354e-05, "loss": 0.0192, "step": 7630 }, { "epoch": 12.107765451664026, "grad_norm": 0.3065180778503418, "learning_rate": 3.6864047624273325e-05, "loss": 0.019, "step": 7640 }, { "epoch": 12.123613312202853, "grad_norm": 0.19206896424293518, "learning_rate": 3.67376754864863e-05, "loss": 0.0149, "step": 7650 }, { "epoch": 12.13946117274168, "grad_norm": 0.21416613459587097, "learning_rate": 3.6611394414063074e-05, "loss": 0.0179, "step": 7660 }, { "epoch": 12.155309033280506, "grad_norm": 0.2737729251384735, "learning_rate": 3.6485205274108936e-05, "loss": 0.0235, "step": 7670 }, { "epoch": 12.171156893819335, "grad_norm": 0.17268019914627075, "learning_rate": 3.635910893309792e-05, "loss": 0.0162, "step": 7680 }, { "epoch": 12.187004754358162, "grad_norm": 0.23836471140384674, "learning_rate": 3.6233106256866895e-05, "loss": 0.0174, "step": 7690 }, { "epoch": 12.202852614896988, "grad_norm": 0.447587788105011, "learning_rate": 3.610719811060952e-05, "loss": 0.0189, "step": 7700 }, { "epoch": 12.218700475435817, "grad_norm": 0.21118977665901184, "learning_rate": 3.598138535887041e-05, "loss": 0.0183, "step": 7710 }, { "epoch": 12.234548335974644, "grad_norm": 0.257715106010437, "learning_rate": 3.585566886553917e-05, "loss": 0.0209, "step": 7720 }, { "epoch": 12.25039619651347, "grad_norm": 0.295749694108963, "learning_rate": 3.5730049493844405e-05, "loss": 0.0261, "step": 7730 }, { "epoch": 12.266244057052297, "grad_norm": 0.3179740607738495, "learning_rate": 3.560452810634787e-05, "loss": 0.0214, "step": 7740 }, { "epoch": 12.282091917591124, "grad_norm": 0.1746010035276413, "learning_rate": 3.547910556493852e-05, "loss": 0.0208, "step": 7750 }, { "epoch": 12.297939778129953, "grad_norm": 0.2330365628004074, "learning_rate": 3.535378273082656e-05, "loss": 0.0208, "step": 7760 }, { "epoch": 12.31378763866878, "grad_norm": 0.39738985896110535, "learning_rate": 3.5228560464537535e-05, "loss": 0.0239, "step": 7770 }, { "epoch": 12.329635499207606, "grad_norm": 0.2947781980037689, "learning_rate": 3.510343962590653e-05, "loss": 0.0191, "step": 7780 }, { "epoch": 12.345483359746435, "grad_norm": 0.21791400015354156, "learning_rate": 3.49784210740721e-05, "loss": 0.0264, "step": 7790 }, { "epoch": 12.361331220285262, "grad_norm": 0.19092513620853424, "learning_rate": 3.485350566747049e-05, "loss": 0.0248, "step": 7800 }, { "epoch": 12.377179080824089, "grad_norm": 0.35505980253219604, "learning_rate": 3.4728694263829684e-05, "loss": 0.0199, "step": 7810 }, { "epoch": 12.393026941362915, "grad_norm": 0.1710539311170578, "learning_rate": 3.460398772016355e-05, "loss": 0.019, "step": 7820 }, { "epoch": 12.408874801901744, "grad_norm": 0.33750495314598083, "learning_rate": 3.4479386892765905e-05, "loss": 0.0205, "step": 7830 }, { "epoch": 12.42472266244057, "grad_norm": 0.2829129099845886, "learning_rate": 3.43548926372047e-05, "loss": 0.0198, "step": 7840 }, { "epoch": 12.440570522979398, "grad_norm": 0.18969641625881195, "learning_rate": 3.423050580831611e-05, "loss": 0.0205, "step": 7850 }, { "epoch": 12.456418383518225, "grad_norm": 0.2330506592988968, "learning_rate": 3.410622726019865e-05, "loss": 0.0213, "step": 7860 }, { "epoch": 12.472266244057053, "grad_norm": 0.2536896765232086, "learning_rate": 3.398205784620735e-05, "loss": 0.0207, "step": 7870 }, { "epoch": 12.48811410459588, "grad_norm": 0.16537010669708252, "learning_rate": 3.3857998418947864e-05, "loss": 0.0169, "step": 7880 }, { "epoch": 12.503961965134707, "grad_norm": 0.2565062344074249, "learning_rate": 3.373404983027062e-05, "loss": 0.0214, "step": 7890 }, { "epoch": 12.519809825673534, "grad_norm": 0.18320074677467346, "learning_rate": 3.361021293126497e-05, "loss": 0.0166, "step": 7900 }, { "epoch": 12.535657686212362, "grad_norm": 0.2510707378387451, "learning_rate": 3.3486488572253385e-05, "loss": 0.0173, "step": 7910 }, { "epoch": 12.551505546751189, "grad_norm": 0.24890565872192383, "learning_rate": 3.3362877602785524e-05, "loss": 0.0196, "step": 7920 }, { "epoch": 12.567353407290016, "grad_norm": 0.25348639488220215, "learning_rate": 3.3239380871632543e-05, "loss": 0.0201, "step": 7930 }, { "epoch": 12.583201267828843, "grad_norm": 0.2547270655632019, "learning_rate": 3.3115999226781135e-05, "loss": 0.0163, "step": 7940 }, { "epoch": 12.599049128367671, "grad_norm": 0.1903742551803589, "learning_rate": 3.299273351542773e-05, "loss": 0.0162, "step": 7950 }, { "epoch": 12.614896988906498, "grad_norm": 0.14592960476875305, "learning_rate": 3.286958458397273e-05, "loss": 0.0218, "step": 7960 }, { "epoch": 12.630744849445325, "grad_norm": 0.220992311835289, "learning_rate": 3.27465532780147e-05, "loss": 0.0193, "step": 7970 }, { "epoch": 12.646592709984152, "grad_norm": 0.3510618209838867, "learning_rate": 3.2623640442344505e-05, "loss": 0.021, "step": 7980 }, { "epoch": 12.66244057052298, "grad_norm": 0.1398414969444275, "learning_rate": 3.250084692093953e-05, "loss": 0.0199, "step": 7990 }, { "epoch": 12.678288431061807, "grad_norm": 0.24324694275856018, "learning_rate": 3.237817355695791e-05, "loss": 0.0172, "step": 8000 }, { "epoch": 12.694136291600634, "grad_norm": 0.20084106922149658, "learning_rate": 3.225562119273272e-05, "loss": 0.0134, "step": 8010 }, { "epoch": 12.70998415213946, "grad_norm": 0.20435374975204468, "learning_rate": 3.213319066976617e-05, "loss": 0.019, "step": 8020 }, { "epoch": 12.72583201267829, "grad_norm": 0.21612811088562012, "learning_rate": 3.201088282872387e-05, "loss": 0.0159, "step": 8030 }, { "epoch": 12.741679873217116, "grad_norm": 0.2342618703842163, "learning_rate": 3.188869850942905e-05, "loss": 0.0186, "step": 8040 }, { "epoch": 12.757527733755943, "grad_norm": 0.20277902483940125, "learning_rate": 3.176663855085677e-05, "loss": 0.0209, "step": 8050 }, { "epoch": 12.77337559429477, "grad_norm": 0.2995304763317108, "learning_rate": 3.164470379112816e-05, "loss": 0.0247, "step": 8060 }, { "epoch": 12.789223454833598, "grad_norm": 0.23769770562648773, "learning_rate": 3.15228950675047e-05, "loss": 0.0152, "step": 8070 }, { "epoch": 12.805071315372425, "grad_norm": 0.1370396465063095, "learning_rate": 3.140121321638241e-05, "loss": 0.0177, "step": 8080 }, { "epoch": 12.820919175911252, "grad_norm": 0.4313637614250183, "learning_rate": 3.127965907328617e-05, "loss": 0.0154, "step": 8090 }, { "epoch": 12.836767036450079, "grad_norm": 0.2073371410369873, "learning_rate": 3.115823347286397e-05, "loss": 0.0165, "step": 8100 }, { "epoch": 12.852614896988907, "grad_norm": 0.32266175746917725, "learning_rate": 3.103693724888112e-05, "loss": 0.0212, "step": 8110 }, { "epoch": 12.868462757527734, "grad_norm": 0.1806778460741043, "learning_rate": 3.091577123421462e-05, "loss": 0.0145, "step": 8120 }, { "epoch": 12.88431061806656, "grad_norm": 0.25016674399375916, "learning_rate": 3.079473626084737e-05, "loss": 0.0211, "step": 8130 }, { "epoch": 12.900158478605388, "grad_norm": 0.16698500514030457, "learning_rate": 3.067383315986249e-05, "loss": 0.0228, "step": 8140 }, { "epoch": 12.916006339144216, "grad_norm": 0.22536715865135193, "learning_rate": 3.055306276143754e-05, "loss": 0.0213, "step": 8150 }, { "epoch": 12.931854199683043, "grad_norm": 0.17826388776302338, "learning_rate": 3.0432425894838977e-05, "loss": 0.023, "step": 8160 }, { "epoch": 12.94770206022187, "grad_norm": 0.22973258793354034, "learning_rate": 3.031192338841631e-05, "loss": 0.0188, "step": 8170 }, { "epoch": 12.963549920760697, "grad_norm": 0.3207305669784546, "learning_rate": 3.0191556069596476e-05, "loss": 0.0199, "step": 8180 }, { "epoch": 12.979397781299525, "grad_norm": 0.19772501289844513, "learning_rate": 3.0071324764878155e-05, "loss": 0.0177, "step": 8190 }, { "epoch": 12.995245641838352, "grad_norm": 0.19332300126552582, "learning_rate": 2.99512302998261e-05, "loss": 0.0243, "step": 8200 }, { "epoch": 13.011093502377179, "grad_norm": 0.22696681320667267, "learning_rate": 2.9831273499065422e-05, "loss": 0.0178, "step": 8210 }, { "epoch": 13.026941362916006, "grad_norm": 0.2711600065231323, "learning_rate": 2.9711455186275998e-05, "loss": 0.0149, "step": 8220 }, { "epoch": 13.042789223454834, "grad_norm": 0.22301819920539856, "learning_rate": 2.959177618418678e-05, "loss": 0.0201, "step": 8230 }, { "epoch": 13.058637083993661, "grad_norm": 0.1777944713830948, "learning_rate": 2.9472237314570134e-05, "loss": 0.0187, "step": 8240 }, { "epoch": 13.074484944532488, "grad_norm": 0.24867452681064606, "learning_rate": 2.935283939823621e-05, "loss": 0.0217, "step": 8250 }, { "epoch": 13.090332805071315, "grad_norm": 0.24219559133052826, "learning_rate": 2.9233583255027313e-05, "loss": 0.013, "step": 8260 }, { "epoch": 13.106180665610143, "grad_norm": 0.14742301404476166, "learning_rate": 2.9114469703812292e-05, "loss": 0.0199, "step": 8270 }, { "epoch": 13.12202852614897, "grad_norm": 0.167776420712471, "learning_rate": 2.8995499562480842e-05, "loss": 0.0183, "step": 8280 }, { "epoch": 13.137876386687797, "grad_norm": 0.29319486021995544, "learning_rate": 2.8876673647937945e-05, "loss": 0.0208, "step": 8290 }, { "epoch": 13.153724247226624, "grad_norm": 0.1555861234664917, "learning_rate": 2.875799277609832e-05, "loss": 0.0194, "step": 8300 }, { "epoch": 13.169572107765452, "grad_norm": 0.1766081005334854, "learning_rate": 2.863945776188065e-05, "loss": 0.0182, "step": 8310 }, { "epoch": 13.185419968304279, "grad_norm": 0.2022436112165451, "learning_rate": 2.8521069419202195e-05, "loss": 0.0161, "step": 8320 }, { "epoch": 13.201267828843106, "grad_norm": 0.1649257242679596, "learning_rate": 2.840282856097304e-05, "loss": 0.0168, "step": 8330 }, { "epoch": 13.217115689381933, "grad_norm": 0.24146905541419983, "learning_rate": 2.828473599909055e-05, "loss": 0.0178, "step": 8340 }, { "epoch": 13.232963549920761, "grad_norm": 0.20440474152565002, "learning_rate": 2.8166792544433894e-05, "loss": 0.0251, "step": 8350 }, { "epoch": 13.248811410459588, "grad_norm": 0.21215130388736725, "learning_rate": 2.8048999006858323e-05, "loss": 0.0225, "step": 8360 }, { "epoch": 13.264659270998415, "grad_norm": 0.17490635812282562, "learning_rate": 2.7931356195189735e-05, "loss": 0.0151, "step": 8370 }, { "epoch": 13.280507131537242, "grad_norm": 0.2777180075645447, "learning_rate": 2.781386491721908e-05, "loss": 0.0178, "step": 8380 }, { "epoch": 13.29635499207607, "grad_norm": 0.23932000994682312, "learning_rate": 2.7696525979696752e-05, "loss": 0.0147, "step": 8390 }, { "epoch": 13.312202852614897, "grad_norm": 0.19922451674938202, "learning_rate": 2.7579340188327186e-05, "loss": 0.0168, "step": 8400 }, { "epoch": 13.328050713153724, "grad_norm": 0.2395889014005661, "learning_rate": 2.7462308347763127e-05, "loss": 0.017, "step": 8410 }, { "epoch": 13.343898573692552, "grad_norm": 0.23529374599456787, "learning_rate": 2.7345431261600317e-05, "loss": 0.0197, "step": 8420 }, { "epoch": 13.35974643423138, "grad_norm": 0.2671940326690674, "learning_rate": 2.7228709732371886e-05, "loss": 0.0155, "step": 8430 }, { "epoch": 13.375594294770206, "grad_norm": 0.2091439962387085, "learning_rate": 2.7112144561542757e-05, "loss": 0.0205, "step": 8440 }, { "epoch": 13.391442155309033, "grad_norm": 0.20118452608585358, "learning_rate": 2.6995736549504315e-05, "loss": 0.015, "step": 8450 }, { "epoch": 13.407290015847861, "grad_norm": 0.15710382163524628, "learning_rate": 2.687948649556874e-05, "loss": 0.0192, "step": 8460 }, { "epoch": 13.423137876386688, "grad_norm": 0.22499555349349976, "learning_rate": 2.6763395197963626e-05, "loss": 0.0268, "step": 8470 }, { "epoch": 13.438985736925515, "grad_norm": 0.17233209311962128, "learning_rate": 2.6647463453826505e-05, "loss": 0.0191, "step": 8480 }, { "epoch": 13.454833597464342, "grad_norm": 0.28862184286117554, "learning_rate": 2.6531692059199275e-05, "loss": 0.0196, "step": 8490 }, { "epoch": 13.47068145800317, "grad_norm": 0.19401662051677704, "learning_rate": 2.6416081809022887e-05, "loss": 0.0171, "step": 8500 }, { "epoch": 13.486529318541997, "grad_norm": 0.21995659172534943, "learning_rate": 2.6300633497131687e-05, "loss": 0.0195, "step": 8510 }, { "epoch": 13.502377179080824, "grad_norm": 0.2321847379207611, "learning_rate": 2.618534791624816e-05, "loss": 0.0209, "step": 8520 }, { "epoch": 13.51822503961965, "grad_norm": 0.21036501228809357, "learning_rate": 2.6070225857977428e-05, "loss": 0.0204, "step": 8530 }, { "epoch": 13.53407290015848, "grad_norm": 0.2640347480773926, "learning_rate": 2.5955268112801656e-05, "loss": 0.0158, "step": 8540 }, { "epoch": 13.549920760697306, "grad_norm": 0.30468320846557617, "learning_rate": 2.58404754700749e-05, "loss": 0.0151, "step": 8550 }, { "epoch": 13.565768621236133, "grad_norm": 0.19475166499614716, "learning_rate": 2.5725848718017454e-05, "loss": 0.0194, "step": 8560 }, { "epoch": 13.58161648177496, "grad_norm": 0.18407198786735535, "learning_rate": 2.561138864371057e-05, "loss": 0.017, "step": 8570 }, { "epoch": 13.597464342313788, "grad_norm": 0.197821244597435, "learning_rate": 2.549709603309104e-05, "loss": 0.0192, "step": 8580 }, { "epoch": 13.613312202852615, "grad_norm": 0.19414368271827698, "learning_rate": 2.53829716709457e-05, "loss": 0.0161, "step": 8590 }, { "epoch": 13.629160063391442, "grad_norm": 0.32657763361930847, "learning_rate": 2.5269016340906138e-05, "loss": 0.0193, "step": 8600 }, { "epoch": 13.645007923930269, "grad_norm": 0.17926651239395142, "learning_rate": 2.5155230825443332e-05, "loss": 0.0172, "step": 8610 }, { "epoch": 13.660855784469097, "grad_norm": 0.1641903668642044, "learning_rate": 2.504161590586217e-05, "loss": 0.0171, "step": 8620 }, { "epoch": 13.676703645007924, "grad_norm": 0.23365381360054016, "learning_rate": 2.4928172362296205e-05, "loss": 0.0149, "step": 8630 }, { "epoch": 13.692551505546751, "grad_norm": 0.2839002311229706, "learning_rate": 2.4814900973702183e-05, "loss": 0.0198, "step": 8640 }, { "epoch": 13.708399366085578, "grad_norm": 0.233973428606987, "learning_rate": 2.4701802517854822e-05, "loss": 0.022, "step": 8650 }, { "epoch": 13.724247226624406, "grad_norm": 0.2717144191265106, "learning_rate": 2.458887777134134e-05, "loss": 0.0199, "step": 8660 }, { "epoch": 13.740095087163233, "grad_norm": 0.2552318274974823, "learning_rate": 2.44761275095562e-05, "loss": 0.019, "step": 8670 }, { "epoch": 13.75594294770206, "grad_norm": 0.17286346852779388, "learning_rate": 2.4363552506695814e-05, "loss": 0.0182, "step": 8680 }, { "epoch": 13.771790808240887, "grad_norm": 0.1892533153295517, "learning_rate": 2.4251153535753107e-05, "loss": 0.0212, "step": 8690 }, { "epoch": 13.787638668779715, "grad_norm": 0.15570400655269623, "learning_rate": 2.4138931368512375e-05, "loss": 0.0178, "step": 8700 }, { "epoch": 13.803486529318542, "grad_norm": 0.287626177072525, "learning_rate": 2.402688677554381e-05, "loss": 0.0166, "step": 8710 }, { "epoch": 13.819334389857369, "grad_norm": 0.3084344267845154, "learning_rate": 2.3915020526198373e-05, "loss": 0.0148, "step": 8720 }, { "epoch": 13.835182250396196, "grad_norm": 0.13890209794044495, "learning_rate": 2.3803333388602372e-05, "loss": 0.0158, "step": 8730 }, { "epoch": 13.851030110935024, "grad_norm": 0.24919134378433228, "learning_rate": 2.3691826129652267e-05, "loss": 0.0202, "step": 8740 }, { "epoch": 13.866877971473851, "grad_norm": 0.19362711906433105, "learning_rate": 2.3580499515009408e-05, "loss": 0.0186, "step": 8750 }, { "epoch": 13.882725832012678, "grad_norm": 0.23859569430351257, "learning_rate": 2.346935430909476e-05, "loss": 0.018, "step": 8760 }, { "epoch": 13.898573692551505, "grad_norm": 0.41652438044548035, "learning_rate": 2.335839127508359e-05, "loss": 0.018, "step": 8770 }, { "epoch": 13.914421553090333, "grad_norm": 0.19404253363609314, "learning_rate": 2.3247611174900375e-05, "loss": 0.0144, "step": 8780 }, { "epoch": 13.93026941362916, "grad_norm": 0.27209949493408203, "learning_rate": 2.3137014769213415e-05, "loss": 0.0181, "step": 8790 }, { "epoch": 13.946117274167987, "grad_norm": 0.15419328212738037, "learning_rate": 2.3026602817429677e-05, "loss": 0.0176, "step": 8800 }, { "epoch": 13.961965134706814, "grad_norm": 0.22414186596870422, "learning_rate": 2.291637607768964e-05, "loss": 0.0224, "step": 8810 }, { "epoch": 13.977812995245642, "grad_norm": 0.16095861792564392, "learning_rate": 2.280633530686195e-05, "loss": 0.0152, "step": 8820 }, { "epoch": 13.99366085578447, "grad_norm": 0.1415528804063797, "learning_rate": 2.2696481260538393e-05, "loss": 0.0156, "step": 8830 }, { "epoch": 14.009508716323296, "grad_norm": 0.1570771187543869, "learning_rate": 2.2586814693028524e-05, "loss": 0.0173, "step": 8840 }, { "epoch": 14.025356576862123, "grad_norm": 0.2337312251329422, "learning_rate": 2.247733635735466e-05, "loss": 0.0197, "step": 8850 }, { "epoch": 14.041204437400951, "grad_norm": 0.2519458532333374, "learning_rate": 2.2368047005246585e-05, "loss": 0.0177, "step": 8860 }, { "epoch": 14.057052297939778, "grad_norm": 0.26522183418273926, "learning_rate": 2.2258947387136415e-05, "loss": 0.0192, "step": 8870 }, { "epoch": 14.072900158478605, "grad_norm": 0.12336030602455139, "learning_rate": 2.2150038252153533e-05, "loss": 0.0175, "step": 8880 }, { "epoch": 14.088748019017432, "grad_norm": 0.15576300024986267, "learning_rate": 2.204132034811929e-05, "loss": 0.0174, "step": 8890 }, { "epoch": 14.10459587955626, "grad_norm": 0.21424925327301025, "learning_rate": 2.1932794421542018e-05, "loss": 0.0142, "step": 8900 }, { "epoch": 14.120443740095087, "grad_norm": 0.21682120859622955, "learning_rate": 2.182446121761186e-05, "loss": 0.0191, "step": 8910 }, { "epoch": 14.136291600633914, "grad_norm": 0.25047534704208374, "learning_rate": 2.171632148019552e-05, "loss": 0.0154, "step": 8920 }, { "epoch": 14.152139461172741, "grad_norm": 0.2971823513507843, "learning_rate": 2.1608375951831383e-05, "loss": 0.0227, "step": 8930 }, { "epoch": 14.16798732171157, "grad_norm": 0.2523512542247772, "learning_rate": 2.1500625373724286e-05, "loss": 0.0144, "step": 8940 }, { "epoch": 14.183835182250396, "grad_norm": 0.21813775599002838, "learning_rate": 2.1393070485740386e-05, "loss": 0.0154, "step": 8950 }, { "epoch": 14.199683042789223, "grad_norm": 0.2209501713514328, "learning_rate": 2.1285712026402215e-05, "loss": 0.0137, "step": 8960 }, { "epoch": 14.21553090332805, "grad_norm": 0.1733659952878952, "learning_rate": 2.117855073288346e-05, "loss": 0.0133, "step": 8970 }, { "epoch": 14.231378763866879, "grad_norm": 0.19718633592128754, "learning_rate": 2.1071587341004058e-05, "loss": 0.0212, "step": 8980 }, { "epoch": 14.247226624405705, "grad_norm": 0.23138895630836487, "learning_rate": 2.0964822585224987e-05, "loss": 0.0218, "step": 8990 }, { "epoch": 14.263074484944532, "grad_norm": 0.22604243457317352, "learning_rate": 2.08582571986433e-05, "loss": 0.0165, "step": 9000 }, { "epoch": 14.278922345483359, "grad_norm": 0.21740014851093292, "learning_rate": 2.075189191298716e-05, "loss": 0.018, "step": 9010 }, { "epoch": 14.294770206022188, "grad_norm": 0.5042977333068848, "learning_rate": 2.0645727458610646e-05, "loss": 0.015, "step": 9020 }, { "epoch": 14.310618066561014, "grad_norm": 0.17162521183490753, "learning_rate": 2.0539764564488927e-05, "loss": 0.0147, "step": 9030 }, { "epoch": 14.326465927099841, "grad_norm": 0.23630589246749878, "learning_rate": 2.04340039582131e-05, "loss": 0.0168, "step": 9040 }, { "epoch": 14.342313787638668, "grad_norm": 0.22610369324684143, "learning_rate": 2.0328446365985253e-05, "loss": 0.019, "step": 9050 }, { "epoch": 14.358161648177497, "grad_norm": 0.23171366751194, "learning_rate": 2.022309251261355e-05, "loss": 0.0185, "step": 9060 }, { "epoch": 14.374009508716323, "grad_norm": 0.20405028760433197, "learning_rate": 2.0117943121507117e-05, "loss": 0.018, "step": 9070 }, { "epoch": 14.38985736925515, "grad_norm": 0.20171862840652466, "learning_rate": 2.0012998914671182e-05, "loss": 0.0156, "step": 9080 }, { "epoch": 14.405705229793977, "grad_norm": 0.2580902874469757, "learning_rate": 1.99082606127021e-05, "loss": 0.018, "step": 9090 }, { "epoch": 14.421553090332806, "grad_norm": 0.16781866550445557, "learning_rate": 1.9803728934782323e-05, "loss": 0.0178, "step": 9100 }, { "epoch": 14.437400950871632, "grad_norm": 0.21224135160446167, "learning_rate": 1.969940459867562e-05, "loss": 0.0169, "step": 9110 }, { "epoch": 14.45324881141046, "grad_norm": 0.16903094947338104, "learning_rate": 1.9595288320721923e-05, "loss": 0.0138, "step": 9120 }, { "epoch": 14.469096671949288, "grad_norm": 0.2130252569913864, "learning_rate": 1.949138081583265e-05, "loss": 0.0175, "step": 9130 }, { "epoch": 14.484944532488115, "grad_norm": 0.2133990377187729, "learning_rate": 1.938768279748566e-05, "loss": 0.0169, "step": 9140 }, { "epoch": 14.500792393026941, "grad_norm": 0.19141750037670135, "learning_rate": 1.9284194977720344e-05, "loss": 0.0139, "step": 9150 }, { "epoch": 14.516640253565768, "grad_norm": 0.18053506314754486, "learning_rate": 1.9180918067132813e-05, "loss": 0.0202, "step": 9160 }, { "epoch": 14.532488114104595, "grad_norm": 0.2015606015920639, "learning_rate": 1.9077852774870945e-05, "loss": 0.0188, "step": 9170 }, { "epoch": 14.548335974643424, "grad_norm": 0.2063121348619461, "learning_rate": 1.8974999808629545e-05, "loss": 0.0141, "step": 9180 }, { "epoch": 14.56418383518225, "grad_norm": 0.14588534832000732, "learning_rate": 1.887235987464553e-05, "loss": 0.0147, "step": 9190 }, { "epoch": 14.580031695721077, "grad_norm": 0.17593805491924286, "learning_rate": 1.876993367769297e-05, "loss": 0.0139, "step": 9200 }, { "epoch": 14.595879556259906, "grad_norm": 0.15790753066539764, "learning_rate": 1.8667721921078397e-05, "loss": 0.0123, "step": 9210 }, { "epoch": 14.611727416798733, "grad_norm": 0.23879548907279968, "learning_rate": 1.8565725306635806e-05, "loss": 0.0186, "step": 9220 }, { "epoch": 14.62757527733756, "grad_norm": 0.23344580829143524, "learning_rate": 1.8463944534722e-05, "loss": 0.0158, "step": 9230 }, { "epoch": 14.643423137876386, "grad_norm": 0.219131201505661, "learning_rate": 1.83623803042117e-05, "loss": 0.0197, "step": 9240 }, { "epoch": 14.659270998415215, "grad_norm": 0.17857685685157776, "learning_rate": 1.826103331249267e-05, "loss": 0.0128, "step": 9250 }, { "epoch": 14.675118858954042, "grad_norm": 0.19189006090164185, "learning_rate": 1.8159904255461108e-05, "loss": 0.0172, "step": 9260 }, { "epoch": 14.690966719492868, "grad_norm": 0.18938252329826355, "learning_rate": 1.8058993827516697e-05, "loss": 0.0212, "step": 9270 }, { "epoch": 14.706814580031695, "grad_norm": 0.20771273970603943, "learning_rate": 1.795830272155796e-05, "loss": 0.0248, "step": 9280 }, { "epoch": 14.722662440570524, "grad_norm": 0.22910486161708832, "learning_rate": 1.7857831628977456e-05, "loss": 0.015, "step": 9290 }, { "epoch": 14.73851030110935, "grad_norm": 0.20048457384109497, "learning_rate": 1.7757581239656984e-05, "loss": 0.0168, "step": 9300 }, { "epoch": 14.754358161648177, "grad_norm": 0.21910695731639862, "learning_rate": 1.7657552241962904e-05, "loss": 0.0119, "step": 9310 }, { "epoch": 14.770206022187004, "grad_norm": 0.214069664478302, "learning_rate": 1.7557745322741433e-05, "loss": 0.0167, "step": 9320 }, { "epoch": 14.786053882725833, "grad_norm": 0.20221184194087982, "learning_rate": 1.745816116731383e-05, "loss": 0.0153, "step": 9330 }, { "epoch": 14.80190174326466, "grad_norm": 0.1907825767993927, "learning_rate": 1.735880045947183e-05, "loss": 0.016, "step": 9340 }, { "epoch": 14.817749603803486, "grad_norm": 0.2389329969882965, "learning_rate": 1.7259663881472787e-05, "loss": 0.0168, "step": 9350 }, { "epoch": 14.833597464342313, "grad_norm": 0.2041391283273697, "learning_rate": 1.716075211403516e-05, "loss": 0.0166, "step": 9360 }, { "epoch": 14.849445324881142, "grad_norm": 0.3064650595188141, "learning_rate": 1.7062065836333696e-05, "loss": 0.0166, "step": 9370 }, { "epoch": 14.865293185419969, "grad_norm": 0.25269177556037903, "learning_rate": 1.6963605725994807e-05, "loss": 0.0179, "step": 9380 }, { "epoch": 14.881141045958795, "grad_norm": 0.13689862191677094, "learning_rate": 1.686537245909201e-05, "loss": 0.0136, "step": 9390 }, { "epoch": 14.896988906497622, "grad_norm": 0.2099904716014862, "learning_rate": 1.6767366710141125e-05, "loss": 0.0188, "step": 9400 }, { "epoch": 14.91283676703645, "grad_norm": 0.20536595582962036, "learning_rate": 1.666958915209578e-05, "loss": 0.0161, "step": 9410 }, { "epoch": 14.928684627575278, "grad_norm": 0.18782939016819, "learning_rate": 1.6572040456342737e-05, "loss": 0.0249, "step": 9420 }, { "epoch": 14.944532488114104, "grad_norm": 0.29753440618515015, "learning_rate": 1.6474721292697247e-05, "loss": 0.0174, "step": 9430 }, { "epoch": 14.960380348652931, "grad_norm": 0.14820578694343567, "learning_rate": 1.6377632329398507e-05, "loss": 0.0229, "step": 9440 }, { "epoch": 14.97622820919176, "grad_norm": 0.26186251640319824, "learning_rate": 1.628077423310503e-05, "loss": 0.0203, "step": 9450 }, { "epoch": 14.992076069730587, "grad_norm": 0.2948777675628662, "learning_rate": 1.6184147668890116e-05, "loss": 0.0192, "step": 9460 }, { "epoch": 15.007923930269413, "grad_norm": 0.20523428916931152, "learning_rate": 1.608775330023727e-05, "loss": 0.0171, "step": 9470 }, { "epoch": 15.02377179080824, "grad_norm": 0.28263282775878906, "learning_rate": 1.599159178903557e-05, "loss": 0.0149, "step": 9480 }, { "epoch": 15.039619651347069, "grad_norm": 0.2222396433353424, "learning_rate": 1.5895663795575255e-05, "loss": 0.0174, "step": 9490 }, { "epoch": 15.055467511885896, "grad_norm": 0.2283553034067154, "learning_rate": 1.5799969978543072e-05, "loss": 0.0152, "step": 9500 }, { "epoch": 15.071315372424722, "grad_norm": 0.19190463423728943, "learning_rate": 1.570451099501781e-05, "loss": 0.0193, "step": 9510 }, { "epoch": 15.08716323296355, "grad_norm": 0.2034788280725479, "learning_rate": 1.560928750046582e-05, "loss": 0.0142, "step": 9520 }, { "epoch": 15.103011093502378, "grad_norm": 0.1533176153898239, "learning_rate": 1.5514300148736405e-05, "loss": 0.0147, "step": 9530 }, { "epoch": 15.118858954041205, "grad_norm": 0.16323472559452057, "learning_rate": 1.5419549592057485e-05, "loss": 0.0128, "step": 9540 }, { "epoch": 15.134706814580031, "grad_norm": 0.1336495280265808, "learning_rate": 1.532503648103095e-05, "loss": 0.0152, "step": 9550 }, { "epoch": 15.150554675118858, "grad_norm": 0.23295193910598755, "learning_rate": 1.5230761464628351e-05, "loss": 0.0202, "step": 9560 }, { "epoch": 15.166402535657687, "grad_norm": 0.21971255540847778, "learning_rate": 1.5136725190186312e-05, "loss": 0.0127, "step": 9570 }, { "epoch": 15.182250396196514, "grad_norm": 0.12831509113311768, "learning_rate": 1.5042928303402155e-05, "loss": 0.0131, "step": 9580 }, { "epoch": 15.19809825673534, "grad_norm": 0.2782778739929199, "learning_rate": 1.4949371448329491e-05, "loss": 0.0134, "step": 9590 }, { "epoch": 15.213946117274167, "grad_norm": 0.15872108936309814, "learning_rate": 1.4856055267373704e-05, "loss": 0.0126, "step": 9600 }, { "epoch": 15.229793977812996, "grad_norm": 0.1593102514743805, "learning_rate": 1.476298040128763e-05, "loss": 0.0168, "step": 9610 }, { "epoch": 15.245641838351823, "grad_norm": 0.21707729995250702, "learning_rate": 1.4670147489167157e-05, "loss": 0.0128, "step": 9620 }, { "epoch": 15.26148969889065, "grad_norm": 0.13602186739444733, "learning_rate": 1.4577557168446704e-05, "loss": 0.0163, "step": 9630 }, { "epoch": 15.277337559429476, "grad_norm": 0.15380342304706573, "learning_rate": 1.4485210074895028e-05, "loss": 0.0131, "step": 9640 }, { "epoch": 15.293185419968305, "grad_norm": 0.23396658897399902, "learning_rate": 1.4393106842610765e-05, "loss": 0.0182, "step": 9650 }, { "epoch": 15.309033280507132, "grad_norm": 0.351018488407135, "learning_rate": 1.4301248104018039e-05, "loss": 0.0163, "step": 9660 }, { "epoch": 15.324881141045958, "grad_norm": 0.15941226482391357, "learning_rate": 1.4209634489862228e-05, "loss": 0.0151, "step": 9670 }, { "epoch": 15.340729001584785, "grad_norm": 0.31737878918647766, "learning_rate": 1.4118266629205501e-05, "loss": 0.016, "step": 9680 }, { "epoch": 15.356576862123614, "grad_norm": 0.1942298859357834, "learning_rate": 1.4027145149422637e-05, "loss": 0.0138, "step": 9690 }, { "epoch": 15.37242472266244, "grad_norm": 0.20650826394557953, "learning_rate": 1.3936270676196605e-05, "loss": 0.0196, "step": 9700 }, { "epoch": 15.388272583201267, "grad_norm": 0.13685113191604614, "learning_rate": 1.3845643833514294e-05, "loss": 0.015, "step": 9710 }, { "epoch": 15.404120443740094, "grad_norm": 0.22127866744995117, "learning_rate": 1.3755265243662308e-05, "loss": 0.0146, "step": 9720 }, { "epoch": 15.419968304278923, "grad_norm": 0.1102658063173294, "learning_rate": 1.3665135527222566e-05, "loss": 0.0132, "step": 9730 }, { "epoch": 15.43581616481775, "grad_norm": 0.17032739520072937, "learning_rate": 1.3575255303068157e-05, "loss": 0.0168, "step": 9740 }, { "epoch": 15.451664025356576, "grad_norm": 0.20449472963809967, "learning_rate": 1.3485625188359008e-05, "loss": 0.0155, "step": 9750 }, { "epoch": 15.467511885895403, "grad_norm": 0.2856760323047638, "learning_rate": 1.3396245798537655e-05, "loss": 0.0174, "step": 9760 }, { "epoch": 15.483359746434232, "grad_norm": 0.17707166075706482, "learning_rate": 1.3307117747325104e-05, "loss": 0.0145, "step": 9770 }, { "epoch": 15.499207606973059, "grad_norm": 0.2179175168275833, "learning_rate": 1.321824164671649e-05, "loss": 0.0142, "step": 9780 }, { "epoch": 15.515055467511885, "grad_norm": 0.14933204650878906, "learning_rate": 1.3129618106976966e-05, "loss": 0.0166, "step": 9790 }, { "epoch": 15.530903328050712, "grad_norm": 0.23230569064617157, "learning_rate": 1.3041247736637497e-05, "loss": 0.02, "step": 9800 }, { "epoch": 15.54675118858954, "grad_norm": 0.2146037369966507, "learning_rate": 1.2953131142490621e-05, "loss": 0.0187, "step": 9810 }, { "epoch": 15.562599049128368, "grad_norm": 0.27099379897117615, "learning_rate": 1.2865268929586399e-05, "loss": 0.0175, "step": 9820 }, { "epoch": 15.578446909667194, "grad_norm": 0.21641230583190918, "learning_rate": 1.2777661701228094e-05, "loss": 0.0131, "step": 9830 }, { "epoch": 15.594294770206023, "grad_norm": 0.206056609749794, "learning_rate": 1.2690310058968208e-05, "loss": 0.0124, "step": 9840 }, { "epoch": 15.61014263074485, "grad_norm": 0.2695901095867157, "learning_rate": 1.2603214602604251e-05, "loss": 0.017, "step": 9850 }, { "epoch": 15.625990491283677, "grad_norm": 0.24454373121261597, "learning_rate": 1.2516375930174607e-05, "loss": 0.0185, "step": 9860 }, { "epoch": 15.641838351822503, "grad_norm": 0.24143637716770172, "learning_rate": 1.2429794637954505e-05, "loss": 0.0167, "step": 9870 }, { "epoch": 15.65768621236133, "grad_norm": 0.24098831415176392, "learning_rate": 1.234347132045185e-05, "loss": 0.0156, "step": 9880 }, { "epoch": 15.673534072900159, "grad_norm": 0.2231469452381134, "learning_rate": 1.2257406570403158e-05, "loss": 0.0162, "step": 9890 }, { "epoch": 15.689381933438986, "grad_norm": 0.18433237075805664, "learning_rate": 1.217160097876956e-05, "loss": 0.0148, "step": 9900 }, { "epoch": 15.705229793977812, "grad_norm": 0.24673160910606384, "learning_rate": 1.2086055134732604e-05, "loss": 0.0156, "step": 9910 }, { "epoch": 15.721077654516641, "grad_norm": 0.2098625749349594, "learning_rate": 1.2000769625690367e-05, "loss": 0.0123, "step": 9920 }, { "epoch": 15.736925515055468, "grad_norm": 0.19441257417201996, "learning_rate": 1.1915745037253273e-05, "loss": 0.0149, "step": 9930 }, { "epoch": 15.752773375594295, "grad_norm": 0.30163636803627014, "learning_rate": 1.1830981953240183e-05, "loss": 0.0145, "step": 9940 }, { "epoch": 15.768621236133121, "grad_norm": 0.2016548216342926, "learning_rate": 1.1746480955674371e-05, "loss": 0.0157, "step": 9950 }, { "epoch": 15.78446909667195, "grad_norm": 0.16448210179805756, "learning_rate": 1.1662242624779413e-05, "loss": 0.0093, "step": 9960 }, { "epoch": 15.800316957210777, "grad_norm": 0.1529219001531601, "learning_rate": 1.1578267538975384e-05, "loss": 0.016, "step": 9970 }, { "epoch": 15.816164817749604, "grad_norm": 0.11220666021108627, "learning_rate": 1.1494556274874736e-05, "loss": 0.0151, "step": 9980 }, { "epoch": 15.83201267828843, "grad_norm": 0.1833869069814682, "learning_rate": 1.1411109407278425e-05, "loss": 0.0126, "step": 9990 }, { "epoch": 15.847860538827259, "grad_norm": 0.24351130425930023, "learning_rate": 1.1327927509171948e-05, "loss": 0.0148, "step": 10000 }, { "epoch": 15.863708399366086, "grad_norm": 0.18271566927433014, "learning_rate": 1.1245011151721358e-05, "loss": 0.0153, "step": 10010 }, { "epoch": 15.879556259904913, "grad_norm": 0.17010100185871124, "learning_rate": 1.1162360904269399e-05, "loss": 0.0139, "step": 10020 }, { "epoch": 15.89540412044374, "grad_norm": 0.20020832121372223, "learning_rate": 1.1079977334331593e-05, "loss": 0.014, "step": 10030 }, { "epoch": 15.911251980982568, "grad_norm": 0.31756097078323364, "learning_rate": 1.0997861007592297e-05, "loss": 0.0137, "step": 10040 }, { "epoch": 15.927099841521395, "grad_norm": 0.20857271552085876, "learning_rate": 1.0916012487900901e-05, "loss": 0.0187, "step": 10050 }, { "epoch": 15.942947702060222, "grad_norm": 0.21330268681049347, "learning_rate": 1.0834432337267835e-05, "loss": 0.0182, "step": 10060 }, { "epoch": 15.958795562599049, "grad_norm": 0.2602750360965729, "learning_rate": 1.0753121115860859e-05, "loss": 0.0126, "step": 10070 }, { "epoch": 15.974643423137877, "grad_norm": 0.10706225037574768, "learning_rate": 1.0672079382001076e-05, "loss": 0.0141, "step": 10080 }, { "epoch": 15.990491283676704, "grad_norm": 0.18691207468509674, "learning_rate": 1.0591307692159175e-05, "loss": 0.018, "step": 10090 }, { "epoch": 16.00633914421553, "grad_norm": 0.16258151829242706, "learning_rate": 1.0510806600951634e-05, "loss": 0.0138, "step": 10100 }, { "epoch": 16.022187004754358, "grad_norm": 0.22133781015872955, "learning_rate": 1.0430576661136809e-05, "loss": 0.0136, "step": 10110 }, { "epoch": 16.038034865293184, "grad_norm": 0.14174553751945496, "learning_rate": 1.0350618423611258e-05, "loss": 0.012, "step": 10120 }, { "epoch": 16.05388272583201, "grad_norm": 0.21903228759765625, "learning_rate": 1.0270932437405894e-05, "loss": 0.0162, "step": 10130 }, { "epoch": 16.06973058637084, "grad_norm": 0.15532748401165009, "learning_rate": 1.0191519249682202e-05, "loss": 0.0129, "step": 10140 }, { "epoch": 16.08557844690967, "grad_norm": 0.2952392101287842, "learning_rate": 1.0112379405728512e-05, "loss": 0.014, "step": 10150 }, { "epoch": 16.101426307448495, "grad_norm": 0.1566477119922638, "learning_rate": 1.003351344895624e-05, "loss": 0.0168, "step": 10160 }, { "epoch": 16.117274167987322, "grad_norm": 0.18433576822280884, "learning_rate": 9.954921920896181e-06, "loss": 0.0141, "step": 10170 }, { "epoch": 16.13312202852615, "grad_norm": 0.1970781683921814, "learning_rate": 9.876605361194784e-06, "loss": 0.014, "step": 10180 }, { "epoch": 16.148969889064976, "grad_norm": 0.22483587265014648, "learning_rate": 9.798564307610397e-06, "loss": 0.0172, "step": 10190 }, { "epoch": 16.164817749603802, "grad_norm": 0.12307272851467133, "learning_rate": 9.720799296009652e-06, "loss": 0.0142, "step": 10200 }, { "epoch": 16.18066561014263, "grad_norm": 0.09929801523685455, "learning_rate": 9.64331086036372e-06, "loss": 0.0157, "step": 10210 }, { "epoch": 16.19651347068146, "grad_norm": 0.22220948338508606, "learning_rate": 9.566099532744666e-06, "loss": 0.0144, "step": 10220 }, { "epoch": 16.212361331220286, "grad_norm": 0.21739843487739563, "learning_rate": 9.48916584332184e-06, "loss": 0.0141, "step": 10230 }, { "epoch": 16.228209191759113, "grad_norm": 0.20657970011234283, "learning_rate": 9.412510320358148e-06, "loss": 0.0125, "step": 10240 }, { "epoch": 16.24405705229794, "grad_norm": 0.1589168906211853, "learning_rate": 9.336133490206527e-06, "loss": 0.0146, "step": 10250 }, { "epoch": 16.259904912836767, "grad_norm": 0.20785082876682281, "learning_rate": 9.260035877306222e-06, "loss": 0.015, "step": 10260 }, { "epoch": 16.275752773375594, "grad_norm": 0.3436870872974396, "learning_rate": 9.184218004179296e-06, "loss": 0.0142, "step": 10270 }, { "epoch": 16.29160063391442, "grad_norm": 0.19214791059494019, "learning_rate": 9.108680391426944e-06, "loss": 0.0153, "step": 10280 }, { "epoch": 16.307448494453247, "grad_norm": 0.18752476572990417, "learning_rate": 9.033423557725968e-06, "loss": 0.0198, "step": 10290 }, { "epoch": 16.323296354992078, "grad_norm": 0.2008536010980606, "learning_rate": 8.958448019825238e-06, "loss": 0.0139, "step": 10300 }, { "epoch": 16.339144215530904, "grad_norm": 0.3124418258666992, "learning_rate": 8.883754292542073e-06, "loss": 0.0184, "step": 10310 }, { "epoch": 16.35499207606973, "grad_norm": 0.18249309062957764, "learning_rate": 8.809342888758787e-06, "loss": 0.012, "step": 10320 }, { "epoch": 16.370839936608558, "grad_norm": 0.27810513973236084, "learning_rate": 8.735214319419122e-06, "loss": 0.012, "step": 10330 }, { "epoch": 16.386687797147385, "grad_norm": 0.25395792722702026, "learning_rate": 8.66136909352469e-06, "loss": 0.0175, "step": 10340 }, { "epoch": 16.40253565768621, "grad_norm": 0.10935286432504654, "learning_rate": 8.587807718131607e-06, "loss": 0.0138, "step": 10350 }, { "epoch": 16.41838351822504, "grad_norm": 0.20935213565826416, "learning_rate": 8.514530698346911e-06, "loss": 0.0149, "step": 10360 }, { "epoch": 16.434231378763865, "grad_norm": 0.15524841845035553, "learning_rate": 8.4415385373251e-06, "loss": 0.016, "step": 10370 }, { "epoch": 16.450079239302696, "grad_norm": 0.17828898131847382, "learning_rate": 8.368831736264738e-06, "loss": 0.0155, "step": 10380 }, { "epoch": 16.465927099841522, "grad_norm": 0.11186101287603378, "learning_rate": 8.296410794404925e-06, "loss": 0.0146, "step": 10390 }, { "epoch": 16.48177496038035, "grad_norm": 0.1628289818763733, "learning_rate": 8.22427620902197e-06, "loss": 0.0138, "step": 10400 }, { "epoch": 16.497622820919176, "grad_norm": 0.20246130228042603, "learning_rate": 8.152428475425876e-06, "loss": 0.017, "step": 10410 }, { "epoch": 16.513470681458003, "grad_norm": 0.2126418799161911, "learning_rate": 8.080868086957e-06, "loss": 0.0181, "step": 10420 }, { "epoch": 16.52931854199683, "grad_norm": 0.27646327018737793, "learning_rate": 8.009595534982684e-06, "loss": 0.0138, "step": 10430 }, { "epoch": 16.545166402535656, "grad_norm": 0.23372896015644073, "learning_rate": 7.938611308893796e-06, "loss": 0.0206, "step": 10440 }, { "epoch": 16.561014263074483, "grad_norm": 0.21742697060108185, "learning_rate": 7.867915896101475e-06, "loss": 0.0117, "step": 10450 }, { "epoch": 16.576862123613314, "grad_norm": 0.30523791909217834, "learning_rate": 7.797509782033696e-06, "loss": 0.0189, "step": 10460 }, { "epoch": 16.59270998415214, "grad_norm": 0.33640623092651367, "learning_rate": 7.727393450131976e-06, "loss": 0.0147, "step": 10470 }, { "epoch": 16.608557844690967, "grad_norm": 0.14561405777931213, "learning_rate": 7.65756738184808e-06, "loss": 0.0119, "step": 10480 }, { "epoch": 16.624405705229794, "grad_norm": 0.27383899688720703, "learning_rate": 7.588032056640643e-06, "loss": 0.0181, "step": 10490 }, { "epoch": 16.64025356576862, "grad_norm": 0.2113339751958847, "learning_rate": 7.518787951971951e-06, "loss": 0.0151, "step": 10500 }, { "epoch": 16.656101426307448, "grad_norm": 0.22912786900997162, "learning_rate": 7.449835543304645e-06, "loss": 0.013, "step": 10510 }, { "epoch": 16.671949286846274, "grad_norm": 0.24694296717643738, "learning_rate": 7.381175304098398e-06, "loss": 0.0124, "step": 10520 }, { "epoch": 16.687797147385105, "grad_norm": 0.14873796701431274, "learning_rate": 7.3128077058067675e-06, "loss": 0.0166, "step": 10530 }, { "epoch": 16.70364500792393, "grad_norm": 0.14333923161029816, "learning_rate": 7.244733217873834e-06, "loss": 0.0128, "step": 10540 }, { "epoch": 16.71949286846276, "grad_norm": 0.17385222017765045, "learning_rate": 7.1769523077310885e-06, "loss": 0.0172, "step": 10550 }, { "epoch": 16.735340729001585, "grad_norm": 0.1889476180076599, "learning_rate": 7.1094654407941945e-06, "loss": 0.0105, "step": 10560 }, { "epoch": 16.751188589540412, "grad_norm": 0.13638252019882202, "learning_rate": 7.042273080459716e-06, "loss": 0.0137, "step": 10570 }, { "epoch": 16.76703645007924, "grad_norm": 0.16387833654880524, "learning_rate": 6.97537568810207e-06, "loss": 0.0121, "step": 10580 }, { "epoch": 16.782884310618066, "grad_norm": 0.18849371373653412, "learning_rate": 6.908773723070228e-06, "loss": 0.0112, "step": 10590 }, { "epoch": 16.798732171156892, "grad_norm": 0.2580081522464752, "learning_rate": 6.842467642684619e-06, "loss": 0.0164, "step": 10600 }, { "epoch": 16.814580031695723, "grad_norm": 0.19095416367053986, "learning_rate": 6.7764579022340405e-06, "loss": 0.0156, "step": 10610 }, { "epoch": 16.83042789223455, "grad_norm": 0.38263216614723206, "learning_rate": 6.71074495497242e-06, "loss": 0.0141, "step": 10620 }, { "epoch": 16.846275752773376, "grad_norm": 0.19752560555934906, "learning_rate": 6.645329252115812e-06, "loss": 0.0134, "step": 10630 }, { "epoch": 16.862123613312203, "grad_norm": 0.21061812341213226, "learning_rate": 6.580211242839207e-06, "loss": 0.0161, "step": 10640 }, { "epoch": 16.87797147385103, "grad_norm": 0.30705246329307556, "learning_rate": 6.515391374273522e-06, "loss": 0.0136, "step": 10650 }, { "epoch": 16.893819334389857, "grad_norm": 0.16327637434005737, "learning_rate": 6.4508700915025145e-06, "loss": 0.0178, "step": 10660 }, { "epoch": 16.909667194928684, "grad_norm": 0.19477631151676178, "learning_rate": 6.3866478375596454e-06, "loss": 0.0155, "step": 10670 }, { "epoch": 16.92551505546751, "grad_norm": 0.10037015378475189, "learning_rate": 6.322725053425166e-06, "loss": 0.0141, "step": 10680 }, { "epoch": 16.94136291600634, "grad_norm": 0.14681270718574524, "learning_rate": 6.259102178023019e-06, "loss": 0.0132, "step": 10690 }, { "epoch": 16.957210776545168, "grad_norm": 0.14646220207214355, "learning_rate": 6.1957796482177865e-06, "loss": 0.015, "step": 10700 }, { "epoch": 16.973058637083994, "grad_norm": 0.14095987379550934, "learning_rate": 6.1327578988118086e-06, "loss": 0.0117, "step": 10710 }, { "epoch": 16.98890649762282, "grad_norm": 0.17115886509418488, "learning_rate": 6.070037362542058e-06, "loss": 0.0113, "step": 10720 }, { "epoch": 17.004754358161648, "grad_norm": 0.18565335869789124, "learning_rate": 6.00761847007727e-06, "loss": 0.0144, "step": 10730 }, { "epoch": 17.020602218700475, "grad_norm": 0.13037702441215515, "learning_rate": 5.945501650014951e-06, "loss": 0.0137, "step": 10740 }, { "epoch": 17.0364500792393, "grad_norm": 0.18481898307800293, "learning_rate": 5.883687328878423e-06, "loss": 0.0133, "step": 10750 }, { "epoch": 17.05229793977813, "grad_norm": 0.2438468635082245, "learning_rate": 5.822175931113933e-06, "loss": 0.0163, "step": 10760 }, { "epoch": 17.06814580031696, "grad_norm": 0.18955622613430023, "learning_rate": 5.760967879087675e-06, "loss": 0.0113, "step": 10770 }, { "epoch": 17.083993660855786, "grad_norm": 0.3023121953010559, "learning_rate": 5.700063593082971e-06, "loss": 0.0142, "step": 10780 }, { "epoch": 17.099841521394612, "grad_norm": 0.19684407114982605, "learning_rate": 5.639463491297314e-06, "loss": 0.0183, "step": 10790 }, { "epoch": 17.11568938193344, "grad_norm": 0.1771165281534195, "learning_rate": 5.579167989839512e-06, "loss": 0.0149, "step": 10800 }, { "epoch": 17.131537242472266, "grad_norm": 0.31700730323791504, "learning_rate": 5.519177502726897e-06, "loss": 0.0149, "step": 10810 }, { "epoch": 17.147385103011093, "grad_norm": 0.24914953112602234, "learning_rate": 5.459492441882369e-06, "loss": 0.0096, "step": 10820 }, { "epoch": 17.16323296354992, "grad_norm": 0.17742785811424255, "learning_rate": 5.400113217131669e-06, "loss": 0.0126, "step": 10830 }, { "epoch": 17.179080824088746, "grad_norm": 0.19636410474777222, "learning_rate": 5.341040236200512e-06, "loss": 0.0148, "step": 10840 }, { "epoch": 17.194928684627577, "grad_norm": 0.16673442721366882, "learning_rate": 5.282273904711793e-06, "loss": 0.0149, "step": 10850 }, { "epoch": 17.210776545166404, "grad_norm": 0.17502924799919128, "learning_rate": 5.223814626182804e-06, "loss": 0.0156, "step": 10860 }, { "epoch": 17.22662440570523, "grad_norm": 0.24374344944953918, "learning_rate": 5.165662802022469e-06, "loss": 0.0156, "step": 10870 }, { "epoch": 17.242472266244057, "grad_norm": 0.20077760517597198, "learning_rate": 5.107818831528593e-06, "loss": 0.0152, "step": 10880 }, { "epoch": 17.258320126782884, "grad_norm": 0.19688129425048828, "learning_rate": 5.050283111885123e-06, "loss": 0.0108, "step": 10890 }, { "epoch": 17.27416798732171, "grad_norm": 0.2499351054430008, "learning_rate": 4.9930560381593825e-06, "loss": 0.0174, "step": 10900 }, { "epoch": 17.290015847860538, "grad_norm": 0.11787986755371094, "learning_rate": 4.936138003299412e-06, "loss": 0.011, "step": 10910 }, { "epoch": 17.305863708399364, "grad_norm": 0.10276877880096436, "learning_rate": 4.879529398131227e-06, "loss": 0.0151, "step": 10920 }, { "epoch": 17.321711568938195, "grad_norm": 0.21218866109848022, "learning_rate": 4.823230611356155e-06, "loss": 0.0188, "step": 10930 }, { "epoch": 17.33755942947702, "grad_norm": 0.12643927335739136, "learning_rate": 4.767242029548186e-06, "loss": 0.0137, "step": 10940 }, { "epoch": 17.35340729001585, "grad_norm": 0.22125521302223206, "learning_rate": 4.711564037151261e-06, "loss": 0.0137, "step": 10950 }, { "epoch": 17.369255150554675, "grad_norm": 0.18663623929023743, "learning_rate": 4.656197016476716e-06, "loss": 0.0169, "step": 10960 }, { "epoch": 17.385103011093502, "grad_norm": 0.1977252960205078, "learning_rate": 4.60114134770055e-06, "loss": 0.0142, "step": 10970 }, { "epoch": 17.40095087163233, "grad_norm": 0.1531880646944046, "learning_rate": 4.54639740886093e-06, "loss": 0.0134, "step": 10980 }, { "epoch": 17.416798732171156, "grad_norm": 0.25299400091171265, "learning_rate": 4.4919655758555055e-06, "loss": 0.0115, "step": 10990 }, { "epoch": 17.432646592709983, "grad_norm": 0.15232089161872864, "learning_rate": 4.4378462224388514e-06, "loss": 0.0121, "step": 11000 }, { "epoch": 17.448494453248813, "grad_norm": 0.2122395932674408, "learning_rate": 4.3840397202199515e-06, "loss": 0.0138, "step": 11010 }, { "epoch": 17.46434231378764, "grad_norm": 0.4084971845149994, "learning_rate": 4.330546438659555e-06, "loss": 0.0169, "step": 11020 }, { "epoch": 17.480190174326466, "grad_norm": 0.13414064049720764, "learning_rate": 4.2773667450677346e-06, "loss": 0.0115, "step": 11030 }, { "epoch": 17.496038034865293, "grad_norm": 0.24400712549686432, "learning_rate": 4.224501004601311e-06, "loss": 0.0165, "step": 11040 }, { "epoch": 17.51188589540412, "grad_norm": 0.15812645852565765, "learning_rate": 4.1719495802613254e-06, "loss": 0.0139, "step": 11050 }, { "epoch": 17.527733755942947, "grad_norm": 0.32170212268829346, "learning_rate": 4.119712832890599e-06, "loss": 0.0173, "step": 11060 }, { "epoch": 17.543581616481774, "grad_norm": 0.26727718114852905, "learning_rate": 4.0677911211712494e-06, "loss": 0.0137, "step": 11070 }, { "epoch": 17.5594294770206, "grad_norm": 0.2177404910326004, "learning_rate": 4.0161848016221804e-06, "loss": 0.0115, "step": 11080 }, { "epoch": 17.57527733755943, "grad_norm": 0.08245435357093811, "learning_rate": 3.964894228596683e-06, "loss": 0.0125, "step": 11090 }, { "epoch": 17.591125198098258, "grad_norm": 0.2010851800441742, "learning_rate": 3.913919754279966e-06, "loss": 0.0196, "step": 11100 }, { "epoch": 17.606973058637085, "grad_norm": 0.333839476108551, "learning_rate": 3.8632617286867845e-06, "loss": 0.0168, "step": 11110 }, { "epoch": 17.62282091917591, "grad_norm": 0.09080642461776733, "learning_rate": 3.8129204996589894e-06, "loss": 0.017, "step": 11120 }, { "epoch": 17.638668779714738, "grad_norm": 0.12128207087516785, "learning_rate": 3.7628964128631428e-06, "loss": 0.0146, "step": 11130 }, { "epoch": 17.654516640253565, "grad_norm": 0.15360169112682343, "learning_rate": 3.7131898117881924e-06, "loss": 0.0125, "step": 11140 }, { "epoch": 17.67036450079239, "grad_norm": 0.18307170271873474, "learning_rate": 3.6638010377430476e-06, "loss": 0.012, "step": 11150 }, { "epoch": 17.686212361331222, "grad_norm": 0.17119954526424408, "learning_rate": 3.6147304298542963e-06, "loss": 0.0159, "step": 11160 }, { "epoch": 17.70206022187005, "grad_norm": 0.18213894963264465, "learning_rate": 3.5659783250638344e-06, "loss": 0.0119, "step": 11170 }, { "epoch": 17.717908082408876, "grad_norm": 0.22571374475955963, "learning_rate": 3.517545058126548e-06, "loss": 0.0142, "step": 11180 }, { "epoch": 17.733755942947703, "grad_norm": 0.1815493106842041, "learning_rate": 3.4694309616080665e-06, "loss": 0.02, "step": 11190 }, { "epoch": 17.74960380348653, "grad_norm": 0.18275891244411469, "learning_rate": 3.4216363658824136e-06, "loss": 0.0135, "step": 11200 }, { "epoch": 17.765451664025356, "grad_norm": 0.14396269619464874, "learning_rate": 3.3741615991297938e-06, "loss": 0.0138, "step": 11210 }, { "epoch": 17.781299524564183, "grad_norm": 0.16869209706783295, "learning_rate": 3.327006987334308e-06, "loss": 0.015, "step": 11220 }, { "epoch": 17.79714738510301, "grad_norm": 0.2123693972826004, "learning_rate": 3.2801728542817155e-06, "loss": 0.0125, "step": 11230 }, { "epoch": 17.812995245641837, "grad_norm": 0.16149067878723145, "learning_rate": 3.2336595215572364e-06, "loss": 0.013, "step": 11240 }, { "epoch": 17.828843106180667, "grad_norm": 0.21204307675361633, "learning_rate": 3.1874673085432848e-06, "loss": 0.0165, "step": 11250 }, { "epoch": 17.844690966719494, "grad_norm": 0.16572032868862152, "learning_rate": 3.1415965324173567e-06, "loss": 0.0148, "step": 11260 }, { "epoch": 17.86053882725832, "grad_norm": 0.11765672266483307, "learning_rate": 3.0960475081497966e-06, "loss": 0.0174, "step": 11270 }, { "epoch": 17.876386687797147, "grad_norm": 0.11968225240707397, "learning_rate": 3.0508205485016426e-06, "loss": 0.0144, "step": 11280 }, { "epoch": 17.892234548335974, "grad_norm": 0.2578866183757782, "learning_rate": 3.0059159640225097e-06, "loss": 0.0146, "step": 11290 }, { "epoch": 17.9080824088748, "grad_norm": 0.21918439865112305, "learning_rate": 2.961334063048393e-06, "loss": 0.0155, "step": 11300 }, { "epoch": 17.923930269413628, "grad_norm": 0.16526588797569275, "learning_rate": 2.917075151699622e-06, "loss": 0.0177, "step": 11310 }, { "epoch": 17.939778129952458, "grad_norm": 0.21584181487560272, "learning_rate": 2.8731395338787215e-06, "loss": 0.0164, "step": 11320 }, { "epoch": 17.955625990491285, "grad_norm": 0.19519634544849396, "learning_rate": 2.8295275112683207e-06, "loss": 0.0166, "step": 11330 }, { "epoch": 17.97147385103011, "grad_norm": 0.21799765527248383, "learning_rate": 2.7862393833291036e-06, "loss": 0.0152, "step": 11340 }, { "epoch": 17.98732171156894, "grad_norm": 0.13291296362876892, "learning_rate": 2.743275447297733e-06, "loss": 0.0142, "step": 11350 }, { "epoch": 18.003169572107765, "grad_norm": 0.21296854317188263, "learning_rate": 2.7006359981848196e-06, "loss": 0.0135, "step": 11360 }, { "epoch": 18.019017432646592, "grad_norm": 0.16779382526874542, "learning_rate": 2.6583213287729115e-06, "loss": 0.0107, "step": 11370 }, { "epoch": 18.03486529318542, "grad_norm": 0.3331531882286072, "learning_rate": 2.616331729614424e-06, "loss": 0.015, "step": 11380 }, { "epoch": 18.050713153724246, "grad_norm": 0.18505552411079407, "learning_rate": 2.574667489029725e-06, "loss": 0.0143, "step": 11390 }, { "epoch": 18.066561014263076, "grad_norm": 0.2790921628475189, "learning_rate": 2.533328893105108e-06, "loss": 0.0161, "step": 11400 }, { "epoch": 18.082408874801903, "grad_norm": 0.2316390722990036, "learning_rate": 2.492316225690827e-06, "loss": 0.0158, "step": 11410 }, { "epoch": 18.09825673534073, "grad_norm": 0.2133951485157013, "learning_rate": 2.4516297683991773e-06, "loss": 0.014, "step": 11420 }, { "epoch": 18.114104595879557, "grad_norm": 0.23200847208499908, "learning_rate": 2.411269800602517e-06, "loss": 0.013, "step": 11430 }, { "epoch": 18.129952456418383, "grad_norm": 0.18895216286182404, "learning_rate": 2.371236599431387e-06, "loss": 0.0163, "step": 11440 }, { "epoch": 18.14580031695721, "grad_norm": 0.18238064646720886, "learning_rate": 2.3315304397726e-06, "loss": 0.0124, "step": 11450 }, { "epoch": 18.161648177496037, "grad_norm": 0.1541883647441864, "learning_rate": 2.2921515942673276e-06, "loss": 0.0119, "step": 11460 }, { "epoch": 18.177496038034864, "grad_norm": 0.1508757770061493, "learning_rate": 2.2531003333092826e-06, "loss": 0.0179, "step": 11470 }, { "epoch": 18.193343898573694, "grad_norm": 0.18820171058177948, "learning_rate": 2.2143769250427883e-06, "loss": 0.0123, "step": 11480 }, { "epoch": 18.20919175911252, "grad_norm": 0.15870815515518188, "learning_rate": 2.175981635361013e-06, "loss": 0.0126, "step": 11490 }, { "epoch": 18.225039619651348, "grad_norm": 0.16902989149093628, "learning_rate": 2.1379147279040777e-06, "loss": 0.0123, "step": 11500 }, { "epoch": 18.240887480190175, "grad_norm": 0.12179669737815857, "learning_rate": 2.1001764640572963e-06, "loss": 0.0154, "step": 11510 }, { "epoch": 18.256735340729, "grad_norm": 0.23037730157375336, "learning_rate": 2.0627671029493535e-06, "loss": 0.0153, "step": 11520 }, { "epoch": 18.272583201267828, "grad_norm": 0.21997253596782684, "learning_rate": 2.02568690145053e-06, "loss": 0.0123, "step": 11530 }, { "epoch": 18.288431061806655, "grad_norm": 0.1361498236656189, "learning_rate": 1.988936114170953e-06, "loss": 0.0179, "step": 11540 }, { "epoch": 18.304278922345482, "grad_norm": 0.20903484523296356, "learning_rate": 1.9525149934588314e-06, "loss": 0.0118, "step": 11550 }, { "epoch": 18.320126782884312, "grad_norm": 0.34163960814476013, "learning_rate": 1.916423789398725e-06, "loss": 0.0145, "step": 11560 }, { "epoch": 18.33597464342314, "grad_norm": 0.226941779255867, "learning_rate": 1.8806627498098305e-06, "loss": 0.0139, "step": 11570 }, { "epoch": 18.351822503961966, "grad_norm": 0.1451648324728012, "learning_rate": 1.8452321202442724e-06, "loss": 0.0116, "step": 11580 }, { "epoch": 18.367670364500793, "grad_norm": 0.19719494879245758, "learning_rate": 1.810132143985438e-06, "loss": 0.0123, "step": 11590 }, { "epoch": 18.38351822503962, "grad_norm": 0.1801900416612625, "learning_rate": 1.7753630620463035e-06, "loss": 0.0094, "step": 11600 }, { "epoch": 18.399366085578446, "grad_norm": 0.17285539209842682, "learning_rate": 1.740925113167735e-06, "loss": 0.0184, "step": 11610 }, { "epoch": 18.415213946117273, "grad_norm": 0.1344527304172516, "learning_rate": 1.7068185338169174e-06, "loss": 0.0123, "step": 11620 }, { "epoch": 18.4310618066561, "grad_norm": 0.21449725329875946, "learning_rate": 1.6730435581856719e-06, "loss": 0.0127, "step": 11630 }, { "epoch": 18.44690966719493, "grad_norm": 0.153366357088089, "learning_rate": 1.6396004181888803e-06, "loss": 0.0115, "step": 11640 }, { "epoch": 18.462757527733757, "grad_norm": 0.20724429190158844, "learning_rate": 1.6064893434628914e-06, "loss": 0.0213, "step": 11650 }, { "epoch": 18.478605388272584, "grad_norm": 0.20763236284255981, "learning_rate": 1.5737105613639336e-06, "loss": 0.0165, "step": 11660 }, { "epoch": 18.49445324881141, "grad_norm": 0.1959114670753479, "learning_rate": 1.5412642969665546e-06, "loss": 0.0165, "step": 11670 }, { "epoch": 18.510301109350237, "grad_norm": 0.1801106333732605, "learning_rate": 1.5091507730620735e-06, "loss": 0.0201, "step": 11680 }, { "epoch": 18.526148969889064, "grad_norm": 0.23710688948631287, "learning_rate": 1.4773702101570807e-06, "loss": 0.0127, "step": 11690 }, { "epoch": 18.54199683042789, "grad_norm": 0.19984515011310577, "learning_rate": 1.4459228264718683e-06, "loss": 0.0134, "step": 11700 }, { "epoch": 18.557844690966718, "grad_norm": 0.34799715876579285, "learning_rate": 1.41480883793898e-06, "loss": 0.0139, "step": 11710 }, { "epoch": 18.573692551505548, "grad_norm": 0.23937344551086426, "learning_rate": 1.3840284582017193e-06, "loss": 0.0168, "step": 11720 }, { "epoch": 18.589540412044375, "grad_norm": 0.1457284539937973, "learning_rate": 1.3535818986126492e-06, "loss": 0.0149, "step": 11730 }, { "epoch": 18.605388272583202, "grad_norm": 0.14516040682792664, "learning_rate": 1.3234693682321886e-06, "loss": 0.0136, "step": 11740 }, { "epoch": 18.62123613312203, "grad_norm": 0.1960999220609665, "learning_rate": 1.2936910738271524e-06, "loss": 0.0197, "step": 11750 }, { "epoch": 18.637083993660855, "grad_norm": 0.17722713947296143, "learning_rate": 1.264247219869319e-06, "loss": 0.0101, "step": 11760 }, { "epoch": 18.652931854199682, "grad_norm": 0.12621328234672546, "learning_rate": 1.2351380085340592e-06, "loss": 0.0099, "step": 11770 }, { "epoch": 18.66877971473851, "grad_norm": 0.19239826500415802, "learning_rate": 1.206363639698921e-06, "loss": 0.017, "step": 11780 }, { "epoch": 18.684627575277336, "grad_norm": 0.21515102684497833, "learning_rate": 1.1779243109422632e-06, "loss": 0.016, "step": 11790 }, { "epoch": 18.700475435816166, "grad_norm": 0.32155877351760864, "learning_rate": 1.1498202175419136e-06, "loss": 0.0189, "step": 11800 }, { "epoch": 18.716323296354993, "grad_norm": 0.1156759113073349, "learning_rate": 1.1220515524738017e-06, "loss": 0.0114, "step": 11810 }, { "epoch": 18.73217115689382, "grad_norm": 0.17164525389671326, "learning_rate": 1.0946185064106552e-06, "loss": 0.0142, "step": 11820 }, { "epoch": 18.748019017432647, "grad_norm": 0.15515750646591187, "learning_rate": 1.0675212677206892e-06, "loss": 0.0129, "step": 11830 }, { "epoch": 18.763866877971473, "grad_norm": 0.17644274234771729, "learning_rate": 1.0407600224662917e-06, "loss": 0.0171, "step": 11840 }, { "epoch": 18.7797147385103, "grad_norm": 0.1408577710390091, "learning_rate": 1.0143349544027791e-06, "loss": 0.0149, "step": 11850 }, { "epoch": 18.795562599049127, "grad_norm": 0.23002204298973083, "learning_rate": 9.882462449771035e-07, "loss": 0.0166, "step": 11860 }, { "epoch": 18.811410459587954, "grad_norm": 0.16936403512954712, "learning_rate": 9.624940733266363e-07, "loss": 0.0141, "step": 11870 }, { "epoch": 18.827258320126784, "grad_norm": 0.18246498703956604, "learning_rate": 9.370786162779033e-07, "loss": 0.0146, "step": 11880 }, { "epoch": 18.84310618066561, "grad_norm": 0.14892500638961792, "learning_rate": 9.120000483453961e-07, "loss": 0.0146, "step": 11890 }, { "epoch": 18.858954041204438, "grad_norm": 0.23931749165058136, "learning_rate": 8.872585417303736e-07, "loss": 0.0165, "step": 11900 }, { "epoch": 18.874801901743265, "grad_norm": 0.12671174108982086, "learning_rate": 8.628542663196625e-07, "loss": 0.0148, "step": 11910 }, { "epoch": 18.89064976228209, "grad_norm": 0.12941120564937592, "learning_rate": 8.387873896845144e-07, "loss": 0.014, "step": 11920 }, { "epoch": 18.90649762282092, "grad_norm": 0.17071138322353363, "learning_rate": 8.150580770794336e-07, "loss": 0.0117, "step": 11930 }, { "epoch": 18.922345483359745, "grad_norm": 0.18729938566684723, "learning_rate": 7.916664914410455e-07, "loss": 0.0163, "step": 11940 }, { "epoch": 18.938193343898575, "grad_norm": 0.15300016105175018, "learning_rate": 7.686127933869968e-07, "loss": 0.0136, "step": 11950 }, { "epoch": 18.954041204437402, "grad_norm": 0.24584755301475525, "learning_rate": 7.458971412148241e-07, "loss": 0.0124, "step": 11960 }, { "epoch": 18.96988906497623, "grad_norm": 0.1396564543247223, "learning_rate": 7.235196909008924e-07, "loss": 0.0105, "step": 11970 }, { "epoch": 18.985736925515056, "grad_norm": 0.45825517177581787, "learning_rate": 7.014805960993131e-07, "loss": 0.0173, "step": 11980 }, { "epoch": 19.001584786053883, "grad_norm": 0.3947998285293579, "learning_rate": 6.797800081408845e-07, "loss": 0.0108, "step": 11990 }, { "epoch": 19.01743264659271, "grad_norm": 0.16845867037773132, "learning_rate": 6.584180760320635e-07, "loss": 0.0125, "step": 12000 }, { "epoch": 19.033280507131536, "grad_norm": 0.17425452172756195, "learning_rate": 6.373949464539286e-07, "loss": 0.0124, "step": 12010 }, { "epoch": 19.049128367670363, "grad_norm": 0.29528307914733887, "learning_rate": 6.167107637611858e-07, "loss": 0.012, "step": 12020 }, { "epoch": 19.064976228209193, "grad_norm": 0.22592425346374512, "learning_rate": 5.963656699811693e-07, "loss": 0.0128, "step": 12030 }, { "epoch": 19.08082408874802, "grad_norm": 0.2710273265838623, "learning_rate": 5.763598048128704e-07, "loss": 0.0118, "step": 12040 }, { "epoch": 19.096671949286847, "grad_norm": 0.21170739829540253, "learning_rate": 5.566933056259882e-07, "loss": 0.0149, "step": 12050 }, { "epoch": 19.112519809825674, "grad_norm": 0.20141953229904175, "learning_rate": 5.373663074599522e-07, "loss": 0.0119, "step": 12060 }, { "epoch": 19.1283676703645, "grad_norm": 0.19637355208396912, "learning_rate": 5.183789430230346e-07, "loss": 0.0136, "step": 12070 }, { "epoch": 19.144215530903328, "grad_norm": 0.16335146129131317, "learning_rate": 4.99731342691423e-07, "loss": 0.0138, "step": 12080 }, { "epoch": 19.160063391442154, "grad_norm": 0.16980670392513275, "learning_rate": 4.814236345083156e-07, "loss": 0.0146, "step": 12090 }, { "epoch": 19.17591125198098, "grad_norm": 0.3072208762168884, "learning_rate": 4.6345594418304996e-07, "loss": 0.0141, "step": 12100 }, { "epoch": 19.19175911251981, "grad_norm": 0.18183228373527527, "learning_rate": 4.458283950902642e-07, "loss": 0.014, "step": 12110 }, { "epoch": 19.20760697305864, "grad_norm": 0.160085067152977, "learning_rate": 4.285411082689927e-07, "loss": 0.0131, "step": 12120 }, { "epoch": 19.223454833597465, "grad_norm": 0.24544082581996918, "learning_rate": 4.115942024218944e-07, "loss": 0.011, "step": 12130 }, { "epoch": 19.239302694136292, "grad_norm": 0.21625439822673798, "learning_rate": 3.9498779391439754e-07, "loss": 0.0135, "step": 12140 }, { "epoch": 19.25515055467512, "grad_norm": 0.23580490052700043, "learning_rate": 3.787219967739231e-07, "loss": 0.0131, "step": 12150 }, { "epoch": 19.270998415213946, "grad_norm": 0.3027212917804718, "learning_rate": 3.627969226890959e-07, "loss": 0.0146, "step": 12160 }, { "epoch": 19.286846275752772, "grad_norm": 0.13139936327934265, "learning_rate": 3.4721268100896265e-07, "loss": 0.0111, "step": 12170 }, { "epoch": 19.3026941362916, "grad_norm": 0.18760831654071808, "learning_rate": 3.319693787422751e-07, "loss": 0.0177, "step": 12180 }, { "epoch": 19.31854199683043, "grad_norm": 0.1672467142343521, "learning_rate": 3.170671205567133e-07, "loss": 0.0183, "step": 12190 }, { "epoch": 19.334389857369256, "grad_norm": 0.13311269879341125, "learning_rate": 3.025060087782028e-07, "loss": 0.0158, "step": 12200 }, { "epoch": 19.350237717908083, "grad_norm": 0.1697445511817932, "learning_rate": 2.8828614339018735e-07, "loss": 0.0118, "step": 12210 }, { "epoch": 19.36608557844691, "grad_norm": 0.12869006395339966, "learning_rate": 2.744076220329628e-07, "loss": 0.0131, "step": 12220 }, { "epoch": 19.381933438985737, "grad_norm": 0.1751239150762558, "learning_rate": 2.6087054000298874e-07, "loss": 0.0163, "step": 12230 }, { "epoch": 19.397781299524564, "grad_norm": 0.23643170297145844, "learning_rate": 2.4767499025223904e-07, "loss": 0.0092, "step": 12240 }, { "epoch": 19.41362916006339, "grad_norm": 0.11966560781002045, "learning_rate": 2.3482106338758025e-07, "loss": 0.0106, "step": 12250 }, { "epoch": 19.429477020602217, "grad_norm": 0.15015937387943268, "learning_rate": 2.2230884767011628e-07, "loss": 0.0139, "step": 12260 }, { "epoch": 19.445324881141048, "grad_norm": 0.21535207331180573, "learning_rate": 2.101384290146169e-07, "loss": 0.0129, "step": 12270 }, { "epoch": 19.461172741679874, "grad_norm": 0.19800494611263275, "learning_rate": 1.9830989098890142e-07, "loss": 0.0098, "step": 12280 }, { "epoch": 19.4770206022187, "grad_norm": 0.18215620517730713, "learning_rate": 1.8682331481328364e-07, "loss": 0.0137, "step": 12290 }, { "epoch": 19.492868462757528, "grad_norm": 0.18857750296592712, "learning_rate": 1.756787793600001e-07, "loss": 0.0127, "step": 12300 }, { "epoch": 19.508716323296355, "grad_norm": 0.23549525439739227, "learning_rate": 1.6487636115268824e-07, "loss": 0.0144, "step": 12310 }, { "epoch": 19.52456418383518, "grad_norm": 0.16930314898490906, "learning_rate": 1.5441613436582014e-07, "loss": 0.0136, "step": 12320 }, { "epoch": 19.54041204437401, "grad_norm": 0.20466506481170654, "learning_rate": 1.4429817082425302e-07, "loss": 0.015, "step": 12330 }, { "epoch": 19.556259904912835, "grad_norm": 0.2871796190738678, "learning_rate": 1.3452254000267394e-07, "loss": 0.0117, "step": 12340 }, { "epoch": 19.572107765451666, "grad_norm": 0.2035956084728241, "learning_rate": 1.2508930902517813e-07, "loss": 0.0137, "step": 12350 }, { "epoch": 19.587955625990492, "grad_norm": 0.096625417470932, "learning_rate": 1.1599854266476918e-07, "loss": 0.0136, "step": 12360 }, { "epoch": 19.60380348652932, "grad_norm": 0.1756078600883484, "learning_rate": 1.0725030334292064e-07, "loss": 0.0157, "step": 12370 }, { "epoch": 19.619651347068146, "grad_norm": 0.12860575318336487, "learning_rate": 9.884465112917074e-08, "loss": 0.0138, "step": 12380 }, { "epoch": 19.635499207606973, "grad_norm": 0.2300836741924286, "learning_rate": 9.078164374067833e-08, "loss": 0.0155, "step": 12390 }, { "epoch": 19.6513470681458, "grad_norm": 0.16416415572166443, "learning_rate": 8.306133654185089e-08, "loss": 0.0114, "step": 12400 }, { "epoch": 19.667194928684626, "grad_norm": 0.12426438182592392, "learning_rate": 7.568378254395047e-08, "loss": 0.014, "step": 12410 }, { "epoch": 19.683042789223453, "grad_norm": 0.16134649515151978, "learning_rate": 6.864903240474397e-08, "loss": 0.0129, "step": 12420 }, { "epoch": 19.698890649762284, "grad_norm": 0.10580016672611237, "learning_rate": 6.195713442812556e-08, "loss": 0.0098, "step": 12430 }, { "epoch": 19.71473851030111, "grad_norm": 0.25324514508247375, "learning_rate": 5.560813456382818e-08, "loss": 0.0145, "step": 12440 }, { "epoch": 19.730586370839937, "grad_norm": 0.10774058848619461, "learning_rate": 4.96020764070626e-08, "loss": 0.0117, "step": 12450 }, { "epoch": 19.746434231378764, "grad_norm": 0.14597612619400024, "learning_rate": 4.393900119826211e-08, "loss": 0.0203, "step": 12460 }, { "epoch": 19.76228209191759, "grad_norm": 0.22840741276741028, "learning_rate": 3.861894782276609e-08, "loss": 0.0155, "step": 12470 }, { "epoch": 19.778129952456418, "grad_norm": 0.17510531842708588, "learning_rate": 3.3641952810559155e-08, "loss": 0.015, "step": 12480 }, { "epoch": 19.793977812995244, "grad_norm": 0.1512196958065033, "learning_rate": 2.9008050336032376e-08, "loss": 0.0137, "step": 12490 }, { "epoch": 19.80982567353407, "grad_norm": 0.3339158296585083, "learning_rate": 2.471727221775022e-08, "loss": 0.0121, "step": 12500 }, { "epoch": 19.8256735340729, "grad_norm": 0.17482948303222656, "learning_rate": 2.0769647918206237e-08, "loss": 0.0122, "step": 12510 }, { "epoch": 19.84152139461173, "grad_norm": 0.19882513582706451, "learning_rate": 1.7165204543656554e-08, "loss": 0.0136, "step": 12520 }, { "epoch": 19.857369255150555, "grad_norm": 0.2373553216457367, "learning_rate": 1.3903966843897831e-08, "loss": 0.0152, "step": 12530 }, { "epoch": 19.873217115689382, "grad_norm": 0.17455127835273743, "learning_rate": 1.0985957212122922e-08, "loss": 0.0108, "step": 12540 }, { "epoch": 19.88906497622821, "grad_norm": 0.10235228389501572, "learning_rate": 8.411195684765449e-09, "loss": 0.0143, "step": 12550 }, { "epoch": 19.904912836767036, "grad_norm": 0.3095182776451111, "learning_rate": 6.179699941349926e-09, "loss": 0.011, "step": 12560 }, { "epoch": 19.920760697305862, "grad_norm": 0.11435042321681976, "learning_rate": 4.291485304375176e-09, "loss": 0.0132, "step": 12570 }, { "epoch": 19.936608557844693, "grad_norm": 0.22614504396915436, "learning_rate": 2.7465647392088676e-09, "loss": 0.0135, "step": 12580 }, { "epoch": 19.95245641838352, "grad_norm": 0.22306819260120392, "learning_rate": 1.544948854009798e-09, "loss": 0.0141, "step": 12590 }, { "epoch": 19.968304278922346, "grad_norm": 0.16913668811321259, "learning_rate": 6.866458996279689e-10, "loss": 0.0131, "step": 12600 }, { "epoch": 19.984152139461173, "grad_norm": 0.28053924441337585, "learning_rate": 1.7166176958238746e-10, "loss": 0.0113, "step": 12610 }, { "epoch": 20.0, "grad_norm": 0.22396335005760193, "learning_rate": 0.0, "loss": 0.0095, "step": 12620 }, { "epoch": 20.0, "step": 12620, "total_flos": 1.0795351858465092e+18, "train_loss": 0.03321190329177066, "train_runtime": 10224.5982, "train_samples_per_second": 33.32, "train_steps_per_second": 1.234 } ], "logging_steps": 10, "max_steps": 12620, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.0795351858465092e+18, "train_batch_size": 27, "trial_name": null, "trial_params": null }