LegrandFrederic's picture
Upload trainer_state.json with huggingface_hub
3e5bd2d verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 20.0,
"eval_steps": 500,
"global_step": 12620,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01584786053882726,
"grad_norm": 6.662916660308838,
"learning_rate": 1.584786053882726e-06,
"loss": 1.0759,
"step": 10
},
{
"epoch": 0.03169572107765452,
"grad_norm": 5.666072845458984,
"learning_rate": 3.169572107765452e-06,
"loss": 1.0851,
"step": 20
},
{
"epoch": 0.04754358161648178,
"grad_norm": 5.542662620544434,
"learning_rate": 4.754358161648178e-06,
"loss": 1.0049,
"step": 30
},
{
"epoch": 0.06339144215530904,
"grad_norm": 3.666578769683838,
"learning_rate": 6.339144215530904e-06,
"loss": 0.7551,
"step": 40
},
{
"epoch": 0.07923930269413629,
"grad_norm": 1.8371094465255737,
"learning_rate": 7.923930269413629e-06,
"loss": 0.5051,
"step": 50
},
{
"epoch": 0.09508716323296355,
"grad_norm": 1.8207030296325684,
"learning_rate": 9.508716323296357e-06,
"loss": 0.3629,
"step": 60
},
{
"epoch": 0.1109350237717908,
"grad_norm": 1.8528467416763306,
"learning_rate": 1.109350237717908e-05,
"loss": 0.2969,
"step": 70
},
{
"epoch": 0.12678288431061807,
"grad_norm": 1.4607429504394531,
"learning_rate": 1.2678288431061808e-05,
"loss": 0.2692,
"step": 80
},
{
"epoch": 0.14263074484944532,
"grad_norm": 1.288856029510498,
"learning_rate": 1.4263074484944532e-05,
"loss": 0.2774,
"step": 90
},
{
"epoch": 0.15847860538827258,
"grad_norm": 1.119730830192566,
"learning_rate": 1.5847860538827258e-05,
"loss": 0.2193,
"step": 100
},
{
"epoch": 0.17432646592709986,
"grad_norm": 0.9573730826377869,
"learning_rate": 1.7432646592709986e-05,
"loss": 0.2068,
"step": 110
},
{
"epoch": 0.1901743264659271,
"grad_norm": 0.9363383054733276,
"learning_rate": 1.9017432646592713e-05,
"loss": 0.2061,
"step": 120
},
{
"epoch": 0.20602218700475436,
"grad_norm": 0.9630652070045471,
"learning_rate": 2.0602218700475437e-05,
"loss": 0.1937,
"step": 130
},
{
"epoch": 0.2218700475435816,
"grad_norm": 1.1261632442474365,
"learning_rate": 2.218700475435816e-05,
"loss": 0.1769,
"step": 140
},
{
"epoch": 0.23771790808240886,
"grad_norm": 0.859064519405365,
"learning_rate": 2.377179080824089e-05,
"loss": 0.1788,
"step": 150
},
{
"epoch": 0.25356576862123614,
"grad_norm": 0.8056842684745789,
"learning_rate": 2.5356576862123617e-05,
"loss": 0.1486,
"step": 160
},
{
"epoch": 0.2694136291600634,
"grad_norm": 0.889215886592865,
"learning_rate": 2.694136291600634e-05,
"loss": 0.1597,
"step": 170
},
{
"epoch": 0.28526148969889065,
"grad_norm": 0.90467369556427,
"learning_rate": 2.8526148969889065e-05,
"loss": 0.1571,
"step": 180
},
{
"epoch": 0.3011093502377179,
"grad_norm": 0.9579144716262817,
"learning_rate": 3.0110935023771792e-05,
"loss": 0.1378,
"step": 190
},
{
"epoch": 0.31695721077654515,
"grad_norm": 1.2036852836608887,
"learning_rate": 3.1695721077654516e-05,
"loss": 0.1315,
"step": 200
},
{
"epoch": 0.3328050713153724,
"grad_norm": 0.9379674196243286,
"learning_rate": 3.3280507131537244e-05,
"loss": 0.128,
"step": 210
},
{
"epoch": 0.3486529318541997,
"grad_norm": 0.9713570475578308,
"learning_rate": 3.486529318541997e-05,
"loss": 0.124,
"step": 220
},
{
"epoch": 0.36450079239302696,
"grad_norm": 0.9157624244689941,
"learning_rate": 3.64500792393027e-05,
"loss": 0.1264,
"step": 230
},
{
"epoch": 0.3803486529318542,
"grad_norm": 0.5934288501739502,
"learning_rate": 3.8034865293185427e-05,
"loss": 0.1121,
"step": 240
},
{
"epoch": 0.39619651347068147,
"grad_norm": 0.865558922290802,
"learning_rate": 3.961965134706815e-05,
"loss": 0.1222,
"step": 250
},
{
"epoch": 0.4120443740095087,
"grad_norm": 0.8881043195724487,
"learning_rate": 4.1204437400950875e-05,
"loss": 0.1067,
"step": 260
},
{
"epoch": 0.42789223454833597,
"grad_norm": 0.7673732042312622,
"learning_rate": 4.27892234548336e-05,
"loss": 0.1088,
"step": 270
},
{
"epoch": 0.4437400950871632,
"grad_norm": 0.5540139079093933,
"learning_rate": 4.437400950871632e-05,
"loss": 0.094,
"step": 280
},
{
"epoch": 0.4595879556259905,
"grad_norm": 0.8689896464347839,
"learning_rate": 4.595879556259905e-05,
"loss": 0.0978,
"step": 290
},
{
"epoch": 0.4754358161648177,
"grad_norm": 0.6194612979888916,
"learning_rate": 4.754358161648178e-05,
"loss": 0.0791,
"step": 300
},
{
"epoch": 0.49128367670364503,
"grad_norm": 0.698275089263916,
"learning_rate": 4.9128367670364506e-05,
"loss": 0.0883,
"step": 310
},
{
"epoch": 0.5071315372424723,
"grad_norm": 0.7928034663200378,
"learning_rate": 5.071315372424723e-05,
"loss": 0.0994,
"step": 320
},
{
"epoch": 0.5229793977812995,
"grad_norm": 0.5640034675598145,
"learning_rate": 5.2297939778129954e-05,
"loss": 0.0832,
"step": 330
},
{
"epoch": 0.5388272583201268,
"grad_norm": 0.8224833011627197,
"learning_rate": 5.388272583201268e-05,
"loss": 0.0926,
"step": 340
},
{
"epoch": 0.554675118858954,
"grad_norm": 0.9649167656898499,
"learning_rate": 5.546751188589541e-05,
"loss": 0.0896,
"step": 350
},
{
"epoch": 0.5705229793977813,
"grad_norm": 0.7821329832077026,
"learning_rate": 5.705229793977813e-05,
"loss": 0.1003,
"step": 360
},
{
"epoch": 0.5863708399366085,
"grad_norm": 0.5526576638221741,
"learning_rate": 5.863708399366086e-05,
"loss": 0.0826,
"step": 370
},
{
"epoch": 0.6022187004754358,
"grad_norm": 0.6012312769889832,
"learning_rate": 6.0221870047543585e-05,
"loss": 0.0808,
"step": 380
},
{
"epoch": 0.618066561014263,
"grad_norm": 0.6598588824272156,
"learning_rate": 6.18066561014263e-05,
"loss": 0.0781,
"step": 390
},
{
"epoch": 0.6339144215530903,
"grad_norm": 0.6164723634719849,
"learning_rate": 6.339144215530903e-05,
"loss": 0.0725,
"step": 400
},
{
"epoch": 0.6497622820919176,
"grad_norm": 0.6792150139808655,
"learning_rate": 6.497622820919176e-05,
"loss": 0.0694,
"step": 410
},
{
"epoch": 0.6656101426307448,
"grad_norm": 0.5863011479377747,
"learning_rate": 6.656101426307449e-05,
"loss": 0.0771,
"step": 420
},
{
"epoch": 0.6814580031695721,
"grad_norm": 0.6146591305732727,
"learning_rate": 6.814580031695722e-05,
"loss": 0.0768,
"step": 430
},
{
"epoch": 0.6973058637083994,
"grad_norm": 0.4906776547431946,
"learning_rate": 6.973058637083994e-05,
"loss": 0.073,
"step": 440
},
{
"epoch": 0.7131537242472267,
"grad_norm": 0.7824636101722717,
"learning_rate": 7.131537242472267e-05,
"loss": 0.074,
"step": 450
},
{
"epoch": 0.7290015847860539,
"grad_norm": 0.5947498679161072,
"learning_rate": 7.29001584786054e-05,
"loss": 0.0786,
"step": 460
},
{
"epoch": 0.7448494453248812,
"grad_norm": 0.49313023686408997,
"learning_rate": 7.448494453248813e-05,
"loss": 0.0694,
"step": 470
},
{
"epoch": 0.7606973058637084,
"grad_norm": 0.36435437202453613,
"learning_rate": 7.606973058637085e-05,
"loss": 0.067,
"step": 480
},
{
"epoch": 0.7765451664025357,
"grad_norm": 0.6767722368240356,
"learning_rate": 7.765451664025357e-05,
"loss": 0.0747,
"step": 490
},
{
"epoch": 0.7923930269413629,
"grad_norm": 0.5377907752990723,
"learning_rate": 7.92393026941363e-05,
"loss": 0.0709,
"step": 500
},
{
"epoch": 0.8082408874801902,
"grad_norm": 0.740249752998352,
"learning_rate": 8.082408874801902e-05,
"loss": 0.0762,
"step": 510
},
{
"epoch": 0.8240887480190174,
"grad_norm": 0.5422998666763306,
"learning_rate": 8.240887480190175e-05,
"loss": 0.0636,
"step": 520
},
{
"epoch": 0.8399366085578447,
"grad_norm": 0.7832110524177551,
"learning_rate": 8.399366085578448e-05,
"loss": 0.0673,
"step": 530
},
{
"epoch": 0.8557844690966719,
"grad_norm": 0.8280210494995117,
"learning_rate": 8.55784469096672e-05,
"loss": 0.0729,
"step": 540
},
{
"epoch": 0.8716323296354992,
"grad_norm": 0.4729553461074829,
"learning_rate": 8.716323296354992e-05,
"loss": 0.0802,
"step": 550
},
{
"epoch": 0.8874801901743264,
"grad_norm": 0.49598968029022217,
"learning_rate": 8.874801901743265e-05,
"loss": 0.0707,
"step": 560
},
{
"epoch": 0.9033280507131537,
"grad_norm": 0.5164231657981873,
"learning_rate": 9.033280507131537e-05,
"loss": 0.0723,
"step": 570
},
{
"epoch": 0.919175911251981,
"grad_norm": 0.8574791550636292,
"learning_rate": 9.19175911251981e-05,
"loss": 0.0691,
"step": 580
},
{
"epoch": 0.9350237717908082,
"grad_norm": 0.6576387286186218,
"learning_rate": 9.350237717908083e-05,
"loss": 0.0652,
"step": 590
},
{
"epoch": 0.9508716323296355,
"grad_norm": 0.5376480221748352,
"learning_rate": 9.508716323296356e-05,
"loss": 0.0665,
"step": 600
},
{
"epoch": 0.9667194928684627,
"grad_norm": 0.3735605776309967,
"learning_rate": 9.667194928684627e-05,
"loss": 0.0663,
"step": 610
},
{
"epoch": 0.9825673534072901,
"grad_norm": 0.4635787606239319,
"learning_rate": 9.825673534072901e-05,
"loss": 0.0641,
"step": 620
},
{
"epoch": 0.9984152139461173,
"grad_norm": 0.7257930040359497,
"learning_rate": 9.984152139461174e-05,
"loss": 0.0689,
"step": 630
},
{
"epoch": 1.0142630744849446,
"grad_norm": 0.4881118834018707,
"learning_rate": 9.999986095395153e-05,
"loss": 0.0612,
"step": 640
},
{
"epoch": 1.0301109350237718,
"grad_norm": 0.4648587107658386,
"learning_rate": 9.99993803019373e-05,
"loss": 0.0638,
"step": 650
},
{
"epoch": 1.045958795562599,
"grad_norm": 0.47810834646224976,
"learning_rate": 9.999855633063904e-05,
"loss": 0.0594,
"step": 660
},
{
"epoch": 1.0618066561014263,
"grad_norm": 0.5248630046844482,
"learning_rate": 9.999738904571453e-05,
"loss": 0.0613,
"step": 670
},
{
"epoch": 1.0776545166402536,
"grad_norm": 0.5785868167877197,
"learning_rate": 9.999587845517889e-05,
"loss": 0.0658,
"step": 680
},
{
"epoch": 1.0935023771790808,
"grad_norm": 0.5674669742584229,
"learning_rate": 9.999402456940454e-05,
"loss": 0.0584,
"step": 690
},
{
"epoch": 1.109350237717908,
"grad_norm": 0.5009925961494446,
"learning_rate": 9.999182740112115e-05,
"loss": 0.061,
"step": 700
},
{
"epoch": 1.1251980982567353,
"grad_norm": 0.4827341139316559,
"learning_rate": 9.99892869654155e-05,
"loss": 0.0668,
"step": 710
},
{
"epoch": 1.1410459587955626,
"grad_norm": 0.5092630982398987,
"learning_rate": 9.998640327973141e-05,
"loss": 0.0644,
"step": 720
},
{
"epoch": 1.1568938193343898,
"grad_norm": 0.5155372619628906,
"learning_rate": 9.998317636386964e-05,
"loss": 0.0629,
"step": 730
},
{
"epoch": 1.172741679873217,
"grad_norm": 0.49632275104522705,
"learning_rate": 9.997960623998772e-05,
"loss": 0.0617,
"step": 740
},
{
"epoch": 1.1885895404120443,
"grad_norm": 0.44003790616989136,
"learning_rate": 9.997569293259977e-05,
"loss": 0.0492,
"step": 750
},
{
"epoch": 1.2044374009508716,
"grad_norm": 0.5962253212928772,
"learning_rate": 9.997143646857643e-05,
"loss": 0.0606,
"step": 760
},
{
"epoch": 1.2202852614896988,
"grad_norm": 0.5411946177482605,
"learning_rate": 9.996683687714457e-05,
"loss": 0.0613,
"step": 770
},
{
"epoch": 1.236133122028526,
"grad_norm": 0.38321027159690857,
"learning_rate": 9.996189418988715e-05,
"loss": 0.0524,
"step": 780
},
{
"epoch": 1.2519809825673534,
"grad_norm": 0.41845589876174927,
"learning_rate": 9.9956608440743e-05,
"loss": 0.0703,
"step": 790
},
{
"epoch": 1.2678288431061806,
"grad_norm": 0.5127869844436646,
"learning_rate": 9.995097966600655e-05,
"loss": 0.0644,
"step": 800
},
{
"epoch": 1.2836767036450079,
"grad_norm": 0.46290555596351624,
"learning_rate": 9.994500790432762e-05,
"loss": 0.0645,
"step": 810
},
{
"epoch": 1.299524564183835,
"grad_norm": 0.4053901731967926,
"learning_rate": 9.993869319671114e-05,
"loss": 0.0595,
"step": 820
},
{
"epoch": 1.3153724247226624,
"grad_norm": 0.5436721444129944,
"learning_rate": 9.993203558651687e-05,
"loss": 0.0549,
"step": 830
},
{
"epoch": 1.3312202852614896,
"grad_norm": 0.47135990858078003,
"learning_rate": 9.992503511945907e-05,
"loss": 0.0716,
"step": 840
},
{
"epoch": 1.3470681458003169,
"grad_norm": 0.5177005529403687,
"learning_rate": 9.991769184360629e-05,
"loss": 0.0616,
"step": 850
},
{
"epoch": 1.3629160063391441,
"grad_norm": 0.49199798703193665,
"learning_rate": 9.991000580938087e-05,
"loss": 0.0559,
"step": 860
},
{
"epoch": 1.3787638668779714,
"grad_norm": 0.4700946509838104,
"learning_rate": 9.990197706955877e-05,
"loss": 0.0534,
"step": 870
},
{
"epoch": 1.3946117274167986,
"grad_norm": 0.49670886993408203,
"learning_rate": 9.98936056792691e-05,
"loss": 0.0494,
"step": 880
},
{
"epoch": 1.4104595879556259,
"grad_norm": 0.42340198159217834,
"learning_rate": 9.988489169599373e-05,
"loss": 0.0577,
"step": 890
},
{
"epoch": 1.4263074484944531,
"grad_norm": 0.395857036113739,
"learning_rate": 9.987583517956703e-05,
"loss": 0.0465,
"step": 900
},
{
"epoch": 1.4421553090332804,
"grad_norm": 0.4153169095516205,
"learning_rate": 9.986643619217524e-05,
"loss": 0.0497,
"step": 910
},
{
"epoch": 1.4580031695721076,
"grad_norm": 0.36922216415405273,
"learning_rate": 9.985669479835629e-05,
"loss": 0.055,
"step": 920
},
{
"epoch": 1.473851030110935,
"grad_norm": 0.43492555618286133,
"learning_rate": 9.984661106499913e-05,
"loss": 0.0459,
"step": 930
},
{
"epoch": 1.4896988906497624,
"grad_norm": 0.4480608403682709,
"learning_rate": 9.983618506134344e-05,
"loss": 0.0567,
"step": 940
},
{
"epoch": 1.5055467511885894,
"grad_norm": 0.4628249704837799,
"learning_rate": 9.98254168589791e-05,
"loss": 0.0595,
"step": 950
},
{
"epoch": 1.5213946117274166,
"grad_norm": 0.2762962877750397,
"learning_rate": 9.98143065318456e-05,
"loss": 0.055,
"step": 960
},
{
"epoch": 1.537242472266244,
"grad_norm": 0.4730890095233917,
"learning_rate": 9.980285415623172e-05,
"loss": 0.0571,
"step": 970
},
{
"epoch": 1.5530903328050714,
"grad_norm": 0.5532105565071106,
"learning_rate": 9.979105981077483e-05,
"loss": 0.05,
"step": 980
},
{
"epoch": 1.5689381933438986,
"grad_norm": 0.3818027675151825,
"learning_rate": 9.977892357646049e-05,
"loss": 0.0564,
"step": 990
},
{
"epoch": 1.5847860538827259,
"grad_norm": 0.4600647985935211,
"learning_rate": 9.976644553662178e-05,
"loss": 0.0517,
"step": 1000
},
{
"epoch": 1.6006339144215531,
"grad_norm": 0.39071592688560486,
"learning_rate": 9.975362577693879e-05,
"loss": 0.0583,
"step": 1010
},
{
"epoch": 1.6164817749603804,
"grad_norm": 0.5415641069412231,
"learning_rate": 9.974046438543805e-05,
"loss": 0.0549,
"step": 1020
},
{
"epoch": 1.6323296354992076,
"grad_norm": 0.4827348291873932,
"learning_rate": 9.972696145249185e-05,
"loss": 0.0537,
"step": 1030
},
{
"epoch": 1.6481774960380349,
"grad_norm": 0.4411364495754242,
"learning_rate": 9.971311707081769e-05,
"loss": 0.0474,
"step": 1040
},
{
"epoch": 1.6640253565768621,
"grad_norm": 0.46820345520973206,
"learning_rate": 9.96989313354776e-05,
"loss": 0.0491,
"step": 1050
},
{
"epoch": 1.6798732171156894,
"grad_norm": 0.38286513090133667,
"learning_rate": 9.968440434387756e-05,
"loss": 0.0536,
"step": 1060
},
{
"epoch": 1.6957210776545166,
"grad_norm": 0.45724204182624817,
"learning_rate": 9.966953619576667e-05,
"loss": 0.0543,
"step": 1070
},
{
"epoch": 1.7115689381933439,
"grad_norm": 0.4794733226299286,
"learning_rate": 9.965432699323669e-05,
"loss": 0.0456,
"step": 1080
},
{
"epoch": 1.7274167987321711,
"grad_norm": 0.4839860498905182,
"learning_rate": 9.963877684072113e-05,
"loss": 0.0558,
"step": 1090
},
{
"epoch": 1.7432646592709984,
"grad_norm": 0.4146821200847626,
"learning_rate": 9.962288584499466e-05,
"loss": 0.0492,
"step": 1100
},
{
"epoch": 1.7591125198098256,
"grad_norm": 0.4119221568107605,
"learning_rate": 9.960665411517235e-05,
"loss": 0.045,
"step": 1110
},
{
"epoch": 1.7749603803486529,
"grad_norm": 0.5363435745239258,
"learning_rate": 9.959008176270892e-05,
"loss": 0.0508,
"step": 1120
},
{
"epoch": 1.7908082408874801,
"grad_norm": 1.4820852279663086,
"learning_rate": 9.957316890139792e-05,
"loss": 0.0546,
"step": 1130
},
{
"epoch": 1.8066561014263076,
"grad_norm": 1.0511815547943115,
"learning_rate": 9.955591564737099e-05,
"loss": 0.0624,
"step": 1140
},
{
"epoch": 1.8225039619651349,
"grad_norm": 1.2796475887298584,
"learning_rate": 9.953832211909715e-05,
"loss": 0.0498,
"step": 1150
},
{
"epoch": 1.8383518225039621,
"grad_norm": 0.44021984934806824,
"learning_rate": 9.952038843738181e-05,
"loss": 0.0545,
"step": 1160
},
{
"epoch": 1.8541996830427894,
"grad_norm": 0.3289697468280792,
"learning_rate": 9.950211472536609e-05,
"loss": 0.0525,
"step": 1170
},
{
"epoch": 1.8700475435816166,
"grad_norm": 0.33451518416404724,
"learning_rate": 9.948350110852587e-05,
"loss": 0.0532,
"step": 1180
},
{
"epoch": 1.8858954041204439,
"grad_norm": 0.28288567066192627,
"learning_rate": 9.946454771467104e-05,
"loss": 0.0554,
"step": 1190
},
{
"epoch": 1.9017432646592711,
"grad_norm": 0.43782156705856323,
"learning_rate": 9.944525467394452e-05,
"loss": 0.0464,
"step": 1200
},
{
"epoch": 1.9175911251980984,
"grad_norm": 0.43314260244369507,
"learning_rate": 9.94256221188214e-05,
"loss": 0.0457,
"step": 1210
},
{
"epoch": 1.9334389857369256,
"grad_norm": 0.3398579955101013,
"learning_rate": 9.940565018410805e-05,
"loss": 0.0479,
"step": 1220
},
{
"epoch": 1.9492868462757529,
"grad_norm": 0.4561791718006134,
"learning_rate": 9.938533900694118e-05,
"loss": 0.0524,
"step": 1230
},
{
"epoch": 1.9651347068145801,
"grad_norm": 0.4242919981479645,
"learning_rate": 9.93646887267869e-05,
"loss": 0.0472,
"step": 1240
},
{
"epoch": 1.9809825673534074,
"grad_norm": 0.4337559938430786,
"learning_rate": 9.934369948543972e-05,
"loss": 0.0483,
"step": 1250
},
{
"epoch": 1.9968304278922346,
"grad_norm": 0.3361351490020752,
"learning_rate": 9.93223714270217e-05,
"loss": 0.0584,
"step": 1260
},
{
"epoch": 2.012678288431062,
"grad_norm": 0.38473933935165405,
"learning_rate": 9.93007046979813e-05,
"loss": 0.0464,
"step": 1270
},
{
"epoch": 2.028526148969889,
"grad_norm": 0.4139537811279297,
"learning_rate": 9.92786994470925e-05,
"loss": 0.0482,
"step": 1280
},
{
"epoch": 2.0443740095087164,
"grad_norm": 0.4201923906803131,
"learning_rate": 9.92563558254537e-05,
"loss": 0.0385,
"step": 1290
},
{
"epoch": 2.0602218700475436,
"grad_norm": 0.36632242798805237,
"learning_rate": 9.923367398648671e-05,
"loss": 0.0482,
"step": 1300
},
{
"epoch": 2.076069730586371,
"grad_norm": 0.5546051859855652,
"learning_rate": 9.921065408593574e-05,
"loss": 0.0522,
"step": 1310
},
{
"epoch": 2.091917591125198,
"grad_norm": 0.34690526127815247,
"learning_rate": 9.918729628186628e-05,
"loss": 0.0493,
"step": 1320
},
{
"epoch": 2.1077654516640254,
"grad_norm": 0.32220637798309326,
"learning_rate": 9.916360073466397e-05,
"loss": 0.0445,
"step": 1330
},
{
"epoch": 2.1236133122028527,
"grad_norm": 0.3459847569465637,
"learning_rate": 9.913956760703363e-05,
"loss": 0.0528,
"step": 1340
},
{
"epoch": 2.13946117274168,
"grad_norm": 0.5802285671234131,
"learning_rate": 9.911519706399798e-05,
"loss": 0.0517,
"step": 1350
},
{
"epoch": 2.155309033280507,
"grad_norm": 0.49526548385620117,
"learning_rate": 9.909048927289668e-05,
"loss": 0.0521,
"step": 1360
},
{
"epoch": 2.1711568938193344,
"grad_norm": 0.48161160945892334,
"learning_rate": 9.906544440338504e-05,
"loss": 0.0486,
"step": 1370
},
{
"epoch": 2.1870047543581617,
"grad_norm": 0.45289692282676697,
"learning_rate": 9.904006262743293e-05,
"loss": 0.0541,
"step": 1380
},
{
"epoch": 2.202852614896989,
"grad_norm": 0.2760493755340576,
"learning_rate": 9.901434411932358e-05,
"loss": 0.0488,
"step": 1390
},
{
"epoch": 2.218700475435816,
"grad_norm": 0.3549060523509979,
"learning_rate": 9.898828905565236e-05,
"loss": 0.0428,
"step": 1400
},
{
"epoch": 2.2345483359746434,
"grad_norm": 0.39921554923057556,
"learning_rate": 9.896189761532563e-05,
"loss": 0.046,
"step": 1410
},
{
"epoch": 2.2503961965134707,
"grad_norm": 0.31093716621398926,
"learning_rate": 9.89351699795594e-05,
"loss": 0.0507,
"step": 1420
},
{
"epoch": 2.266244057052298,
"grad_norm": 0.48712223768234253,
"learning_rate": 9.890810633187825e-05,
"loss": 0.0537,
"step": 1430
},
{
"epoch": 2.282091917591125,
"grad_norm": 0.2943997085094452,
"learning_rate": 9.888070685811389e-05,
"loss": 0.0434,
"step": 1440
},
{
"epoch": 2.2979397781299524,
"grad_norm": 0.5522529482841492,
"learning_rate": 9.885297174640401e-05,
"loss": 0.0508,
"step": 1450
},
{
"epoch": 2.3137876386687797,
"grad_norm": 0.43696942925453186,
"learning_rate": 9.882490118719095e-05,
"loss": 0.0469,
"step": 1460
},
{
"epoch": 2.329635499207607,
"grad_norm": 0.34890133142471313,
"learning_rate": 9.87964953732204e-05,
"loss": 0.0406,
"step": 1470
},
{
"epoch": 2.345483359746434,
"grad_norm": 0.4267130494117737,
"learning_rate": 9.876775449954003e-05,
"loss": 0.0482,
"step": 1480
},
{
"epoch": 2.3613312202852614,
"grad_norm": 0.44068190455436707,
"learning_rate": 9.873867876349822e-05,
"loss": 0.0509,
"step": 1490
},
{
"epoch": 2.3771790808240887,
"grad_norm": 0.6220930814743042,
"learning_rate": 9.870926836474265e-05,
"loss": 0.0451,
"step": 1500
},
{
"epoch": 2.393026941362916,
"grad_norm": 0.33855652809143066,
"learning_rate": 9.867952350521899e-05,
"loss": 0.0489,
"step": 1510
},
{
"epoch": 2.408874801901743,
"grad_norm": 0.35320836305618286,
"learning_rate": 9.864944438916943e-05,
"loss": 0.0553,
"step": 1520
},
{
"epoch": 2.4247226624405704,
"grad_norm": 0.44654354453086853,
"learning_rate": 9.861903122313132e-05,
"loss": 0.0523,
"step": 1530
},
{
"epoch": 2.4405705229793977,
"grad_norm": 0.41467657685279846,
"learning_rate": 9.858828421593582e-05,
"loss": 0.0457,
"step": 1540
},
{
"epoch": 2.456418383518225,
"grad_norm": 0.4683903157711029,
"learning_rate": 9.855720357870635e-05,
"loss": 0.0451,
"step": 1550
},
{
"epoch": 2.472266244057052,
"grad_norm": 0.3570249378681183,
"learning_rate": 9.852578952485716e-05,
"loss": 0.0496,
"step": 1560
},
{
"epoch": 2.4881141045958794,
"grad_norm": 0.31039777398109436,
"learning_rate": 9.849404227009196e-05,
"loss": 0.047,
"step": 1570
},
{
"epoch": 2.5039619651347067,
"grad_norm": 0.35661402344703674,
"learning_rate": 9.846196203240234e-05,
"loss": 0.0451,
"step": 1580
},
{
"epoch": 2.519809825673534,
"grad_norm": 0.2695527970790863,
"learning_rate": 9.842954903206634e-05,
"loss": 0.0405,
"step": 1590
},
{
"epoch": 2.535657686212361,
"grad_norm": 0.2955043613910675,
"learning_rate": 9.839680349164684e-05,
"loss": 0.0378,
"step": 1600
},
{
"epoch": 2.5515055467511885,
"grad_norm": 0.32479894161224365,
"learning_rate": 9.836372563599017e-05,
"loss": 0.0398,
"step": 1610
},
{
"epoch": 2.5673534072900157,
"grad_norm": 0.2761167883872986,
"learning_rate": 9.833031569222443e-05,
"loss": 0.0373,
"step": 1620
},
{
"epoch": 2.583201267828843,
"grad_norm": 0.42928674817085266,
"learning_rate": 9.829657388975803e-05,
"loss": 0.0445,
"step": 1630
},
{
"epoch": 2.59904912836767,
"grad_norm": 0.27622172236442566,
"learning_rate": 9.826250046027809e-05,
"loss": 0.038,
"step": 1640
},
{
"epoch": 2.6148969889064975,
"grad_norm": 0.3467934727668762,
"learning_rate": 9.822809563774881e-05,
"loss": 0.0417,
"step": 1650
},
{
"epoch": 2.6307448494453247,
"grad_norm": 0.3505828380584717,
"learning_rate": 9.81933596584099e-05,
"loss": 0.0424,
"step": 1660
},
{
"epoch": 2.6465927099841524,
"grad_norm": 0.38430240750312805,
"learning_rate": 9.815829276077492e-05,
"loss": 0.0407,
"step": 1670
},
{
"epoch": 2.662440570522979,
"grad_norm": 0.280718058347702,
"learning_rate": 9.812289518562975e-05,
"loss": 0.0415,
"step": 1680
},
{
"epoch": 2.678288431061807,
"grad_norm": 0.3132197856903076,
"learning_rate": 9.808716717603076e-05,
"loss": 0.0467,
"step": 1690
},
{
"epoch": 2.6941362916006337,
"grad_norm": 0.43638864159584045,
"learning_rate": 9.80511089773033e-05,
"loss": 0.0488,
"step": 1700
},
{
"epoch": 2.7099841521394614,
"grad_norm": 0.39665859937667847,
"learning_rate": 9.801472083703993e-05,
"loss": 0.043,
"step": 1710
},
{
"epoch": 2.7258320126782882,
"grad_norm": 0.43067046999931335,
"learning_rate": 9.797800300509879e-05,
"loss": 0.044,
"step": 1720
},
{
"epoch": 2.741679873217116,
"grad_norm": 0.4771805703639984,
"learning_rate": 9.794095573360173e-05,
"loss": 0.0428,
"step": 1730
},
{
"epoch": 2.7575277337559427,
"grad_norm": 0.28175461292266846,
"learning_rate": 9.790357927693282e-05,
"loss": 0.0407,
"step": 1740
},
{
"epoch": 2.7733755942947704,
"grad_norm": 0.3774772584438324,
"learning_rate": 9.786587389173639e-05,
"loss": 0.0526,
"step": 1750
},
{
"epoch": 2.7892234548335972,
"grad_norm": 0.38130345940589905,
"learning_rate": 9.782783983691534e-05,
"loss": 0.0397,
"step": 1760
},
{
"epoch": 2.805071315372425,
"grad_norm": 0.3435608744621277,
"learning_rate": 9.778947737362942e-05,
"loss": 0.0421,
"step": 1770
},
{
"epoch": 2.8209191759112517,
"grad_norm": 0.3259428143501282,
"learning_rate": 9.775078676529338e-05,
"loss": 0.0534,
"step": 1780
},
{
"epoch": 2.8367670364500794,
"grad_norm": 0.37528592348098755,
"learning_rate": 9.771176827757512e-05,
"loss": 0.0397,
"step": 1790
},
{
"epoch": 2.8526148969889062,
"grad_norm": 0.22234618663787842,
"learning_rate": 9.767242217839397e-05,
"loss": 0.0425,
"step": 1800
},
{
"epoch": 2.868462757527734,
"grad_norm": 0.366315096616745,
"learning_rate": 9.763274873791874e-05,
"loss": 0.048,
"step": 1810
},
{
"epoch": 2.8843106180665607,
"grad_norm": 0.2822195887565613,
"learning_rate": 9.759274822856598e-05,
"loss": 0.0394,
"step": 1820
},
{
"epoch": 2.9001584786053884,
"grad_norm": 0.2494378387928009,
"learning_rate": 9.7552420924998e-05,
"loss": 0.0525,
"step": 1830
},
{
"epoch": 2.9160063391442153,
"grad_norm": 0.3741839528083801,
"learning_rate": 9.751176710412106e-05,
"loss": 0.0406,
"step": 1840
},
{
"epoch": 2.931854199683043,
"grad_norm": 0.436900794506073,
"learning_rate": 9.747078704508343e-05,
"loss": 0.04,
"step": 1850
},
{
"epoch": 2.94770206022187,
"grad_norm": 0.4009973704814911,
"learning_rate": 9.742948102927351e-05,
"loss": 0.0379,
"step": 1860
},
{
"epoch": 2.9635499207606975,
"grad_norm": 0.34224581718444824,
"learning_rate": 9.738784934031781e-05,
"loss": 0.0383,
"step": 1870
},
{
"epoch": 2.9793977812995247,
"grad_norm": 0.32024386525154114,
"learning_rate": 9.734589226407913e-05,
"loss": 0.0421,
"step": 1880
},
{
"epoch": 2.995245641838352,
"grad_norm": 0.2870291769504547,
"learning_rate": 9.730361008865452e-05,
"loss": 0.038,
"step": 1890
},
{
"epoch": 3.011093502377179,
"grad_norm": 0.30015242099761963,
"learning_rate": 9.726100310437327e-05,
"loss": 0.0427,
"step": 1900
},
{
"epoch": 3.0269413629160065,
"grad_norm": 0.37298670411109924,
"learning_rate": 9.721807160379503e-05,
"loss": 0.0368,
"step": 1910
},
{
"epoch": 3.0427892234548337,
"grad_norm": 0.3384378254413605,
"learning_rate": 9.717481588170765e-05,
"loss": 0.0338,
"step": 1920
},
{
"epoch": 3.058637083993661,
"grad_norm": 0.456691175699234,
"learning_rate": 9.713123623512532e-05,
"loss": 0.0406,
"step": 1930
},
{
"epoch": 3.074484944532488,
"grad_norm": 0.28940680623054504,
"learning_rate": 9.70873329632864e-05,
"loss": 0.039,
"step": 1940
},
{
"epoch": 3.0903328050713155,
"grad_norm": 0.3975684940814972,
"learning_rate": 9.704310636765142e-05,
"loss": 0.0445,
"step": 1950
},
{
"epoch": 3.1061806656101427,
"grad_norm": 0.33805158734321594,
"learning_rate": 9.699855675190099e-05,
"loss": 0.0452,
"step": 1960
},
{
"epoch": 3.12202852614897,
"grad_norm": 0.4213513433933258,
"learning_rate": 9.695368442193378e-05,
"loss": 0.0371,
"step": 1970
},
{
"epoch": 3.1378763866877972,
"grad_norm": 0.3289247751235962,
"learning_rate": 9.69084896858643e-05,
"loss": 0.0345,
"step": 1980
},
{
"epoch": 3.1537242472266245,
"grad_norm": 0.3328181505203247,
"learning_rate": 9.68629728540209e-05,
"loss": 0.0436,
"step": 1990
},
{
"epoch": 3.1695721077654517,
"grad_norm": 0.37982073426246643,
"learning_rate": 9.681713423894359e-05,
"loss": 0.0415,
"step": 2000
},
{
"epoch": 3.185419968304279,
"grad_norm": 0.4652085602283478,
"learning_rate": 9.677097415538186e-05,
"loss": 0.0391,
"step": 2010
},
{
"epoch": 3.2012678288431062,
"grad_norm": 0.44633859395980835,
"learning_rate": 9.672449292029257e-05,
"loss": 0.0375,
"step": 2020
},
{
"epoch": 3.2171156893819335,
"grad_norm": 0.4091266095638275,
"learning_rate": 9.66776908528378e-05,
"loss": 0.042,
"step": 2030
},
{
"epoch": 3.2329635499207607,
"grad_norm": 0.37333956360816956,
"learning_rate": 9.663056827438252e-05,
"loss": 0.0416,
"step": 2040
},
{
"epoch": 3.248811410459588,
"grad_norm": 0.3241555988788605,
"learning_rate": 9.65831255084926e-05,
"loss": 0.0375,
"step": 2050
},
{
"epoch": 3.2646592709984152,
"grad_norm": 0.3356478214263916,
"learning_rate": 9.653536288093237e-05,
"loss": 0.0379,
"step": 2060
},
{
"epoch": 3.2805071315372425,
"grad_norm": 0.3036734163761139,
"learning_rate": 9.648728071966251e-05,
"loss": 0.0401,
"step": 2070
},
{
"epoch": 3.2963549920760697,
"grad_norm": 0.3258291780948639,
"learning_rate": 9.64388793548378e-05,
"loss": 0.0403,
"step": 2080
},
{
"epoch": 3.312202852614897,
"grad_norm": 0.24130631983280182,
"learning_rate": 9.639015911880478e-05,
"loss": 0.0429,
"step": 2090
},
{
"epoch": 3.3280507131537242,
"grad_norm": 0.32036861777305603,
"learning_rate": 9.634112034609955e-05,
"loss": 0.044,
"step": 2100
},
{
"epoch": 3.3438985736925515,
"grad_norm": 0.4092555046081543,
"learning_rate": 9.629176337344538e-05,
"loss": 0.04,
"step": 2110
},
{
"epoch": 3.3597464342313788,
"grad_norm": 0.2772470712661743,
"learning_rate": 9.62420885397505e-05,
"loss": 0.0441,
"step": 2120
},
{
"epoch": 3.375594294770206,
"grad_norm": 0.26047825813293457,
"learning_rate": 9.619209618610569e-05,
"loss": 0.0465,
"step": 2130
},
{
"epoch": 3.3914421553090333,
"grad_norm": 0.3633308410644531,
"learning_rate": 9.614178665578199e-05,
"loss": 0.0366,
"step": 2140
},
{
"epoch": 3.4072900158478605,
"grad_norm": 0.3024093806743622,
"learning_rate": 9.609116029422834e-05,
"loss": 0.0394,
"step": 2150
},
{
"epoch": 3.4231378763866878,
"grad_norm": 0.30647802352905273,
"learning_rate": 9.604021744906915e-05,
"loss": 0.0378,
"step": 2160
},
{
"epoch": 3.438985736925515,
"grad_norm": 0.36456671357154846,
"learning_rate": 9.598895847010198e-05,
"loss": 0.0378,
"step": 2170
},
{
"epoch": 3.4548335974643423,
"grad_norm": 0.28538623452186584,
"learning_rate": 9.593738370929513e-05,
"loss": 0.0367,
"step": 2180
},
{
"epoch": 3.4706814580031695,
"grad_norm": 0.3848700523376465,
"learning_rate": 9.588549352078517e-05,
"loss": 0.0376,
"step": 2190
},
{
"epoch": 3.4865293185419968,
"grad_norm": 0.4077780842781067,
"learning_rate": 9.583328826087456e-05,
"loss": 0.0401,
"step": 2200
},
{
"epoch": 3.502377179080824,
"grad_norm": 0.29360130429267883,
"learning_rate": 9.578076828802922e-05,
"loss": 0.0377,
"step": 2210
},
{
"epoch": 3.5182250396196513,
"grad_norm": 0.23208092153072357,
"learning_rate": 9.572793396287598e-05,
"loss": 0.0456,
"step": 2220
},
{
"epoch": 3.5340729001584785,
"grad_norm": 0.2558813691139221,
"learning_rate": 9.567478564820019e-05,
"loss": 0.032,
"step": 2230
},
{
"epoch": 3.5499207606973058,
"grad_norm": 0.24153469502925873,
"learning_rate": 9.562132370894321e-05,
"loss": 0.0374,
"step": 2240
},
{
"epoch": 3.565768621236133,
"grad_norm": 0.3506905436515808,
"learning_rate": 9.55675485121999e-05,
"loss": 0.0455,
"step": 2250
},
{
"epoch": 3.5816164817749603,
"grad_norm": 0.3719404637813568,
"learning_rate": 9.551346042721604e-05,
"loss": 0.042,
"step": 2260
},
{
"epoch": 3.5974643423137875,
"grad_norm": 0.27092769742012024,
"learning_rate": 9.545905982538592e-05,
"loss": 0.0351,
"step": 2270
},
{
"epoch": 3.613312202852615,
"grad_norm": 0.492046982049942,
"learning_rate": 9.540434708024966e-05,
"loss": 0.0414,
"step": 2280
},
{
"epoch": 3.629160063391442,
"grad_norm": 0.3070124387741089,
"learning_rate": 9.534932256749074e-05,
"loss": 0.0396,
"step": 2290
},
{
"epoch": 3.6450079239302693,
"grad_norm": 0.30400729179382324,
"learning_rate": 9.529398666493336e-05,
"loss": 0.0356,
"step": 2300
},
{
"epoch": 3.6608557844690965,
"grad_norm": 0.26050692796707153,
"learning_rate": 9.523833975253988e-05,
"loss": 0.0419,
"step": 2310
},
{
"epoch": 3.676703645007924,
"grad_norm": 0.27897489070892334,
"learning_rate": 9.51823822124082e-05,
"loss": 0.034,
"step": 2320
},
{
"epoch": 3.692551505546751,
"grad_norm": 0.3234636187553406,
"learning_rate": 9.512611442876914e-05,
"loss": 0.0428,
"step": 2330
},
{
"epoch": 3.7083993660855783,
"grad_norm": 0.3589284121990204,
"learning_rate": 9.506953678798378e-05,
"loss": 0.0387,
"step": 2340
},
{
"epoch": 3.7242472266244055,
"grad_norm": 0.4306239187717438,
"learning_rate": 9.501264967854084e-05,
"loss": 0.0474,
"step": 2350
},
{
"epoch": 3.740095087163233,
"grad_norm": 0.22645992040634155,
"learning_rate": 9.495545349105401e-05,
"loss": 0.0385,
"step": 2360
},
{
"epoch": 3.75594294770206,
"grad_norm": 0.3852635622024536,
"learning_rate": 9.489794861825923e-05,
"loss": 0.0345,
"step": 2370
},
{
"epoch": 3.7717908082408877,
"grad_norm": 0.27143415808677673,
"learning_rate": 9.484013545501203e-05,
"loss": 0.0361,
"step": 2380
},
{
"epoch": 3.7876386687797146,
"grad_norm": 0.28437864780426025,
"learning_rate": 9.47820143982848e-05,
"loss": 0.0341,
"step": 2390
},
{
"epoch": 3.8034865293185423,
"grad_norm": 0.2932587265968323,
"learning_rate": 9.472358584716408e-05,
"loss": 0.0462,
"step": 2400
},
{
"epoch": 3.819334389857369,
"grad_norm": 0.4200306236743927,
"learning_rate": 9.466485020284782e-05,
"loss": 0.0421,
"step": 2410
},
{
"epoch": 3.8351822503961968,
"grad_norm": 0.41429468989372253,
"learning_rate": 9.46058078686426e-05,
"loss": 0.0429,
"step": 2420
},
{
"epoch": 3.8510301109350236,
"grad_norm": 0.29475075006484985,
"learning_rate": 9.454645924996087e-05,
"loss": 0.036,
"step": 2430
},
{
"epoch": 3.8668779714738513,
"grad_norm": 0.36007851362228394,
"learning_rate": 9.448680475431819e-05,
"loss": 0.0343,
"step": 2440
},
{
"epoch": 3.882725832012678,
"grad_norm": 0.31012412905693054,
"learning_rate": 9.442684479133044e-05,
"loss": 0.0328,
"step": 2450
},
{
"epoch": 3.8985736925515058,
"grad_norm": 0.3186264932155609,
"learning_rate": 9.436657977271093e-05,
"loss": 0.0357,
"step": 2460
},
{
"epoch": 3.9144215530903326,
"grad_norm": 0.289347380399704,
"learning_rate": 9.430601011226763e-05,
"loss": 0.0322,
"step": 2470
},
{
"epoch": 3.9302694136291603,
"grad_norm": 0.8456028699874878,
"learning_rate": 9.424513622590038e-05,
"loss": 0.0368,
"step": 2480
},
{
"epoch": 3.946117274167987,
"grad_norm": 0.2324322909116745,
"learning_rate": 9.418395853159793e-05,
"loss": 0.0334,
"step": 2490
},
{
"epoch": 3.9619651347068148,
"grad_norm": 0.24869747459888458,
"learning_rate": 9.412247744943512e-05,
"loss": 0.0333,
"step": 2500
},
{
"epoch": 3.9778129952456416,
"grad_norm": 0.31198471784591675,
"learning_rate": 9.406069340157003e-05,
"loss": 0.0335,
"step": 2510
},
{
"epoch": 3.9936608557844693,
"grad_norm": 0.3861044645309448,
"learning_rate": 9.399860681224098e-05,
"loss": 0.0363,
"step": 2520
},
{
"epoch": 4.009508716323296,
"grad_norm": 0.21961505711078644,
"learning_rate": 9.393621810776376e-05,
"loss": 0.0353,
"step": 2530
},
{
"epoch": 4.025356576862124,
"grad_norm": 0.28296446800231934,
"learning_rate": 9.387352771652856e-05,
"loss": 0.0438,
"step": 2540
},
{
"epoch": 4.041204437400951,
"grad_norm": 0.48964765667915344,
"learning_rate": 9.381053606899713e-05,
"loss": 0.0346,
"step": 2550
},
{
"epoch": 4.057052297939778,
"grad_norm": 0.38259637355804443,
"learning_rate": 9.374724359769979e-05,
"loss": 0.0342,
"step": 2560
},
{
"epoch": 4.072900158478605,
"grad_norm": 0.29834380745887756,
"learning_rate": 9.368365073723241e-05,
"loss": 0.031,
"step": 2570
},
{
"epoch": 4.088748019017433,
"grad_norm": 0.41095930337905884,
"learning_rate": 9.361975792425356e-05,
"loss": 0.0344,
"step": 2580
},
{
"epoch": 4.10459587955626,
"grad_norm": 0.21189731359481812,
"learning_rate": 9.355556559748133e-05,
"loss": 0.0367,
"step": 2590
},
{
"epoch": 4.120443740095087,
"grad_norm": 0.2741738557815552,
"learning_rate": 9.349107419769048e-05,
"loss": 0.0366,
"step": 2600
},
{
"epoch": 4.136291600633914,
"grad_norm": 0.28367432951927185,
"learning_rate": 9.342628416770928e-05,
"loss": 0.0301,
"step": 2610
},
{
"epoch": 4.152139461172742,
"grad_norm": 0.29805341362953186,
"learning_rate": 9.336119595241665e-05,
"loss": 0.0332,
"step": 2620
},
{
"epoch": 4.167987321711569,
"grad_norm": 0.340262770652771,
"learning_rate": 9.329580999873887e-05,
"loss": 0.0332,
"step": 2630
},
{
"epoch": 4.183835182250396,
"grad_norm": 0.2894122302532196,
"learning_rate": 9.323012675564668e-05,
"loss": 0.0333,
"step": 2640
},
{
"epoch": 4.199683042789223,
"grad_norm": 0.2781189978122711,
"learning_rate": 9.316414667415216e-05,
"loss": 0.0348,
"step": 2650
},
{
"epoch": 4.215530903328051,
"grad_norm": 0.321756511926651,
"learning_rate": 9.309787020730562e-05,
"loss": 0.0303,
"step": 2660
},
{
"epoch": 4.231378763866878,
"grad_norm": 0.275852233171463,
"learning_rate": 9.303129781019249e-05,
"loss": 0.0407,
"step": 2670
},
{
"epoch": 4.247226624405705,
"grad_norm": 0.44196420907974243,
"learning_rate": 9.296442993993015e-05,
"loss": 0.0395,
"step": 2680
},
{
"epoch": 4.263074484944532,
"grad_norm": 0.2846081852912903,
"learning_rate": 9.289726705566491e-05,
"loss": 0.0344,
"step": 2690
},
{
"epoch": 4.27892234548336,
"grad_norm": 0.31943535804748535,
"learning_rate": 9.282980961856875e-05,
"loss": 0.0388,
"step": 2700
},
{
"epoch": 4.294770206022187,
"grad_norm": 0.4001297354698181,
"learning_rate": 9.276205809183618e-05,
"loss": 0.0366,
"step": 2710
},
{
"epoch": 4.310618066561014,
"grad_norm": 0.2874903976917267,
"learning_rate": 9.26940129406811e-05,
"loss": 0.0302,
"step": 2720
},
{
"epoch": 4.326465927099841,
"grad_norm": 0.3430187404155731,
"learning_rate": 9.262567463233352e-05,
"loss": 0.0368,
"step": 2730
},
{
"epoch": 4.342313787638669,
"grad_norm": 0.3248710632324219,
"learning_rate": 9.255704363603645e-05,
"loss": 0.0337,
"step": 2740
},
{
"epoch": 4.358161648177496,
"grad_norm": 0.3309313654899597,
"learning_rate": 9.248812042304263e-05,
"loss": 0.0328,
"step": 2750
},
{
"epoch": 4.374009508716323,
"grad_norm": 0.2918064594268799,
"learning_rate": 9.24189054666113e-05,
"loss": 0.0394,
"step": 2760
},
{
"epoch": 4.38985736925515,
"grad_norm": 0.35082730650901794,
"learning_rate": 9.23493992420049e-05,
"loss": 0.0406,
"step": 2770
},
{
"epoch": 4.405705229793978,
"grad_norm": 0.32973727583885193,
"learning_rate": 9.227960222648593e-05,
"loss": 0.034,
"step": 2780
},
{
"epoch": 4.4215530903328055,
"grad_norm": 0.23779386281967163,
"learning_rate": 9.220951489931352e-05,
"loss": 0.0371,
"step": 2790
},
{
"epoch": 4.437400950871632,
"grad_norm": 0.2471320629119873,
"learning_rate": 9.213913774174028e-05,
"loss": 0.0317,
"step": 2800
},
{
"epoch": 4.453248811410459,
"grad_norm": 0.3636610805988312,
"learning_rate": 9.20684712370089e-05,
"loss": 0.0356,
"step": 2810
},
{
"epoch": 4.469096671949287,
"grad_norm": 0.18174231052398682,
"learning_rate": 9.199751587034887e-05,
"loss": 0.0258,
"step": 2820
},
{
"epoch": 4.4849445324881145,
"grad_norm": 0.20908503234386444,
"learning_rate": 9.192627212897315e-05,
"loss": 0.0368,
"step": 2830
},
{
"epoch": 4.500792393026941,
"grad_norm": 0.27427220344543457,
"learning_rate": 9.185474050207478e-05,
"loss": 0.0382,
"step": 2840
},
{
"epoch": 4.516640253565768,
"grad_norm": 0.35455378890037537,
"learning_rate": 9.178292148082362e-05,
"loss": 0.0338,
"step": 2850
},
{
"epoch": 4.532488114104596,
"grad_norm": 0.3077165484428406,
"learning_rate": 9.171081555836287e-05,
"loss": 0.032,
"step": 2860
},
{
"epoch": 4.5483359746434235,
"grad_norm": 0.29954010248184204,
"learning_rate": 9.163842322980573e-05,
"loss": 0.0363,
"step": 2870
},
{
"epoch": 4.56418383518225,
"grad_norm": 0.23956748843193054,
"learning_rate": 9.156574499223202e-05,
"loss": 0.0319,
"step": 2880
},
{
"epoch": 4.580031695721077,
"grad_norm": 0.24991659820079803,
"learning_rate": 9.149278134468472e-05,
"loss": 0.0351,
"step": 2890
},
{
"epoch": 4.595879556259905,
"grad_norm": 0.35879701375961304,
"learning_rate": 9.141953278816661e-05,
"loss": 0.0364,
"step": 2900
},
{
"epoch": 4.6117274167987325,
"grad_norm": 0.2529746890068054,
"learning_rate": 9.134599982563674e-05,
"loss": 0.0357,
"step": 2910
},
{
"epoch": 4.627575277337559,
"grad_norm": 0.23599006235599518,
"learning_rate": 9.127218296200705e-05,
"loss": 0.0363,
"step": 2920
},
{
"epoch": 4.643423137876387,
"grad_norm": 0.3693040907382965,
"learning_rate": 9.119808270413891e-05,
"loss": 0.036,
"step": 2930
},
{
"epoch": 4.659270998415214,
"grad_norm": 0.37512966990470886,
"learning_rate": 9.112369956083953e-05,
"loss": 0.0379,
"step": 2940
},
{
"epoch": 4.675118858954042,
"grad_norm": 0.35540756583213806,
"learning_rate": 9.104903404285862e-05,
"loss": 0.0305,
"step": 2950
},
{
"epoch": 4.690966719492868,
"grad_norm": 0.4176557660102844,
"learning_rate": 9.097408666288475e-05,
"loss": 0.0355,
"step": 2960
},
{
"epoch": 4.706814580031696,
"grad_norm": 0.28811272978782654,
"learning_rate": 9.089885793554195e-05,
"loss": 0.0376,
"step": 2970
},
{
"epoch": 4.722662440570523,
"grad_norm": 0.3358956575393677,
"learning_rate": 9.082334837738607e-05,
"loss": 0.0368,
"step": 2980
},
{
"epoch": 4.738510301109351,
"grad_norm": 0.3090055584907532,
"learning_rate": 9.074755850690127e-05,
"loss": 0.0326,
"step": 2990
},
{
"epoch": 4.754358161648177,
"grad_norm": 0.24217335879802704,
"learning_rate": 9.067148884449647e-05,
"loss": 0.0271,
"step": 3000
},
{
"epoch": 4.770206022187005,
"grad_norm": 0.361965149641037,
"learning_rate": 9.059513991250181e-05,
"loss": 0.0361,
"step": 3010
},
{
"epoch": 4.786053882725832,
"grad_norm": 0.36846402287483215,
"learning_rate": 9.051851223516501e-05,
"loss": 0.0381,
"step": 3020
},
{
"epoch": 4.80190174326466,
"grad_norm": 0.3030705451965332,
"learning_rate": 9.044160633864776e-05,
"loss": 0.0363,
"step": 3030
},
{
"epoch": 4.817749603803486,
"grad_norm": 0.40651705861091614,
"learning_rate": 9.036442275102213e-05,
"loss": 0.0305,
"step": 3040
},
{
"epoch": 4.833597464342314,
"grad_norm": 0.2696928381919861,
"learning_rate": 9.0286962002267e-05,
"loss": 0.0386,
"step": 3050
},
{
"epoch": 4.849445324881141,
"grad_norm": 0.3362119197845459,
"learning_rate": 9.020922462426433e-05,
"loss": 0.0318,
"step": 3060
},
{
"epoch": 4.865293185419969,
"grad_norm": 0.21661606431007385,
"learning_rate": 9.013121115079557e-05,
"loss": 0.0338,
"step": 3070
},
{
"epoch": 4.881141045958795,
"grad_norm": 0.2977627217769623,
"learning_rate": 9.005292211753792e-05,
"loss": 0.0323,
"step": 3080
},
{
"epoch": 4.896988906497623,
"grad_norm": 0.3265908658504486,
"learning_rate": 8.997435806206078e-05,
"loss": 0.032,
"step": 3090
},
{
"epoch": 4.91283676703645,
"grad_norm": 0.45224496722221375,
"learning_rate": 8.989551952382192e-05,
"loss": 0.0347,
"step": 3100
},
{
"epoch": 4.928684627575278,
"grad_norm": 0.3116205930709839,
"learning_rate": 8.981640704416385e-05,
"loss": 0.0278,
"step": 3110
},
{
"epoch": 4.944532488114104,
"grad_norm": 0.38788729906082153,
"learning_rate": 8.97370211663101e-05,
"loss": 0.0356,
"step": 3120
},
{
"epoch": 4.960380348652932,
"grad_norm": 0.3053205609321594,
"learning_rate": 8.965736243536152e-05,
"loss": 0.0298,
"step": 3130
},
{
"epoch": 4.976228209191759,
"grad_norm": 0.3261253535747528,
"learning_rate": 8.957743139829243e-05,
"loss": 0.038,
"step": 3140
},
{
"epoch": 4.992076069730587,
"grad_norm": 0.3000582158565521,
"learning_rate": 8.949722860394693e-05,
"loss": 0.0485,
"step": 3150
},
{
"epoch": 5.007923930269413,
"grad_norm": 0.3081798553466797,
"learning_rate": 8.941675460303522e-05,
"loss": 0.0401,
"step": 3160
},
{
"epoch": 5.023771790808241,
"grad_norm": 0.29715317487716675,
"learning_rate": 8.933600994812965e-05,
"loss": 0.0314,
"step": 3170
},
{
"epoch": 5.039619651347068,
"grad_norm": 0.20959503948688507,
"learning_rate": 8.925499519366102e-05,
"loss": 0.0344,
"step": 3180
},
{
"epoch": 5.055467511885896,
"grad_norm": 0.34640997648239136,
"learning_rate": 8.917371089591482e-05,
"loss": 0.0324,
"step": 3190
},
{
"epoch": 5.071315372424722,
"grad_norm": 0.29564642906188965,
"learning_rate": 8.909215761302728e-05,
"loss": 0.0404,
"step": 3200
},
{
"epoch": 5.08716323296355,
"grad_norm": 0.29282501339912415,
"learning_rate": 8.90103359049816e-05,
"loss": 0.0317,
"step": 3210
},
{
"epoch": 5.103011093502377,
"grad_norm": 0.3910326063632965,
"learning_rate": 8.892824633360419e-05,
"loss": 0.0297,
"step": 3220
},
{
"epoch": 5.118858954041205,
"grad_norm": 0.30237722396850586,
"learning_rate": 8.884588946256069e-05,
"loss": 0.0372,
"step": 3230
},
{
"epoch": 5.134706814580031,
"grad_norm": 0.3003133535385132,
"learning_rate": 8.876326585735213e-05,
"loss": 0.0332,
"step": 3240
},
{
"epoch": 5.150554675118859,
"grad_norm": 0.2812441885471344,
"learning_rate": 8.868037608531108e-05,
"loss": 0.0315,
"step": 3250
},
{
"epoch": 5.166402535657686,
"grad_norm": 0.2651035785675049,
"learning_rate": 8.859722071559777e-05,
"loss": 0.0292,
"step": 3260
},
{
"epoch": 5.182250396196514,
"grad_norm": 0.31288737058639526,
"learning_rate": 8.85138003191961e-05,
"loss": 0.0294,
"step": 3270
},
{
"epoch": 5.19809825673534,
"grad_norm": 0.2833364009857178,
"learning_rate": 8.843011546890978e-05,
"loss": 0.0331,
"step": 3280
},
{
"epoch": 5.213946117274168,
"grad_norm": 0.25718948245048523,
"learning_rate": 8.834616673935839e-05,
"loss": 0.0281,
"step": 3290
},
{
"epoch": 5.229793977812995,
"grad_norm": 0.28992629051208496,
"learning_rate": 8.82619547069734e-05,
"loss": 0.034,
"step": 3300
},
{
"epoch": 5.245641838351823,
"grad_norm": 0.2499540150165558,
"learning_rate": 8.817747994999432e-05,
"loss": 0.027,
"step": 3310
},
{
"epoch": 5.261489698890649,
"grad_norm": 0.25445619225502014,
"learning_rate": 8.80927430484646e-05,
"loss": 0.0316,
"step": 3320
},
{
"epoch": 5.277337559429477,
"grad_norm": 0.28179076313972473,
"learning_rate": 8.800774458422765e-05,
"loss": 0.035,
"step": 3330
},
{
"epoch": 5.293185419968304,
"grad_norm": 0.30823758244514465,
"learning_rate": 8.792248514092299e-05,
"loss": 0.0259,
"step": 3340
},
{
"epoch": 5.309033280507132,
"grad_norm": 0.3379741311073303,
"learning_rate": 8.783696530398207e-05,
"loss": 0.033,
"step": 3350
},
{
"epoch": 5.324881141045958,
"grad_norm": 0.29917508363723755,
"learning_rate": 8.775118566062435e-05,
"loss": 0.0278,
"step": 3360
},
{
"epoch": 5.340729001584786,
"grad_norm": 0.15989099442958832,
"learning_rate": 8.766514679985325e-05,
"loss": 0.0315,
"step": 3370
},
{
"epoch": 5.356576862123613,
"grad_norm": 0.2137162983417511,
"learning_rate": 8.757884931245211e-05,
"loss": 0.0333,
"step": 3380
},
{
"epoch": 5.372424722662441,
"grad_norm": 0.30674856901168823,
"learning_rate": 8.749229379098008e-05,
"loss": 0.0308,
"step": 3390
},
{
"epoch": 5.3882725832012675,
"grad_norm": 0.23785285651683807,
"learning_rate": 8.740548082976814e-05,
"loss": 0.0278,
"step": 3400
},
{
"epoch": 5.404120443740095,
"grad_norm": 0.25887709856033325,
"learning_rate": 8.731841102491494e-05,
"loss": 0.0283,
"step": 3410
},
{
"epoch": 5.419968304278922,
"grad_norm": 0.3679006099700928,
"learning_rate": 8.723108497428276e-05,
"loss": 0.0273,
"step": 3420
},
{
"epoch": 5.43581616481775,
"grad_norm": 0.40523847937583923,
"learning_rate": 8.714350327749337e-05,
"loss": 0.0319,
"step": 3430
},
{
"epoch": 5.4516640253565765,
"grad_norm": 0.2975967228412628,
"learning_rate": 8.705566653592393e-05,
"loss": 0.0382,
"step": 3440
},
{
"epoch": 5.467511885895404,
"grad_norm": 0.27645203471183777,
"learning_rate": 8.696757535270285e-05,
"loss": 0.0413,
"step": 3450
},
{
"epoch": 5.483359746434231,
"grad_norm": 0.23291446268558502,
"learning_rate": 8.68792303327057e-05,
"loss": 0.0306,
"step": 3460
},
{
"epoch": 5.499207606973059,
"grad_norm": 0.34922730922698975,
"learning_rate": 8.679063208255095e-05,
"loss": 0.0299,
"step": 3470
},
{
"epoch": 5.5150554675118855,
"grad_norm": 0.2651195228099823,
"learning_rate": 8.67017812105959e-05,
"loss": 0.0279,
"step": 3480
},
{
"epoch": 5.530903328050713,
"grad_norm": 0.23726455867290497,
"learning_rate": 8.661267832693247e-05,
"loss": 0.0311,
"step": 3490
},
{
"epoch": 5.546751188589541,
"grad_norm": 0.22650249302387238,
"learning_rate": 8.6523324043383e-05,
"loss": 0.0319,
"step": 3500
},
{
"epoch": 5.562599049128368,
"grad_norm": 0.275462806224823,
"learning_rate": 8.643371897349609e-05,
"loss": 0.0328,
"step": 3510
},
{
"epoch": 5.5784469096671945,
"grad_norm": 0.30848929286003113,
"learning_rate": 8.63438637325423e-05,
"loss": 0.0353,
"step": 3520
},
{
"epoch": 5.594294770206022,
"grad_norm": 0.22483864426612854,
"learning_rate": 8.625375893751005e-05,
"loss": 0.0291,
"step": 3530
},
{
"epoch": 5.61014263074485,
"grad_norm": 0.2007935345172882,
"learning_rate": 8.616340520710124e-05,
"loss": 0.0287,
"step": 3540
},
{
"epoch": 5.625990491283677,
"grad_norm": 0.24104808270931244,
"learning_rate": 8.607280316172717e-05,
"loss": 0.0296,
"step": 3550
},
{
"epoch": 5.6418383518225035,
"grad_norm": 0.25262153148651123,
"learning_rate": 8.598195342350413e-05,
"loss": 0.0332,
"step": 3560
},
{
"epoch": 5.657686212361331,
"grad_norm": 0.2854628264904022,
"learning_rate": 8.589085661624915e-05,
"loss": 0.0287,
"step": 3570
},
{
"epoch": 5.673534072900159,
"grad_norm": 0.27987590432167053,
"learning_rate": 8.579951336547583e-05,
"loss": 0.0358,
"step": 3580
},
{
"epoch": 5.689381933438986,
"grad_norm": 0.28694331645965576,
"learning_rate": 8.570792429838994e-05,
"loss": 0.0301,
"step": 3590
},
{
"epoch": 5.705229793977813,
"grad_norm": 0.4414514899253845,
"learning_rate": 8.561609004388511e-05,
"loss": 0.0276,
"step": 3600
},
{
"epoch": 5.72107765451664,
"grad_norm": 0.36731958389282227,
"learning_rate": 8.552401123253857e-05,
"loss": 0.0326,
"step": 3610
},
{
"epoch": 5.736925515055468,
"grad_norm": 0.3216352164745331,
"learning_rate": 8.543168849660682e-05,
"loss": 0.0351,
"step": 3620
},
{
"epoch": 5.752773375594295,
"grad_norm": 0.2965521812438965,
"learning_rate": 8.533912247002116e-05,
"loss": 0.0336,
"step": 3630
},
{
"epoch": 5.768621236133122,
"grad_norm": 0.37146931886672974,
"learning_rate": 8.524631378838357e-05,
"loss": 0.041,
"step": 3640
},
{
"epoch": 5.784469096671949,
"grad_norm": 0.27054694294929504,
"learning_rate": 8.515326308896213e-05,
"loss": 0.0333,
"step": 3650
},
{
"epoch": 5.800316957210777,
"grad_norm": 0.30338549613952637,
"learning_rate": 8.505997101068675e-05,
"loss": 0.0305,
"step": 3660
},
{
"epoch": 5.816164817749604,
"grad_norm": 0.2014935314655304,
"learning_rate": 8.496643819414476e-05,
"loss": 0.0292,
"step": 3670
},
{
"epoch": 5.832012678288431,
"grad_norm": 0.3620418906211853,
"learning_rate": 8.48726652815765e-05,
"loss": 0.0345,
"step": 3680
},
{
"epoch": 5.847860538827258,
"grad_norm": 0.22847791016101837,
"learning_rate": 8.477865291687095e-05,
"loss": 0.038,
"step": 3690
},
{
"epoch": 5.863708399366086,
"grad_norm": 0.42736053466796875,
"learning_rate": 8.468440174556127e-05,
"loss": 0.0341,
"step": 3700
},
{
"epoch": 5.879556259904913,
"grad_norm": 0.2668206989765167,
"learning_rate": 8.458991241482036e-05,
"loss": 0.0365,
"step": 3710
},
{
"epoch": 5.89540412044374,
"grad_norm": 0.24107444286346436,
"learning_rate": 8.449518557345645e-05,
"loss": 0.033,
"step": 3720
},
{
"epoch": 5.911251980982567,
"grad_norm": 0.2556779384613037,
"learning_rate": 8.440022187190864e-05,
"loss": 0.0336,
"step": 3730
},
{
"epoch": 5.927099841521395,
"grad_norm": 0.2224377542734146,
"learning_rate": 8.43050219622424e-05,
"loss": 0.0257,
"step": 3740
},
{
"epoch": 5.942947702060222,
"grad_norm": 0.247999370098114,
"learning_rate": 8.420958649814513e-05,
"loss": 0.0325,
"step": 3750
},
{
"epoch": 5.958795562599049,
"grad_norm": 0.3033657670021057,
"learning_rate": 8.411391613492165e-05,
"loss": 0.0336,
"step": 3760
},
{
"epoch": 5.974643423137876,
"grad_norm": 0.3270326852798462,
"learning_rate": 8.401801152948973e-05,
"loss": 0.0302,
"step": 3770
},
{
"epoch": 5.990491283676704,
"grad_norm": 0.23401206731796265,
"learning_rate": 8.392187334037555e-05,
"loss": 0.0308,
"step": 3780
},
{
"epoch": 6.006339144215531,
"grad_norm": 0.2145588994026184,
"learning_rate": 8.382550222770915e-05,
"loss": 0.035,
"step": 3790
},
{
"epoch": 6.022187004754358,
"grad_norm": 0.27132412791252136,
"learning_rate": 8.372889885321996e-05,
"loss": 0.0313,
"step": 3800
},
{
"epoch": 6.038034865293185,
"grad_norm": 0.21529650688171387,
"learning_rate": 8.363206388023224e-05,
"loss": 0.0297,
"step": 3810
},
{
"epoch": 6.053882725832013,
"grad_norm": 0.25313499569892883,
"learning_rate": 8.353499797366051e-05,
"loss": 0.0255,
"step": 3820
},
{
"epoch": 6.06973058637084,
"grad_norm": 0.19570957124233246,
"learning_rate": 8.343770180000497e-05,
"loss": 0.0275,
"step": 3830
},
{
"epoch": 6.085578446909667,
"grad_norm": 0.24506336450576782,
"learning_rate": 8.334017602734697e-05,
"loss": 0.0268,
"step": 3840
},
{
"epoch": 6.101426307448494,
"grad_norm": 0.21346315741539001,
"learning_rate": 8.324242132534435e-05,
"loss": 0.024,
"step": 3850
},
{
"epoch": 6.117274167987322,
"grad_norm": 0.3212679624557495,
"learning_rate": 8.314443836522692e-05,
"loss": 0.036,
"step": 3860
},
{
"epoch": 6.133122028526149,
"grad_norm": 0.24916702508926392,
"learning_rate": 8.304622781979183e-05,
"loss": 0.0271,
"step": 3870
},
{
"epoch": 6.148969889064976,
"grad_norm": 0.30624908208847046,
"learning_rate": 8.294779036339893e-05,
"loss": 0.0318,
"step": 3880
},
{
"epoch": 6.164817749603803,
"grad_norm": 0.2676468789577484,
"learning_rate": 8.284912667196612e-05,
"loss": 0.0294,
"step": 3890
},
{
"epoch": 6.180665610142631,
"grad_norm": 0.24745798110961914,
"learning_rate": 8.275023742296474e-05,
"loss": 0.0303,
"step": 3900
},
{
"epoch": 6.196513470681458,
"grad_norm": 0.2466627061367035,
"learning_rate": 8.265112329541495e-05,
"loss": 0.0255,
"step": 3910
},
{
"epoch": 6.212361331220285,
"grad_norm": 0.3070094883441925,
"learning_rate": 8.255178496988101e-05,
"loss": 0.0284,
"step": 3920
},
{
"epoch": 6.228209191759112,
"grad_norm": 0.3049757778644562,
"learning_rate": 8.245222312846663e-05,
"loss": 0.0286,
"step": 3930
},
{
"epoch": 6.24405705229794,
"grad_norm": 0.3167661428451538,
"learning_rate": 8.235243845481029e-05,
"loss": 0.0256,
"step": 3940
},
{
"epoch": 6.259904912836767,
"grad_norm": 0.2966691851615906,
"learning_rate": 8.225243163408051e-05,
"loss": 0.0332,
"step": 3950
},
{
"epoch": 6.2757527733755945,
"grad_norm": 0.29441869258880615,
"learning_rate": 8.215220335297124e-05,
"loss": 0.0279,
"step": 3960
},
{
"epoch": 6.291600633914421,
"grad_norm": 0.2598278522491455,
"learning_rate": 8.205175429969701e-05,
"loss": 0.0327,
"step": 3970
},
{
"epoch": 6.307448494453249,
"grad_norm": 0.3308967351913452,
"learning_rate": 8.195108516398834e-05,
"loss": 0.0301,
"step": 3980
},
{
"epoch": 6.323296354992076,
"grad_norm": 0.2924744486808777,
"learning_rate": 8.185019663708689e-05,
"loss": 0.035,
"step": 3990
},
{
"epoch": 6.3391442155309035,
"grad_norm": 0.29859915375709534,
"learning_rate": 8.174908941174078e-05,
"loss": 0.0293,
"step": 4000
},
{
"epoch": 6.35499207606973,
"grad_norm": 0.2642618715763092,
"learning_rate": 8.164776418219982e-05,
"loss": 0.0377,
"step": 4010
},
{
"epoch": 6.370839936608558,
"grad_norm": 0.25345122814178467,
"learning_rate": 8.154622164421075e-05,
"loss": 0.0321,
"step": 4020
},
{
"epoch": 6.386687797147385,
"grad_norm": 0.27396586537361145,
"learning_rate": 8.144446249501244e-05,
"loss": 0.0362,
"step": 4030
},
{
"epoch": 6.4025356576862125,
"grad_norm": 0.23460988700389862,
"learning_rate": 8.13424874333311e-05,
"loss": 0.0288,
"step": 4040
},
{
"epoch": 6.418383518225039,
"grad_norm": 0.268079549074173,
"learning_rate": 8.124029715937552e-05,
"loss": 0.0337,
"step": 4050
},
{
"epoch": 6.434231378763867,
"grad_norm": 0.23016807436943054,
"learning_rate": 8.113789237483224e-05,
"loss": 0.0297,
"step": 4060
},
{
"epoch": 6.450079239302694,
"grad_norm": 0.21488989889621735,
"learning_rate": 8.103527378286071e-05,
"loss": 0.0226,
"step": 4070
},
{
"epoch": 6.4659270998415215,
"grad_norm": 0.3006250262260437,
"learning_rate": 8.093244208808847e-05,
"loss": 0.0323,
"step": 4080
},
{
"epoch": 6.481774960380348,
"grad_norm": 0.31131377816200256,
"learning_rate": 8.082939799660641e-05,
"loss": 0.0263,
"step": 4090
},
{
"epoch": 6.497622820919176,
"grad_norm": 0.3602330982685089,
"learning_rate": 8.072614221596372e-05,
"loss": 0.0327,
"step": 4100
},
{
"epoch": 6.513470681458003,
"grad_norm": 0.24554632604122162,
"learning_rate": 8.062267545516323e-05,
"loss": 0.0307,
"step": 4110
},
{
"epoch": 6.5293185419968305,
"grad_norm": 0.3024232089519501,
"learning_rate": 8.05189984246564e-05,
"loss": 0.031,
"step": 4120
},
{
"epoch": 6.545166402535658,
"grad_norm": 0.20746688544750214,
"learning_rate": 8.041511183633855e-05,
"loss": 0.0296,
"step": 4130
},
{
"epoch": 6.561014263074485,
"grad_norm": 0.2613235414028168,
"learning_rate": 8.03110164035439e-05,
"loss": 0.0349,
"step": 4140
},
{
"epoch": 6.576862123613312,
"grad_norm": 0.41507190465927124,
"learning_rate": 8.020671284104072e-05,
"loss": 0.0377,
"step": 4150
},
{
"epoch": 6.5927099841521395,
"grad_norm": 0.2900952696800232,
"learning_rate": 8.010220186502635e-05,
"loss": 0.0296,
"step": 4160
},
{
"epoch": 6.608557844690967,
"grad_norm": 0.26226314902305603,
"learning_rate": 7.999748419312234e-05,
"loss": 0.0289,
"step": 4170
},
{
"epoch": 6.624405705229794,
"grad_norm": 0.3070898950099945,
"learning_rate": 7.989256054436956e-05,
"loss": 0.0298,
"step": 4180
},
{
"epoch": 6.640253565768621,
"grad_norm": 0.2827918231487274,
"learning_rate": 7.978743163922316e-05,
"loss": 0.0299,
"step": 4190
},
{
"epoch": 6.6561014263074485,
"grad_norm": 0.2928052842617035,
"learning_rate": 7.968209819954768e-05,
"loss": 0.0337,
"step": 4200
},
{
"epoch": 6.671949286846276,
"grad_norm": 0.23168888688087463,
"learning_rate": 7.957656094861214e-05,
"loss": 0.0334,
"step": 4210
},
{
"epoch": 6.687797147385103,
"grad_norm": 0.24511629343032837,
"learning_rate": 7.947082061108497e-05,
"loss": 0.0302,
"step": 4220
},
{
"epoch": 6.70364500792393,
"grad_norm": 0.24456819891929626,
"learning_rate": 7.93648779130291e-05,
"loss": 0.0311,
"step": 4230
},
{
"epoch": 6.7194928684627575,
"grad_norm": 0.26930612325668335,
"learning_rate": 7.925873358189699e-05,
"loss": 0.0291,
"step": 4240
},
{
"epoch": 6.735340729001585,
"grad_norm": 0.18482516705989838,
"learning_rate": 7.91523883465256e-05,
"loss": 0.0313,
"step": 4250
},
{
"epoch": 6.751188589540412,
"grad_norm": 0.36619842052459717,
"learning_rate": 7.904584293713134e-05,
"loss": 0.0298,
"step": 4260
},
{
"epoch": 6.767036450079239,
"grad_norm": 0.28840282559394836,
"learning_rate": 7.893909808530518e-05,
"loss": 0.0318,
"step": 4270
},
{
"epoch": 6.7828843106180665,
"grad_norm": 0.2239818572998047,
"learning_rate": 7.883215452400752e-05,
"loss": 0.0295,
"step": 4280
},
{
"epoch": 6.798732171156894,
"grad_norm": 0.21004091203212738,
"learning_rate": 7.872501298756319e-05,
"loss": 0.0284,
"step": 4290
},
{
"epoch": 6.814580031695721,
"grad_norm": 0.21372993290424347,
"learning_rate": 7.861767421165644e-05,
"loss": 0.031,
"step": 4300
},
{
"epoch": 6.830427892234549,
"grad_norm": 0.20823988318443298,
"learning_rate": 7.851013893332584e-05,
"loss": 0.0275,
"step": 4310
},
{
"epoch": 6.8462757527733755,
"grad_norm": 0.24077993631362915,
"learning_rate": 7.84024078909592e-05,
"loss": 0.0267,
"step": 4320
},
{
"epoch": 6.862123613312203,
"grad_norm": 0.29702138900756836,
"learning_rate": 7.82944818242886e-05,
"loss": 0.0293,
"step": 4330
},
{
"epoch": 6.87797147385103,
"grad_norm": 0.23424126207828522,
"learning_rate": 7.818636147438523e-05,
"loss": 0.0254,
"step": 4340
},
{
"epoch": 6.893819334389858,
"grad_norm": 0.28826698660850525,
"learning_rate": 7.807804758365431e-05,
"loss": 0.028,
"step": 4350
},
{
"epoch": 6.9096671949286845,
"grad_norm": 0.25839823484420776,
"learning_rate": 7.796954089583e-05,
"loss": 0.0339,
"step": 4360
},
{
"epoch": 6.925515055467512,
"grad_norm": 0.25523653626441956,
"learning_rate": 7.786084215597029e-05,
"loss": 0.0283,
"step": 4370
},
{
"epoch": 6.941362916006339,
"grad_norm": 0.23376896977424622,
"learning_rate": 7.775195211045193e-05,
"loss": 0.0287,
"step": 4380
},
{
"epoch": 6.957210776545167,
"grad_norm": 0.2951514720916748,
"learning_rate": 7.764287150696523e-05,
"loss": 0.0279,
"step": 4390
},
{
"epoch": 6.9730586370839935,
"grad_norm": 0.3112223446369171,
"learning_rate": 7.753360109450893e-05,
"loss": 0.0348,
"step": 4400
},
{
"epoch": 6.988906497622821,
"grad_norm": 0.3574570119380951,
"learning_rate": 7.742414162338519e-05,
"loss": 0.0315,
"step": 4410
},
{
"epoch": 7.004754358161648,
"grad_norm": 0.25105416774749756,
"learning_rate": 7.73144938451942e-05,
"loss": 0.0259,
"step": 4420
},
{
"epoch": 7.020602218700476,
"grad_norm": 0.313162624835968,
"learning_rate": 7.720465851282927e-05,
"loss": 0.0293,
"step": 4430
},
{
"epoch": 7.0364500792393025,
"grad_norm": 0.2756791412830353,
"learning_rate": 7.70946363804715e-05,
"loss": 0.032,
"step": 4440
},
{
"epoch": 7.05229793977813,
"grad_norm": 0.2672293484210968,
"learning_rate": 7.698442820358463e-05,
"loss": 0.0295,
"step": 4450
},
{
"epoch": 7.068145800316957,
"grad_norm": 0.27197128534317017,
"learning_rate": 7.687403473890988e-05,
"loss": 0.0329,
"step": 4460
},
{
"epoch": 7.083993660855785,
"grad_norm": 0.3267204761505127,
"learning_rate": 7.676345674446077e-05,
"loss": 0.0336,
"step": 4470
},
{
"epoch": 7.0998415213946116,
"grad_norm": 0.3577364683151245,
"learning_rate": 7.665269497951787e-05,
"loss": 0.0253,
"step": 4480
},
{
"epoch": 7.115689381933439,
"grad_norm": 0.25939124822616577,
"learning_rate": 7.65417502046236e-05,
"loss": 0.0257,
"step": 4490
},
{
"epoch": 7.131537242472266,
"grad_norm": 0.211978480219841,
"learning_rate": 7.6430623181577e-05,
"loss": 0.0276,
"step": 4500
},
{
"epoch": 7.147385103011094,
"grad_norm": 0.22676114737987518,
"learning_rate": 7.631931467342853e-05,
"loss": 0.0264,
"step": 4510
},
{
"epoch": 7.163232963549921,
"grad_norm": 0.3186163604259491,
"learning_rate": 7.620782544447483e-05,
"loss": 0.0312,
"step": 4520
},
{
"epoch": 7.179080824088748,
"grad_norm": 0.2680210769176483,
"learning_rate": 7.609615626025342e-05,
"loss": 0.0297,
"step": 4530
},
{
"epoch": 7.194928684627575,
"grad_norm": 0.25488680601119995,
"learning_rate": 7.598430788753748e-05,
"loss": 0.0309,
"step": 4540
},
{
"epoch": 7.210776545166403,
"grad_norm": 0.25716468691825867,
"learning_rate": 7.587228109433061e-05,
"loss": 0.0295,
"step": 4550
},
{
"epoch": 7.22662440570523,
"grad_norm": 0.17865824699401855,
"learning_rate": 7.576007664986149e-05,
"loss": 0.0275,
"step": 4560
},
{
"epoch": 7.242472266244057,
"grad_norm": 0.25337857007980347,
"learning_rate": 7.56476953245787e-05,
"loss": 0.0309,
"step": 4570
},
{
"epoch": 7.258320126782884,
"grad_norm": 0.23190538585186005,
"learning_rate": 7.553513789014531e-05,
"loss": 0.0326,
"step": 4580
},
{
"epoch": 7.274167987321712,
"grad_norm": 0.23697835206985474,
"learning_rate": 7.542240511943362e-05,
"loss": 0.0289,
"step": 4590
},
{
"epoch": 7.290015847860539,
"grad_norm": 0.19046033918857574,
"learning_rate": 7.530949778651995e-05,
"loss": 0.0272,
"step": 4600
},
{
"epoch": 7.305863708399366,
"grad_norm": 0.2411852329969406,
"learning_rate": 7.519641666667918e-05,
"loss": 0.0281,
"step": 4610
},
{
"epoch": 7.321711568938193,
"grad_norm": 0.2323843538761139,
"learning_rate": 7.508316253637951e-05,
"loss": 0.0286,
"step": 4620
},
{
"epoch": 7.337559429477021,
"grad_norm": 0.2985825538635254,
"learning_rate": 7.496973617327714e-05,
"loss": 0.027,
"step": 4630
},
{
"epoch": 7.353407290015848,
"grad_norm": 0.2772405743598938,
"learning_rate": 7.485613835621088e-05,
"loss": 0.0287,
"step": 4640
},
{
"epoch": 7.369255150554675,
"grad_norm": 0.28249087929725647,
"learning_rate": 7.474236986519679e-05,
"loss": 0.029,
"step": 4650
},
{
"epoch": 7.385103011093502,
"grad_norm": 0.2735413908958435,
"learning_rate": 7.462843148142292e-05,
"loss": 0.0285,
"step": 4660
},
{
"epoch": 7.40095087163233,
"grad_norm": 0.3959973454475403,
"learning_rate": 7.451432398724384e-05,
"loss": 0.0314,
"step": 4670
},
{
"epoch": 7.416798732171157,
"grad_norm": 0.23869942128658295,
"learning_rate": 7.440004816617533e-05,
"loss": 0.0302,
"step": 4680
},
{
"epoch": 7.432646592709984,
"grad_norm": 0.2646492123603821,
"learning_rate": 7.428560480288896e-05,
"loss": 0.0277,
"step": 4690
},
{
"epoch": 7.448494453248811,
"grad_norm": 0.23564158380031586,
"learning_rate": 7.417099468320676e-05,
"loss": 0.0284,
"step": 4700
},
{
"epoch": 7.464342313787639,
"grad_norm": 0.19051893055438995,
"learning_rate": 7.405621859409577e-05,
"loss": 0.031,
"step": 4710
},
{
"epoch": 7.480190174326466,
"grad_norm": 0.5017970204353333,
"learning_rate": 7.394127732366264e-05,
"loss": 0.028,
"step": 4720
},
{
"epoch": 7.496038034865293,
"grad_norm": 0.24149303138256073,
"learning_rate": 7.382617166114826e-05,
"loss": 0.0263,
"step": 4730
},
{
"epoch": 7.51188589540412,
"grad_norm": 0.2918100357055664,
"learning_rate": 7.371090239692228e-05,
"loss": 0.029,
"step": 4740
},
{
"epoch": 7.527733755942948,
"grad_norm": 0.41638660430908203,
"learning_rate": 7.359547032247773e-05,
"loss": 0.0279,
"step": 4750
},
{
"epoch": 7.543581616481775,
"grad_norm": 0.24228066205978394,
"learning_rate": 7.347987623042561e-05,
"loss": 0.0249,
"step": 4760
},
{
"epoch": 7.559429477020602,
"grad_norm": 0.3426589369773865,
"learning_rate": 7.336412091448936e-05,
"loss": 0.0291,
"step": 4770
},
{
"epoch": 7.575277337559429,
"grad_norm": 0.381527841091156,
"learning_rate": 7.324820516949946e-05,
"loss": 0.0329,
"step": 4780
},
{
"epoch": 7.591125198098257,
"grad_norm": 0.26290562748908997,
"learning_rate": 7.3132129791388e-05,
"loss": 0.0305,
"step": 4790
},
{
"epoch": 7.606973058637084,
"grad_norm": 0.28301799297332764,
"learning_rate": 7.301589557718315e-05,
"loss": 0.0224,
"step": 4800
},
{
"epoch": 7.622820919175911,
"grad_norm": 0.33471032977104187,
"learning_rate": 7.28995033250038e-05,
"loss": 0.0356,
"step": 4810
},
{
"epoch": 7.638668779714738,
"grad_norm": 0.219041109085083,
"learning_rate": 7.278295383405389e-05,
"loss": 0.0278,
"step": 4820
},
{
"epoch": 7.654516640253566,
"grad_norm": 0.27412205934524536,
"learning_rate": 7.266624790461713e-05,
"loss": 0.0271,
"step": 4830
},
{
"epoch": 7.6703645007923935,
"grad_norm": 0.27656254172325134,
"learning_rate": 7.254938633805137e-05,
"loss": 0.0296,
"step": 4840
},
{
"epoch": 7.68621236133122,
"grad_norm": 0.23747026920318604,
"learning_rate": 7.243236993678311e-05,
"loss": 0.0217,
"step": 4850
},
{
"epoch": 7.702060221870047,
"grad_norm": 0.29850152134895325,
"learning_rate": 7.231519950430212e-05,
"loss": 0.0297,
"step": 4860
},
{
"epoch": 7.717908082408875,
"grad_norm": 0.2872811555862427,
"learning_rate": 7.219787584515567e-05,
"loss": 0.0274,
"step": 4870
},
{
"epoch": 7.7337559429477025,
"grad_norm": 0.26487553119659424,
"learning_rate": 7.208039976494329e-05,
"loss": 0.0267,
"step": 4880
},
{
"epoch": 7.749603803486529,
"grad_norm": 0.32571732997894287,
"learning_rate": 7.196277207031103e-05,
"loss": 0.031,
"step": 4890
},
{
"epoch": 7.765451664025356,
"grad_norm": 0.2101273387670517,
"learning_rate": 7.184499356894606e-05,
"loss": 0.0261,
"step": 4900
},
{
"epoch": 7.781299524564184,
"grad_norm": 0.3179239332675934,
"learning_rate": 7.172706506957095e-05,
"loss": 0.0303,
"step": 4910
},
{
"epoch": 7.7971473851030115,
"grad_norm": 0.1984127014875412,
"learning_rate": 7.160898738193833e-05,
"loss": 0.0226,
"step": 4920
},
{
"epoch": 7.812995245641838,
"grad_norm": 0.19061654806137085,
"learning_rate": 7.149076131682521e-05,
"loss": 0.0219,
"step": 4930
},
{
"epoch": 7.828843106180665,
"grad_norm": 0.27196112275123596,
"learning_rate": 7.137238768602739e-05,
"loss": 0.0327,
"step": 4940
},
{
"epoch": 7.844690966719493,
"grad_norm": 0.2761131525039673,
"learning_rate": 7.125386730235395e-05,
"loss": 0.0258,
"step": 4950
},
{
"epoch": 7.8605388272583205,
"grad_norm": 0.22716206312179565,
"learning_rate": 7.113520097962165e-05,
"loss": 0.0306,
"step": 4960
},
{
"epoch": 7.876386687797147,
"grad_norm": 0.278010755777359,
"learning_rate": 7.101638953264933e-05,
"loss": 0.0261,
"step": 4970
},
{
"epoch": 7.892234548335974,
"grad_norm": 0.19748617708683014,
"learning_rate": 7.08974337772523e-05,
"loss": 0.0216,
"step": 4980
},
{
"epoch": 7.908082408874802,
"grad_norm": 0.35271981358528137,
"learning_rate": 7.077833453023678e-05,
"loss": 0.0236,
"step": 4990
},
{
"epoch": 7.9239302694136295,
"grad_norm": 0.33073899149894714,
"learning_rate": 7.065909260939429e-05,
"loss": 0.0274,
"step": 5000
},
{
"epoch": 7.939778129952456,
"grad_norm": 0.36262351274490356,
"learning_rate": 7.053970883349599e-05,
"loss": 0.0229,
"step": 5010
},
{
"epoch": 7.955625990491284,
"grad_norm": 0.4560012221336365,
"learning_rate": 7.04201840222871e-05,
"loss": 0.027,
"step": 5020
},
{
"epoch": 7.971473851030111,
"grad_norm": 0.3530636727809906,
"learning_rate": 7.03005189964812e-05,
"loss": 0.0307,
"step": 5030
},
{
"epoch": 7.9873217115689386,
"grad_norm": 0.2944605052471161,
"learning_rate": 7.018071457775474e-05,
"loss": 0.0254,
"step": 5040
},
{
"epoch": 8.003169572107765,
"grad_norm": 0.25718453526496887,
"learning_rate": 7.006077158874124e-05,
"loss": 0.0289,
"step": 5050
},
{
"epoch": 8.019017432646592,
"grad_norm": 0.23285925388336182,
"learning_rate": 6.994069085302573e-05,
"loss": 0.0278,
"step": 5060
},
{
"epoch": 8.03486529318542,
"grad_norm": 0.2729281485080719,
"learning_rate": 6.98204731951391e-05,
"loss": 0.0259,
"step": 5070
},
{
"epoch": 8.050713153724248,
"grad_norm": 0.2978493869304657,
"learning_rate": 6.970011944055234e-05,
"loss": 0.0231,
"step": 5080
},
{
"epoch": 8.066561014263074,
"grad_norm": 0.20820550620555878,
"learning_rate": 6.9579630415671e-05,
"loss": 0.0281,
"step": 5090
},
{
"epoch": 8.082408874801901,
"grad_norm": 0.23685221374034882,
"learning_rate": 6.945900694782949e-05,
"loss": 0.0251,
"step": 5100
},
{
"epoch": 8.09825673534073,
"grad_norm": 0.25722959637641907,
"learning_rate": 6.933824986528527e-05,
"loss": 0.0302,
"step": 5110
},
{
"epoch": 8.114104595879557,
"grad_norm": 0.28215500712394714,
"learning_rate": 6.921735999721338e-05,
"loss": 0.0218,
"step": 5120
},
{
"epoch": 8.129952456418383,
"grad_norm": 0.24379587173461914,
"learning_rate": 6.909633817370051e-05,
"loss": 0.0274,
"step": 5130
},
{
"epoch": 8.14580031695721,
"grad_norm": 0.295631468296051,
"learning_rate": 6.897518522573951e-05,
"loss": 0.0226,
"step": 5140
},
{
"epoch": 8.161648177496039,
"grad_norm": 0.24112898111343384,
"learning_rate": 6.885390198522356e-05,
"loss": 0.027,
"step": 5150
},
{
"epoch": 8.177496038034866,
"grad_norm": 0.2933104336261749,
"learning_rate": 6.873248928494046e-05,
"loss": 0.0257,
"step": 5160
},
{
"epoch": 8.193343898573692,
"grad_norm": 0.29547762870788574,
"learning_rate": 6.8610947958567e-05,
"loss": 0.0242,
"step": 5170
},
{
"epoch": 8.20919175911252,
"grad_norm": 0.27927926182746887,
"learning_rate": 6.848927884066311e-05,
"loss": 0.0257,
"step": 5180
},
{
"epoch": 8.225039619651348,
"grad_norm": 0.2721002697944641,
"learning_rate": 6.836748276666627e-05,
"loss": 0.0244,
"step": 5190
},
{
"epoch": 8.240887480190175,
"grad_norm": 0.25311270356178284,
"learning_rate": 6.824556057288563e-05,
"loss": 0.0279,
"step": 5200
},
{
"epoch": 8.256735340729001,
"grad_norm": 0.23902995884418488,
"learning_rate": 6.81235130964964e-05,
"loss": 0.0312,
"step": 5210
},
{
"epoch": 8.272583201267828,
"grad_norm": 0.30612844228744507,
"learning_rate": 6.8001341175534e-05,
"loss": 0.0357,
"step": 5220
},
{
"epoch": 8.288431061806657,
"grad_norm": 0.19130030274391174,
"learning_rate": 6.787904564888837e-05,
"loss": 0.0242,
"step": 5230
},
{
"epoch": 8.304278922345484,
"grad_norm": 0.2579098045825958,
"learning_rate": 6.775662735629816e-05,
"loss": 0.0329,
"step": 5240
},
{
"epoch": 8.32012678288431,
"grad_norm": 0.3037128150463104,
"learning_rate": 6.763408713834498e-05,
"loss": 0.0262,
"step": 5250
},
{
"epoch": 8.335974643423137,
"grad_norm": 0.2066265344619751,
"learning_rate": 6.751142583644767e-05,
"loss": 0.0311,
"step": 5260
},
{
"epoch": 8.351822503961966,
"grad_norm": 0.19183726608753204,
"learning_rate": 6.738864429285648e-05,
"loss": 0.0291,
"step": 5270
},
{
"epoch": 8.367670364500793,
"grad_norm": 0.2202986180782318,
"learning_rate": 6.72657433506473e-05,
"loss": 0.0224,
"step": 5280
},
{
"epoch": 8.38351822503962,
"grad_norm": 0.2542373538017273,
"learning_rate": 6.714272385371585e-05,
"loss": 0.0254,
"step": 5290
},
{
"epoch": 8.399366085578446,
"grad_norm": 0.33272790908813477,
"learning_rate": 6.701958664677191e-05,
"loss": 0.0245,
"step": 5300
},
{
"epoch": 8.415213946117275,
"grad_norm": 0.25956010818481445,
"learning_rate": 6.68963325753335e-05,
"loss": 0.0255,
"step": 5310
},
{
"epoch": 8.431061806656102,
"grad_norm": 0.314311683177948,
"learning_rate": 6.677296248572112e-05,
"loss": 0.0248,
"step": 5320
},
{
"epoch": 8.446909667194928,
"grad_norm": 0.28039562702178955,
"learning_rate": 6.664947722505188e-05,
"loss": 0.0282,
"step": 5330
},
{
"epoch": 8.462757527733755,
"grad_norm": 0.23970749974250793,
"learning_rate": 6.652587764123373e-05,
"loss": 0.0273,
"step": 5340
},
{
"epoch": 8.478605388272584,
"grad_norm": 0.1702006310224533,
"learning_rate": 6.640216458295958e-05,
"loss": 0.0291,
"step": 5350
},
{
"epoch": 8.49445324881141,
"grad_norm": 0.13902607560157776,
"learning_rate": 6.627833889970155e-05,
"loss": 0.0241,
"step": 5360
},
{
"epoch": 8.510301109350237,
"grad_norm": 0.2187580019235611,
"learning_rate": 6.615440144170502e-05,
"loss": 0.027,
"step": 5370
},
{
"epoch": 8.526148969889064,
"grad_norm": 0.2224210649728775,
"learning_rate": 6.603035305998301e-05,
"loss": 0.0235,
"step": 5380
},
{
"epoch": 8.541996830427893,
"grad_norm": 0.32996585965156555,
"learning_rate": 6.590619460631005e-05,
"loss": 0.0267,
"step": 5390
},
{
"epoch": 8.55784469096672,
"grad_norm": 0.31346139311790466,
"learning_rate": 6.578192693321656e-05,
"loss": 0.0194,
"step": 5400
},
{
"epoch": 8.573692551505546,
"grad_norm": 0.198611781001091,
"learning_rate": 6.565755089398285e-05,
"loss": 0.0256,
"step": 5410
},
{
"epoch": 8.589540412044373,
"grad_norm": 0.2415742725133896,
"learning_rate": 6.553306734263342e-05,
"loss": 0.0233,
"step": 5420
},
{
"epoch": 8.605388272583202,
"grad_norm": 0.3221810460090637,
"learning_rate": 6.540847713393088e-05,
"loss": 0.025,
"step": 5430
},
{
"epoch": 8.621236133122029,
"grad_norm": 0.17353218793869019,
"learning_rate": 6.528378112337031e-05,
"loss": 0.0229,
"step": 5440
},
{
"epoch": 8.637083993660855,
"grad_norm": 0.31122300028800964,
"learning_rate": 6.515898016717318e-05,
"loss": 0.0229,
"step": 5450
},
{
"epoch": 8.652931854199682,
"grad_norm": 0.27111196517944336,
"learning_rate": 6.50340751222816e-05,
"loss": 0.0329,
"step": 5460
},
{
"epoch": 8.66877971473851,
"grad_norm": 0.29258912801742554,
"learning_rate": 6.49090668463525e-05,
"loss": 0.0251,
"step": 5470
},
{
"epoch": 8.684627575277338,
"grad_norm": 0.23192371428012848,
"learning_rate": 6.478395619775145e-05,
"loss": 0.0294,
"step": 5480
},
{
"epoch": 8.700475435816164,
"grad_norm": 0.31985238194465637,
"learning_rate": 6.465874403554711e-05,
"loss": 0.0242,
"step": 5490
},
{
"epoch": 8.716323296354991,
"grad_norm": 0.23439311981201172,
"learning_rate": 6.453343121950513e-05,
"loss": 0.0267,
"step": 5500
},
{
"epoch": 8.73217115689382,
"grad_norm": 0.18457037210464478,
"learning_rate": 6.44080186100823e-05,
"loss": 0.0232,
"step": 5510
},
{
"epoch": 8.748019017432647,
"grad_norm": 0.2508156895637512,
"learning_rate": 6.428250706842064e-05,
"loss": 0.0365,
"step": 5520
},
{
"epoch": 8.763866877971473,
"grad_norm": 0.2573819160461426,
"learning_rate": 6.415689745634147e-05,
"loss": 0.029,
"step": 5530
},
{
"epoch": 8.7797147385103,
"grad_norm": 0.2110164314508438,
"learning_rate": 6.403119063633956e-05,
"loss": 0.0254,
"step": 5540
},
{
"epoch": 8.795562599049129,
"grad_norm": 0.3200654089450836,
"learning_rate": 6.390538747157706e-05,
"loss": 0.028,
"step": 5550
},
{
"epoch": 8.811410459587956,
"grad_norm": 0.2371603101491928,
"learning_rate": 6.377948882587777e-05,
"loss": 0.0217,
"step": 5560
},
{
"epoch": 8.827258320126782,
"grad_norm": 0.2176957130432129,
"learning_rate": 6.365349556372105e-05,
"loss": 0.0319,
"step": 5570
},
{
"epoch": 8.843106180665611,
"grad_norm": 0.2418396770954132,
"learning_rate": 6.352740855023594e-05,
"loss": 0.0258,
"step": 5580
},
{
"epoch": 8.858954041204438,
"grad_norm": 0.24693243205547333,
"learning_rate": 6.340122865119524e-05,
"loss": 0.0293,
"step": 5590
},
{
"epoch": 8.874801901743265,
"grad_norm": 0.249970942735672,
"learning_rate": 6.327495673300957e-05,
"loss": 0.0276,
"step": 5600
},
{
"epoch": 8.890649762282091,
"grad_norm": 0.21087859570980072,
"learning_rate": 6.314859366272132e-05,
"loss": 0.0234,
"step": 5610
},
{
"epoch": 8.906497622820918,
"grad_norm": 0.2701822817325592,
"learning_rate": 6.302214030799883e-05,
"loss": 0.022,
"step": 5620
},
{
"epoch": 8.922345483359747,
"grad_norm": 0.261089950799942,
"learning_rate": 6.28955975371304e-05,
"loss": 0.0264,
"step": 5630
},
{
"epoch": 8.938193343898574,
"grad_norm": 0.3843868672847748,
"learning_rate": 6.276896621901825e-05,
"loss": 0.0272,
"step": 5640
},
{
"epoch": 8.9540412044374,
"grad_norm": 0.3247261643409729,
"learning_rate": 6.26422472231726e-05,
"loss": 0.0275,
"step": 5650
},
{
"epoch": 8.969889064976229,
"grad_norm": 0.27681615948677063,
"learning_rate": 6.251544141970578e-05,
"loss": 0.0281,
"step": 5660
},
{
"epoch": 8.985736925515056,
"grad_norm": 0.255501925945282,
"learning_rate": 6.238854967932612e-05,
"loss": 0.0249,
"step": 5670
},
{
"epoch": 9.001584786053883,
"grad_norm": 0.2693521976470947,
"learning_rate": 6.2261572873332e-05,
"loss": 0.0202,
"step": 5680
},
{
"epoch": 9.01743264659271,
"grad_norm": 0.21597042679786682,
"learning_rate": 6.213451187360601e-05,
"loss": 0.0238,
"step": 5690
},
{
"epoch": 9.033280507131538,
"grad_norm": 0.3910636007785797,
"learning_rate": 6.200736755260877e-05,
"loss": 0.023,
"step": 5700
},
{
"epoch": 9.049128367670365,
"grad_norm": 0.22803229093551636,
"learning_rate": 6.188014078337305e-05,
"loss": 0.0227,
"step": 5710
},
{
"epoch": 9.064976228209192,
"grad_norm": 0.22921766340732574,
"learning_rate": 6.175283243949772e-05,
"loss": 0.0225,
"step": 5720
},
{
"epoch": 9.080824088748018,
"grad_norm": 0.2634933590888977,
"learning_rate": 6.162544339514183e-05,
"loss": 0.0304,
"step": 5730
},
{
"epoch": 9.096671949286847,
"grad_norm": 0.5331051349639893,
"learning_rate": 6.149797452501851e-05,
"loss": 0.0282,
"step": 5740
},
{
"epoch": 9.112519809825674,
"grad_norm": 0.2564757466316223,
"learning_rate": 6.137042670438907e-05,
"loss": 0.0262,
"step": 5750
},
{
"epoch": 9.1283676703645,
"grad_norm": 0.24122044444084167,
"learning_rate": 6.124280080905685e-05,
"loss": 0.0243,
"step": 5760
},
{
"epoch": 9.144215530903328,
"grad_norm": 0.20856255292892456,
"learning_rate": 6.111509771536138e-05,
"loss": 0.0255,
"step": 5770
},
{
"epoch": 9.160063391442156,
"grad_norm": 0.39979806542396545,
"learning_rate": 6.098731830017217e-05,
"loss": 0.0281,
"step": 5780
},
{
"epoch": 9.175911251980983,
"grad_norm": 0.16420406103134155,
"learning_rate": 6.0859463440882866e-05,
"loss": 0.0217,
"step": 5790
},
{
"epoch": 9.19175911251981,
"grad_norm": 0.25281447172164917,
"learning_rate": 6.073153401540512e-05,
"loss": 0.0279,
"step": 5800
},
{
"epoch": 9.207606973058637,
"grad_norm": 0.25699812173843384,
"learning_rate": 6.060353090216261e-05,
"loss": 0.0258,
"step": 5810
},
{
"epoch": 9.223454833597465,
"grad_norm": 0.19040873646736145,
"learning_rate": 6.0475454980084945e-05,
"loss": 0.0233,
"step": 5820
},
{
"epoch": 9.239302694136292,
"grad_norm": 0.21894507110118866,
"learning_rate": 6.0347307128601716e-05,
"loss": 0.0203,
"step": 5830
},
{
"epoch": 9.255150554675119,
"grad_norm": 0.35552018880844116,
"learning_rate": 6.021908822763641e-05,
"loss": 0.0238,
"step": 5840
},
{
"epoch": 9.270998415213946,
"grad_norm": 0.328046053647995,
"learning_rate": 6.0090799157600354e-05,
"loss": 0.0249,
"step": 5850
},
{
"epoch": 9.286846275752774,
"grad_norm": 0.23552384972572327,
"learning_rate": 5.996244079938671e-05,
"loss": 0.0236,
"step": 5860
},
{
"epoch": 9.302694136291601,
"grad_norm": 0.2591778337955475,
"learning_rate": 5.983401403436437e-05,
"loss": 0.0248,
"step": 5870
},
{
"epoch": 9.318541996830428,
"grad_norm": 0.16465957462787628,
"learning_rate": 5.970551974437198e-05,
"loss": 0.0208,
"step": 5880
},
{
"epoch": 9.334389857369255,
"grad_norm": 0.25457292795181274,
"learning_rate": 5.957695881171184e-05,
"loss": 0.033,
"step": 5890
},
{
"epoch": 9.350237717908083,
"grad_norm": 0.19111283123493195,
"learning_rate": 5.944833211914382e-05,
"loss": 0.0318,
"step": 5900
},
{
"epoch": 9.36608557844691,
"grad_norm": 0.30721551179885864,
"learning_rate": 5.931964054987935e-05,
"loss": 0.0224,
"step": 5910
},
{
"epoch": 9.381933438985737,
"grad_norm": 0.25978097319602966,
"learning_rate": 5.9190884987575336e-05,
"loss": 0.0251,
"step": 5920
},
{
"epoch": 9.397781299524564,
"grad_norm": 0.2720729112625122,
"learning_rate": 5.906206631632807e-05,
"loss": 0.025,
"step": 5930
},
{
"epoch": 9.413629160063392,
"grad_norm": 0.26405835151672363,
"learning_rate": 5.8933185420667217e-05,
"loss": 0.0266,
"step": 5940
},
{
"epoch": 9.429477020602219,
"grad_norm": 0.27683427929878235,
"learning_rate": 5.880424318554967e-05,
"loss": 0.0256,
"step": 5950
},
{
"epoch": 9.445324881141046,
"grad_norm": 0.2533441185951233,
"learning_rate": 5.867524049635352e-05,
"loss": 0.0255,
"step": 5960
},
{
"epoch": 9.461172741679873,
"grad_norm": 0.3351084589958191,
"learning_rate": 5.854617823887196e-05,
"loss": 0.0257,
"step": 5970
},
{
"epoch": 9.477020602218701,
"grad_norm": 0.2585383951663971,
"learning_rate": 5.841705729930721e-05,
"loss": 0.0257,
"step": 5980
},
{
"epoch": 9.492868462757528,
"grad_norm": 0.2588648796081543,
"learning_rate": 5.828787856426444e-05,
"loss": 0.0226,
"step": 5990
},
{
"epoch": 9.508716323296355,
"grad_norm": 0.2622322738170624,
"learning_rate": 5.8158642920745655e-05,
"loss": 0.0221,
"step": 6000
},
{
"epoch": 9.524564183835182,
"grad_norm": 0.23283162713050842,
"learning_rate": 5.802935125614361e-05,
"loss": 0.0177,
"step": 6010
},
{
"epoch": 9.54041204437401,
"grad_norm": 0.265953928232193,
"learning_rate": 5.790000445823576e-05,
"loss": 0.0237,
"step": 6020
},
{
"epoch": 9.556259904912837,
"grad_norm": 0.23547948896884918,
"learning_rate": 5.777060341517811e-05,
"loss": 0.0254,
"step": 6030
},
{
"epoch": 9.572107765451664,
"grad_norm": 0.3150040805339813,
"learning_rate": 5.764114901549914e-05,
"loss": 0.0298,
"step": 6040
},
{
"epoch": 9.58795562599049,
"grad_norm": 0.23534265160560608,
"learning_rate": 5.7511642148093704e-05,
"loss": 0.0208,
"step": 6050
},
{
"epoch": 9.60380348652932,
"grad_norm": 0.2798217833042145,
"learning_rate": 5.7382083702216925e-05,
"loss": 0.0264,
"step": 6060
},
{
"epoch": 9.619651347068146,
"grad_norm": 0.2324879914522171,
"learning_rate": 5.725247456747809e-05,
"loss": 0.0315,
"step": 6070
},
{
"epoch": 9.635499207606973,
"grad_norm": 0.25599566102027893,
"learning_rate": 5.7122815633834506e-05,
"loss": 0.0227,
"step": 6080
},
{
"epoch": 9.6513470681458,
"grad_norm": 0.1766338348388672,
"learning_rate": 5.699310779158551e-05,
"loss": 0.0222,
"step": 6090
},
{
"epoch": 9.667194928684628,
"grad_norm": 0.2305234670639038,
"learning_rate": 5.686335193136616e-05,
"loss": 0.0229,
"step": 6100
},
{
"epoch": 9.683042789223455,
"grad_norm": 0.24864676594734192,
"learning_rate": 5.673354894414129e-05,
"loss": 0.0259,
"step": 6110
},
{
"epoch": 9.698890649762282,
"grad_norm": 0.25202295184135437,
"learning_rate": 5.660369972119933e-05,
"loss": 0.0237,
"step": 6120
},
{
"epoch": 9.714738510301109,
"grad_norm": 0.32556819915771484,
"learning_rate": 5.6473805154146174e-05,
"loss": 0.02,
"step": 6130
},
{
"epoch": 9.730586370839937,
"grad_norm": 0.2521624267101288,
"learning_rate": 5.634386613489908e-05,
"loss": 0.0242,
"step": 6140
},
{
"epoch": 9.746434231378764,
"grad_norm": 0.25148093700408936,
"learning_rate": 5.6213883555680516e-05,
"loss": 0.0269,
"step": 6150
},
{
"epoch": 9.76228209191759,
"grad_norm": 0.22112874686717987,
"learning_rate": 5.608385830901206e-05,
"loss": 0.0285,
"step": 6160
},
{
"epoch": 9.778129952456418,
"grad_norm": 0.33593472838401794,
"learning_rate": 5.5953791287708254e-05,
"loss": 0.03,
"step": 6170
},
{
"epoch": 9.793977812995246,
"grad_norm": 0.306130975484848,
"learning_rate": 5.5823683384870554e-05,
"loss": 0.0244,
"step": 6180
},
{
"epoch": 9.809825673534073,
"grad_norm": 0.3085562288761139,
"learning_rate": 5.569353549388103e-05,
"loss": 0.027,
"step": 6190
},
{
"epoch": 9.8256735340729,
"grad_norm": 0.2247430682182312,
"learning_rate": 5.556334850839637e-05,
"loss": 0.0234,
"step": 6200
},
{
"epoch": 9.841521394611727,
"grad_norm": 0.26314494013786316,
"learning_rate": 5.543312332234174e-05,
"loss": 0.024,
"step": 6210
},
{
"epoch": 9.857369255150555,
"grad_norm": 0.22496825456619263,
"learning_rate": 5.530286082990454e-05,
"loss": 0.0194,
"step": 6220
},
{
"epoch": 9.873217115689382,
"grad_norm": 0.29987284541130066,
"learning_rate": 5.5172561925528386e-05,
"loss": 0.0252,
"step": 6230
},
{
"epoch": 9.889064976228209,
"grad_norm": 0.3042098581790924,
"learning_rate": 5.5042227503906894e-05,
"loss": 0.0246,
"step": 6240
},
{
"epoch": 9.904912836767036,
"grad_norm": 0.22687886655330658,
"learning_rate": 5.491185845997757e-05,
"loss": 0.026,
"step": 6250
},
{
"epoch": 9.920760697305864,
"grad_norm": 0.2479943484067917,
"learning_rate": 5.478145568891562e-05,
"loss": 0.0289,
"step": 6260
},
{
"epoch": 9.936608557844691,
"grad_norm": 0.20297874510288239,
"learning_rate": 5.465102008612789e-05,
"loss": 0.0233,
"step": 6270
},
{
"epoch": 9.952456418383518,
"grad_norm": 0.17246457934379578,
"learning_rate": 5.452055254724664e-05,
"loss": 0.0253,
"step": 6280
},
{
"epoch": 9.968304278922346,
"grad_norm": 0.24328118562698364,
"learning_rate": 5.4390053968123386e-05,
"loss": 0.025,
"step": 6290
},
{
"epoch": 9.984152139461173,
"grad_norm": 0.18752968311309814,
"learning_rate": 5.425952524482283e-05,
"loss": 0.024,
"step": 6300
},
{
"epoch": 10.0,
"grad_norm": 0.18232440948486328,
"learning_rate": 5.4128967273616625e-05,
"loss": 0.0241,
"step": 6310
},
{
"epoch": 10.015847860538827,
"grad_norm": 0.22801880538463593,
"learning_rate": 5.3998380950977266e-05,
"loss": 0.0209,
"step": 6320
},
{
"epoch": 10.031695721077655,
"grad_norm": 0.21135802567005157,
"learning_rate": 5.386776717357193e-05,
"loss": 0.0234,
"step": 6330
},
{
"epoch": 10.047543581616482,
"grad_norm": 0.2743472754955292,
"learning_rate": 5.373712683825629e-05,
"loss": 0.0237,
"step": 6340
},
{
"epoch": 10.063391442155309,
"grad_norm": 0.2664951682090759,
"learning_rate": 5.3606460842068426e-05,
"loss": 0.0249,
"step": 6350
},
{
"epoch": 10.079239302694136,
"grad_norm": 0.20999731123447418,
"learning_rate": 5.347577008222253e-05,
"loss": 0.0244,
"step": 6360
},
{
"epoch": 10.095087163232964,
"grad_norm": 0.18719319999217987,
"learning_rate": 5.334505545610293e-05,
"loss": 0.0239,
"step": 6370
},
{
"epoch": 10.110935023771791,
"grad_norm": 0.17207162082195282,
"learning_rate": 5.321431786125778e-05,
"loss": 0.0218,
"step": 6380
},
{
"epoch": 10.126782884310618,
"grad_norm": 0.21071314811706543,
"learning_rate": 5.3083558195392936e-05,
"loss": 0.021,
"step": 6390
},
{
"epoch": 10.142630744849445,
"grad_norm": 0.21377994120121002,
"learning_rate": 5.295277735636583e-05,
"loss": 0.0226,
"step": 6400
},
{
"epoch": 10.158478605388273,
"grad_norm": 0.16608726978302002,
"learning_rate": 5.282197624217928e-05,
"loss": 0.0227,
"step": 6410
},
{
"epoch": 10.1743264659271,
"grad_norm": 0.19757942855358124,
"learning_rate": 5.2691155750975316e-05,
"loss": 0.0196,
"step": 6420
},
{
"epoch": 10.190174326465927,
"grad_norm": 0.1993936449289322,
"learning_rate": 5.2560316781029005e-05,
"loss": 0.0199,
"step": 6430
},
{
"epoch": 10.206022187004754,
"grad_norm": 0.20808455348014832,
"learning_rate": 5.2429460230742346e-05,
"loss": 0.0214,
"step": 6440
},
{
"epoch": 10.221870047543582,
"grad_norm": 0.1672813892364502,
"learning_rate": 5.2298586998637956e-05,
"loss": 0.0243,
"step": 6450
},
{
"epoch": 10.23771790808241,
"grad_norm": 0.26778897643089294,
"learning_rate": 5.216769798335311e-05,
"loss": 0.025,
"step": 6460
},
{
"epoch": 10.253565768621236,
"grad_norm": 0.22870604693889618,
"learning_rate": 5.203679408363341e-05,
"loss": 0.021,
"step": 6470
},
{
"epoch": 10.269413629160063,
"grad_norm": 0.2953716516494751,
"learning_rate": 5.190587619832664e-05,
"loss": 0.0215,
"step": 6480
},
{
"epoch": 10.285261489698891,
"grad_norm": 0.3255462944507599,
"learning_rate": 5.1774945226376624e-05,
"loss": 0.0166,
"step": 6490
},
{
"epoch": 10.301109350237718,
"grad_norm": 0.17969000339508057,
"learning_rate": 5.1644002066817063e-05,
"loss": 0.0205,
"step": 6500
},
{
"epoch": 10.316957210776545,
"grad_norm": 0.2460571676492691,
"learning_rate": 5.151304761876536e-05,
"loss": 0.0201,
"step": 6510
},
{
"epoch": 10.332805071315372,
"grad_norm": 0.178553506731987,
"learning_rate": 5.1382082781416396e-05,
"loss": 0.0203,
"step": 6520
},
{
"epoch": 10.3486529318542,
"grad_norm": 0.18054994940757751,
"learning_rate": 5.125110845403638e-05,
"loss": 0.0204,
"step": 6530
},
{
"epoch": 10.364500792393027,
"grad_norm": 0.2226029634475708,
"learning_rate": 5.112012553595671e-05,
"loss": 0.0202,
"step": 6540
},
{
"epoch": 10.380348652931854,
"grad_norm": 0.23070666193962097,
"learning_rate": 5.0989134926567785e-05,
"loss": 0.0205,
"step": 6550
},
{
"epoch": 10.39619651347068,
"grad_norm": 0.1447778195142746,
"learning_rate": 5.085813752531278e-05,
"loss": 0.0273,
"step": 6560
},
{
"epoch": 10.41204437400951,
"grad_norm": 0.18221695721149445,
"learning_rate": 5.072713423168154e-05,
"loss": 0.0196,
"step": 6570
},
{
"epoch": 10.427892234548336,
"grad_norm": 0.2584993839263916,
"learning_rate": 5.0596125945204334e-05,
"loss": 0.0205,
"step": 6580
},
{
"epoch": 10.443740095087163,
"grad_norm": 0.19126753509044647,
"learning_rate": 5.046511356544574e-05,
"loss": 0.0226,
"step": 6590
},
{
"epoch": 10.45958795562599,
"grad_norm": 0.19277669489383698,
"learning_rate": 5.033409799199844e-05,
"loss": 0.0195,
"step": 6600
},
{
"epoch": 10.475435816164818,
"grad_norm": 0.22546206414699554,
"learning_rate": 5.020308012447704e-05,
"loss": 0.022,
"step": 6610
},
{
"epoch": 10.491283676703645,
"grad_norm": 0.26715290546417236,
"learning_rate": 5.0072060862511893e-05,
"loss": 0.0232,
"step": 6620
},
{
"epoch": 10.507131537242472,
"grad_norm": 0.23546898365020752,
"learning_rate": 4.994104110574295e-05,
"loss": 0.0233,
"step": 6630
},
{
"epoch": 10.522979397781299,
"grad_norm": 0.38194459676742554,
"learning_rate": 4.981002175381352e-05,
"loss": 0.0266,
"step": 6640
},
{
"epoch": 10.538827258320127,
"grad_norm": 0.17723363637924194,
"learning_rate": 4.9679003706364185e-05,
"loss": 0.0249,
"step": 6650
},
{
"epoch": 10.554675118858954,
"grad_norm": 0.30575594305992126,
"learning_rate": 4.9547987863026507e-05,
"loss": 0.0268,
"step": 6660
},
{
"epoch": 10.570522979397781,
"grad_norm": 0.2724224328994751,
"learning_rate": 4.9416975123416966e-05,
"loss": 0.0216,
"step": 6670
},
{
"epoch": 10.586370839936608,
"grad_norm": 0.3302716910839081,
"learning_rate": 4.92859663871307e-05,
"loss": 0.0222,
"step": 6680
},
{
"epoch": 10.602218700475436,
"grad_norm": 0.182839035987854,
"learning_rate": 4.915496255373537e-05,
"loss": 0.0241,
"step": 6690
},
{
"epoch": 10.618066561014263,
"grad_norm": 0.18011973798274994,
"learning_rate": 4.902396452276498e-05,
"loss": 0.0166,
"step": 6700
},
{
"epoch": 10.63391442155309,
"grad_norm": 0.2910979688167572,
"learning_rate": 4.8892973193713684e-05,
"loss": 0.0268,
"step": 6710
},
{
"epoch": 10.649762282091917,
"grad_norm": 0.20945270359516144,
"learning_rate": 4.876198946602963e-05,
"loss": 0.0243,
"step": 6720
},
{
"epoch": 10.665610142630745,
"grad_norm": 0.2104242444038391,
"learning_rate": 4.86310142391087e-05,
"loss": 0.0217,
"step": 6730
},
{
"epoch": 10.681458003169572,
"grad_norm": 0.22012865543365479,
"learning_rate": 4.850004841228852e-05,
"loss": 0.0187,
"step": 6740
},
{
"epoch": 10.697305863708399,
"grad_norm": 0.252900093793869,
"learning_rate": 4.836909288484208e-05,
"loss": 0.0284,
"step": 6750
},
{
"epoch": 10.713153724247226,
"grad_norm": 0.2362486571073532,
"learning_rate": 4.8238148555971704e-05,
"loss": 0.0178,
"step": 6760
},
{
"epoch": 10.729001584786054,
"grad_norm": 0.28352028131484985,
"learning_rate": 4.81072163248028e-05,
"loss": 0.0281,
"step": 6770
},
{
"epoch": 10.744849445324881,
"grad_norm": 0.31054121255874634,
"learning_rate": 4.7976297090377706e-05,
"loss": 0.0271,
"step": 6780
},
{
"epoch": 10.760697305863708,
"grad_norm": 0.15438808500766754,
"learning_rate": 4.7845391751649505e-05,
"loss": 0.0256,
"step": 6790
},
{
"epoch": 10.776545166402535,
"grad_norm": 0.17651043832302094,
"learning_rate": 4.7714501207475884e-05,
"loss": 0.0218,
"step": 6800
},
{
"epoch": 10.792393026941363,
"grad_norm": 0.2993830740451813,
"learning_rate": 4.7583626356612954e-05,
"loss": 0.0219,
"step": 6810
},
{
"epoch": 10.80824088748019,
"grad_norm": 0.21443192660808563,
"learning_rate": 4.745276809770905e-05,
"loss": 0.0198,
"step": 6820
},
{
"epoch": 10.824088748019017,
"grad_norm": 0.22990483045578003,
"learning_rate": 4.732192732929858e-05,
"loss": 0.024,
"step": 6830
},
{
"epoch": 10.839936608557844,
"grad_norm": 0.2523830831050873,
"learning_rate": 4.7191104949795845e-05,
"loss": 0.02,
"step": 6840
},
{
"epoch": 10.855784469096672,
"grad_norm": 0.19074945151805878,
"learning_rate": 4.706030185748894e-05,
"loss": 0.0235,
"step": 6850
},
{
"epoch": 10.8716323296355,
"grad_norm": 0.17805525660514832,
"learning_rate": 4.692951895053342e-05,
"loss": 0.024,
"step": 6860
},
{
"epoch": 10.887480190174326,
"grad_norm": 0.25457364320755005,
"learning_rate": 4.6798757126946324e-05,
"loss": 0.0225,
"step": 6870
},
{
"epoch": 10.903328050713153,
"grad_norm": 0.2769658863544464,
"learning_rate": 4.6668017284599866e-05,
"loss": 0.0186,
"step": 6880
},
{
"epoch": 10.919175911251982,
"grad_norm": 0.27840906381607056,
"learning_rate": 4.653730032121539e-05,
"loss": 0.0213,
"step": 6890
},
{
"epoch": 10.935023771790808,
"grad_norm": 0.31035539507865906,
"learning_rate": 4.640660713435709e-05,
"loss": 0.022,
"step": 6900
},
{
"epoch": 10.950871632329635,
"grad_norm": 0.2523256540298462,
"learning_rate": 4.627593862142594e-05,
"loss": 0.0261,
"step": 6910
},
{
"epoch": 10.966719492868462,
"grad_norm": 0.2741487920284271,
"learning_rate": 4.61452956796534e-05,
"loss": 0.0243,
"step": 6920
},
{
"epoch": 10.98256735340729,
"grad_norm": 0.18995286524295807,
"learning_rate": 4.601467920609547e-05,
"loss": 0.0261,
"step": 6930
},
{
"epoch": 10.998415213946117,
"grad_norm": 0.33396896719932556,
"learning_rate": 4.588409009762634e-05,
"loss": 0.0268,
"step": 6940
},
{
"epoch": 11.014263074484944,
"grad_norm": 0.2645708918571472,
"learning_rate": 4.575352925093229e-05,
"loss": 0.0221,
"step": 6950
},
{
"epoch": 11.030110935023771,
"grad_norm": 0.21601872146129608,
"learning_rate": 4.562299756250557e-05,
"loss": 0.0197,
"step": 6960
},
{
"epoch": 11.0459587955626,
"grad_norm": 0.26823803782463074,
"learning_rate": 4.549249592863822e-05,
"loss": 0.0318,
"step": 6970
},
{
"epoch": 11.061806656101426,
"grad_norm": 0.40468984842300415,
"learning_rate": 4.536202524541588e-05,
"loss": 0.0201,
"step": 6980
},
{
"epoch": 11.077654516640253,
"grad_norm": 0.2228170931339264,
"learning_rate": 4.5231586408711684e-05,
"loss": 0.0232,
"step": 6990
},
{
"epoch": 11.09350237717908,
"grad_norm": 0.17821644246578217,
"learning_rate": 4.510118031418009e-05,
"loss": 0.0193,
"step": 7000
},
{
"epoch": 11.109350237717909,
"grad_norm": 0.22201032936573029,
"learning_rate": 4.4970807857250745e-05,
"loss": 0.0235,
"step": 7010
},
{
"epoch": 11.125198098256735,
"grad_norm": 0.16020157933235168,
"learning_rate": 4.4840469933122314e-05,
"loss": 0.0206,
"step": 7020
},
{
"epoch": 11.141045958795562,
"grad_norm": 0.18815340101718903,
"learning_rate": 4.471016743675633e-05,
"loss": 0.0202,
"step": 7030
},
{
"epoch": 11.15689381933439,
"grad_norm": 0.2237204611301422,
"learning_rate": 4.457990126287112e-05,
"loss": 0.021,
"step": 7040
},
{
"epoch": 11.172741679873218,
"grad_norm": 0.2936099171638489,
"learning_rate": 4.444967230593551e-05,
"loss": 0.0203,
"step": 7050
},
{
"epoch": 11.188589540412044,
"grad_norm": 0.1436583399772644,
"learning_rate": 4.431948146016286e-05,
"loss": 0.0197,
"step": 7060
},
{
"epoch": 11.204437400950871,
"grad_norm": 0.2675095796585083,
"learning_rate": 4.418932961950478e-05,
"loss": 0.02,
"step": 7070
},
{
"epoch": 11.2202852614897,
"grad_norm": 0.23882818222045898,
"learning_rate": 4.405921767764511e-05,
"loss": 0.0217,
"step": 7080
},
{
"epoch": 11.236133122028527,
"grad_norm": 0.2709539830684662,
"learning_rate": 4.392914652799368e-05,
"loss": 0.0209,
"step": 7090
},
{
"epoch": 11.251980982567353,
"grad_norm": 0.18802231550216675,
"learning_rate": 4.3799117063680254e-05,
"loss": 0.0173,
"step": 7100
},
{
"epoch": 11.26782884310618,
"grad_norm": 0.25173911452293396,
"learning_rate": 4.366913017754836e-05,
"loss": 0.0228,
"step": 7110
},
{
"epoch": 11.283676703645009,
"grad_norm": 0.2181670218706131,
"learning_rate": 4.3539186762149106e-05,
"loss": 0.016,
"step": 7120
},
{
"epoch": 11.299524564183836,
"grad_norm": 0.18725943565368652,
"learning_rate": 4.3409287709735204e-05,
"loss": 0.0234,
"step": 7130
},
{
"epoch": 11.315372424722662,
"grad_norm": 0.3149115741252899,
"learning_rate": 4.3279433912254675e-05,
"loss": 0.0213,
"step": 7140
},
{
"epoch": 11.33122028526149,
"grad_norm": 0.2042395919561386,
"learning_rate": 4.314962626134484e-05,
"loss": 0.0206,
"step": 7150
},
{
"epoch": 11.347068145800318,
"grad_norm": 0.14478328824043274,
"learning_rate": 4.301986564832613e-05,
"loss": 0.0203,
"step": 7160
},
{
"epoch": 11.362916006339145,
"grad_norm": 0.20697103440761566,
"learning_rate": 4.289015296419603e-05,
"loss": 0.0156,
"step": 7170
},
{
"epoch": 11.378763866877971,
"grad_norm": 0.2516174912452698,
"learning_rate": 4.276048909962286e-05,
"loss": 0.021,
"step": 7180
},
{
"epoch": 11.394611727416798,
"grad_norm": 0.30749985575675964,
"learning_rate": 4.263087494493977e-05,
"loss": 0.0189,
"step": 7190
},
{
"epoch": 11.410459587955627,
"grad_norm": 0.2317238450050354,
"learning_rate": 4.2501311390138574e-05,
"loss": 0.0245,
"step": 7200
},
{
"epoch": 11.426307448494454,
"grad_norm": 0.24530279636383057,
"learning_rate": 4.2371799324863614e-05,
"loss": 0.0185,
"step": 7210
},
{
"epoch": 11.44215530903328,
"grad_norm": 0.16856257617473602,
"learning_rate": 4.224233963840574e-05,
"loss": 0.0223,
"step": 7220
},
{
"epoch": 11.458003169572107,
"grad_norm": 0.15289132297039032,
"learning_rate": 4.2112933219696106e-05,
"loss": 0.0157,
"step": 7230
},
{
"epoch": 11.473851030110936,
"grad_norm": 0.17484936118125916,
"learning_rate": 4.198358095730006e-05,
"loss": 0.0212,
"step": 7240
},
{
"epoch": 11.489698890649763,
"grad_norm": 0.18419259786605835,
"learning_rate": 4.185428373941115e-05,
"loss": 0.0207,
"step": 7250
},
{
"epoch": 11.50554675118859,
"grad_norm": 0.2928980588912964,
"learning_rate": 4.172504245384496e-05,
"loss": 0.0217,
"step": 7260
},
{
"epoch": 11.521394611727416,
"grad_norm": 0.19275160133838654,
"learning_rate": 4.1595857988033e-05,
"loss": 0.0194,
"step": 7270
},
{
"epoch": 11.537242472266245,
"grad_norm": 0.3847340941429138,
"learning_rate": 4.146673122901662e-05,
"loss": 0.0199,
"step": 7280
},
{
"epoch": 11.553090332805072,
"grad_norm": 0.25312259793281555,
"learning_rate": 4.1337663063440946e-05,
"loss": 0.0174,
"step": 7290
},
{
"epoch": 11.568938193343898,
"grad_norm": 0.274879515171051,
"learning_rate": 4.120865437754877e-05,
"loss": 0.0238,
"step": 7300
},
{
"epoch": 11.584786053882725,
"grad_norm": 0.22004622220993042,
"learning_rate": 4.1079706057174455e-05,
"loss": 0.0231,
"step": 7310
},
{
"epoch": 11.600633914421554,
"grad_norm": 0.4630294740200043,
"learning_rate": 4.095081898773787e-05,
"loss": 0.022,
"step": 7320
},
{
"epoch": 11.61648177496038,
"grad_norm": 0.15254133939743042,
"learning_rate": 4.0821994054238325e-05,
"loss": 0.0218,
"step": 7330
},
{
"epoch": 11.632329635499207,
"grad_norm": 0.18909721076488495,
"learning_rate": 4.069323214124845e-05,
"loss": 0.0241,
"step": 7340
},
{
"epoch": 11.648177496038034,
"grad_norm": 0.18203580379486084,
"learning_rate": 4.0564534132908164e-05,
"loss": 0.0206,
"step": 7350
},
{
"epoch": 11.664025356576863,
"grad_norm": 0.31021520495414734,
"learning_rate": 4.04359009129186e-05,
"loss": 0.0229,
"step": 7360
},
{
"epoch": 11.67987321711569,
"grad_norm": 0.21043580770492554,
"learning_rate": 4.0307333364535973e-05,
"loss": 0.0243,
"step": 7370
},
{
"epoch": 11.695721077654516,
"grad_norm": 0.17714616656303406,
"learning_rate": 4.017883237056561e-05,
"loss": 0.02,
"step": 7380
},
{
"epoch": 11.711568938193343,
"grad_norm": 0.23153972625732422,
"learning_rate": 4.005039881335583e-05,
"loss": 0.0178,
"step": 7390
},
{
"epoch": 11.727416798732172,
"grad_norm": 0.7659839391708374,
"learning_rate": 3.99220335747919e-05,
"loss": 0.0213,
"step": 7400
},
{
"epoch": 11.743264659270999,
"grad_norm": 0.2092520147562027,
"learning_rate": 3.979373753628999e-05,
"loss": 0.023,
"step": 7410
},
{
"epoch": 11.759112519809825,
"grad_norm": 0.3415199816226959,
"learning_rate": 3.9665511578791096e-05,
"loss": 0.021,
"step": 7420
},
{
"epoch": 11.774960380348652,
"grad_norm": 0.31222307682037354,
"learning_rate": 3.9537356582755034e-05,
"loss": 0.0214,
"step": 7430
},
{
"epoch": 11.79080824088748,
"grad_norm": 0.18112266063690186,
"learning_rate": 3.940927342815428e-05,
"loss": 0.0234,
"step": 7440
},
{
"epoch": 11.806656101426308,
"grad_norm": 0.28897473216056824,
"learning_rate": 3.9281262994468114e-05,
"loss": 0.0258,
"step": 7450
},
{
"epoch": 11.822503961965134,
"grad_norm": 0.28549882769584656,
"learning_rate": 3.915332616067643e-05,
"loss": 0.0188,
"step": 7460
},
{
"epoch": 11.838351822503961,
"grad_norm": 0.19967828691005707,
"learning_rate": 3.9025463805253765e-05,
"loss": 0.0201,
"step": 7470
},
{
"epoch": 11.85419968304279,
"grad_norm": 0.27357855439186096,
"learning_rate": 3.889767680616324e-05,
"loss": 0.0193,
"step": 7480
},
{
"epoch": 11.870047543581617,
"grad_norm": 0.202061265707016,
"learning_rate": 3.8769966040850566e-05,
"loss": 0.0188,
"step": 7490
},
{
"epoch": 11.885895404120443,
"grad_norm": 0.24488794803619385,
"learning_rate": 3.864233238623796e-05,
"loss": 0.0177,
"step": 7500
},
{
"epoch": 11.90174326465927,
"grad_norm": 0.23348113894462585,
"learning_rate": 3.851477671871818e-05,
"loss": 0.0189,
"step": 7510
},
{
"epoch": 11.917591125198099,
"grad_norm": 0.31944724917411804,
"learning_rate": 3.838729991414852e-05,
"loss": 0.0211,
"step": 7520
},
{
"epoch": 11.933438985736926,
"grad_norm": 0.24721786379814148,
"learning_rate": 3.82599028478447e-05,
"loss": 0.0159,
"step": 7530
},
{
"epoch": 11.949286846275752,
"grad_norm": 0.2412160336971283,
"learning_rate": 3.8132586394574974e-05,
"loss": 0.0231,
"step": 7540
},
{
"epoch": 11.96513470681458,
"grad_norm": 0.2842359244823456,
"learning_rate": 3.8005351428554036e-05,
"loss": 0.0179,
"step": 7550
},
{
"epoch": 11.980982567353408,
"grad_norm": 0.19113971292972565,
"learning_rate": 3.78781988234371e-05,
"loss": 0.0178,
"step": 7560
},
{
"epoch": 11.996830427892235,
"grad_norm": 0.24129873514175415,
"learning_rate": 3.775112945231377e-05,
"loss": 0.0214,
"step": 7570
},
{
"epoch": 12.012678288431061,
"grad_norm": 0.30563119053840637,
"learning_rate": 3.7624144187702174e-05,
"loss": 0.0207,
"step": 7580
},
{
"epoch": 12.028526148969888,
"grad_norm": 0.16946931183338165,
"learning_rate": 3.7497243901542934e-05,
"loss": 0.0194,
"step": 7590
},
{
"epoch": 12.044374009508717,
"grad_norm": 0.23966370522975922,
"learning_rate": 3.7370429465193154e-05,
"loss": 0.0198,
"step": 7600
},
{
"epoch": 12.060221870047544,
"grad_norm": 0.2549941837787628,
"learning_rate": 3.724370174942047e-05,
"loss": 0.023,
"step": 7610
},
{
"epoch": 12.07606973058637,
"grad_norm": 0.2220945656299591,
"learning_rate": 3.711706162439704e-05,
"loss": 0.0174,
"step": 7620
},
{
"epoch": 12.091917591125197,
"grad_norm": 0.16276349127292633,
"learning_rate": 3.699050995969354e-05,
"loss": 0.0192,
"step": 7630
},
{
"epoch": 12.107765451664026,
"grad_norm": 0.3065180778503418,
"learning_rate": 3.6864047624273325e-05,
"loss": 0.019,
"step": 7640
},
{
"epoch": 12.123613312202853,
"grad_norm": 0.19206896424293518,
"learning_rate": 3.67376754864863e-05,
"loss": 0.0149,
"step": 7650
},
{
"epoch": 12.13946117274168,
"grad_norm": 0.21416613459587097,
"learning_rate": 3.6611394414063074e-05,
"loss": 0.0179,
"step": 7660
},
{
"epoch": 12.155309033280506,
"grad_norm": 0.2737729251384735,
"learning_rate": 3.6485205274108936e-05,
"loss": 0.0235,
"step": 7670
},
{
"epoch": 12.171156893819335,
"grad_norm": 0.17268019914627075,
"learning_rate": 3.635910893309792e-05,
"loss": 0.0162,
"step": 7680
},
{
"epoch": 12.187004754358162,
"grad_norm": 0.23836471140384674,
"learning_rate": 3.6233106256866895e-05,
"loss": 0.0174,
"step": 7690
},
{
"epoch": 12.202852614896988,
"grad_norm": 0.447587788105011,
"learning_rate": 3.610719811060952e-05,
"loss": 0.0189,
"step": 7700
},
{
"epoch": 12.218700475435817,
"grad_norm": 0.21118977665901184,
"learning_rate": 3.598138535887041e-05,
"loss": 0.0183,
"step": 7710
},
{
"epoch": 12.234548335974644,
"grad_norm": 0.257715106010437,
"learning_rate": 3.585566886553917e-05,
"loss": 0.0209,
"step": 7720
},
{
"epoch": 12.25039619651347,
"grad_norm": 0.295749694108963,
"learning_rate": 3.5730049493844405e-05,
"loss": 0.0261,
"step": 7730
},
{
"epoch": 12.266244057052297,
"grad_norm": 0.3179740607738495,
"learning_rate": 3.560452810634787e-05,
"loss": 0.0214,
"step": 7740
},
{
"epoch": 12.282091917591124,
"grad_norm": 0.1746010035276413,
"learning_rate": 3.547910556493852e-05,
"loss": 0.0208,
"step": 7750
},
{
"epoch": 12.297939778129953,
"grad_norm": 0.2330365628004074,
"learning_rate": 3.535378273082656e-05,
"loss": 0.0208,
"step": 7760
},
{
"epoch": 12.31378763866878,
"grad_norm": 0.39738985896110535,
"learning_rate": 3.5228560464537535e-05,
"loss": 0.0239,
"step": 7770
},
{
"epoch": 12.329635499207606,
"grad_norm": 0.2947781980037689,
"learning_rate": 3.510343962590653e-05,
"loss": 0.0191,
"step": 7780
},
{
"epoch": 12.345483359746435,
"grad_norm": 0.21791400015354156,
"learning_rate": 3.49784210740721e-05,
"loss": 0.0264,
"step": 7790
},
{
"epoch": 12.361331220285262,
"grad_norm": 0.19092513620853424,
"learning_rate": 3.485350566747049e-05,
"loss": 0.0248,
"step": 7800
},
{
"epoch": 12.377179080824089,
"grad_norm": 0.35505980253219604,
"learning_rate": 3.4728694263829684e-05,
"loss": 0.0199,
"step": 7810
},
{
"epoch": 12.393026941362915,
"grad_norm": 0.1710539311170578,
"learning_rate": 3.460398772016355e-05,
"loss": 0.019,
"step": 7820
},
{
"epoch": 12.408874801901744,
"grad_norm": 0.33750495314598083,
"learning_rate": 3.4479386892765905e-05,
"loss": 0.0205,
"step": 7830
},
{
"epoch": 12.42472266244057,
"grad_norm": 0.2829129099845886,
"learning_rate": 3.43548926372047e-05,
"loss": 0.0198,
"step": 7840
},
{
"epoch": 12.440570522979398,
"grad_norm": 0.18969641625881195,
"learning_rate": 3.423050580831611e-05,
"loss": 0.0205,
"step": 7850
},
{
"epoch": 12.456418383518225,
"grad_norm": 0.2330506592988968,
"learning_rate": 3.410622726019865e-05,
"loss": 0.0213,
"step": 7860
},
{
"epoch": 12.472266244057053,
"grad_norm": 0.2536896765232086,
"learning_rate": 3.398205784620735e-05,
"loss": 0.0207,
"step": 7870
},
{
"epoch": 12.48811410459588,
"grad_norm": 0.16537010669708252,
"learning_rate": 3.3857998418947864e-05,
"loss": 0.0169,
"step": 7880
},
{
"epoch": 12.503961965134707,
"grad_norm": 0.2565062344074249,
"learning_rate": 3.373404983027062e-05,
"loss": 0.0214,
"step": 7890
},
{
"epoch": 12.519809825673534,
"grad_norm": 0.18320074677467346,
"learning_rate": 3.361021293126497e-05,
"loss": 0.0166,
"step": 7900
},
{
"epoch": 12.535657686212362,
"grad_norm": 0.2510707378387451,
"learning_rate": 3.3486488572253385e-05,
"loss": 0.0173,
"step": 7910
},
{
"epoch": 12.551505546751189,
"grad_norm": 0.24890565872192383,
"learning_rate": 3.3362877602785524e-05,
"loss": 0.0196,
"step": 7920
},
{
"epoch": 12.567353407290016,
"grad_norm": 0.25348639488220215,
"learning_rate": 3.3239380871632543e-05,
"loss": 0.0201,
"step": 7930
},
{
"epoch": 12.583201267828843,
"grad_norm": 0.2547270655632019,
"learning_rate": 3.3115999226781135e-05,
"loss": 0.0163,
"step": 7940
},
{
"epoch": 12.599049128367671,
"grad_norm": 0.1903742551803589,
"learning_rate": 3.299273351542773e-05,
"loss": 0.0162,
"step": 7950
},
{
"epoch": 12.614896988906498,
"grad_norm": 0.14592960476875305,
"learning_rate": 3.286958458397273e-05,
"loss": 0.0218,
"step": 7960
},
{
"epoch": 12.630744849445325,
"grad_norm": 0.220992311835289,
"learning_rate": 3.27465532780147e-05,
"loss": 0.0193,
"step": 7970
},
{
"epoch": 12.646592709984152,
"grad_norm": 0.3510618209838867,
"learning_rate": 3.2623640442344505e-05,
"loss": 0.021,
"step": 7980
},
{
"epoch": 12.66244057052298,
"grad_norm": 0.1398414969444275,
"learning_rate": 3.250084692093953e-05,
"loss": 0.0199,
"step": 7990
},
{
"epoch": 12.678288431061807,
"grad_norm": 0.24324694275856018,
"learning_rate": 3.237817355695791e-05,
"loss": 0.0172,
"step": 8000
},
{
"epoch": 12.694136291600634,
"grad_norm": 0.20084106922149658,
"learning_rate": 3.225562119273272e-05,
"loss": 0.0134,
"step": 8010
},
{
"epoch": 12.70998415213946,
"grad_norm": 0.20435374975204468,
"learning_rate": 3.213319066976617e-05,
"loss": 0.019,
"step": 8020
},
{
"epoch": 12.72583201267829,
"grad_norm": 0.21612811088562012,
"learning_rate": 3.201088282872387e-05,
"loss": 0.0159,
"step": 8030
},
{
"epoch": 12.741679873217116,
"grad_norm": 0.2342618703842163,
"learning_rate": 3.188869850942905e-05,
"loss": 0.0186,
"step": 8040
},
{
"epoch": 12.757527733755943,
"grad_norm": 0.20277902483940125,
"learning_rate": 3.176663855085677e-05,
"loss": 0.0209,
"step": 8050
},
{
"epoch": 12.77337559429477,
"grad_norm": 0.2995304763317108,
"learning_rate": 3.164470379112816e-05,
"loss": 0.0247,
"step": 8060
},
{
"epoch": 12.789223454833598,
"grad_norm": 0.23769770562648773,
"learning_rate": 3.15228950675047e-05,
"loss": 0.0152,
"step": 8070
},
{
"epoch": 12.805071315372425,
"grad_norm": 0.1370396465063095,
"learning_rate": 3.140121321638241e-05,
"loss": 0.0177,
"step": 8080
},
{
"epoch": 12.820919175911252,
"grad_norm": 0.4313637614250183,
"learning_rate": 3.127965907328617e-05,
"loss": 0.0154,
"step": 8090
},
{
"epoch": 12.836767036450079,
"grad_norm": 0.2073371410369873,
"learning_rate": 3.115823347286397e-05,
"loss": 0.0165,
"step": 8100
},
{
"epoch": 12.852614896988907,
"grad_norm": 0.32266175746917725,
"learning_rate": 3.103693724888112e-05,
"loss": 0.0212,
"step": 8110
},
{
"epoch": 12.868462757527734,
"grad_norm": 0.1806778460741043,
"learning_rate": 3.091577123421462e-05,
"loss": 0.0145,
"step": 8120
},
{
"epoch": 12.88431061806656,
"grad_norm": 0.25016674399375916,
"learning_rate": 3.079473626084737e-05,
"loss": 0.0211,
"step": 8130
},
{
"epoch": 12.900158478605388,
"grad_norm": 0.16698500514030457,
"learning_rate": 3.067383315986249e-05,
"loss": 0.0228,
"step": 8140
},
{
"epoch": 12.916006339144216,
"grad_norm": 0.22536715865135193,
"learning_rate": 3.055306276143754e-05,
"loss": 0.0213,
"step": 8150
},
{
"epoch": 12.931854199683043,
"grad_norm": 0.17826388776302338,
"learning_rate": 3.0432425894838977e-05,
"loss": 0.023,
"step": 8160
},
{
"epoch": 12.94770206022187,
"grad_norm": 0.22973258793354034,
"learning_rate": 3.031192338841631e-05,
"loss": 0.0188,
"step": 8170
},
{
"epoch": 12.963549920760697,
"grad_norm": 0.3207305669784546,
"learning_rate": 3.0191556069596476e-05,
"loss": 0.0199,
"step": 8180
},
{
"epoch": 12.979397781299525,
"grad_norm": 0.19772501289844513,
"learning_rate": 3.0071324764878155e-05,
"loss": 0.0177,
"step": 8190
},
{
"epoch": 12.995245641838352,
"grad_norm": 0.19332300126552582,
"learning_rate": 2.99512302998261e-05,
"loss": 0.0243,
"step": 8200
},
{
"epoch": 13.011093502377179,
"grad_norm": 0.22696681320667267,
"learning_rate": 2.9831273499065422e-05,
"loss": 0.0178,
"step": 8210
},
{
"epoch": 13.026941362916006,
"grad_norm": 0.2711600065231323,
"learning_rate": 2.9711455186275998e-05,
"loss": 0.0149,
"step": 8220
},
{
"epoch": 13.042789223454834,
"grad_norm": 0.22301819920539856,
"learning_rate": 2.959177618418678e-05,
"loss": 0.0201,
"step": 8230
},
{
"epoch": 13.058637083993661,
"grad_norm": 0.1777944713830948,
"learning_rate": 2.9472237314570134e-05,
"loss": 0.0187,
"step": 8240
},
{
"epoch": 13.074484944532488,
"grad_norm": 0.24867452681064606,
"learning_rate": 2.935283939823621e-05,
"loss": 0.0217,
"step": 8250
},
{
"epoch": 13.090332805071315,
"grad_norm": 0.24219559133052826,
"learning_rate": 2.9233583255027313e-05,
"loss": 0.013,
"step": 8260
},
{
"epoch": 13.106180665610143,
"grad_norm": 0.14742301404476166,
"learning_rate": 2.9114469703812292e-05,
"loss": 0.0199,
"step": 8270
},
{
"epoch": 13.12202852614897,
"grad_norm": 0.167776420712471,
"learning_rate": 2.8995499562480842e-05,
"loss": 0.0183,
"step": 8280
},
{
"epoch": 13.137876386687797,
"grad_norm": 0.29319486021995544,
"learning_rate": 2.8876673647937945e-05,
"loss": 0.0208,
"step": 8290
},
{
"epoch": 13.153724247226624,
"grad_norm": 0.1555861234664917,
"learning_rate": 2.875799277609832e-05,
"loss": 0.0194,
"step": 8300
},
{
"epoch": 13.169572107765452,
"grad_norm": 0.1766081005334854,
"learning_rate": 2.863945776188065e-05,
"loss": 0.0182,
"step": 8310
},
{
"epoch": 13.185419968304279,
"grad_norm": 0.2022436112165451,
"learning_rate": 2.8521069419202195e-05,
"loss": 0.0161,
"step": 8320
},
{
"epoch": 13.201267828843106,
"grad_norm": 0.1649257242679596,
"learning_rate": 2.840282856097304e-05,
"loss": 0.0168,
"step": 8330
},
{
"epoch": 13.217115689381933,
"grad_norm": 0.24146905541419983,
"learning_rate": 2.828473599909055e-05,
"loss": 0.0178,
"step": 8340
},
{
"epoch": 13.232963549920761,
"grad_norm": 0.20440474152565002,
"learning_rate": 2.8166792544433894e-05,
"loss": 0.0251,
"step": 8350
},
{
"epoch": 13.248811410459588,
"grad_norm": 0.21215130388736725,
"learning_rate": 2.8048999006858323e-05,
"loss": 0.0225,
"step": 8360
},
{
"epoch": 13.264659270998415,
"grad_norm": 0.17490635812282562,
"learning_rate": 2.7931356195189735e-05,
"loss": 0.0151,
"step": 8370
},
{
"epoch": 13.280507131537242,
"grad_norm": 0.2777180075645447,
"learning_rate": 2.781386491721908e-05,
"loss": 0.0178,
"step": 8380
},
{
"epoch": 13.29635499207607,
"grad_norm": 0.23932000994682312,
"learning_rate": 2.7696525979696752e-05,
"loss": 0.0147,
"step": 8390
},
{
"epoch": 13.312202852614897,
"grad_norm": 0.19922451674938202,
"learning_rate": 2.7579340188327186e-05,
"loss": 0.0168,
"step": 8400
},
{
"epoch": 13.328050713153724,
"grad_norm": 0.2395889014005661,
"learning_rate": 2.7462308347763127e-05,
"loss": 0.017,
"step": 8410
},
{
"epoch": 13.343898573692552,
"grad_norm": 0.23529374599456787,
"learning_rate": 2.7345431261600317e-05,
"loss": 0.0197,
"step": 8420
},
{
"epoch": 13.35974643423138,
"grad_norm": 0.2671940326690674,
"learning_rate": 2.7228709732371886e-05,
"loss": 0.0155,
"step": 8430
},
{
"epoch": 13.375594294770206,
"grad_norm": 0.2091439962387085,
"learning_rate": 2.7112144561542757e-05,
"loss": 0.0205,
"step": 8440
},
{
"epoch": 13.391442155309033,
"grad_norm": 0.20118452608585358,
"learning_rate": 2.6995736549504315e-05,
"loss": 0.015,
"step": 8450
},
{
"epoch": 13.407290015847861,
"grad_norm": 0.15710382163524628,
"learning_rate": 2.687948649556874e-05,
"loss": 0.0192,
"step": 8460
},
{
"epoch": 13.423137876386688,
"grad_norm": 0.22499555349349976,
"learning_rate": 2.6763395197963626e-05,
"loss": 0.0268,
"step": 8470
},
{
"epoch": 13.438985736925515,
"grad_norm": 0.17233209311962128,
"learning_rate": 2.6647463453826505e-05,
"loss": 0.0191,
"step": 8480
},
{
"epoch": 13.454833597464342,
"grad_norm": 0.28862184286117554,
"learning_rate": 2.6531692059199275e-05,
"loss": 0.0196,
"step": 8490
},
{
"epoch": 13.47068145800317,
"grad_norm": 0.19401662051677704,
"learning_rate": 2.6416081809022887e-05,
"loss": 0.0171,
"step": 8500
},
{
"epoch": 13.486529318541997,
"grad_norm": 0.21995659172534943,
"learning_rate": 2.6300633497131687e-05,
"loss": 0.0195,
"step": 8510
},
{
"epoch": 13.502377179080824,
"grad_norm": 0.2321847379207611,
"learning_rate": 2.618534791624816e-05,
"loss": 0.0209,
"step": 8520
},
{
"epoch": 13.51822503961965,
"grad_norm": 0.21036501228809357,
"learning_rate": 2.6070225857977428e-05,
"loss": 0.0204,
"step": 8530
},
{
"epoch": 13.53407290015848,
"grad_norm": 0.2640347480773926,
"learning_rate": 2.5955268112801656e-05,
"loss": 0.0158,
"step": 8540
},
{
"epoch": 13.549920760697306,
"grad_norm": 0.30468320846557617,
"learning_rate": 2.58404754700749e-05,
"loss": 0.0151,
"step": 8550
},
{
"epoch": 13.565768621236133,
"grad_norm": 0.19475166499614716,
"learning_rate": 2.5725848718017454e-05,
"loss": 0.0194,
"step": 8560
},
{
"epoch": 13.58161648177496,
"grad_norm": 0.18407198786735535,
"learning_rate": 2.561138864371057e-05,
"loss": 0.017,
"step": 8570
},
{
"epoch": 13.597464342313788,
"grad_norm": 0.197821244597435,
"learning_rate": 2.549709603309104e-05,
"loss": 0.0192,
"step": 8580
},
{
"epoch": 13.613312202852615,
"grad_norm": 0.19414368271827698,
"learning_rate": 2.53829716709457e-05,
"loss": 0.0161,
"step": 8590
},
{
"epoch": 13.629160063391442,
"grad_norm": 0.32657763361930847,
"learning_rate": 2.5269016340906138e-05,
"loss": 0.0193,
"step": 8600
},
{
"epoch": 13.645007923930269,
"grad_norm": 0.17926651239395142,
"learning_rate": 2.5155230825443332e-05,
"loss": 0.0172,
"step": 8610
},
{
"epoch": 13.660855784469097,
"grad_norm": 0.1641903668642044,
"learning_rate": 2.504161590586217e-05,
"loss": 0.0171,
"step": 8620
},
{
"epoch": 13.676703645007924,
"grad_norm": 0.23365381360054016,
"learning_rate": 2.4928172362296205e-05,
"loss": 0.0149,
"step": 8630
},
{
"epoch": 13.692551505546751,
"grad_norm": 0.2839002311229706,
"learning_rate": 2.4814900973702183e-05,
"loss": 0.0198,
"step": 8640
},
{
"epoch": 13.708399366085578,
"grad_norm": 0.233973428606987,
"learning_rate": 2.4701802517854822e-05,
"loss": 0.022,
"step": 8650
},
{
"epoch": 13.724247226624406,
"grad_norm": 0.2717144191265106,
"learning_rate": 2.458887777134134e-05,
"loss": 0.0199,
"step": 8660
},
{
"epoch": 13.740095087163233,
"grad_norm": 0.2552318274974823,
"learning_rate": 2.44761275095562e-05,
"loss": 0.019,
"step": 8670
},
{
"epoch": 13.75594294770206,
"grad_norm": 0.17286346852779388,
"learning_rate": 2.4363552506695814e-05,
"loss": 0.0182,
"step": 8680
},
{
"epoch": 13.771790808240887,
"grad_norm": 0.1892533153295517,
"learning_rate": 2.4251153535753107e-05,
"loss": 0.0212,
"step": 8690
},
{
"epoch": 13.787638668779715,
"grad_norm": 0.15570400655269623,
"learning_rate": 2.4138931368512375e-05,
"loss": 0.0178,
"step": 8700
},
{
"epoch": 13.803486529318542,
"grad_norm": 0.287626177072525,
"learning_rate": 2.402688677554381e-05,
"loss": 0.0166,
"step": 8710
},
{
"epoch": 13.819334389857369,
"grad_norm": 0.3084344267845154,
"learning_rate": 2.3915020526198373e-05,
"loss": 0.0148,
"step": 8720
},
{
"epoch": 13.835182250396196,
"grad_norm": 0.13890209794044495,
"learning_rate": 2.3803333388602372e-05,
"loss": 0.0158,
"step": 8730
},
{
"epoch": 13.851030110935024,
"grad_norm": 0.24919134378433228,
"learning_rate": 2.3691826129652267e-05,
"loss": 0.0202,
"step": 8740
},
{
"epoch": 13.866877971473851,
"grad_norm": 0.19362711906433105,
"learning_rate": 2.3580499515009408e-05,
"loss": 0.0186,
"step": 8750
},
{
"epoch": 13.882725832012678,
"grad_norm": 0.23859569430351257,
"learning_rate": 2.346935430909476e-05,
"loss": 0.018,
"step": 8760
},
{
"epoch": 13.898573692551505,
"grad_norm": 0.41652438044548035,
"learning_rate": 2.335839127508359e-05,
"loss": 0.018,
"step": 8770
},
{
"epoch": 13.914421553090333,
"grad_norm": 0.19404253363609314,
"learning_rate": 2.3247611174900375e-05,
"loss": 0.0144,
"step": 8780
},
{
"epoch": 13.93026941362916,
"grad_norm": 0.27209949493408203,
"learning_rate": 2.3137014769213415e-05,
"loss": 0.0181,
"step": 8790
},
{
"epoch": 13.946117274167987,
"grad_norm": 0.15419328212738037,
"learning_rate": 2.3026602817429677e-05,
"loss": 0.0176,
"step": 8800
},
{
"epoch": 13.961965134706814,
"grad_norm": 0.22414186596870422,
"learning_rate": 2.291637607768964e-05,
"loss": 0.0224,
"step": 8810
},
{
"epoch": 13.977812995245642,
"grad_norm": 0.16095861792564392,
"learning_rate": 2.280633530686195e-05,
"loss": 0.0152,
"step": 8820
},
{
"epoch": 13.99366085578447,
"grad_norm": 0.1415528804063797,
"learning_rate": 2.2696481260538393e-05,
"loss": 0.0156,
"step": 8830
},
{
"epoch": 14.009508716323296,
"grad_norm": 0.1570771187543869,
"learning_rate": 2.2586814693028524e-05,
"loss": 0.0173,
"step": 8840
},
{
"epoch": 14.025356576862123,
"grad_norm": 0.2337312251329422,
"learning_rate": 2.247733635735466e-05,
"loss": 0.0197,
"step": 8850
},
{
"epoch": 14.041204437400951,
"grad_norm": 0.2519458532333374,
"learning_rate": 2.2368047005246585e-05,
"loss": 0.0177,
"step": 8860
},
{
"epoch": 14.057052297939778,
"grad_norm": 0.26522183418273926,
"learning_rate": 2.2258947387136415e-05,
"loss": 0.0192,
"step": 8870
},
{
"epoch": 14.072900158478605,
"grad_norm": 0.12336030602455139,
"learning_rate": 2.2150038252153533e-05,
"loss": 0.0175,
"step": 8880
},
{
"epoch": 14.088748019017432,
"grad_norm": 0.15576300024986267,
"learning_rate": 2.204132034811929e-05,
"loss": 0.0174,
"step": 8890
},
{
"epoch": 14.10459587955626,
"grad_norm": 0.21424925327301025,
"learning_rate": 2.1932794421542018e-05,
"loss": 0.0142,
"step": 8900
},
{
"epoch": 14.120443740095087,
"grad_norm": 0.21682120859622955,
"learning_rate": 2.182446121761186e-05,
"loss": 0.0191,
"step": 8910
},
{
"epoch": 14.136291600633914,
"grad_norm": 0.25047534704208374,
"learning_rate": 2.171632148019552e-05,
"loss": 0.0154,
"step": 8920
},
{
"epoch": 14.152139461172741,
"grad_norm": 0.2971823513507843,
"learning_rate": 2.1608375951831383e-05,
"loss": 0.0227,
"step": 8930
},
{
"epoch": 14.16798732171157,
"grad_norm": 0.2523512542247772,
"learning_rate": 2.1500625373724286e-05,
"loss": 0.0144,
"step": 8940
},
{
"epoch": 14.183835182250396,
"grad_norm": 0.21813775599002838,
"learning_rate": 2.1393070485740386e-05,
"loss": 0.0154,
"step": 8950
},
{
"epoch": 14.199683042789223,
"grad_norm": 0.2209501713514328,
"learning_rate": 2.1285712026402215e-05,
"loss": 0.0137,
"step": 8960
},
{
"epoch": 14.21553090332805,
"grad_norm": 0.1733659952878952,
"learning_rate": 2.117855073288346e-05,
"loss": 0.0133,
"step": 8970
},
{
"epoch": 14.231378763866879,
"grad_norm": 0.19718633592128754,
"learning_rate": 2.1071587341004058e-05,
"loss": 0.0212,
"step": 8980
},
{
"epoch": 14.247226624405705,
"grad_norm": 0.23138895630836487,
"learning_rate": 2.0964822585224987e-05,
"loss": 0.0218,
"step": 8990
},
{
"epoch": 14.263074484944532,
"grad_norm": 0.22604243457317352,
"learning_rate": 2.08582571986433e-05,
"loss": 0.0165,
"step": 9000
},
{
"epoch": 14.278922345483359,
"grad_norm": 0.21740014851093292,
"learning_rate": 2.075189191298716e-05,
"loss": 0.018,
"step": 9010
},
{
"epoch": 14.294770206022188,
"grad_norm": 0.5042977333068848,
"learning_rate": 2.0645727458610646e-05,
"loss": 0.015,
"step": 9020
},
{
"epoch": 14.310618066561014,
"grad_norm": 0.17162521183490753,
"learning_rate": 2.0539764564488927e-05,
"loss": 0.0147,
"step": 9030
},
{
"epoch": 14.326465927099841,
"grad_norm": 0.23630589246749878,
"learning_rate": 2.04340039582131e-05,
"loss": 0.0168,
"step": 9040
},
{
"epoch": 14.342313787638668,
"grad_norm": 0.22610369324684143,
"learning_rate": 2.0328446365985253e-05,
"loss": 0.019,
"step": 9050
},
{
"epoch": 14.358161648177497,
"grad_norm": 0.23171366751194,
"learning_rate": 2.022309251261355e-05,
"loss": 0.0185,
"step": 9060
},
{
"epoch": 14.374009508716323,
"grad_norm": 0.20405028760433197,
"learning_rate": 2.0117943121507117e-05,
"loss": 0.018,
"step": 9070
},
{
"epoch": 14.38985736925515,
"grad_norm": 0.20171862840652466,
"learning_rate": 2.0012998914671182e-05,
"loss": 0.0156,
"step": 9080
},
{
"epoch": 14.405705229793977,
"grad_norm": 0.2580902874469757,
"learning_rate": 1.99082606127021e-05,
"loss": 0.018,
"step": 9090
},
{
"epoch": 14.421553090332806,
"grad_norm": 0.16781866550445557,
"learning_rate": 1.9803728934782323e-05,
"loss": 0.0178,
"step": 9100
},
{
"epoch": 14.437400950871632,
"grad_norm": 0.21224135160446167,
"learning_rate": 1.969940459867562e-05,
"loss": 0.0169,
"step": 9110
},
{
"epoch": 14.45324881141046,
"grad_norm": 0.16903094947338104,
"learning_rate": 1.9595288320721923e-05,
"loss": 0.0138,
"step": 9120
},
{
"epoch": 14.469096671949288,
"grad_norm": 0.2130252569913864,
"learning_rate": 1.949138081583265e-05,
"loss": 0.0175,
"step": 9130
},
{
"epoch": 14.484944532488115,
"grad_norm": 0.2133990377187729,
"learning_rate": 1.938768279748566e-05,
"loss": 0.0169,
"step": 9140
},
{
"epoch": 14.500792393026941,
"grad_norm": 0.19141750037670135,
"learning_rate": 1.9284194977720344e-05,
"loss": 0.0139,
"step": 9150
},
{
"epoch": 14.516640253565768,
"grad_norm": 0.18053506314754486,
"learning_rate": 1.9180918067132813e-05,
"loss": 0.0202,
"step": 9160
},
{
"epoch": 14.532488114104595,
"grad_norm": 0.2015606015920639,
"learning_rate": 1.9077852774870945e-05,
"loss": 0.0188,
"step": 9170
},
{
"epoch": 14.548335974643424,
"grad_norm": 0.2063121348619461,
"learning_rate": 1.8974999808629545e-05,
"loss": 0.0141,
"step": 9180
},
{
"epoch": 14.56418383518225,
"grad_norm": 0.14588534832000732,
"learning_rate": 1.887235987464553e-05,
"loss": 0.0147,
"step": 9190
},
{
"epoch": 14.580031695721077,
"grad_norm": 0.17593805491924286,
"learning_rate": 1.876993367769297e-05,
"loss": 0.0139,
"step": 9200
},
{
"epoch": 14.595879556259906,
"grad_norm": 0.15790753066539764,
"learning_rate": 1.8667721921078397e-05,
"loss": 0.0123,
"step": 9210
},
{
"epoch": 14.611727416798733,
"grad_norm": 0.23879548907279968,
"learning_rate": 1.8565725306635806e-05,
"loss": 0.0186,
"step": 9220
},
{
"epoch": 14.62757527733756,
"grad_norm": 0.23344580829143524,
"learning_rate": 1.8463944534722e-05,
"loss": 0.0158,
"step": 9230
},
{
"epoch": 14.643423137876386,
"grad_norm": 0.219131201505661,
"learning_rate": 1.83623803042117e-05,
"loss": 0.0197,
"step": 9240
},
{
"epoch": 14.659270998415215,
"grad_norm": 0.17857685685157776,
"learning_rate": 1.826103331249267e-05,
"loss": 0.0128,
"step": 9250
},
{
"epoch": 14.675118858954042,
"grad_norm": 0.19189006090164185,
"learning_rate": 1.8159904255461108e-05,
"loss": 0.0172,
"step": 9260
},
{
"epoch": 14.690966719492868,
"grad_norm": 0.18938252329826355,
"learning_rate": 1.8058993827516697e-05,
"loss": 0.0212,
"step": 9270
},
{
"epoch": 14.706814580031695,
"grad_norm": 0.20771273970603943,
"learning_rate": 1.795830272155796e-05,
"loss": 0.0248,
"step": 9280
},
{
"epoch": 14.722662440570524,
"grad_norm": 0.22910486161708832,
"learning_rate": 1.7857831628977456e-05,
"loss": 0.015,
"step": 9290
},
{
"epoch": 14.73851030110935,
"grad_norm": 0.20048457384109497,
"learning_rate": 1.7757581239656984e-05,
"loss": 0.0168,
"step": 9300
},
{
"epoch": 14.754358161648177,
"grad_norm": 0.21910695731639862,
"learning_rate": 1.7657552241962904e-05,
"loss": 0.0119,
"step": 9310
},
{
"epoch": 14.770206022187004,
"grad_norm": 0.214069664478302,
"learning_rate": 1.7557745322741433e-05,
"loss": 0.0167,
"step": 9320
},
{
"epoch": 14.786053882725833,
"grad_norm": 0.20221184194087982,
"learning_rate": 1.745816116731383e-05,
"loss": 0.0153,
"step": 9330
},
{
"epoch": 14.80190174326466,
"grad_norm": 0.1907825767993927,
"learning_rate": 1.735880045947183e-05,
"loss": 0.016,
"step": 9340
},
{
"epoch": 14.817749603803486,
"grad_norm": 0.2389329969882965,
"learning_rate": 1.7259663881472787e-05,
"loss": 0.0168,
"step": 9350
},
{
"epoch": 14.833597464342313,
"grad_norm": 0.2041391283273697,
"learning_rate": 1.716075211403516e-05,
"loss": 0.0166,
"step": 9360
},
{
"epoch": 14.849445324881142,
"grad_norm": 0.3064650595188141,
"learning_rate": 1.7062065836333696e-05,
"loss": 0.0166,
"step": 9370
},
{
"epoch": 14.865293185419969,
"grad_norm": 0.25269177556037903,
"learning_rate": 1.6963605725994807e-05,
"loss": 0.0179,
"step": 9380
},
{
"epoch": 14.881141045958795,
"grad_norm": 0.13689862191677094,
"learning_rate": 1.686537245909201e-05,
"loss": 0.0136,
"step": 9390
},
{
"epoch": 14.896988906497622,
"grad_norm": 0.2099904716014862,
"learning_rate": 1.6767366710141125e-05,
"loss": 0.0188,
"step": 9400
},
{
"epoch": 14.91283676703645,
"grad_norm": 0.20536595582962036,
"learning_rate": 1.666958915209578e-05,
"loss": 0.0161,
"step": 9410
},
{
"epoch": 14.928684627575278,
"grad_norm": 0.18782939016819,
"learning_rate": 1.6572040456342737e-05,
"loss": 0.0249,
"step": 9420
},
{
"epoch": 14.944532488114104,
"grad_norm": 0.29753440618515015,
"learning_rate": 1.6474721292697247e-05,
"loss": 0.0174,
"step": 9430
},
{
"epoch": 14.960380348652931,
"grad_norm": 0.14820578694343567,
"learning_rate": 1.6377632329398507e-05,
"loss": 0.0229,
"step": 9440
},
{
"epoch": 14.97622820919176,
"grad_norm": 0.26186251640319824,
"learning_rate": 1.628077423310503e-05,
"loss": 0.0203,
"step": 9450
},
{
"epoch": 14.992076069730587,
"grad_norm": 0.2948777675628662,
"learning_rate": 1.6184147668890116e-05,
"loss": 0.0192,
"step": 9460
},
{
"epoch": 15.007923930269413,
"grad_norm": 0.20523428916931152,
"learning_rate": 1.608775330023727e-05,
"loss": 0.0171,
"step": 9470
},
{
"epoch": 15.02377179080824,
"grad_norm": 0.28263282775878906,
"learning_rate": 1.599159178903557e-05,
"loss": 0.0149,
"step": 9480
},
{
"epoch": 15.039619651347069,
"grad_norm": 0.2222396433353424,
"learning_rate": 1.5895663795575255e-05,
"loss": 0.0174,
"step": 9490
},
{
"epoch": 15.055467511885896,
"grad_norm": 0.2283553034067154,
"learning_rate": 1.5799969978543072e-05,
"loss": 0.0152,
"step": 9500
},
{
"epoch": 15.071315372424722,
"grad_norm": 0.19190463423728943,
"learning_rate": 1.570451099501781e-05,
"loss": 0.0193,
"step": 9510
},
{
"epoch": 15.08716323296355,
"grad_norm": 0.2034788280725479,
"learning_rate": 1.560928750046582e-05,
"loss": 0.0142,
"step": 9520
},
{
"epoch": 15.103011093502378,
"grad_norm": 0.1533176153898239,
"learning_rate": 1.5514300148736405e-05,
"loss": 0.0147,
"step": 9530
},
{
"epoch": 15.118858954041205,
"grad_norm": 0.16323472559452057,
"learning_rate": 1.5419549592057485e-05,
"loss": 0.0128,
"step": 9540
},
{
"epoch": 15.134706814580031,
"grad_norm": 0.1336495280265808,
"learning_rate": 1.532503648103095e-05,
"loss": 0.0152,
"step": 9550
},
{
"epoch": 15.150554675118858,
"grad_norm": 0.23295193910598755,
"learning_rate": 1.5230761464628351e-05,
"loss": 0.0202,
"step": 9560
},
{
"epoch": 15.166402535657687,
"grad_norm": 0.21971255540847778,
"learning_rate": 1.5136725190186312e-05,
"loss": 0.0127,
"step": 9570
},
{
"epoch": 15.182250396196514,
"grad_norm": 0.12831509113311768,
"learning_rate": 1.5042928303402155e-05,
"loss": 0.0131,
"step": 9580
},
{
"epoch": 15.19809825673534,
"grad_norm": 0.2782778739929199,
"learning_rate": 1.4949371448329491e-05,
"loss": 0.0134,
"step": 9590
},
{
"epoch": 15.213946117274167,
"grad_norm": 0.15872108936309814,
"learning_rate": 1.4856055267373704e-05,
"loss": 0.0126,
"step": 9600
},
{
"epoch": 15.229793977812996,
"grad_norm": 0.1593102514743805,
"learning_rate": 1.476298040128763e-05,
"loss": 0.0168,
"step": 9610
},
{
"epoch": 15.245641838351823,
"grad_norm": 0.21707729995250702,
"learning_rate": 1.4670147489167157e-05,
"loss": 0.0128,
"step": 9620
},
{
"epoch": 15.26148969889065,
"grad_norm": 0.13602186739444733,
"learning_rate": 1.4577557168446704e-05,
"loss": 0.0163,
"step": 9630
},
{
"epoch": 15.277337559429476,
"grad_norm": 0.15380342304706573,
"learning_rate": 1.4485210074895028e-05,
"loss": 0.0131,
"step": 9640
},
{
"epoch": 15.293185419968305,
"grad_norm": 0.23396658897399902,
"learning_rate": 1.4393106842610765e-05,
"loss": 0.0182,
"step": 9650
},
{
"epoch": 15.309033280507132,
"grad_norm": 0.351018488407135,
"learning_rate": 1.4301248104018039e-05,
"loss": 0.0163,
"step": 9660
},
{
"epoch": 15.324881141045958,
"grad_norm": 0.15941226482391357,
"learning_rate": 1.4209634489862228e-05,
"loss": 0.0151,
"step": 9670
},
{
"epoch": 15.340729001584785,
"grad_norm": 0.31737878918647766,
"learning_rate": 1.4118266629205501e-05,
"loss": 0.016,
"step": 9680
},
{
"epoch": 15.356576862123614,
"grad_norm": 0.1942298859357834,
"learning_rate": 1.4027145149422637e-05,
"loss": 0.0138,
"step": 9690
},
{
"epoch": 15.37242472266244,
"grad_norm": 0.20650826394557953,
"learning_rate": 1.3936270676196605e-05,
"loss": 0.0196,
"step": 9700
},
{
"epoch": 15.388272583201267,
"grad_norm": 0.13685113191604614,
"learning_rate": 1.3845643833514294e-05,
"loss": 0.015,
"step": 9710
},
{
"epoch": 15.404120443740094,
"grad_norm": 0.22127866744995117,
"learning_rate": 1.3755265243662308e-05,
"loss": 0.0146,
"step": 9720
},
{
"epoch": 15.419968304278923,
"grad_norm": 0.1102658063173294,
"learning_rate": 1.3665135527222566e-05,
"loss": 0.0132,
"step": 9730
},
{
"epoch": 15.43581616481775,
"grad_norm": 0.17032739520072937,
"learning_rate": 1.3575255303068157e-05,
"loss": 0.0168,
"step": 9740
},
{
"epoch": 15.451664025356576,
"grad_norm": 0.20449472963809967,
"learning_rate": 1.3485625188359008e-05,
"loss": 0.0155,
"step": 9750
},
{
"epoch": 15.467511885895403,
"grad_norm": 0.2856760323047638,
"learning_rate": 1.3396245798537655e-05,
"loss": 0.0174,
"step": 9760
},
{
"epoch": 15.483359746434232,
"grad_norm": 0.17707166075706482,
"learning_rate": 1.3307117747325104e-05,
"loss": 0.0145,
"step": 9770
},
{
"epoch": 15.499207606973059,
"grad_norm": 0.2179175168275833,
"learning_rate": 1.321824164671649e-05,
"loss": 0.0142,
"step": 9780
},
{
"epoch": 15.515055467511885,
"grad_norm": 0.14933204650878906,
"learning_rate": 1.3129618106976966e-05,
"loss": 0.0166,
"step": 9790
},
{
"epoch": 15.530903328050712,
"grad_norm": 0.23230569064617157,
"learning_rate": 1.3041247736637497e-05,
"loss": 0.02,
"step": 9800
},
{
"epoch": 15.54675118858954,
"grad_norm": 0.2146037369966507,
"learning_rate": 1.2953131142490621e-05,
"loss": 0.0187,
"step": 9810
},
{
"epoch": 15.562599049128368,
"grad_norm": 0.27099379897117615,
"learning_rate": 1.2865268929586399e-05,
"loss": 0.0175,
"step": 9820
},
{
"epoch": 15.578446909667194,
"grad_norm": 0.21641230583190918,
"learning_rate": 1.2777661701228094e-05,
"loss": 0.0131,
"step": 9830
},
{
"epoch": 15.594294770206023,
"grad_norm": 0.206056609749794,
"learning_rate": 1.2690310058968208e-05,
"loss": 0.0124,
"step": 9840
},
{
"epoch": 15.61014263074485,
"grad_norm": 0.2695901095867157,
"learning_rate": 1.2603214602604251e-05,
"loss": 0.017,
"step": 9850
},
{
"epoch": 15.625990491283677,
"grad_norm": 0.24454373121261597,
"learning_rate": 1.2516375930174607e-05,
"loss": 0.0185,
"step": 9860
},
{
"epoch": 15.641838351822503,
"grad_norm": 0.24143637716770172,
"learning_rate": 1.2429794637954505e-05,
"loss": 0.0167,
"step": 9870
},
{
"epoch": 15.65768621236133,
"grad_norm": 0.24098831415176392,
"learning_rate": 1.234347132045185e-05,
"loss": 0.0156,
"step": 9880
},
{
"epoch": 15.673534072900159,
"grad_norm": 0.2231469452381134,
"learning_rate": 1.2257406570403158e-05,
"loss": 0.0162,
"step": 9890
},
{
"epoch": 15.689381933438986,
"grad_norm": 0.18433237075805664,
"learning_rate": 1.217160097876956e-05,
"loss": 0.0148,
"step": 9900
},
{
"epoch": 15.705229793977812,
"grad_norm": 0.24673160910606384,
"learning_rate": 1.2086055134732604e-05,
"loss": 0.0156,
"step": 9910
},
{
"epoch": 15.721077654516641,
"grad_norm": 0.2098625749349594,
"learning_rate": 1.2000769625690367e-05,
"loss": 0.0123,
"step": 9920
},
{
"epoch": 15.736925515055468,
"grad_norm": 0.19441257417201996,
"learning_rate": 1.1915745037253273e-05,
"loss": 0.0149,
"step": 9930
},
{
"epoch": 15.752773375594295,
"grad_norm": 0.30163636803627014,
"learning_rate": 1.1830981953240183e-05,
"loss": 0.0145,
"step": 9940
},
{
"epoch": 15.768621236133121,
"grad_norm": 0.2016548216342926,
"learning_rate": 1.1746480955674371e-05,
"loss": 0.0157,
"step": 9950
},
{
"epoch": 15.78446909667195,
"grad_norm": 0.16448210179805756,
"learning_rate": 1.1662242624779413e-05,
"loss": 0.0093,
"step": 9960
},
{
"epoch": 15.800316957210777,
"grad_norm": 0.1529219001531601,
"learning_rate": 1.1578267538975384e-05,
"loss": 0.016,
"step": 9970
},
{
"epoch": 15.816164817749604,
"grad_norm": 0.11220666021108627,
"learning_rate": 1.1494556274874736e-05,
"loss": 0.0151,
"step": 9980
},
{
"epoch": 15.83201267828843,
"grad_norm": 0.1833869069814682,
"learning_rate": 1.1411109407278425e-05,
"loss": 0.0126,
"step": 9990
},
{
"epoch": 15.847860538827259,
"grad_norm": 0.24351130425930023,
"learning_rate": 1.1327927509171948e-05,
"loss": 0.0148,
"step": 10000
},
{
"epoch": 15.863708399366086,
"grad_norm": 0.18271566927433014,
"learning_rate": 1.1245011151721358e-05,
"loss": 0.0153,
"step": 10010
},
{
"epoch": 15.879556259904913,
"grad_norm": 0.17010100185871124,
"learning_rate": 1.1162360904269399e-05,
"loss": 0.0139,
"step": 10020
},
{
"epoch": 15.89540412044374,
"grad_norm": 0.20020832121372223,
"learning_rate": 1.1079977334331593e-05,
"loss": 0.014,
"step": 10030
},
{
"epoch": 15.911251980982568,
"grad_norm": 0.31756097078323364,
"learning_rate": 1.0997861007592297e-05,
"loss": 0.0137,
"step": 10040
},
{
"epoch": 15.927099841521395,
"grad_norm": 0.20857271552085876,
"learning_rate": 1.0916012487900901e-05,
"loss": 0.0187,
"step": 10050
},
{
"epoch": 15.942947702060222,
"grad_norm": 0.21330268681049347,
"learning_rate": 1.0834432337267835e-05,
"loss": 0.0182,
"step": 10060
},
{
"epoch": 15.958795562599049,
"grad_norm": 0.2602750360965729,
"learning_rate": 1.0753121115860859e-05,
"loss": 0.0126,
"step": 10070
},
{
"epoch": 15.974643423137877,
"grad_norm": 0.10706225037574768,
"learning_rate": 1.0672079382001076e-05,
"loss": 0.0141,
"step": 10080
},
{
"epoch": 15.990491283676704,
"grad_norm": 0.18691207468509674,
"learning_rate": 1.0591307692159175e-05,
"loss": 0.018,
"step": 10090
},
{
"epoch": 16.00633914421553,
"grad_norm": 0.16258151829242706,
"learning_rate": 1.0510806600951634e-05,
"loss": 0.0138,
"step": 10100
},
{
"epoch": 16.022187004754358,
"grad_norm": 0.22133781015872955,
"learning_rate": 1.0430576661136809e-05,
"loss": 0.0136,
"step": 10110
},
{
"epoch": 16.038034865293184,
"grad_norm": 0.14174553751945496,
"learning_rate": 1.0350618423611258e-05,
"loss": 0.012,
"step": 10120
},
{
"epoch": 16.05388272583201,
"grad_norm": 0.21903228759765625,
"learning_rate": 1.0270932437405894e-05,
"loss": 0.0162,
"step": 10130
},
{
"epoch": 16.06973058637084,
"grad_norm": 0.15532748401165009,
"learning_rate": 1.0191519249682202e-05,
"loss": 0.0129,
"step": 10140
},
{
"epoch": 16.08557844690967,
"grad_norm": 0.2952392101287842,
"learning_rate": 1.0112379405728512e-05,
"loss": 0.014,
"step": 10150
},
{
"epoch": 16.101426307448495,
"grad_norm": 0.1566477119922638,
"learning_rate": 1.003351344895624e-05,
"loss": 0.0168,
"step": 10160
},
{
"epoch": 16.117274167987322,
"grad_norm": 0.18433576822280884,
"learning_rate": 9.954921920896181e-06,
"loss": 0.0141,
"step": 10170
},
{
"epoch": 16.13312202852615,
"grad_norm": 0.1970781683921814,
"learning_rate": 9.876605361194784e-06,
"loss": 0.014,
"step": 10180
},
{
"epoch": 16.148969889064976,
"grad_norm": 0.22483587265014648,
"learning_rate": 9.798564307610397e-06,
"loss": 0.0172,
"step": 10190
},
{
"epoch": 16.164817749603802,
"grad_norm": 0.12307272851467133,
"learning_rate": 9.720799296009652e-06,
"loss": 0.0142,
"step": 10200
},
{
"epoch": 16.18066561014263,
"grad_norm": 0.09929801523685455,
"learning_rate": 9.64331086036372e-06,
"loss": 0.0157,
"step": 10210
},
{
"epoch": 16.19651347068146,
"grad_norm": 0.22220948338508606,
"learning_rate": 9.566099532744666e-06,
"loss": 0.0144,
"step": 10220
},
{
"epoch": 16.212361331220286,
"grad_norm": 0.21739843487739563,
"learning_rate": 9.48916584332184e-06,
"loss": 0.0141,
"step": 10230
},
{
"epoch": 16.228209191759113,
"grad_norm": 0.20657970011234283,
"learning_rate": 9.412510320358148e-06,
"loss": 0.0125,
"step": 10240
},
{
"epoch": 16.24405705229794,
"grad_norm": 0.1589168906211853,
"learning_rate": 9.336133490206527e-06,
"loss": 0.0146,
"step": 10250
},
{
"epoch": 16.259904912836767,
"grad_norm": 0.20785082876682281,
"learning_rate": 9.260035877306222e-06,
"loss": 0.015,
"step": 10260
},
{
"epoch": 16.275752773375594,
"grad_norm": 0.3436870872974396,
"learning_rate": 9.184218004179296e-06,
"loss": 0.0142,
"step": 10270
},
{
"epoch": 16.29160063391442,
"grad_norm": 0.19214791059494019,
"learning_rate": 9.108680391426944e-06,
"loss": 0.0153,
"step": 10280
},
{
"epoch": 16.307448494453247,
"grad_norm": 0.18752476572990417,
"learning_rate": 9.033423557725968e-06,
"loss": 0.0198,
"step": 10290
},
{
"epoch": 16.323296354992078,
"grad_norm": 0.2008536010980606,
"learning_rate": 8.958448019825238e-06,
"loss": 0.0139,
"step": 10300
},
{
"epoch": 16.339144215530904,
"grad_norm": 0.3124418258666992,
"learning_rate": 8.883754292542073e-06,
"loss": 0.0184,
"step": 10310
},
{
"epoch": 16.35499207606973,
"grad_norm": 0.18249309062957764,
"learning_rate": 8.809342888758787e-06,
"loss": 0.012,
"step": 10320
},
{
"epoch": 16.370839936608558,
"grad_norm": 0.27810513973236084,
"learning_rate": 8.735214319419122e-06,
"loss": 0.012,
"step": 10330
},
{
"epoch": 16.386687797147385,
"grad_norm": 0.25395792722702026,
"learning_rate": 8.66136909352469e-06,
"loss": 0.0175,
"step": 10340
},
{
"epoch": 16.40253565768621,
"grad_norm": 0.10935286432504654,
"learning_rate": 8.587807718131607e-06,
"loss": 0.0138,
"step": 10350
},
{
"epoch": 16.41838351822504,
"grad_norm": 0.20935213565826416,
"learning_rate": 8.514530698346911e-06,
"loss": 0.0149,
"step": 10360
},
{
"epoch": 16.434231378763865,
"grad_norm": 0.15524841845035553,
"learning_rate": 8.4415385373251e-06,
"loss": 0.016,
"step": 10370
},
{
"epoch": 16.450079239302696,
"grad_norm": 0.17828898131847382,
"learning_rate": 8.368831736264738e-06,
"loss": 0.0155,
"step": 10380
},
{
"epoch": 16.465927099841522,
"grad_norm": 0.11186101287603378,
"learning_rate": 8.296410794404925e-06,
"loss": 0.0146,
"step": 10390
},
{
"epoch": 16.48177496038035,
"grad_norm": 0.1628289818763733,
"learning_rate": 8.22427620902197e-06,
"loss": 0.0138,
"step": 10400
},
{
"epoch": 16.497622820919176,
"grad_norm": 0.20246130228042603,
"learning_rate": 8.152428475425876e-06,
"loss": 0.017,
"step": 10410
},
{
"epoch": 16.513470681458003,
"grad_norm": 0.2126418799161911,
"learning_rate": 8.080868086957e-06,
"loss": 0.0181,
"step": 10420
},
{
"epoch": 16.52931854199683,
"grad_norm": 0.27646327018737793,
"learning_rate": 8.009595534982684e-06,
"loss": 0.0138,
"step": 10430
},
{
"epoch": 16.545166402535656,
"grad_norm": 0.23372896015644073,
"learning_rate": 7.938611308893796e-06,
"loss": 0.0206,
"step": 10440
},
{
"epoch": 16.561014263074483,
"grad_norm": 0.21742697060108185,
"learning_rate": 7.867915896101475e-06,
"loss": 0.0117,
"step": 10450
},
{
"epoch": 16.576862123613314,
"grad_norm": 0.30523791909217834,
"learning_rate": 7.797509782033696e-06,
"loss": 0.0189,
"step": 10460
},
{
"epoch": 16.59270998415214,
"grad_norm": 0.33640623092651367,
"learning_rate": 7.727393450131976e-06,
"loss": 0.0147,
"step": 10470
},
{
"epoch": 16.608557844690967,
"grad_norm": 0.14561405777931213,
"learning_rate": 7.65756738184808e-06,
"loss": 0.0119,
"step": 10480
},
{
"epoch": 16.624405705229794,
"grad_norm": 0.27383899688720703,
"learning_rate": 7.588032056640643e-06,
"loss": 0.0181,
"step": 10490
},
{
"epoch": 16.64025356576862,
"grad_norm": 0.2113339751958847,
"learning_rate": 7.518787951971951e-06,
"loss": 0.0151,
"step": 10500
},
{
"epoch": 16.656101426307448,
"grad_norm": 0.22912786900997162,
"learning_rate": 7.449835543304645e-06,
"loss": 0.013,
"step": 10510
},
{
"epoch": 16.671949286846274,
"grad_norm": 0.24694296717643738,
"learning_rate": 7.381175304098398e-06,
"loss": 0.0124,
"step": 10520
},
{
"epoch": 16.687797147385105,
"grad_norm": 0.14873796701431274,
"learning_rate": 7.3128077058067675e-06,
"loss": 0.0166,
"step": 10530
},
{
"epoch": 16.70364500792393,
"grad_norm": 0.14333923161029816,
"learning_rate": 7.244733217873834e-06,
"loss": 0.0128,
"step": 10540
},
{
"epoch": 16.71949286846276,
"grad_norm": 0.17385222017765045,
"learning_rate": 7.1769523077310885e-06,
"loss": 0.0172,
"step": 10550
},
{
"epoch": 16.735340729001585,
"grad_norm": 0.1889476180076599,
"learning_rate": 7.1094654407941945e-06,
"loss": 0.0105,
"step": 10560
},
{
"epoch": 16.751188589540412,
"grad_norm": 0.13638252019882202,
"learning_rate": 7.042273080459716e-06,
"loss": 0.0137,
"step": 10570
},
{
"epoch": 16.76703645007924,
"grad_norm": 0.16387833654880524,
"learning_rate": 6.97537568810207e-06,
"loss": 0.0121,
"step": 10580
},
{
"epoch": 16.782884310618066,
"grad_norm": 0.18849371373653412,
"learning_rate": 6.908773723070228e-06,
"loss": 0.0112,
"step": 10590
},
{
"epoch": 16.798732171156892,
"grad_norm": 0.2580081522464752,
"learning_rate": 6.842467642684619e-06,
"loss": 0.0164,
"step": 10600
},
{
"epoch": 16.814580031695723,
"grad_norm": 0.19095416367053986,
"learning_rate": 6.7764579022340405e-06,
"loss": 0.0156,
"step": 10610
},
{
"epoch": 16.83042789223455,
"grad_norm": 0.38263216614723206,
"learning_rate": 6.71074495497242e-06,
"loss": 0.0141,
"step": 10620
},
{
"epoch": 16.846275752773376,
"grad_norm": 0.19752560555934906,
"learning_rate": 6.645329252115812e-06,
"loss": 0.0134,
"step": 10630
},
{
"epoch": 16.862123613312203,
"grad_norm": 0.21061812341213226,
"learning_rate": 6.580211242839207e-06,
"loss": 0.0161,
"step": 10640
},
{
"epoch": 16.87797147385103,
"grad_norm": 0.30705246329307556,
"learning_rate": 6.515391374273522e-06,
"loss": 0.0136,
"step": 10650
},
{
"epoch": 16.893819334389857,
"grad_norm": 0.16327637434005737,
"learning_rate": 6.4508700915025145e-06,
"loss": 0.0178,
"step": 10660
},
{
"epoch": 16.909667194928684,
"grad_norm": 0.19477631151676178,
"learning_rate": 6.3866478375596454e-06,
"loss": 0.0155,
"step": 10670
},
{
"epoch": 16.92551505546751,
"grad_norm": 0.10037015378475189,
"learning_rate": 6.322725053425166e-06,
"loss": 0.0141,
"step": 10680
},
{
"epoch": 16.94136291600634,
"grad_norm": 0.14681270718574524,
"learning_rate": 6.259102178023019e-06,
"loss": 0.0132,
"step": 10690
},
{
"epoch": 16.957210776545168,
"grad_norm": 0.14646220207214355,
"learning_rate": 6.1957796482177865e-06,
"loss": 0.015,
"step": 10700
},
{
"epoch": 16.973058637083994,
"grad_norm": 0.14095987379550934,
"learning_rate": 6.1327578988118086e-06,
"loss": 0.0117,
"step": 10710
},
{
"epoch": 16.98890649762282,
"grad_norm": 0.17115886509418488,
"learning_rate": 6.070037362542058e-06,
"loss": 0.0113,
"step": 10720
},
{
"epoch": 17.004754358161648,
"grad_norm": 0.18565335869789124,
"learning_rate": 6.00761847007727e-06,
"loss": 0.0144,
"step": 10730
},
{
"epoch": 17.020602218700475,
"grad_norm": 0.13037702441215515,
"learning_rate": 5.945501650014951e-06,
"loss": 0.0137,
"step": 10740
},
{
"epoch": 17.0364500792393,
"grad_norm": 0.18481898307800293,
"learning_rate": 5.883687328878423e-06,
"loss": 0.0133,
"step": 10750
},
{
"epoch": 17.05229793977813,
"grad_norm": 0.2438468635082245,
"learning_rate": 5.822175931113933e-06,
"loss": 0.0163,
"step": 10760
},
{
"epoch": 17.06814580031696,
"grad_norm": 0.18955622613430023,
"learning_rate": 5.760967879087675e-06,
"loss": 0.0113,
"step": 10770
},
{
"epoch": 17.083993660855786,
"grad_norm": 0.3023121953010559,
"learning_rate": 5.700063593082971e-06,
"loss": 0.0142,
"step": 10780
},
{
"epoch": 17.099841521394612,
"grad_norm": 0.19684407114982605,
"learning_rate": 5.639463491297314e-06,
"loss": 0.0183,
"step": 10790
},
{
"epoch": 17.11568938193344,
"grad_norm": 0.1771165281534195,
"learning_rate": 5.579167989839512e-06,
"loss": 0.0149,
"step": 10800
},
{
"epoch": 17.131537242472266,
"grad_norm": 0.31700730323791504,
"learning_rate": 5.519177502726897e-06,
"loss": 0.0149,
"step": 10810
},
{
"epoch": 17.147385103011093,
"grad_norm": 0.24914953112602234,
"learning_rate": 5.459492441882369e-06,
"loss": 0.0096,
"step": 10820
},
{
"epoch": 17.16323296354992,
"grad_norm": 0.17742785811424255,
"learning_rate": 5.400113217131669e-06,
"loss": 0.0126,
"step": 10830
},
{
"epoch": 17.179080824088746,
"grad_norm": 0.19636410474777222,
"learning_rate": 5.341040236200512e-06,
"loss": 0.0148,
"step": 10840
},
{
"epoch": 17.194928684627577,
"grad_norm": 0.16673442721366882,
"learning_rate": 5.282273904711793e-06,
"loss": 0.0149,
"step": 10850
},
{
"epoch": 17.210776545166404,
"grad_norm": 0.17502924799919128,
"learning_rate": 5.223814626182804e-06,
"loss": 0.0156,
"step": 10860
},
{
"epoch": 17.22662440570523,
"grad_norm": 0.24374344944953918,
"learning_rate": 5.165662802022469e-06,
"loss": 0.0156,
"step": 10870
},
{
"epoch": 17.242472266244057,
"grad_norm": 0.20077760517597198,
"learning_rate": 5.107818831528593e-06,
"loss": 0.0152,
"step": 10880
},
{
"epoch": 17.258320126782884,
"grad_norm": 0.19688129425048828,
"learning_rate": 5.050283111885123e-06,
"loss": 0.0108,
"step": 10890
},
{
"epoch": 17.27416798732171,
"grad_norm": 0.2499351054430008,
"learning_rate": 4.9930560381593825e-06,
"loss": 0.0174,
"step": 10900
},
{
"epoch": 17.290015847860538,
"grad_norm": 0.11787986755371094,
"learning_rate": 4.936138003299412e-06,
"loss": 0.011,
"step": 10910
},
{
"epoch": 17.305863708399364,
"grad_norm": 0.10276877880096436,
"learning_rate": 4.879529398131227e-06,
"loss": 0.0151,
"step": 10920
},
{
"epoch": 17.321711568938195,
"grad_norm": 0.21218866109848022,
"learning_rate": 4.823230611356155e-06,
"loss": 0.0188,
"step": 10930
},
{
"epoch": 17.33755942947702,
"grad_norm": 0.12643927335739136,
"learning_rate": 4.767242029548186e-06,
"loss": 0.0137,
"step": 10940
},
{
"epoch": 17.35340729001585,
"grad_norm": 0.22125521302223206,
"learning_rate": 4.711564037151261e-06,
"loss": 0.0137,
"step": 10950
},
{
"epoch": 17.369255150554675,
"grad_norm": 0.18663623929023743,
"learning_rate": 4.656197016476716e-06,
"loss": 0.0169,
"step": 10960
},
{
"epoch": 17.385103011093502,
"grad_norm": 0.1977252960205078,
"learning_rate": 4.60114134770055e-06,
"loss": 0.0142,
"step": 10970
},
{
"epoch": 17.40095087163233,
"grad_norm": 0.1531880646944046,
"learning_rate": 4.54639740886093e-06,
"loss": 0.0134,
"step": 10980
},
{
"epoch": 17.416798732171156,
"grad_norm": 0.25299400091171265,
"learning_rate": 4.4919655758555055e-06,
"loss": 0.0115,
"step": 10990
},
{
"epoch": 17.432646592709983,
"grad_norm": 0.15232089161872864,
"learning_rate": 4.4378462224388514e-06,
"loss": 0.0121,
"step": 11000
},
{
"epoch": 17.448494453248813,
"grad_norm": 0.2122395932674408,
"learning_rate": 4.3840397202199515e-06,
"loss": 0.0138,
"step": 11010
},
{
"epoch": 17.46434231378764,
"grad_norm": 0.4084971845149994,
"learning_rate": 4.330546438659555e-06,
"loss": 0.0169,
"step": 11020
},
{
"epoch": 17.480190174326466,
"grad_norm": 0.13414064049720764,
"learning_rate": 4.2773667450677346e-06,
"loss": 0.0115,
"step": 11030
},
{
"epoch": 17.496038034865293,
"grad_norm": 0.24400712549686432,
"learning_rate": 4.224501004601311e-06,
"loss": 0.0165,
"step": 11040
},
{
"epoch": 17.51188589540412,
"grad_norm": 0.15812645852565765,
"learning_rate": 4.1719495802613254e-06,
"loss": 0.0139,
"step": 11050
},
{
"epoch": 17.527733755942947,
"grad_norm": 0.32170212268829346,
"learning_rate": 4.119712832890599e-06,
"loss": 0.0173,
"step": 11060
},
{
"epoch": 17.543581616481774,
"grad_norm": 0.26727718114852905,
"learning_rate": 4.0677911211712494e-06,
"loss": 0.0137,
"step": 11070
},
{
"epoch": 17.5594294770206,
"grad_norm": 0.2177404910326004,
"learning_rate": 4.0161848016221804e-06,
"loss": 0.0115,
"step": 11080
},
{
"epoch": 17.57527733755943,
"grad_norm": 0.08245435357093811,
"learning_rate": 3.964894228596683e-06,
"loss": 0.0125,
"step": 11090
},
{
"epoch": 17.591125198098258,
"grad_norm": 0.2010851800441742,
"learning_rate": 3.913919754279966e-06,
"loss": 0.0196,
"step": 11100
},
{
"epoch": 17.606973058637085,
"grad_norm": 0.333839476108551,
"learning_rate": 3.8632617286867845e-06,
"loss": 0.0168,
"step": 11110
},
{
"epoch": 17.62282091917591,
"grad_norm": 0.09080642461776733,
"learning_rate": 3.8129204996589894e-06,
"loss": 0.017,
"step": 11120
},
{
"epoch": 17.638668779714738,
"grad_norm": 0.12128207087516785,
"learning_rate": 3.7628964128631428e-06,
"loss": 0.0146,
"step": 11130
},
{
"epoch": 17.654516640253565,
"grad_norm": 0.15360169112682343,
"learning_rate": 3.7131898117881924e-06,
"loss": 0.0125,
"step": 11140
},
{
"epoch": 17.67036450079239,
"grad_norm": 0.18307170271873474,
"learning_rate": 3.6638010377430476e-06,
"loss": 0.012,
"step": 11150
},
{
"epoch": 17.686212361331222,
"grad_norm": 0.17119954526424408,
"learning_rate": 3.6147304298542963e-06,
"loss": 0.0159,
"step": 11160
},
{
"epoch": 17.70206022187005,
"grad_norm": 0.18213894963264465,
"learning_rate": 3.5659783250638344e-06,
"loss": 0.0119,
"step": 11170
},
{
"epoch": 17.717908082408876,
"grad_norm": 0.22571374475955963,
"learning_rate": 3.517545058126548e-06,
"loss": 0.0142,
"step": 11180
},
{
"epoch": 17.733755942947703,
"grad_norm": 0.1815493106842041,
"learning_rate": 3.4694309616080665e-06,
"loss": 0.02,
"step": 11190
},
{
"epoch": 17.74960380348653,
"grad_norm": 0.18275891244411469,
"learning_rate": 3.4216363658824136e-06,
"loss": 0.0135,
"step": 11200
},
{
"epoch": 17.765451664025356,
"grad_norm": 0.14396269619464874,
"learning_rate": 3.3741615991297938e-06,
"loss": 0.0138,
"step": 11210
},
{
"epoch": 17.781299524564183,
"grad_norm": 0.16869209706783295,
"learning_rate": 3.327006987334308e-06,
"loss": 0.015,
"step": 11220
},
{
"epoch": 17.79714738510301,
"grad_norm": 0.2123693972826004,
"learning_rate": 3.2801728542817155e-06,
"loss": 0.0125,
"step": 11230
},
{
"epoch": 17.812995245641837,
"grad_norm": 0.16149067878723145,
"learning_rate": 3.2336595215572364e-06,
"loss": 0.013,
"step": 11240
},
{
"epoch": 17.828843106180667,
"grad_norm": 0.21204307675361633,
"learning_rate": 3.1874673085432848e-06,
"loss": 0.0165,
"step": 11250
},
{
"epoch": 17.844690966719494,
"grad_norm": 0.16572032868862152,
"learning_rate": 3.1415965324173567e-06,
"loss": 0.0148,
"step": 11260
},
{
"epoch": 17.86053882725832,
"grad_norm": 0.11765672266483307,
"learning_rate": 3.0960475081497966e-06,
"loss": 0.0174,
"step": 11270
},
{
"epoch": 17.876386687797147,
"grad_norm": 0.11968225240707397,
"learning_rate": 3.0508205485016426e-06,
"loss": 0.0144,
"step": 11280
},
{
"epoch": 17.892234548335974,
"grad_norm": 0.2578866183757782,
"learning_rate": 3.0059159640225097e-06,
"loss": 0.0146,
"step": 11290
},
{
"epoch": 17.9080824088748,
"grad_norm": 0.21918439865112305,
"learning_rate": 2.961334063048393e-06,
"loss": 0.0155,
"step": 11300
},
{
"epoch": 17.923930269413628,
"grad_norm": 0.16526588797569275,
"learning_rate": 2.917075151699622e-06,
"loss": 0.0177,
"step": 11310
},
{
"epoch": 17.939778129952458,
"grad_norm": 0.21584181487560272,
"learning_rate": 2.8731395338787215e-06,
"loss": 0.0164,
"step": 11320
},
{
"epoch": 17.955625990491285,
"grad_norm": 0.19519634544849396,
"learning_rate": 2.8295275112683207e-06,
"loss": 0.0166,
"step": 11330
},
{
"epoch": 17.97147385103011,
"grad_norm": 0.21799765527248383,
"learning_rate": 2.7862393833291036e-06,
"loss": 0.0152,
"step": 11340
},
{
"epoch": 17.98732171156894,
"grad_norm": 0.13291296362876892,
"learning_rate": 2.743275447297733e-06,
"loss": 0.0142,
"step": 11350
},
{
"epoch": 18.003169572107765,
"grad_norm": 0.21296854317188263,
"learning_rate": 2.7006359981848196e-06,
"loss": 0.0135,
"step": 11360
},
{
"epoch": 18.019017432646592,
"grad_norm": 0.16779382526874542,
"learning_rate": 2.6583213287729115e-06,
"loss": 0.0107,
"step": 11370
},
{
"epoch": 18.03486529318542,
"grad_norm": 0.3331531882286072,
"learning_rate": 2.616331729614424e-06,
"loss": 0.015,
"step": 11380
},
{
"epoch": 18.050713153724246,
"grad_norm": 0.18505552411079407,
"learning_rate": 2.574667489029725e-06,
"loss": 0.0143,
"step": 11390
},
{
"epoch": 18.066561014263076,
"grad_norm": 0.2790921628475189,
"learning_rate": 2.533328893105108e-06,
"loss": 0.0161,
"step": 11400
},
{
"epoch": 18.082408874801903,
"grad_norm": 0.2316390722990036,
"learning_rate": 2.492316225690827e-06,
"loss": 0.0158,
"step": 11410
},
{
"epoch": 18.09825673534073,
"grad_norm": 0.2133951485157013,
"learning_rate": 2.4516297683991773e-06,
"loss": 0.014,
"step": 11420
},
{
"epoch": 18.114104595879557,
"grad_norm": 0.23200847208499908,
"learning_rate": 2.411269800602517e-06,
"loss": 0.013,
"step": 11430
},
{
"epoch": 18.129952456418383,
"grad_norm": 0.18895216286182404,
"learning_rate": 2.371236599431387e-06,
"loss": 0.0163,
"step": 11440
},
{
"epoch": 18.14580031695721,
"grad_norm": 0.18238064646720886,
"learning_rate": 2.3315304397726e-06,
"loss": 0.0124,
"step": 11450
},
{
"epoch": 18.161648177496037,
"grad_norm": 0.1541883647441864,
"learning_rate": 2.2921515942673276e-06,
"loss": 0.0119,
"step": 11460
},
{
"epoch": 18.177496038034864,
"grad_norm": 0.1508757770061493,
"learning_rate": 2.2531003333092826e-06,
"loss": 0.0179,
"step": 11470
},
{
"epoch": 18.193343898573694,
"grad_norm": 0.18820171058177948,
"learning_rate": 2.2143769250427883e-06,
"loss": 0.0123,
"step": 11480
},
{
"epoch": 18.20919175911252,
"grad_norm": 0.15870815515518188,
"learning_rate": 2.175981635361013e-06,
"loss": 0.0126,
"step": 11490
},
{
"epoch": 18.225039619651348,
"grad_norm": 0.16902989149093628,
"learning_rate": 2.1379147279040777e-06,
"loss": 0.0123,
"step": 11500
},
{
"epoch": 18.240887480190175,
"grad_norm": 0.12179669737815857,
"learning_rate": 2.1001764640572963e-06,
"loss": 0.0154,
"step": 11510
},
{
"epoch": 18.256735340729,
"grad_norm": 0.23037730157375336,
"learning_rate": 2.0627671029493535e-06,
"loss": 0.0153,
"step": 11520
},
{
"epoch": 18.272583201267828,
"grad_norm": 0.21997253596782684,
"learning_rate": 2.02568690145053e-06,
"loss": 0.0123,
"step": 11530
},
{
"epoch": 18.288431061806655,
"grad_norm": 0.1361498236656189,
"learning_rate": 1.988936114170953e-06,
"loss": 0.0179,
"step": 11540
},
{
"epoch": 18.304278922345482,
"grad_norm": 0.20903484523296356,
"learning_rate": 1.9525149934588314e-06,
"loss": 0.0118,
"step": 11550
},
{
"epoch": 18.320126782884312,
"grad_norm": 0.34163960814476013,
"learning_rate": 1.916423789398725e-06,
"loss": 0.0145,
"step": 11560
},
{
"epoch": 18.33597464342314,
"grad_norm": 0.226941779255867,
"learning_rate": 1.8806627498098305e-06,
"loss": 0.0139,
"step": 11570
},
{
"epoch": 18.351822503961966,
"grad_norm": 0.1451648324728012,
"learning_rate": 1.8452321202442724e-06,
"loss": 0.0116,
"step": 11580
},
{
"epoch": 18.367670364500793,
"grad_norm": 0.19719494879245758,
"learning_rate": 1.810132143985438e-06,
"loss": 0.0123,
"step": 11590
},
{
"epoch": 18.38351822503962,
"grad_norm": 0.1801900416612625,
"learning_rate": 1.7753630620463035e-06,
"loss": 0.0094,
"step": 11600
},
{
"epoch": 18.399366085578446,
"grad_norm": 0.17285539209842682,
"learning_rate": 1.740925113167735e-06,
"loss": 0.0184,
"step": 11610
},
{
"epoch": 18.415213946117273,
"grad_norm": 0.1344527304172516,
"learning_rate": 1.7068185338169174e-06,
"loss": 0.0123,
"step": 11620
},
{
"epoch": 18.4310618066561,
"grad_norm": 0.21449725329875946,
"learning_rate": 1.6730435581856719e-06,
"loss": 0.0127,
"step": 11630
},
{
"epoch": 18.44690966719493,
"grad_norm": 0.153366357088089,
"learning_rate": 1.6396004181888803e-06,
"loss": 0.0115,
"step": 11640
},
{
"epoch": 18.462757527733757,
"grad_norm": 0.20724429190158844,
"learning_rate": 1.6064893434628914e-06,
"loss": 0.0213,
"step": 11650
},
{
"epoch": 18.478605388272584,
"grad_norm": 0.20763236284255981,
"learning_rate": 1.5737105613639336e-06,
"loss": 0.0165,
"step": 11660
},
{
"epoch": 18.49445324881141,
"grad_norm": 0.1959114670753479,
"learning_rate": 1.5412642969665546e-06,
"loss": 0.0165,
"step": 11670
},
{
"epoch": 18.510301109350237,
"grad_norm": 0.1801106333732605,
"learning_rate": 1.5091507730620735e-06,
"loss": 0.0201,
"step": 11680
},
{
"epoch": 18.526148969889064,
"grad_norm": 0.23710688948631287,
"learning_rate": 1.4773702101570807e-06,
"loss": 0.0127,
"step": 11690
},
{
"epoch": 18.54199683042789,
"grad_norm": 0.19984515011310577,
"learning_rate": 1.4459228264718683e-06,
"loss": 0.0134,
"step": 11700
},
{
"epoch": 18.557844690966718,
"grad_norm": 0.34799715876579285,
"learning_rate": 1.41480883793898e-06,
"loss": 0.0139,
"step": 11710
},
{
"epoch": 18.573692551505548,
"grad_norm": 0.23937344551086426,
"learning_rate": 1.3840284582017193e-06,
"loss": 0.0168,
"step": 11720
},
{
"epoch": 18.589540412044375,
"grad_norm": 0.1457284539937973,
"learning_rate": 1.3535818986126492e-06,
"loss": 0.0149,
"step": 11730
},
{
"epoch": 18.605388272583202,
"grad_norm": 0.14516040682792664,
"learning_rate": 1.3234693682321886e-06,
"loss": 0.0136,
"step": 11740
},
{
"epoch": 18.62123613312203,
"grad_norm": 0.1960999220609665,
"learning_rate": 1.2936910738271524e-06,
"loss": 0.0197,
"step": 11750
},
{
"epoch": 18.637083993660855,
"grad_norm": 0.17722713947296143,
"learning_rate": 1.264247219869319e-06,
"loss": 0.0101,
"step": 11760
},
{
"epoch": 18.652931854199682,
"grad_norm": 0.12621328234672546,
"learning_rate": 1.2351380085340592e-06,
"loss": 0.0099,
"step": 11770
},
{
"epoch": 18.66877971473851,
"grad_norm": 0.19239826500415802,
"learning_rate": 1.206363639698921e-06,
"loss": 0.017,
"step": 11780
},
{
"epoch": 18.684627575277336,
"grad_norm": 0.21515102684497833,
"learning_rate": 1.1779243109422632e-06,
"loss": 0.016,
"step": 11790
},
{
"epoch": 18.700475435816166,
"grad_norm": 0.32155877351760864,
"learning_rate": 1.1498202175419136e-06,
"loss": 0.0189,
"step": 11800
},
{
"epoch": 18.716323296354993,
"grad_norm": 0.1156759113073349,
"learning_rate": 1.1220515524738017e-06,
"loss": 0.0114,
"step": 11810
},
{
"epoch": 18.73217115689382,
"grad_norm": 0.17164525389671326,
"learning_rate": 1.0946185064106552e-06,
"loss": 0.0142,
"step": 11820
},
{
"epoch": 18.748019017432647,
"grad_norm": 0.15515750646591187,
"learning_rate": 1.0675212677206892e-06,
"loss": 0.0129,
"step": 11830
},
{
"epoch": 18.763866877971473,
"grad_norm": 0.17644274234771729,
"learning_rate": 1.0407600224662917e-06,
"loss": 0.0171,
"step": 11840
},
{
"epoch": 18.7797147385103,
"grad_norm": 0.1408577710390091,
"learning_rate": 1.0143349544027791e-06,
"loss": 0.0149,
"step": 11850
},
{
"epoch": 18.795562599049127,
"grad_norm": 0.23002204298973083,
"learning_rate": 9.882462449771035e-07,
"loss": 0.0166,
"step": 11860
},
{
"epoch": 18.811410459587954,
"grad_norm": 0.16936403512954712,
"learning_rate": 9.624940733266363e-07,
"loss": 0.0141,
"step": 11870
},
{
"epoch": 18.827258320126784,
"grad_norm": 0.18246498703956604,
"learning_rate": 9.370786162779033e-07,
"loss": 0.0146,
"step": 11880
},
{
"epoch": 18.84310618066561,
"grad_norm": 0.14892500638961792,
"learning_rate": 9.120000483453961e-07,
"loss": 0.0146,
"step": 11890
},
{
"epoch": 18.858954041204438,
"grad_norm": 0.23931749165058136,
"learning_rate": 8.872585417303736e-07,
"loss": 0.0165,
"step": 11900
},
{
"epoch": 18.874801901743265,
"grad_norm": 0.12671174108982086,
"learning_rate": 8.628542663196625e-07,
"loss": 0.0148,
"step": 11910
},
{
"epoch": 18.89064976228209,
"grad_norm": 0.12941120564937592,
"learning_rate": 8.387873896845144e-07,
"loss": 0.014,
"step": 11920
},
{
"epoch": 18.90649762282092,
"grad_norm": 0.17071138322353363,
"learning_rate": 8.150580770794336e-07,
"loss": 0.0117,
"step": 11930
},
{
"epoch": 18.922345483359745,
"grad_norm": 0.18729938566684723,
"learning_rate": 7.916664914410455e-07,
"loss": 0.0163,
"step": 11940
},
{
"epoch": 18.938193343898575,
"grad_norm": 0.15300016105175018,
"learning_rate": 7.686127933869968e-07,
"loss": 0.0136,
"step": 11950
},
{
"epoch": 18.954041204437402,
"grad_norm": 0.24584755301475525,
"learning_rate": 7.458971412148241e-07,
"loss": 0.0124,
"step": 11960
},
{
"epoch": 18.96988906497623,
"grad_norm": 0.1396564543247223,
"learning_rate": 7.235196909008924e-07,
"loss": 0.0105,
"step": 11970
},
{
"epoch": 18.985736925515056,
"grad_norm": 0.45825517177581787,
"learning_rate": 7.014805960993131e-07,
"loss": 0.0173,
"step": 11980
},
{
"epoch": 19.001584786053883,
"grad_norm": 0.3947998285293579,
"learning_rate": 6.797800081408845e-07,
"loss": 0.0108,
"step": 11990
},
{
"epoch": 19.01743264659271,
"grad_norm": 0.16845867037773132,
"learning_rate": 6.584180760320635e-07,
"loss": 0.0125,
"step": 12000
},
{
"epoch": 19.033280507131536,
"grad_norm": 0.17425452172756195,
"learning_rate": 6.373949464539286e-07,
"loss": 0.0124,
"step": 12010
},
{
"epoch": 19.049128367670363,
"grad_norm": 0.29528307914733887,
"learning_rate": 6.167107637611858e-07,
"loss": 0.012,
"step": 12020
},
{
"epoch": 19.064976228209193,
"grad_norm": 0.22592425346374512,
"learning_rate": 5.963656699811693e-07,
"loss": 0.0128,
"step": 12030
},
{
"epoch": 19.08082408874802,
"grad_norm": 0.2710273265838623,
"learning_rate": 5.763598048128704e-07,
"loss": 0.0118,
"step": 12040
},
{
"epoch": 19.096671949286847,
"grad_norm": 0.21170739829540253,
"learning_rate": 5.566933056259882e-07,
"loss": 0.0149,
"step": 12050
},
{
"epoch": 19.112519809825674,
"grad_norm": 0.20141953229904175,
"learning_rate": 5.373663074599522e-07,
"loss": 0.0119,
"step": 12060
},
{
"epoch": 19.1283676703645,
"grad_norm": 0.19637355208396912,
"learning_rate": 5.183789430230346e-07,
"loss": 0.0136,
"step": 12070
},
{
"epoch": 19.144215530903328,
"grad_norm": 0.16335146129131317,
"learning_rate": 4.99731342691423e-07,
"loss": 0.0138,
"step": 12080
},
{
"epoch": 19.160063391442154,
"grad_norm": 0.16980670392513275,
"learning_rate": 4.814236345083156e-07,
"loss": 0.0146,
"step": 12090
},
{
"epoch": 19.17591125198098,
"grad_norm": 0.3072208762168884,
"learning_rate": 4.6345594418304996e-07,
"loss": 0.0141,
"step": 12100
},
{
"epoch": 19.19175911251981,
"grad_norm": 0.18183228373527527,
"learning_rate": 4.458283950902642e-07,
"loss": 0.014,
"step": 12110
},
{
"epoch": 19.20760697305864,
"grad_norm": 0.160085067152977,
"learning_rate": 4.285411082689927e-07,
"loss": 0.0131,
"step": 12120
},
{
"epoch": 19.223454833597465,
"grad_norm": 0.24544082581996918,
"learning_rate": 4.115942024218944e-07,
"loss": 0.011,
"step": 12130
},
{
"epoch": 19.239302694136292,
"grad_norm": 0.21625439822673798,
"learning_rate": 3.9498779391439754e-07,
"loss": 0.0135,
"step": 12140
},
{
"epoch": 19.25515055467512,
"grad_norm": 0.23580490052700043,
"learning_rate": 3.787219967739231e-07,
"loss": 0.0131,
"step": 12150
},
{
"epoch": 19.270998415213946,
"grad_norm": 0.3027212917804718,
"learning_rate": 3.627969226890959e-07,
"loss": 0.0146,
"step": 12160
},
{
"epoch": 19.286846275752772,
"grad_norm": 0.13139936327934265,
"learning_rate": 3.4721268100896265e-07,
"loss": 0.0111,
"step": 12170
},
{
"epoch": 19.3026941362916,
"grad_norm": 0.18760831654071808,
"learning_rate": 3.319693787422751e-07,
"loss": 0.0177,
"step": 12180
},
{
"epoch": 19.31854199683043,
"grad_norm": 0.1672467142343521,
"learning_rate": 3.170671205567133e-07,
"loss": 0.0183,
"step": 12190
},
{
"epoch": 19.334389857369256,
"grad_norm": 0.13311269879341125,
"learning_rate": 3.025060087782028e-07,
"loss": 0.0158,
"step": 12200
},
{
"epoch": 19.350237717908083,
"grad_norm": 0.1697445511817932,
"learning_rate": 2.8828614339018735e-07,
"loss": 0.0118,
"step": 12210
},
{
"epoch": 19.36608557844691,
"grad_norm": 0.12869006395339966,
"learning_rate": 2.744076220329628e-07,
"loss": 0.0131,
"step": 12220
},
{
"epoch": 19.381933438985737,
"grad_norm": 0.1751239150762558,
"learning_rate": 2.6087054000298874e-07,
"loss": 0.0163,
"step": 12230
},
{
"epoch": 19.397781299524564,
"grad_norm": 0.23643170297145844,
"learning_rate": 2.4767499025223904e-07,
"loss": 0.0092,
"step": 12240
},
{
"epoch": 19.41362916006339,
"grad_norm": 0.11966560781002045,
"learning_rate": 2.3482106338758025e-07,
"loss": 0.0106,
"step": 12250
},
{
"epoch": 19.429477020602217,
"grad_norm": 0.15015937387943268,
"learning_rate": 2.2230884767011628e-07,
"loss": 0.0139,
"step": 12260
},
{
"epoch": 19.445324881141048,
"grad_norm": 0.21535207331180573,
"learning_rate": 2.101384290146169e-07,
"loss": 0.0129,
"step": 12270
},
{
"epoch": 19.461172741679874,
"grad_norm": 0.19800494611263275,
"learning_rate": 1.9830989098890142e-07,
"loss": 0.0098,
"step": 12280
},
{
"epoch": 19.4770206022187,
"grad_norm": 0.18215620517730713,
"learning_rate": 1.8682331481328364e-07,
"loss": 0.0137,
"step": 12290
},
{
"epoch": 19.492868462757528,
"grad_norm": 0.18857750296592712,
"learning_rate": 1.756787793600001e-07,
"loss": 0.0127,
"step": 12300
},
{
"epoch": 19.508716323296355,
"grad_norm": 0.23549525439739227,
"learning_rate": 1.6487636115268824e-07,
"loss": 0.0144,
"step": 12310
},
{
"epoch": 19.52456418383518,
"grad_norm": 0.16930314898490906,
"learning_rate": 1.5441613436582014e-07,
"loss": 0.0136,
"step": 12320
},
{
"epoch": 19.54041204437401,
"grad_norm": 0.20466506481170654,
"learning_rate": 1.4429817082425302e-07,
"loss": 0.015,
"step": 12330
},
{
"epoch": 19.556259904912835,
"grad_norm": 0.2871796190738678,
"learning_rate": 1.3452254000267394e-07,
"loss": 0.0117,
"step": 12340
},
{
"epoch": 19.572107765451666,
"grad_norm": 0.2035956084728241,
"learning_rate": 1.2508930902517813e-07,
"loss": 0.0137,
"step": 12350
},
{
"epoch": 19.587955625990492,
"grad_norm": 0.096625417470932,
"learning_rate": 1.1599854266476918e-07,
"loss": 0.0136,
"step": 12360
},
{
"epoch": 19.60380348652932,
"grad_norm": 0.1756078600883484,
"learning_rate": 1.0725030334292064e-07,
"loss": 0.0157,
"step": 12370
},
{
"epoch": 19.619651347068146,
"grad_norm": 0.12860575318336487,
"learning_rate": 9.884465112917074e-08,
"loss": 0.0138,
"step": 12380
},
{
"epoch": 19.635499207606973,
"grad_norm": 0.2300836741924286,
"learning_rate": 9.078164374067833e-08,
"loss": 0.0155,
"step": 12390
},
{
"epoch": 19.6513470681458,
"grad_norm": 0.16416415572166443,
"learning_rate": 8.306133654185089e-08,
"loss": 0.0114,
"step": 12400
},
{
"epoch": 19.667194928684626,
"grad_norm": 0.12426438182592392,
"learning_rate": 7.568378254395047e-08,
"loss": 0.014,
"step": 12410
},
{
"epoch": 19.683042789223453,
"grad_norm": 0.16134649515151978,
"learning_rate": 6.864903240474397e-08,
"loss": 0.0129,
"step": 12420
},
{
"epoch": 19.698890649762284,
"grad_norm": 0.10580016672611237,
"learning_rate": 6.195713442812556e-08,
"loss": 0.0098,
"step": 12430
},
{
"epoch": 19.71473851030111,
"grad_norm": 0.25324514508247375,
"learning_rate": 5.560813456382818e-08,
"loss": 0.0145,
"step": 12440
},
{
"epoch": 19.730586370839937,
"grad_norm": 0.10774058848619461,
"learning_rate": 4.96020764070626e-08,
"loss": 0.0117,
"step": 12450
},
{
"epoch": 19.746434231378764,
"grad_norm": 0.14597612619400024,
"learning_rate": 4.393900119826211e-08,
"loss": 0.0203,
"step": 12460
},
{
"epoch": 19.76228209191759,
"grad_norm": 0.22840741276741028,
"learning_rate": 3.861894782276609e-08,
"loss": 0.0155,
"step": 12470
},
{
"epoch": 19.778129952456418,
"grad_norm": 0.17510531842708588,
"learning_rate": 3.3641952810559155e-08,
"loss": 0.015,
"step": 12480
},
{
"epoch": 19.793977812995244,
"grad_norm": 0.1512196958065033,
"learning_rate": 2.9008050336032376e-08,
"loss": 0.0137,
"step": 12490
},
{
"epoch": 19.80982567353407,
"grad_norm": 0.3339158296585083,
"learning_rate": 2.471727221775022e-08,
"loss": 0.0121,
"step": 12500
},
{
"epoch": 19.8256735340729,
"grad_norm": 0.17482948303222656,
"learning_rate": 2.0769647918206237e-08,
"loss": 0.0122,
"step": 12510
},
{
"epoch": 19.84152139461173,
"grad_norm": 0.19882513582706451,
"learning_rate": 1.7165204543656554e-08,
"loss": 0.0136,
"step": 12520
},
{
"epoch": 19.857369255150555,
"grad_norm": 0.2373553216457367,
"learning_rate": 1.3903966843897831e-08,
"loss": 0.0152,
"step": 12530
},
{
"epoch": 19.873217115689382,
"grad_norm": 0.17455127835273743,
"learning_rate": 1.0985957212122922e-08,
"loss": 0.0108,
"step": 12540
},
{
"epoch": 19.88906497622821,
"grad_norm": 0.10235228389501572,
"learning_rate": 8.411195684765449e-09,
"loss": 0.0143,
"step": 12550
},
{
"epoch": 19.904912836767036,
"grad_norm": 0.3095182776451111,
"learning_rate": 6.179699941349926e-09,
"loss": 0.011,
"step": 12560
},
{
"epoch": 19.920760697305862,
"grad_norm": 0.11435042321681976,
"learning_rate": 4.291485304375176e-09,
"loss": 0.0132,
"step": 12570
},
{
"epoch": 19.936608557844693,
"grad_norm": 0.22614504396915436,
"learning_rate": 2.7465647392088676e-09,
"loss": 0.0135,
"step": 12580
},
{
"epoch": 19.95245641838352,
"grad_norm": 0.22306819260120392,
"learning_rate": 1.544948854009798e-09,
"loss": 0.0141,
"step": 12590
},
{
"epoch": 19.968304278922346,
"grad_norm": 0.16913668811321259,
"learning_rate": 6.866458996279689e-10,
"loss": 0.0131,
"step": 12600
},
{
"epoch": 19.984152139461173,
"grad_norm": 0.28053924441337585,
"learning_rate": 1.7166176958238746e-10,
"loss": 0.0113,
"step": 12610
},
{
"epoch": 20.0,
"grad_norm": 0.22396335005760193,
"learning_rate": 0.0,
"loss": 0.0095,
"step": 12620
},
{
"epoch": 20.0,
"step": 12620,
"total_flos": 1.0795351858465092e+18,
"train_loss": 0.03321190329177066,
"train_runtime": 10224.5982,
"train_samples_per_second": 33.32,
"train_steps_per_second": 1.234
}
],
"logging_steps": 10,
"max_steps": 12620,
"num_input_tokens_seen": 0,
"num_train_epochs": 20,
"save_steps": 10000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.0795351858465092e+18,
"train_batch_size": 27,
"trial_name": null,
"trial_params": null
}